diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -15,7 +15,7 @@ "logits/rejected": -2.057170867919922, "logps/chosen": -246.4422607421875, "logps/rejected": -173.7652587890625, - "loss": 10887.5, + "loss": 7612.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -27,3024 +27,3024 @@ { "epoch": 0.01, "learning_rate": 2.6881720430107527e-08, - "logits/chosen": -2.3337254524230957, - "logits/rejected": -2.1098954677581787, - "logps/chosen": -199.181396484375, - "logps/rejected": -169.34323120117188, - "loss": 12433.158, - "rewards/accuracies": 0.4409722089767456, - "rewards/chosen": -0.00023617834085598588, - "rewards/margins": -0.0004530403239186853, - "rewards/rejected": 0.0002168620121665299, - "rewards/safe_rewards": -0.0001589212188264355, - "rewards/unsafe_rewards": -0.0003134354774374515, + "logits/chosen": -2.3338747024536133, + "logits/rejected": -2.1101577281951904, + "logps/chosen": -199.17718505859375, + "logps/rejected": -169.33853149414062, + "loss": 8542.1936, + "rewards/accuracies": 0.4236111044883728, + "rewards/chosen": -0.00019407388754189014, + "rewards/margins": -0.0004579208616632968, + "rewards/rejected": 0.00026384700322523713, + "rewards/safe_rewards": -7.85427400842309e-05, + "rewards/unsafe_rewards": -0.0003096049767918885, "step": 10 }, { "epoch": 0.01, "learning_rate": 5.3763440860215054e-08, - "logits/chosen": -2.329373836517334, - "logits/rejected": -2.085947036743164, - "logps/chosen": -215.31210327148438, - "logps/rejected": -176.8895263671875, - "loss": 12634.6484, - "rewards/accuracies": 0.44999998807907104, - "rewards/chosen": -0.0002022087574005127, - "rewards/margins": -0.00027598050655797124, - "rewards/rejected": 7.377170550171286e-05, - "rewards/safe_rewards": 0.00012937009159941226, - "rewards/unsafe_rewards": -0.0005337875918485224, + "logits/chosen": -2.3297791481018066, + "logits/rejected": -2.0859668254852295, + "logps/chosen": -215.31082153320312, + "logps/rejected": -176.90184020996094, + "loss": 8615.6992, + "rewards/accuracies": 0.503125011920929, + "rewards/chosen": -0.0001893683074740693, + "rewards/margins": -0.00013989376020617783, + "rewards/rejected": -4.9474612751509994e-05, + "rewards/safe_rewards": 2.0141320419497788e-05, + "rewards/unsafe_rewards": -0.00039887792081572115, "step": 20 }, { "epoch": 0.02, "learning_rate": 8.064516129032257e-08, - "logits/chosen": -2.3231472969055176, - "logits/rejected": -2.1040878295898438, - "logps/chosen": -199.29074096679688, - "logps/rejected": -180.7845916748047, - "loss": 12483.55, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": 0.0001314308901783079, - "rewards/margins": 0.0002524043375160545, - "rewards/rejected": -0.00012097340368200094, - "rewards/safe_rewards": -4.216630622977391e-05, - "rewards/unsafe_rewards": 0.00030502810841426253, + "logits/chosen": -2.3232197761535645, + "logits/rejected": -2.104114532470703, + "logps/chosen": -199.29653930664062, + "logps/rejected": -180.81167602539062, + "loss": 8655.0875, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 7.340547745116055e-05, + "rewards/margins": 0.00046515712165273726, + "rewards/rejected": -0.00039175161509774625, + "rewards/safe_rewards": 0.00016999247600324452, + "rewards/unsafe_rewards": -2.318151564395521e-05, "step": 30 }, { "epoch": 0.02, "learning_rate": 1.0752688172043011e-07, - "logits/chosen": -2.269274950027466, - "logits/rejected": -1.9993311166763306, - "logps/chosen": -197.7371368408203, - "logps/rejected": -177.80197143554688, - "loss": 12187.3875, - "rewards/accuracies": 0.5531250238418579, - "rewards/chosen": 7.160066161304712e-05, - "rewards/margins": 0.0010266354074701667, - "rewards/rejected": -0.0009550346876494586, - "rewards/safe_rewards": 6.697652861475945e-05, - "rewards/unsafe_rewards": 7.622483099112287e-05, + "logits/chosen": -2.26924991607666, + "logits/rejected": -1.9993360042572021, + "logps/chosen": -197.77706909179688, + "logps/rejected": -177.79495239257812, + "loss": 8307.5406, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.0003278615477029234, + "rewards/margins": 0.000557084393221885, + "rewards/rejected": -0.000884945853613317, + "rewards/safe_rewards": -0.00021670451678801328, + "rewards/unsafe_rewards": -0.0004390186513774097, "step": 40 }, { "epoch": 0.03, "learning_rate": 1.3440860215053762e-07, - "logits/chosen": -2.377847671508789, - "logits/rejected": -2.0819365978240967, - "logps/chosen": -191.5760040283203, - "logps/rejected": -162.4053955078125, - "loss": 12239.6664, - "rewards/accuracies": 0.6468750238418579, - "rewards/chosen": 0.00046143392683006823, - "rewards/margins": 0.0037944589275866747, - "rewards/rejected": -0.0033330251462757587, - "rewards/safe_rewards": 2.9730377718806267e-05, - "rewards/unsafe_rewards": 0.0008931377669796348, + "logits/chosen": -2.3771767616271973, + "logits/rejected": -2.081153392791748, + "logps/chosen": -191.57861328125, + "logps/rejected": -162.315185546875, + "loss": 8305.5875, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0004353286640252918, + "rewards/margins": 0.0028661643154919147, + "rewards/rejected": -0.0024308357387781143, + "rewards/safe_rewards": -0.00014582322910428047, + "rewards/unsafe_rewards": 0.0010164804989472032, "step": 50 }, { "epoch": 0.03, "learning_rate": 1.6129032258064515e-07, - "logits/chosen": -2.3524489402770996, - "logits/rejected": -2.135829448699951, - "logps/chosen": -186.65359497070312, - "logps/rejected": -175.50205993652344, - "loss": 12036.8937, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -0.0014410190051421523, - "rewards/margins": 0.003365521552041173, - "rewards/rejected": -0.0048065404407680035, - "rewards/safe_rewards": -0.001406198600307107, - "rewards/unsafe_rewards": -0.0014758387114852667, + "logits/chosen": -2.350670576095581, + "logits/rejected": -2.13375186920166, + "logps/chosen": -186.54489135742188, + "logps/rejected": -175.369873046875, + "loss": 8113.0641, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.00035417350591160357, + "rewards/margins": 0.0031304885633289814, + "rewards/rejected": -0.0034846621565520763, + "rewards/safe_rewards": -0.0002569267526268959, + "rewards/unsafe_rewards": -0.0004514198808465153, "step": 60 }, { "epoch": 0.04, "learning_rate": 1.8817204301075268e-07, - "logits/chosen": -2.324812650680542, - "logits/rejected": -2.111722469329834, - "logps/chosen": -221.7294464111328, - "logps/rejected": -180.26425170898438, - "loss": 11819.9391, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.0033922013826668262, - "rewards/margins": 0.007680011447519064, - "rewards/rejected": -0.01107221283018589, - "rewards/safe_rewards": -0.004657374229282141, - "rewards/unsafe_rewards": -0.0021270285360515118, + "logits/chosen": -2.322838306427002, + "logits/rejected": -2.109368085861206, + "logps/chosen": -221.5357208251953, + "logps/rejected": -179.95974731445312, + "loss": 8132.6656, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.0014549014158546925, + "rewards/margins": 0.006572114769369364, + "rewards/rejected": -0.008027016185224056, + "rewards/safe_rewards": -0.00250421604141593, + "rewards/unsafe_rewards": -0.00040558649925515056, "step": 70 }, { "epoch": 0.04, "learning_rate": 2.1505376344086022e-07, - "logits/chosen": -2.3345985412597656, - "logits/rejected": -2.1188461780548096, - "logps/chosen": -199.0458984375, - "logps/rejected": -179.85231018066406, - "loss": 12035.2875, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.016937915235757828, - "rewards/margins": 0.012463906779885292, - "rewards/rejected": -0.02940182387828827, - "rewards/safe_rewards": -0.01989533193409443, - "rewards/unsafe_rewards": -0.013980497606098652, + "logits/chosen": -2.331857919692993, + "logits/rejected": -2.11533784866333, + "logps/chosen": -198.42869567871094, + "logps/rejected": -178.8989715576172, + "loss": 8433.2156, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.010765710845589638, + "rewards/margins": 0.009102851152420044, + "rewards/rejected": -0.019868558272719383, + "rewards/safe_rewards": -0.01374301128089428, + "rewards/unsafe_rewards": -0.007788407150655985, "step": 80 }, { "epoch": 0.05, "learning_rate": 2.4193548387096775e-07, - "logits/chosen": -2.340318202972412, - "logits/rejected": -2.109419822692871, - "logps/chosen": -220.1958465576172, - "logps/rejected": -175.22171020507812, - "loss": 12008.7812, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.040661171078681946, - "rewards/margins": 0.02644166722893715, - "rewards/rejected": -0.0671028420329094, - "rewards/safe_rewards": -0.0411004014313221, - "rewards/unsafe_rewards": -0.04022195190191269, + "logits/chosen": -2.335395097732544, + "logits/rejected": -2.1030659675598145, + "logps/chosen": -218.33627319335938, + "logps/rejected": -172.68142700195312, + "loss": 8180.8656, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.022065121680498123, + "rewards/margins": 0.01963471807539463, + "rewards/rejected": -0.0416998453438282, + "rewards/safe_rewards": -0.02183777093887329, + "rewards/unsafe_rewards": -0.022292476147413254, "step": 90 }, { "epoch": 0.05, "learning_rate": 2.6881720430107523e-07, - "logits/chosen": -2.3278441429138184, - "logits/rejected": -2.1185097694396973, - "logps/chosen": -209.53744506835938, - "logps/rejected": -202.1396942138672, - "loss": 11892.225, - "rewards/accuracies": 0.609375, - "rewards/chosen": -0.0774240791797638, - "rewards/margins": 0.0386374369263649, - "rewards/rejected": -0.11606152355670929, - "rewards/safe_rewards": -0.07736717909574509, - "rewards/unsafe_rewards": -0.0774809718132019, + "logits/chosen": -2.324553966522217, + "logits/rejected": -2.1137032508850098, + "logps/chosen": -206.8110809326172, + "logps/rejected": -198.6154327392578, + "loss": 8117.1062, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.050160281360149384, + "rewards/margins": 0.0306589063256979, + "rewards/rejected": -0.08081920444965363, + "rewards/safe_rewards": -0.04988502338528633, + "rewards/unsafe_rewards": -0.05043555423617363, "step": 100 }, { "epoch": 0.06, "learning_rate": 2.956989247311828e-07, - "logits/chosen": -2.336418628692627, - "logits/rejected": -2.100806713104248, - "logps/chosen": -218.48379516601562, - "logps/rejected": -191.77542114257812, - "loss": 12213.7945, + "logits/chosen": -2.341404914855957, + "logits/rejected": -2.1051011085510254, + "logps/chosen": -215.013427734375, + "logps/rejected": -187.58670043945312, + "loss": 8260.3953, "rewards/accuracies": 0.653124988079071, - "rewards/chosen": -0.11473993957042694, - "rewards/margins": 0.04081073775887489, - "rewards/rejected": -0.15555070340633392, - "rewards/safe_rewards": -0.11462344229221344, - "rewards/unsafe_rewards": -0.11485648155212402, + "rewards/chosen": -0.0800362378358841, + "rewards/margins": 0.03362729027867317, + "rewards/rejected": -0.11366353929042816, + "rewards/safe_rewards": -0.079714335501194, + "rewards/unsafe_rewards": -0.08035816252231598, "step": 110 }, { "epoch": 0.06, "learning_rate": 3.225806451612903e-07, - "logits/chosen": -2.3020033836364746, - "logits/rejected": -2.040771961212158, - "logps/chosen": -215.15774536132812, - "logps/rejected": -185.8181915283203, - "loss": 11595.3016, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.111789271235466, - "rewards/margins": 0.057957328855991364, - "rewards/rejected": -0.16974660754203796, - "rewards/safe_rewards": -0.1049695611000061, - "rewards/unsafe_rewards": -0.1186089739203453, + "logits/chosen": -2.317996025085449, + "logits/rejected": -2.05900502204895, + "logps/chosen": -213.1697235107422, + "logps/rejected": -182.4001922607422, + "loss": 7881.807, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.09190914034843445, + "rewards/margins": 0.043657559901475906, + "rewards/rejected": -0.13556669652462006, + "rewards/safe_rewards": -0.08523599803447723, + "rewards/unsafe_rewards": -0.09858228266239166, "step": 120 }, { "epoch": 0.07, "learning_rate": 3.4946236559139783e-07, - "logits/chosen": -2.3317370414733887, - "logits/rejected": -2.075387954711914, - "logps/chosen": -221.8496551513672, - "logps/rejected": -185.46388244628906, - "loss": 11215.5039, - "rewards/accuracies": 0.671875, - "rewards/chosen": -0.10837243497371674, - "rewards/margins": 0.07047604024410248, - "rewards/rejected": -0.1788484752178192, - "rewards/safe_rewards": -0.09084135293960571, - "rewards/unsafe_rewards": -0.12590351700782776, + "logits/chosen": -2.354461193084717, + "logits/rejected": -2.1032989025115967, + "logps/chosen": -220.25735473632812, + "logps/rejected": -181.8567657470703, + "loss": 7824.5234, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.09244942665100098, + "rewards/margins": 0.05032774806022644, + "rewards/rejected": -0.14277717471122742, + "rewards/safe_rewards": -0.07864584028720856, + "rewards/unsafe_rewards": -0.1062530130147934, "step": 130 }, { "epoch": 0.08, "learning_rate": 3.7634408602150537e-07, - "logits/chosen": -2.3039958477020264, - "logits/rejected": -2.056906223297119, - "logps/chosen": -236.0045928955078, - "logps/rejected": -204.21005249023438, - "loss": 10992.7797, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.16659508645534515, - "rewards/margins": 0.07402800768613815, - "rewards/rejected": -0.2406230866909027, - "rewards/safe_rewards": -0.15856191515922546, - "rewards/unsafe_rewards": -0.17462827265262604, + "logits/chosen": -2.331094741821289, + "logits/rejected": -2.0899605751037598, + "logps/chosen": -230.6691131591797, + "logps/rejected": -196.5701141357422, + "loss": 7728.4891, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.1132405549287796, + "rewards/margins": 0.05098314210772514, + "rewards/rejected": -0.16422370076179504, + "rewards/safe_rewards": -0.10501746088266373, + "rewards/unsafe_rewards": -0.12146364152431488, "step": 140 }, { "epoch": 0.08, "learning_rate": 4.0322580645161285e-07, - "logits/chosen": -2.2005298137664795, - "logits/rejected": -1.986655831336975, - "logps/chosen": -225.0756378173828, - "logps/rejected": -196.11924743652344, - "loss": 10241.9297, - "rewards/accuracies": 0.6656249761581421, - "rewards/chosen": -0.20789048075675964, - "rewards/margins": 0.09014638513326645, - "rewards/rejected": -0.2980369031429291, - "rewards/safe_rewards": -0.21130767464637756, - "rewards/unsafe_rewards": -0.20447330176830292, + "logits/chosen": -2.230912208557129, + "logits/rejected": -2.0194005966186523, + "logps/chosen": -217.6083526611328, + "logps/rejected": -185.86074829101562, + "loss": 7080.1617, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.1332172453403473, + "rewards/margins": 0.062234263867139816, + "rewards/rejected": -0.1954515129327774, + "rewards/safe_rewards": -0.13880565762519836, + "rewards/unsafe_rewards": -0.1276288479566574, "step": 150 }, { "epoch": 0.09, "learning_rate": 4.3010752688172043e-07, - "logits/chosen": -2.113193988800049, - "logits/rejected": -1.8298135995864868, - "logps/chosen": -229.54745483398438, - "logps/rejected": -214.3673858642578, - "loss": 11174.7664, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.20709700882434845, - "rewards/margins": 0.07644233852624893, - "rewards/rejected": -0.283539354801178, - "rewards/safe_rewards": -0.2122730314731598, - "rewards/unsafe_rewards": -0.2019209861755371, + "logits/chosen": -2.15290904045105, + "logits/rejected": -1.8807637691497803, + "logps/chosen": -228.8234405517578, + "logps/rejected": -211.19802856445312, + "loss": 7557.0047, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.1998569667339325, + "rewards/margins": 0.051988668739795685, + "rewards/rejected": -0.2518456280231476, + "rewards/safe_rewards": -0.20650985836982727, + "rewards/unsafe_rewards": -0.19320407509803772, "step": 160 }, { "epoch": 0.09, "learning_rate": 4.569892473118279e-07, - "logits/chosen": -2.0915558338165283, - "logits/rejected": -1.7845208644866943, - "logps/chosen": -209.046142578125, - "logps/rejected": -181.61386108398438, - "loss": 10357.8453, - "rewards/accuracies": 0.671875, - "rewards/chosen": -0.16017496585845947, - "rewards/margins": 0.08110227435827255, - "rewards/rejected": -0.24127721786499023, - "rewards/safe_rewards": -0.15518422424793243, - "rewards/unsafe_rewards": -0.16516566276550293, + "logits/chosen": -2.163496494293213, + "logits/rejected": -1.8812412023544312, + "logps/chosen": -209.8403778076172, + "logps/rejected": -179.97496032714844, + "loss": 7154.7875, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.16811709105968475, + "rewards/margins": 0.05677107721567154, + "rewards/rejected": -0.2248881608247757, + "rewards/safe_rewards": -0.1681157797574997, + "rewards/unsafe_rewards": -0.16811838746070862, "step": 170 }, { "epoch": 0.1, "learning_rate": 4.838709677419355e-07, - "logits/chosen": -1.9759992361068726, - "logits/rejected": -1.6107635498046875, - "logps/chosen": -224.05850219726562, - "logps/rejected": -208.45144653320312, - "loss": 10663.718, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.23630261421203613, - "rewards/margins": 0.09500019252300262, - "rewards/rejected": -0.33130279183387756, - "rewards/safe_rewards": -0.23541903495788574, - "rewards/unsafe_rewards": -0.23718611896038055, + "logits/chosen": -2.079279899597168, + "logits/rejected": -1.7519344091415405, + "logps/chosen": -228.12905883789062, + "logps/rejected": -209.23184204101562, + "loss": 7375.8891, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.2770082652568817, + "rewards/margins": 0.06209835410118103, + "rewards/rejected": -0.33910661935806274, + "rewards/safe_rewards": -0.27290791273117065, + "rewards/unsafe_rewards": -0.2811085879802704, "step": 180 }, { "epoch": 0.1, "learning_rate": 4.999929391798331e-07, - "logits/chosen": -2.0257487297058105, - "logits/rejected": -1.5829004049301147, - "logps/chosen": -248.24057006835938, - "logps/rejected": -218.6661376953125, - "loss": 10641.9109, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.35525068640708923, - "rewards/margins": 0.11016690731048584, - "rewards/rejected": -0.4654175639152527, - "rewards/safe_rewards": -0.35236117243766785, - "rewards/unsafe_rewards": -0.35814011096954346, + "logits/chosen": -2.132680892944336, + "logits/rejected": -1.7472947835922241, + "logps/chosen": -237.369384765625, + "logps/rejected": -204.32150268554688, + "loss": 7349.0242, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.24653896689414978, + "rewards/margins": 0.07543239742517471, + "rewards/rejected": -0.3219713568687439, + "rewards/safe_rewards": -0.24422617256641388, + "rewards/unsafe_rewards": -0.24885177612304688, "step": 190 }, { "epoch": 0.11, "learning_rate": 4.9991350953333e-07, - "logits/chosen": -1.8850326538085938, - "logits/rejected": -1.4665980339050293, - "logps/chosen": -254.23593139648438, - "logps/rejected": -234.6528778076172, - "loss": 10264.4211, - "rewards/accuracies": 0.6343749761581421, - "rewards/chosen": -0.4368314743041992, - "rewards/margins": 0.07941244542598724, - "rewards/rejected": -0.5162439346313477, - "rewards/safe_rewards": -0.4102691113948822, - "rewards/unsafe_rewards": -0.46339383721351624, + "logits/chosen": -2.0583910942077637, + "logits/rejected": -1.7268041372299194, + "logps/chosen": -248.58944702148438, + "logps/rejected": -226.4416046142578, + "loss": 7112.2336, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.38036662340164185, + "rewards/margins": 0.05376458168029785, + "rewards/rejected": -0.4341312348842621, + "rewards/safe_rewards": -0.3542497754096985, + "rewards/unsafe_rewards": -0.40648356080055237, "step": 200 }, { "epoch": 0.11, "learning_rate": 4.997458523498236e-07, - "logits/chosen": -1.944396734237671, - "logits/rejected": -1.5599733591079712, - "logps/chosen": -226.7266387939453, - "logps/rejected": -202.39404296875, - "loss": 10025.3039, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.3351445198059082, - "rewards/margins": 0.07879569381475449, - "rewards/rejected": -0.4139402508735657, - "rewards/safe_rewards": -0.3409610688686371, - "rewards/unsafe_rewards": -0.32932794094085693, + "logits/chosen": -2.1265130043029785, + "logits/rejected": -1.8143631219863892, + "logps/chosen": -225.8239288330078, + "logps/rejected": -198.24595642089844, + "loss": 7020.6945, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.32611721754074097, + "rewards/margins": 0.04634212702512741, + "rewards/rejected": -0.372459352016449, + "rewards/safe_rewards": -0.33249932527542114, + "rewards/unsafe_rewards": -0.3197351396083832, "step": 210 }, { "epoch": 0.12, "learning_rate": 4.99490026817712e-07, - "logits/chosen": -1.9720779657363892, - "logits/rejected": -1.5801351070404053, - "logps/chosen": -241.8947296142578, - "logps/rejected": -219.57498168945312, - "loss": 10269.6352, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.3446536064147949, - "rewards/margins": 0.11220917850732803, - "rewards/rejected": -0.45686277747154236, - "rewards/safe_rewards": -0.3444424867630005, - "rewards/unsafe_rewards": -0.3448646664619446, + "logits/chosen": -2.1002330780029297, + "logits/rejected": -1.7667055130004883, + "logps/chosen": -240.4268798828125, + "logps/rejected": -214.17312622070312, + "loss": 6918.2156, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.3299749791622162, + "rewards/margins": 0.07286922633647919, + "rewards/rejected": -0.40284425020217896, + "rewards/safe_rewards": -0.33103418350219727, + "rewards/unsafe_rewards": -0.3289158344268799, "step": 220 }, { "epoch": 0.12, "learning_rate": 4.991461232516674e-07, - "logits/chosen": -2.0816266536712646, - "logits/rejected": -1.6995646953582764, - "logps/chosen": -251.25466918945312, - "logps/rejected": -235.22006225585938, - "loss": 11163.5422, - "rewards/accuracies": 0.640625, - "rewards/chosen": -0.33530837297439575, - "rewards/margins": 0.12319612503051758, - "rewards/rejected": -0.45850443840026855, - "rewards/safe_rewards": -0.32858791947364807, - "rewards/unsafe_rewards": -0.34202873706817627, + "logits/chosen": -2.132354259490967, + "logits/rejected": -1.7921921014785767, + "logps/chosen": -249.4021453857422, + "logps/rejected": -229.88247680664062, + "loss": 7642.9477, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.3167831003665924, + "rewards/margins": 0.08834515511989594, + "rewards/rejected": -0.4051283001899719, + "rewards/safe_rewards": -0.3021748661994934, + "rewards/unsafe_rewards": -0.3313913643360138, "step": 230 }, { "epoch": 0.13, "learning_rate": 4.98714263060751e-07, - "logits/chosen": -2.211090326309204, - "logits/rejected": -1.7899425029754639, - "logps/chosen": -214.9016571044922, - "logps/rejected": -193.04257202148438, - "loss": 10506.5781, - "rewards/accuracies": 0.6343749761581421, - "rewards/chosen": -0.26967987418174744, - "rewards/margins": 0.10429898649454117, - "rewards/rejected": -0.3739788830280304, - "rewards/safe_rewards": -0.2796659469604492, - "rewards/unsafe_rewards": -0.25969380140304565, + "logits/chosen": -2.251471519470215, + "logits/rejected": -1.886639952659607, + "logps/chosen": -207.30184936523438, + "logps/rejected": -179.5860595703125, + "loss": 7244.568, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.19368217885494232, + "rewards/margins": 0.04573160782456398, + "rewards/rejected": -0.239413782954216, + "rewards/safe_rewards": -0.2037370651960373, + "rewards/unsafe_rewards": -0.18362729251384735, "step": 240 }, { "epoch": 0.13, "learning_rate": 4.98194598705552e-07, - "logits/chosen": -2.1962857246398926, - "logits/rejected": -1.912153959274292, - "logps/chosen": -243.35693359375, - "logps/rejected": -224.67507934570312, - "loss": 10806.9805, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.40015238523483276, - "rewards/margins": 0.09695416688919067, - "rewards/rejected": -0.49710655212402344, - "rewards/safe_rewards": -0.4010702967643738, - "rewards/unsafe_rewards": -0.3992345333099365, + "logits/chosen": -2.288367748260498, + "logits/rejected": -2.053013563156128, + "logps/chosen": -224.88290405273438, + "logps/rejected": -203.2853240966797, + "loss": 7454.4141, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.21541249752044678, + "rewards/margins": 0.06779678165912628, + "rewards/rejected": -0.28320926427841187, + "rewards/safe_rewards": -0.22131112217903137, + "rewards/unsafe_rewards": -0.20951387286186218, "step": 250 }, { "epoch": 0.14, "learning_rate": 4.975873136443648e-07, - "logits/chosen": -2.223935604095459, - "logits/rejected": -1.9479196071624756, - "logps/chosen": -256.0559997558594, - "logps/rejected": -234.63931274414062, - "loss": 9752.6734, - "rewards/accuracies": 0.6468750238418579, - "rewards/chosen": -0.36628082394599915, - "rewards/margins": 0.09993860870599747, - "rewards/rejected": -0.466219425201416, - "rewards/safe_rewards": -0.377407968044281, - "rewards/unsafe_rewards": -0.3551536500453949, + "logits/chosen": -2.33178973197937, + "logits/rejected": -2.110666036605835, + "logps/chosen": -250.9014434814453, + "logps/rejected": -225.95675659179688, + "loss": 6933.1133, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.3147348463535309, + "rewards/margins": 0.06465913355350494, + "rewards/rejected": -0.379393994808197, + "rewards/safe_rewards": -0.3217589557170868, + "rewards/unsafe_rewards": -0.307710736989975, "step": 260 }, { "epoch": 0.15, "learning_rate": 4.968926222684212e-07, - "logits/chosen": -2.1337714195251465, - "logits/rejected": -1.8700491189956665, - "logps/chosen": -233.6194610595703, - "logps/rejected": -223.4718780517578, - "loss": 9947.2914, - "rewards/accuracies": 0.653124988079071, - "rewards/chosen": -0.3725220263004303, - "rewards/margins": 0.12324249744415283, - "rewards/rejected": -0.49576449394226074, - "rewards/safe_rewards": -0.36347565054893494, - "rewards/unsafe_rewards": -0.38156840205192566, + "logits/chosen": -2.2538371086120605, + "logits/rejected": -2.0441083908081055, + "logps/chosen": -231.07321166992188, + "logps/rejected": -215.10226440429688, + "loss": 7070.5437, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.347059428691864, + "rewards/margins": 0.0650092363357544, + "rewards/rejected": -0.4120686650276184, + "rewards/safe_rewards": -0.3425368070602417, + "rewards/unsafe_rewards": -0.3515821099281311, "step": 270 }, { "epoch": 0.15, "learning_rate": 4.961107698262044e-07, - "logits/chosen": -2.147411346435547, - "logits/rejected": -1.8175932168960571, - "logps/chosen": -252.82357788085938, - "logps/rejected": -226.16012573242188, - "loss": 10183.2313, - "rewards/accuracies": 0.640625, - "rewards/chosen": -0.41806760430336, - "rewards/margins": 0.09029438346624374, - "rewards/rejected": -0.5083619952201843, - "rewards/safe_rewards": -0.4310055375099182, - "rewards/unsafe_rewards": -0.40512973070144653, + "logits/chosen": -2.240346670150757, + "logits/rejected": -1.963796615600586, + "logps/chosen": -245.887939453125, + "logps/rejected": -214.9673309326172, + "loss": 7118.2719, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.3487110733985901, + "rewards/margins": 0.047723181545734406, + "rewards/rejected": -0.3964342474937439, + "rewards/safe_rewards": -0.36569300293922424, + "rewards/unsafe_rewards": -0.33172911405563354, "step": 280 }, { "epoch": 0.16, "learning_rate": 4.952420323368673e-07, - "logits/chosen": -2.1101536750793457, - "logits/rejected": -1.7720788717269897, - "logps/chosen": -244.27499389648438, - "logps/rejected": -226.966552734375, - "loss": 10267.0086, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.40319761633872986, - "rewards/margins": 0.12694020569324493, - "rewards/rejected": -0.5301378965377808, - "rewards/safe_rewards": -0.39970266819000244, - "rewards/unsafe_rewards": -0.40669265389442444, + "logits/chosen": -2.258303642272949, + "logits/rejected": -1.9825427532196045, + "logps/chosen": -243.5653839111328, + "logps/rejected": -221.63369750976562, + "loss": 7056.0906, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.3961013853549957, + "rewards/margins": 0.0807078555226326, + "rewards/rejected": -0.4768092632293701, + "rewards/safe_rewards": -0.39416471123695374, + "rewards/unsafe_rewards": -0.3980380594730377, "step": 290 }, { "epoch": 0.16, "learning_rate": 4.942867164927899e-07, - "logits/chosen": -2.1024351119995117, - "logits/rejected": -1.8436143398284912, - "logps/chosen": -236.44937133789062, - "logps/rejected": -222.50283813476562, - "loss": 10763.8734, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.350605845451355, - "rewards/margins": 0.12936124205589294, - "rewards/rejected": -0.47996705770492554, - "rewards/safe_rewards": -0.33073925971984863, - "rewards/unsafe_rewards": -0.37047240138053894, + "logits/chosen": -2.2862093448638916, + "logits/rejected": -2.087953805923462, + "logps/chosen": -232.6704864501953, + "logps/rejected": -214.80374145507812, + "loss": 7359.575, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3128170967102051, + "rewards/margins": 0.09015890210866928, + "rewards/rejected": -0.40297597646713257, + "rewards/safe_rewards": -0.2959223985671997, + "rewards/unsafe_rewards": -0.3297117352485657, "step": 300 }, { "epoch": 0.17, "learning_rate": 4.932451595513062e-07, - "logits/chosen": -2.1173605918884277, - "logits/rejected": -1.754899263381958, - "logps/chosen": -264.0268249511719, - "logps/rejected": -244.3734893798828, - "loss": 9657.9672, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.40277355909347534, - "rewards/margins": 0.13846410810947418, - "rewards/rejected": -0.5412376523017883, - "rewards/safe_rewards": -0.4034086763858795, - "rewards/unsafe_rewards": -0.40213847160339355, + "logits/chosen": -2.278350591659546, + "logits/rejected": -1.9919618368148804, + "logps/chosen": -261.5721740722656, + "logps/rejected": -237.3791961669922, + "loss": 6861.6203, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.3782269060611725, + "rewards/margins": 0.09306754171848297, + "rewards/rejected": -0.4712944030761719, + "rewards/safe_rewards": -0.37359195947647095, + "rewards/unsafe_rewards": -0.38286182284355164, "step": 310 }, { "epoch": 0.17, "learning_rate": 4.921177292156419e-07, - "logits/chosen": -2.1319141387939453, - "logits/rejected": -1.7121671438217163, - "logps/chosen": -235.3206787109375, - "logps/rejected": -223.6913604736328, - "loss": 9518.4141, - "rewards/accuracies": 0.6781250238418579, - "rewards/chosen": -0.3673807978630066, - "rewards/margins": 0.12836968898773193, - "rewards/rejected": -0.4957505166530609, - "rewards/safe_rewards": -0.3563651442527771, - "rewards/unsafe_rewards": -0.37839651107788086, + "logits/chosen": -2.2868103981018066, + "logits/rejected": -1.9530357122421265, + "logps/chosen": -238.9241485595703, + "logps/rejected": -221.6173095703125, + "loss": 6779.8594, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.4034157395362854, + "rewards/margins": 0.0715942531824112, + "rewards/rejected": -0.4750100076198578, + "rewards/safe_rewards": -0.39607614278793335, + "rewards/unsafe_rewards": -0.41075533628463745, "step": 320 }, { "epoch": 0.18, "learning_rate": 4.909048235051033e-07, - "logits/chosen": -2.0745625495910645, - "logits/rejected": -1.8059850931167603, - "logps/chosen": -234.7843017578125, - "logps/rejected": -227.23898315429688, - "loss": 10123.7766, - "rewards/accuracies": 0.6781250238418579, - "rewards/chosen": -0.3189774751663208, - "rewards/margins": 0.14415447413921356, - "rewards/rejected": -0.46313196420669556, - "rewards/safe_rewards": -0.3179768919944763, - "rewards/unsafe_rewards": -0.3199780583381653, + "logits/chosen": -2.248591184616089, + "logits/rejected": -2.041801929473877, + "logps/chosen": -233.26199340820312, + "logps/rejected": -221.5611572265625, + "loss": 6971.7297, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.30375435948371887, + "rewards/margins": 0.10259900242090225, + "rewards/rejected": -0.40635329484939575, + "rewards/safe_rewards": -0.3013189435005188, + "rewards/unsafe_rewards": -0.30618971586227417, "step": 330 }, { "epoch": 0.18, "learning_rate": 4.896068706145631e-07, - "logits/chosen": -2.053877353668213, - "logits/rejected": -1.679033637046814, - "logps/chosen": -257.50213623046875, - "logps/rejected": -221.5078582763672, - "loss": 10452.5141, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.4782954752445221, - "rewards/margins": 0.11873143911361694, - "rewards/rejected": -0.5970267653465271, - "rewards/safe_rewards": -0.46968308091163635, - "rewards/unsafe_rewards": -0.4869077801704407, + "logits/chosen": -2.239377975463867, + "logits/rejected": -1.9611393213272095, + "logps/chosen": -252.949951171875, + "logps/rejected": -212.8416290283203, + "loss": 7127.9594, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.43277350068092346, + "rewards/margins": 0.07759107649326324, + "rewards/rejected": -0.5103645920753479, + "rewards/safe_rewards": -0.42603859305381775, + "rewards/unsafe_rewards": -0.4395083785057068, "step": 340 }, { "epoch": 0.19, "learning_rate": 4.882243287632946e-07, - "logits/chosen": -2.0686936378479004, - "logits/rejected": -1.7236442565917969, - "logps/chosen": -238.40576171875, - "logps/rejected": -228.39682006835938, - "loss": 9810.4336, - "rewards/accuracies": 0.640625, - "rewards/chosen": -0.47843924164772034, - "rewards/margins": 0.13079717755317688, - "rewards/rejected": -0.6092364192008972, - "rewards/safe_rewards": -0.4754910469055176, - "rewards/unsafe_rewards": -0.48138752579689026, + "logits/chosen": -2.258354425430298, + "logits/rejected": -1.9823997020721436, + "logps/chosen": -240.0575714111328, + "logps/rejected": -224.93618774414062, + "loss": 6857.2828, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.4949573576450348, + "rewards/margins": 0.07967236638069153, + "rewards/rejected": -0.5746296644210815, + "rewards/safe_rewards": -0.4892745912075043, + "rewards/unsafe_rewards": -0.5006400346755981, "step": 350 }, { "epoch": 0.19, "learning_rate": 4.867576860332048e-07, - "logits/chosen": -2.110980749130249, - "logits/rejected": -1.7398662567138672, - "logps/chosen": -221.39352416992188, - "logps/rejected": -209.3349609375, - "loss": 9735.3125, - "rewards/accuracies": 0.684374988079071, - "rewards/chosen": -0.3842235207557678, - "rewards/margins": 0.13459259271621704, - "rewards/rejected": -0.5188161134719849, - "rewards/safe_rewards": -0.388908326625824, - "rewards/unsafe_rewards": -0.37953871488571167, + "logits/chosen": -2.325746774673462, + "logits/rejected": -2.0531835556030273, + "logps/chosen": -223.86062622070312, + "logps/rejected": -206.43603515625, + "loss": 6847.6039, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.4088946282863617, + "rewards/margins": 0.08093228191137314, + "rewards/rejected": -0.48982691764831543, + "rewards/safe_rewards": -0.41694265604019165, + "rewards/unsafe_rewards": -0.4008466303348541, "step": 360 }, { "epoch": 0.2, "learning_rate": 4.85207460196526e-07, - "logits/chosen": -2.033064365386963, - "logits/rejected": -1.7036364078521729, - "logps/chosen": -249.37808227539062, - "logps/rejected": -240.7008056640625, - "loss": 9924.3289, - "rewards/accuracies": 0.6343749761581421, - "rewards/chosen": -0.48289600014686584, - "rewards/margins": 0.12102627754211426, - "rewards/rejected": -0.6039222478866577, - "rewards/safe_rewards": -0.49458175897598267, - "rewards/unsafe_rewards": -0.47121015191078186, + "logits/chosen": -2.2898755073547363, + "logits/rejected": -2.0400357246398926, + "logps/chosen": -238.4681396484375, + "logps/rejected": -224.0597686767578, + "loss": 6861.7656, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.373796671628952, + "rewards/margins": 0.06371524930000305, + "rewards/rejected": -0.4375119209289551, + "rewards/safe_rewards": -0.3857375979423523, + "rewards/unsafe_rewards": -0.361855685710907, "step": 370 }, { "epoch": 0.2, "learning_rate": 4.835741985330259e-07, - "logits/chosen": -1.9914309978485107, - "logits/rejected": -1.6691398620605469, - "logps/chosen": -241.2218475341797, - "logps/rejected": -221.5822296142578, - "loss": 9384.0953, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.4463868737220764, - "rewards/margins": 0.12155318260192871, - "rewards/rejected": -0.5679400563240051, - "rewards/safe_rewards": -0.4332394599914551, - "rewards/unsafe_rewards": -0.45953425765037537, + "logits/chosen": -2.275742292404175, + "logits/rejected": -2.050511121749878, + "logps/chosen": -232.7757110595703, + "logps/rejected": -209.2894287109375, + "loss": 6446.7797, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.3619254231452942, + "rewards/margins": 0.08308672159910202, + "rewards/rejected": -0.4450121521949768, + "rewards/safe_rewards": -0.3546380400657654, + "rewards/unsafe_rewards": -0.3692127764225006, "step": 380 }, { "epoch": 0.21, "learning_rate": 4.818584776367992e-07, - "logits/chosen": -1.8350143432617188, - "logits/rejected": -1.5597480535507202, - "logps/chosen": -243.4880828857422, - "logps/rejected": -234.6859588623047, - "loss": 9768.3547, - "rewards/accuracies": 0.684374988079071, - "rewards/chosen": -0.35229426622390747, - "rewards/margins": 0.12931743264198303, - "rewards/rejected": -0.4816117286682129, - "rewards/safe_rewards": -0.3771669864654541, - "rewards/unsafe_rewards": -0.32742154598236084, + "logits/chosen": -2.1700243949890137, + "logits/rejected": -1.9761574268341064, + "logps/chosen": -248.4570770263672, + "logps/rejected": -236.026611328125, + "loss": 6785.3953, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4019840657711029, + "rewards/margins": 0.09303420037031174, + "rewards/rejected": -0.49501824378967285, + "rewards/safe_rewards": -0.41988474130630493, + "rewards/unsafe_rewards": -0.38408344984054565, "step": 390 }, { "epoch": 0.22, "learning_rate": 4.800609032127122e-07, - "logits/chosen": -1.8959261178970337, - "logits/rejected": -1.4644628763198853, - "logps/chosen": -245.52548217773438, - "logps/rejected": -224.02798461914062, - "loss": 10171.7398, + "logits/chosen": -2.213772773742676, + "logits/rejected": -1.9254707098007202, + "logps/chosen": -244.1571044921875, + "logps/rejected": -219.7991943359375, + "loss": 6917.5641, "rewards/accuracies": 0.609375, - "rewards/chosen": -0.3962332010269165, - "rewards/margins": 0.09811348468065262, - "rewards/rejected": -0.4943466782569885, - "rewards/safe_rewards": -0.38441720604896545, - "rewards/unsafe_rewards": -0.40804919600486755, + "rewards/chosen": -0.3825494050979614, + "rewards/margins": 0.06950954347848892, + "rewards/rejected": -0.45205894112586975, + "rewards/safe_rewards": -0.3747442066669464, + "rewards/unsafe_rewards": -0.39035457372665405, "step": 400 }, { "epoch": 0.22, "learning_rate": 4.78182109862569e-07, - "logits/chosen": -1.7905447483062744, - "logits/rejected": -1.4820103645324707, - "logps/chosen": -240.4580841064453, - "logps/rejected": -227.4237060546875, - "loss": 10629.6891, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.46166953444480896, - "rewards/margins": 0.10201609134674072, - "rewards/rejected": -0.5636855959892273, - "rewards/safe_rewards": -0.4457937180995941, - "rewards/unsafe_rewards": -0.4775453507900238, + "logits/chosen": -2.149578809738159, + "logits/rejected": -1.9344444274902344, + "logps/chosen": -230.39987182617188, + "logps/rejected": -214.0001678466797, + "loss": 7505.7875, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.3610875904560089, + "rewards/margins": 0.06836280971765518, + "rewards/rejected": -0.4294503629207611, + "rewards/safe_rewards": -0.3561205565929413, + "rewards/unsafe_rewards": -0.36605459451675415, "step": 410 }, { "epoch": 0.23, "learning_rate": 4.7622276086107677e-07, - "logits/chosen": -1.9193427562713623, - "logits/rejected": -1.4959001541137695, - "logps/chosen": -262.74884033203125, - "logps/rejected": -236.9444122314453, - "loss": 9956.0648, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.39339667558670044, - "rewards/margins": 0.12222667783498764, - "rewards/rejected": -0.5156232714653015, - "rewards/safe_rewards": -0.4004043936729431, - "rewards/unsafe_rewards": -0.38638895750045776, + "logits/chosen": -2.2380170822143555, + "logits/rejected": -1.9596973657608032, + "logps/chosen": -260.88958740234375, + "logps/rejected": -231.9978485107422, + "loss": 7023.0203, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.3748038709163666, + "rewards/margins": 0.09135393798351288, + "rewards/rejected": -0.46615785360336304, + "rewards/safe_rewards": -0.37403541803359985, + "rewards/unsafe_rewards": -0.3755723834037781, "step": 420 }, { "epoch": 0.23, "learning_rate": 4.741835479216879e-07, - "logits/chosen": -1.7996397018432617, - "logits/rejected": -1.3732637166976929, - "logps/chosen": -268.1848449707031, - "logps/rejected": -258.21844482421875, - "loss": 9698.1516, - "rewards/accuracies": 0.621874988079071, - "rewards/chosen": -0.4201679229736328, - "rewards/margins": 0.12748563289642334, - "rewards/rejected": -0.5476535558700562, - "rewards/safe_rewards": -0.43537646532058716, - "rewards/unsafe_rewards": -0.4049593508243561, + "logits/chosen": -2.1961891651153564, + "logits/rejected": -1.9379104375839233, + "logps/chosen": -265.2579650878906, + "logps/rejected": -250.7912139892578, + "loss": 6681.5836, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.3908992409706116, + "rewards/margins": 0.08248193562030792, + "rewards/rejected": -0.4733811914920807, + "rewards/safe_rewards": -0.40423211455345154, + "rewards/unsafe_rewards": -0.3775663375854492, "step": 430 }, { "epoch": 0.24, "learning_rate": 4.720651909524036e-07, - "logits/chosen": -1.694494605064392, - "logits/rejected": -1.2918825149536133, - "logps/chosen": -239.9465789794922, - "logps/rejected": -224.81838989257812, - "loss": 9903.8234, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.40551429986953735, - "rewards/margins": 0.12095747143030167, - "rewards/rejected": -0.526471734046936, - "rewards/safe_rewards": -0.4219624996185303, - "rewards/unsafe_rewards": -0.38906604051589966, + "logits/chosen": -2.185619831085205, + "logits/rejected": -1.9523022174835205, + "logps/chosen": -230.352294921875, + "logps/rejected": -210.71206665039062, + "loss": 6920.7953, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.30957138538360596, + "rewards/margins": 0.0758373960852623, + "rewards/rejected": -0.38540878891944885, + "rewards/safe_rewards": -0.3266102075576782, + "rewards/unsafe_rewards": -0.2925325632095337, "step": 440 }, { "epoch": 0.24, "learning_rate": 4.698684378016222e-07, - "logits/chosen": -1.8708194494247437, - "logits/rejected": -1.4681661128997803, - "logps/chosen": -251.16006469726562, - "logps/rejected": -224.59683227539062, - "loss": 10020.0453, - "rewards/accuracies": 0.684374988079071, - "rewards/chosen": -0.4468216001987457, - "rewards/margins": 0.13241124153137207, - "rewards/rejected": -0.5792328119277954, - "rewards/safe_rewards": -0.44445109367370605, - "rewards/unsafe_rewards": -0.44919198751449585, + "logits/chosen": -2.2177813053131104, + "logits/rejected": -1.9565246105194092, + "logps/chosen": -239.7762451171875, + "logps/rejected": -208.4960479736328, + "loss": 6838.5781, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.3329833149909973, + "rewards/margins": 0.08524159342050552, + "rewards/rejected": -0.41822490096092224, + "rewards/safe_rewards": -0.32857412099838257, + "rewards/unsafe_rewards": -0.3373924791812897, "step": 450 }, { "epoch": 0.25, "learning_rate": 4.675940639941256e-07, - "logits/chosen": -2.047152042388916, - "logits/rejected": -1.792803406715393, - "logps/chosen": -248.07742309570312, - "logps/rejected": -235.62014770507812, - "loss": 9748.4266, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.4478784203529358, - "rewards/margins": 0.12355700880289078, - "rewards/rejected": -0.5714353322982788, - "rewards/safe_rewards": -0.44087591767311096, - "rewards/unsafe_rewards": -0.4548807740211487, + "logits/chosen": -2.1189966201782227, + "logits/rejected": -1.9002841711044312, + "logps/chosen": -242.6001739501953, + "logps/rejected": -224.87594604492188, + "loss": 6790.9219, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.393105685710907, + "rewards/margins": 0.07088775932788849, + "rewards/rejected": -0.46399348974227905, + "rewards/safe_rewards": -0.3847719132900238, + "rewards/unsafe_rewards": -0.40143948793411255, "step": 460 }, { "epoch": 0.25, "learning_rate": 4.6524287245729286e-07, - "logits/chosen": -2.1067752838134766, - "logits/rejected": -1.8487707376480103, - "logps/chosen": -236.04287719726562, - "logps/rejected": -220.44912719726562, - "loss": 9640.7938, - "rewards/accuracies": 0.6968749761581421, - "rewards/chosen": -0.3775666654109955, - "rewards/margins": 0.16341695189476013, - "rewards/rejected": -0.5409836173057556, - "rewards/safe_rewards": -0.3839438259601593, - "rewards/unsafe_rewards": -0.3711894452571869, + "logits/chosen": -2.0616540908813477, + "logits/rejected": -1.8031222820281982, + "logps/chosen": -235.3325958251953, + "logps/rejected": -215.1667022705078, + "loss": 6658.0063, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.37046387791633606, + "rewards/margins": 0.11769552528858185, + "rewards/rejected": -0.4881593585014343, + "rewards/safe_rewards": -0.37195223569869995, + "rewards/unsafe_rewards": -0.3689754605293274, "step": 470 }, { "epoch": 0.26, "learning_rate": 4.628156932376418e-07, - "logits/chosen": -2.167776584625244, - "logits/rejected": -1.872856855392456, - "logps/chosen": -248.55838012695312, - "logps/rejected": -225.5117950439453, - "loss": 9688.1625, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.4561285078525543, - "rewards/margins": 0.14114831387996674, - "rewards/rejected": -0.5972768664360046, - "rewards/safe_rewards": -0.4667208194732666, - "rewards/unsafe_rewards": -0.44553622603416443, + "logits/chosen": -2.127551555633545, + "logits/rejected": -1.8270717859268188, + "logps/chosen": -238.7838592529297, + "logps/rejected": -211.7913360595703, + "loss": 6571.0922, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.358383446931839, + "rewards/margins": 0.10168886184692383, + "rewards/rejected": -0.4600723385810852, + "rewards/safe_rewards": -0.3699692189693451, + "rewards/unsafe_rewards": -0.3467976748943329, "step": 480 }, { "epoch": 0.26, "learning_rate": 4.603133832077953e-07, - "logits/chosen": -2.161594867706299, - "logits/rejected": -1.9173338413238525, - "logps/chosen": -257.06353759765625, - "logps/rejected": -258.7896423339844, - "loss": 9681.8422, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.47079768776893616, - "rewards/margins": 0.11741242557764053, - "rewards/rejected": -0.5882101058959961, - "rewards/safe_rewards": -0.4799569249153137, - "rewards/unsafe_rewards": -0.4616383910179138, + "logits/chosen": -2.1131837368011475, + "logits/rejected": -1.8676449060440063, + "logps/chosen": -252.4139404296875, + "logps/rejected": -249.065185546875, + "loss": 6808.1812, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.42430177330970764, + "rewards/margins": 0.06666339188814163, + "rewards/rejected": -0.49096518754959106, + "rewards/safe_rewards": -0.4318160116672516, + "rewards/unsafe_rewards": -0.4167875647544861, "step": 490 }, { "epoch": 0.27, "learning_rate": 4.5773682576397776e-07, - "logits/chosen": -2.180304527282715, - "logits/rejected": -1.9387527704238892, - "logps/chosen": -235.86962890625, - "logps/rejected": -215.8931427001953, - "loss": 10014.0016, - "rewards/accuracies": 0.640625, - "rewards/chosen": -0.34283483028411865, - "rewards/margins": 0.11132063716650009, - "rewards/rejected": -0.45415550470352173, - "rewards/safe_rewards": -0.34071075916290283, - "rewards/unsafe_rewards": -0.3449589014053345, + "logits/chosen": -2.147376537322998, + "logits/rejected": -1.9110462665557861, + "logps/chosen": -240.01022338867188, + "logps/rejected": -216.8660125732422, + "loss": 6785.8539, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.38424068689346313, + "rewards/margins": 0.07964359223842621, + "rewards/rejected": -0.46388429403305054, + "rewards/safe_rewards": -0.376841276884079, + "rewards/unsafe_rewards": -0.39164015650749207, "step": 500 }, { "epoch": 0.27, - "eval_logits/chosen": -1.766314148902893, - "eval_logits/rejected": -1.439511775970459, - "eval_logps/chosen": -193.2104034423828, - "eval_logps/rejected": -157.86143493652344, - "eval_loss": 3855.205810546875, - "eval_rewards/accuracies": 0.5693368911743164, - "eval_rewards/chosen": -0.6234450936317444, - "eval_rewards/margins": 0.0316503532230854, - "eval_rewards/rejected": -0.6550954580307007, - "eval_rewards/safe_rewards": -0.6247209906578064, - "eval_rewards/unsafe_rewards": -0.622212827205658, - "eval_runtime": 1813.9451, - "eval_samples_per_second": 18.217, - "eval_steps_per_second": 1.139, + "eval_logits/chosen": -1.7600023746490479, + "eval_logits/rejected": -1.4655544757843018, + "eval_logps/chosen": -195.498779296875, + "eval_logps/rejected": -155.7913818359375, + "eval_loss": 3119.06689453125, + "eval_rewards/accuracies": 0.47652468085289, + "eval_rewards/chosen": -0.6463291049003601, + "eval_rewards/margins": -0.011934175156056881, + "eval_rewards/rejected": -0.6343949437141418, + "eval_rewards/safe_rewards": -0.6474616527557373, + "eval_rewards/unsafe_rewards": -0.6439096927642822, + "eval_runtime": 1793.7735, + "eval_samples_per_second": 18.422, + "eval_steps_per_second": 1.152, "step": 500 }, { "epoch": 0.27, "learning_rate": 4.5508693051414774e-07, - "logits/chosen": -2.214632987976074, - "logits/rejected": -1.9986759424209595, - "logps/chosen": -236.26504516601562, - "logps/rejected": -232.67257690429688, - "loss": 9382.2266, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.38486331701278687, - "rewards/margins": 0.14760960638523102, - "rewards/rejected": -0.5324729084968567, - "rewards/safe_rewards": -0.38839876651763916, - "rewards/unsafe_rewards": -0.38132789731025696, + "logits/chosen": -2.199183225631714, + "logits/rejected": -1.9916718006134033, + "logps/chosen": -239.27005004882812, + "logps/rejected": -231.099853515625, + "loss": 6512.4289, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.4149134159088135, + "rewards/margins": 0.10183224827051163, + "rewards/rejected": -0.5167456865310669, + "rewards/safe_rewards": -0.4191102981567383, + "rewards/unsafe_rewards": -0.4107164740562439, "step": 510 }, { "epoch": 0.28, "learning_rate": 4.52364632956877e-07, - "logits/chosen": -2.153568744659424, - "logits/rejected": -1.90863835811615, - "logps/chosen": -254.1094207763672, - "logps/rejected": -228.3584747314453, - "loss": 10537.8664, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.4435926079750061, - "rewards/margins": 0.1338704377412796, - "rewards/rejected": -0.5774630308151245, - "rewards/safe_rewards": -0.4331069588661194, - "rewards/unsafe_rewards": -0.45407819747924805, + "logits/chosen": -2.1832046508789062, + "logits/rejected": -1.9507207870483398, + "logps/chosen": -252.33029174804688, + "logps/rejected": -222.3158416748047, + "loss": 7180.6562, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.42580121755599976, + "rewards/margins": 0.09123551845550537, + "rewards/rejected": -0.5170367956161499, + "rewards/safe_rewards": -0.4184451103210449, + "rewards/unsafe_rewards": -0.43315738439559937, "step": 520 }, { "epoch": 0.29, "learning_rate": 4.4957089415108895e-07, - "logits/chosen": -2.141589641571045, - "logits/rejected": -1.8750436305999756, - "logps/chosen": -234.4965362548828, - "logps/rejected": -225.8005828857422, - "loss": 9642.6016, - "rewards/accuracies": 0.6656249761581421, - "rewards/chosen": -0.45910438895225525, - "rewards/margins": 0.1458037942647934, - "rewards/rejected": -0.6049081683158875, - "rewards/safe_rewards": -0.4470369219779968, - "rewards/unsafe_rewards": -0.47117185592651367, + "logits/chosen": -2.1672799587249756, + "logits/rejected": -1.9240920543670654, + "logps/chosen": -227.26931762695312, + "logps/rejected": -213.41995239257812, + "loss": 6686.6555, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.38683217763900757, + "rewards/margins": 0.09426978975534439, + "rewards/rejected": -0.481101930141449, + "rewards/safe_rewards": -0.37474173307418823, + "rewards/unsafe_rewards": -0.3989226520061493, "step": 530 }, { "epoch": 0.29, "learning_rate": 4.467067003767745e-07, - "logits/chosen": -2.2650766372680664, - "logits/rejected": -1.9944769144058228, - "logps/chosen": -260.74176025390625, - "logps/rejected": -238.8166046142578, - "loss": 9908.1234, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.45205292105674744, - "rewards/margins": 0.15067532658576965, - "rewards/rejected": -0.6027282476425171, - "rewards/safe_rewards": -0.4535408914089203, - "rewards/unsafe_rewards": -0.4505649507045746, + "logits/chosen": -2.2515788078308105, + "logits/rejected": -1.981323003768921, + "logps/chosen": -250.74972534179688, + "logps/rejected": -223.9951629638672, + "loss": 6660.8516, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.35213276743888855, + "rewards/margins": 0.10238126665353775, + "rewards/rejected": -0.4545140266418457, + "rewards/safe_rewards": -0.35352325439453125, + "rewards/unsafe_rewards": -0.35074225068092346, "step": 540 }, { "epoch": 0.3, "learning_rate": 4.437730627868027e-07, - "logits/chosen": -2.177022933959961, - "logits/rejected": -1.876518964767456, - "logps/chosen": -224.05197143554688, - "logps/rejected": -219.69790649414062, - "loss": 9283.0344, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.4288279414176941, - "rewards/margins": 0.15696583688259125, - "rewards/rejected": -0.5857938528060913, - "rewards/safe_rewards": -0.4391719698905945, - "rewards/unsafe_rewards": -0.41848403215408325, + "logits/chosen": -2.167005777359009, + "logits/rejected": -1.8684250116348267, + "logps/chosen": -221.5258026123047, + "logps/rejected": -211.6287384033203, + "loss": 6479.7195, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.4035661816596985, + "rewards/margins": 0.10153625160455704, + "rewards/rejected": -0.5051023960113525, + "rewards/safe_rewards": -0.40968450903892517, + "rewards/unsafe_rewards": -0.39744776487350464, "step": 550 }, { "epoch": 0.3, "learning_rate": 4.4077101704995163e-07, - "logits/chosen": -2.166729688644409, - "logits/rejected": -1.8793588876724243, - "logps/chosen": -241.890625, - "logps/rejected": -240.79116821289062, - "loss": 9689.2141, - "rewards/accuracies": 0.659375011920929, - "rewards/chosen": -0.3801850974559784, - "rewards/margins": 0.14441236853599548, - "rewards/rejected": -0.5245975255966187, - "rewards/safe_rewards": -0.38108500838279724, - "rewards/unsafe_rewards": -0.37928515672683716, + "logits/chosen": -2.207162380218506, + "logits/rejected": -1.9421132802963257, + "logps/chosen": -243.27017211914062, + "logps/rejected": -237.16085815429688, + "loss": 6695.0641, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.3939805030822754, + "rewards/margins": 0.09431411325931549, + "rewards/rejected": -0.4882946014404297, + "rewards/safe_rewards": -0.3932721018791199, + "rewards/unsafe_rewards": -0.3946888744831085, "step": 560 }, { "epoch": 0.31, "learning_rate": 4.3770162298528356e-07, - "logits/chosen": -2.13301944732666, - "logits/rejected": -1.8853784799575806, - "logps/chosen": -246.5365753173828, - "logps/rejected": -228.28970336914062, - "loss": 9945.7938, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.44606542587280273, - "rewards/margins": 0.14534635841846466, - "rewards/rejected": -0.5914117693901062, - "rewards/safe_rewards": -0.4420045018196106, - "rewards/unsafe_rewards": -0.45012640953063965, + "logits/chosen": -2.230370283126831, + "logits/rejected": -2.010568857192993, + "logps/chosen": -239.39120483398438, + "logps/rejected": -215.85934448242188, + "loss": 6883.2531, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.37461167573928833, + "rewards/margins": 0.09249675273895264, + "rewards/rejected": -0.4671083986759186, + "rewards/safe_rewards": -0.37565773725509644, + "rewards/unsafe_rewards": -0.3735656142234802, "step": 570 }, { "epoch": 0.31, "learning_rate": 4.3456596418799476e-07, - "logits/chosen": -2.0352725982666016, - "logits/rejected": -1.7989652156829834, - "logps/chosen": -260.2506103515625, - "logps/rejected": -239.3428192138672, - "loss": 9081.4078, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.5129576921463013, - "rewards/margins": 0.15186551213264465, - "rewards/rejected": -0.6648232340812683, - "rewards/safe_rewards": -0.5223872661590576, - "rewards/unsafe_rewards": -0.5035280585289001, + "logits/chosen": -2.1258435249328613, + "logits/rejected": -1.9077991247177124, + "logps/chosen": -251.86349487304688, + "logps/rejected": -226.2504425048828, + "loss": 6485.532, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.4290865361690521, + "rewards/margins": 0.10481268167495728, + "rewards/rejected": -0.533899188041687, + "rewards/safe_rewards": -0.4373076856136322, + "rewards/unsafe_rewards": -0.42086538672447205, "step": 580 }, { "epoch": 0.32, "learning_rate": 4.313651476468715e-07, - "logits/chosen": -2.0924770832061768, - "logits/rejected": -1.8049672842025757, - "logps/chosen": -249.8899383544922, - "logps/rejected": -239.24252319335938, - "loss": 9272.5719, - "rewards/accuracies": 0.659375011920929, - "rewards/chosen": -0.4385497570037842, - "rewards/margins": 0.1377258002758026, - "rewards/rejected": -0.5762755274772644, - "rewards/safe_rewards": -0.453767865896225, - "rewards/unsafe_rewards": -0.4233315885066986, + "logits/chosen": -2.182234764099121, + "logits/rejected": -1.9257800579071045, + "logps/chosen": -245.7446746826172, + "logps/rejected": -230.3665313720703, + "loss": 6464.5875, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.3970971703529358, + "rewards/margins": 0.09041866660118103, + "rewards/rejected": -0.4875158369541168, + "rewards/safe_rewards": -0.4161418378353119, + "rewards/unsafe_rewards": -0.3780525028705597, "step": 590 }, { "epoch": 0.32, "learning_rate": 4.2810030335348693e-07, - "logits/chosen": -2.0217080116271973, - "logits/rejected": -1.762096643447876, - "logps/chosen": -262.0118713378906, - "logps/rejected": -224.1782989501953, - "loss": 9632.9078, - "rewards/accuracies": 0.596875011920929, - "rewards/chosen": -0.43455037474632263, - "rewards/margins": 0.11642362177371979, - "rewards/rejected": -0.5509740114212036, - "rewards/safe_rewards": -0.4458708167076111, - "rewards/unsafe_rewards": -0.4232299327850342, + "logits/chosen": -2.113201379776001, + "logits/rejected": -1.8778488636016846, + "logps/chosen": -256.13543701171875, + "logps/rejected": -213.571044921875, + "loss": 6756.0938, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.37578636407852173, + "rewards/margins": 0.06911512464284897, + "rewards/rejected": -0.4449015259742737, + "rewards/safe_rewards": -0.38628315925598145, + "rewards/unsafe_rewards": -0.3652896285057068, "step": 600 }, { "epoch": 0.33, "learning_rate": 4.2477258390327806e-07, - "logits/chosen": -2.0361132621765137, - "logits/rejected": -1.7106812000274658, - "logps/chosen": -235.67434692382812, - "logps/rejected": -231.11856079101562, - "loss": 9407.8156, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.4568377137184143, - "rewards/margins": 0.17628352344036102, - "rewards/rejected": -0.6331211924552917, - "rewards/safe_rewards": -0.4708985388278961, - "rewards/unsafe_rewards": -0.44277673959732056, + "logits/chosen": -2.132603645324707, + "logits/rejected": -1.8388347625732422, + "logps/chosen": -228.72323608398438, + "logps/rejected": -217.3914031982422, + "loss": 6564.557, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.3873269259929657, + "rewards/margins": 0.10852308571338654, + "rewards/rejected": -0.4958500266075134, + "rewards/safe_rewards": -0.3940195143222809, + "rewards/unsafe_rewards": -0.3806343674659729, "step": 610 }, { "epoch": 0.33, "learning_rate": 4.2138316408864197e-07, - "logits/chosen": -2.072322368621826, - "logits/rejected": -1.7710635662078857, - "logps/chosen": -238.7259063720703, - "logps/rejected": -225.31277465820312, - "loss": 8506.775, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.42509979009628296, - "rewards/margins": 0.1986309140920639, - "rewards/rejected": -0.6237307786941528, - "rewards/safe_rewards": -0.4370320439338684, - "rewards/unsafe_rewards": -0.4131676256656647, + "logits/chosen": -2.144538402557373, + "logits/rejected": -1.8722187280654907, + "logps/chosen": -231.070068359375, + "logps/rejected": -211.3180694580078, + "loss": 5815.3586, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.3485415577888489, + "rewards/margins": 0.13524214923381805, + "rewards/rejected": -0.48378363251686096, + "rewards/safe_rewards": -0.36353689432144165, + "rewards/unsafe_rewards": -0.33354613184928894, "step": 620 }, { "epoch": 0.34, "learning_rate": 4.179332404841962e-07, - "logits/chosen": -2.065585136413574, - "logits/rejected": -1.7641960382461548, - "logps/chosen": -254.5260009765625, - "logps/rejected": -238.98910522460938, - "loss": 9063.3977, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.4591439664363861, - "rewards/margins": 0.1645183116197586, - "rewards/rejected": -0.6236622929573059, - "rewards/safe_rewards": -0.4708753228187561, - "rewards/unsafe_rewards": -0.4474126398563385, + "logits/chosen": -2.1126229763031006, + "logits/rejected": -1.8241170644760132, + "logps/chosen": -250.9884490966797, + "logps/rejected": -230.13394165039062, + "loss": 6280.5078, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.4237682819366455, + "rewards/margins": 0.11134223639965057, + "rewards/rejected": -0.5351104736328125, + "rewards/safe_rewards": -0.4349561631679535, + "rewards/unsafe_rewards": -0.4125804305076599, "step": 630 }, { "epoch": 0.34, "learning_rate": 4.1442403102434954e-07, - "logits/chosen": -2.0620930194854736, - "logits/rejected": -1.7930257320404053, - "logps/chosen": -259.7158203125, - "logps/rejected": -241.06173706054688, - "loss": 9654.3422, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.4739893078804016, - "rewards/margins": 0.14570489525794983, - "rewards/rejected": -0.6196941137313843, - "rewards/safe_rewards": -0.471531480550766, - "rewards/unsafe_rewards": -0.4764471650123596, + "logits/chosen": -2.112366199493408, + "logits/rejected": -1.851994276046753, + "logps/chosen": -256.3794860839844, + "logps/rejected": -232.27554321289062, + "loss": 6645.3945, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.44062596559524536, + "rewards/margins": 0.09120626747608185, + "rewards/rejected": -0.531832218170166, + "rewards/safe_rewards": -0.4353967607021332, + "rewards/unsafe_rewards": -0.4458550810813904, "step": 640 }, { "epoch": 0.35, "learning_rate": 4.108567745733318e-07, - "logits/chosen": -2.0093209743499756, - "logits/rejected": -1.6891580820083618, - "logps/chosen": -225.36874389648438, - "logps/rejected": -220.1201629638672, - "loss": 9953.3375, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.40882912278175354, - "rewards/margins": 0.1250211000442505, - "rewards/rejected": -0.5338501930236816, - "rewards/safe_rewards": -0.4028988778591156, - "rewards/unsafe_rewards": -0.4147593379020691, + "logits/chosen": -2.0930869579315186, + "logits/rejected": -1.8053241968154907, + "logps/chosen": -219.0164794921875, + "logps/rejected": -208.75341796875, + "loss": 6823.0883, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.34530672430992126, + "rewards/margins": 0.07487599551677704, + "rewards/rejected": -0.4201827645301819, + "rewards/safe_rewards": -0.3363416790962219, + "rewards/unsafe_rewards": -0.354271799325943, "step": 650 }, { "epoch": 0.36, "learning_rate": 4.0723273048783426e-07, - "logits/chosen": -2.0602688789367676, - "logits/rejected": -1.7822468280792236, - "logps/chosen": -252.5499725341797, - "logps/rejected": -217.19607543945312, - "loss": 9878.5469, - "rewards/accuracies": 0.590624988079071, - "rewards/chosen": -0.4066733419895172, - "rewards/margins": 0.11300390958786011, - "rewards/rejected": -0.5196772813796997, - "rewards/safe_rewards": -0.3962500989437103, - "rewards/unsafe_rewards": -0.4170965254306793, + "logits/chosen": -2.1528728008270264, + "logits/rejected": -1.9104461669921875, + "logps/chosen": -246.50015258789062, + "logps/rejected": -206.0511932373047, + "loss": 6722.1141, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": -0.34617477655410767, + "rewards/margins": 0.06205359101295471, + "rewards/rejected": -0.40822833776474, + "rewards/safe_rewards": -0.33560293912887573, + "rewards/unsafe_rewards": -0.3567466139793396, "step": 660 }, { "epoch": 0.36, "learning_rate": 4.0355317817241697e-07, - "logits/chosen": -2.016270160675049, - "logits/rejected": -1.720707654953003, - "logps/chosen": -271.5939025878906, - "logps/rejected": -230.49514770507812, - "loss": 9712.3578, - "rewards/accuracies": 0.671875, - "rewards/chosen": -0.41256681084632874, - "rewards/margins": 0.12585967779159546, - "rewards/rejected": -0.5384265184402466, - "rewards/safe_rewards": -0.3883249759674072, - "rewards/unsafe_rewards": -0.43680867552757263, + "logits/chosen": -2.108217716217041, + "logits/rejected": -1.8410425186157227, + "logps/chosen": -265.3013916015625, + "logps/rejected": -219.53402709960938, + "loss": 6793.2109, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.3496417999267578, + "rewards/margins": 0.07917375862598419, + "rewards/rejected": -0.4288156032562256, + "rewards/safe_rewards": -0.3285156488418579, + "rewards/unsafe_rewards": -0.3707680106163025, "step": 670 }, { "epoch": 0.37, "learning_rate": 3.998194166278367e-07, - "logits/chosen": -2.061642646789551, - "logits/rejected": -1.7631162405014038, - "logps/chosen": -236.6819610595703, - "logps/rejected": -211.05215454101562, - "loss": 9874.0938, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.4390217363834381, - "rewards/margins": 0.1045667976140976, - "rewards/rejected": -0.5435885190963745, - "rewards/safe_rewards": -0.43371590971946716, - "rewards/unsafe_rewards": -0.4443275034427643, + "logits/chosen": -2.1824183464050293, + "logits/rejected": -1.9256740808486938, + "logps/chosen": -228.2741241455078, + "logps/rejected": -197.99484252929688, + "loss": 6763.4398, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.35494309663772583, + "rewards/margins": 0.058072127401828766, + "rewards/rejected": -0.4130152761936188, + "rewards/safe_rewards": -0.353218674659729, + "rewards/unsafe_rewards": -0.3566676080226898, "step": 680 }, { "epoch": 0.37, "learning_rate": 3.9603276399245855e-07, - "logits/chosen": -2.1031622886657715, - "logits/rejected": -1.786027193069458, - "logps/chosen": -255.8426971435547, - "logps/rejected": -234.46176147460938, - "loss": 10056.725, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.4331499934196472, - "rewards/margins": 0.1870744228363037, - "rewards/rejected": -0.6202244758605957, - "rewards/safe_rewards": -0.43039584159851074, - "rewards/unsafe_rewards": -0.4359041750431061, + "logits/chosen": -2.2138686180114746, + "logits/rejected": -1.9390268325805664, + "logps/chosen": -246.8724822998047, + "logps/rejected": -219.68814086914062, + "loss": 6987.0063, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.34344762563705444, + "rewards/margins": 0.12904086709022522, + "rewards/rejected": -0.47248849272727966, + "rewards/safe_rewards": -0.3371937572956085, + "rewards/unsafe_rewards": -0.34970152378082275, "step": 690 }, { "epoch": 0.38, "learning_rate": 3.9219455707691e-07, - "logits/chosen": -2.0409488677978516, - "logits/rejected": -1.721737265586853, - "logps/chosen": -275.725341796875, - "logps/rejected": -255.3893585205078, - "loss": 9441.6844, - "rewards/accuracies": 0.6343749761581421, - "rewards/chosen": -0.533936083316803, - "rewards/margins": 0.14196796715259552, - "rewards/rejected": -0.6759039759635925, - "rewards/safe_rewards": -0.5558069348335266, - "rewards/unsafe_rewards": -0.5120651721954346, + "logits/chosen": -2.201855421066284, + "logits/rejected": -1.927875280380249, + "logps/chosen": -269.231201171875, + "logps/rejected": -243.1079559326172, + "loss": 6437.2254, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.4689943194389343, + "rewards/margins": 0.08409595489501953, + "rewards/rejected": -0.5530902147293091, + "rewards/safe_rewards": -0.4955593943595886, + "rewards/unsafe_rewards": -0.44242924451828003, "step": 700 }, { "epoch": 0.38, "learning_rate": 3.883061508921439e-07, - "logits/chosen": -1.9712127447128296, - "logits/rejected": -1.703774094581604, - "logps/chosen": -254.61148071289062, - "logps/rejected": -257.2832336425781, - "loss": 9573.2711, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.5562829971313477, - "rewards/margins": 0.10704982280731201, - "rewards/rejected": -0.6633327007293701, - "rewards/safe_rewards": -0.5785806775093079, - "rewards/unsafe_rewards": -0.5339852571487427, + "logits/chosen": -2.2111687660217285, + "logits/rejected": -2.002533197402954, + "logps/chosen": -251.5044403076172, + "logps/rejected": -248.65231323242188, + "loss": 6540.6648, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.5252124071121216, + "rewards/margins": 0.051811158657073975, + "rewards/rejected": -0.5770235061645508, + "rewards/safe_rewards": -0.5454440712928772, + "rewards/unsafe_rewards": -0.5049806833267212, "step": 710 }, { "epoch": 0.39, "learning_rate": 3.8436891817107555e-07, - "logits/chosen": -1.8322317600250244, - "logits/rejected": -1.5908830165863037, - "logps/chosen": -239.904541015625, - "logps/rejected": -233.88052368164062, - "loss": 9692.9688, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.47806644439697266, - "rewards/margins": 0.1413581669330597, - "rewards/rejected": -0.6194247007369995, - "rewards/safe_rewards": -0.48141616582870483, - "rewards/unsafe_rewards": -0.47471684217453003, + "logits/chosen": -2.166321277618408, + "logits/rejected": -2.001189947128296, + "logps/chosen": -240.1176300048828, + "logps/rejected": -228.7688446044922, + "loss": 6719.8211, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.48019713163375854, + "rewards/margins": 0.08811075985431671, + "rewards/rejected": -0.5683078765869141, + "rewards/safe_rewards": -0.48789653182029724, + "rewards/unsafe_rewards": -0.47249770164489746, "step": 720 }, { "epoch": 0.39, "learning_rate": 3.8038424888396414e-07, - "logits/chosen": -2.0533385276794434, - "logits/rejected": -1.7637081146240234, - "logps/chosen": -235.538330078125, - "logps/rejected": -233.2519073486328, - "loss": 9168.9617, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.46311601996421814, - "rewards/margins": 0.14198057353496552, - "rewards/rejected": -0.6050965785980225, - "rewards/safe_rewards": -0.4721951484680176, - "rewards/unsafe_rewards": -0.4540369510650635, + "logits/chosen": -2.277644157409668, + "logits/rejected": -2.064286708831787, + "logps/chosen": -227.665771484375, + "logps/rejected": -219.80184936523438, + "loss": 6400.2289, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.384390264749527, + "rewards/margins": 0.08620595932006836, + "rewards/rejected": -0.47059616446495056, + "rewards/safe_rewards": -0.39066264033317566, + "rewards/unsafe_rewards": -0.3781178593635559, "step": 730 }, { "epoch": 0.4, "learning_rate": 3.763535497477079e-07, - "logits/chosen": -2.088493824005127, - "logits/rejected": -1.8092867136001587, - "logps/chosen": -252.17929077148438, - "logps/rejected": -242.96066284179688, - "loss": 9250.4531, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.4875727593898773, - "rewards/margins": 0.1561814844608307, - "rewards/rejected": -0.6437541246414185, - "rewards/safe_rewards": -0.486122190952301, - "rewards/unsafe_rewards": -0.4890231490135193, + "logits/chosen": -2.284943103790283, + "logits/rejected": -2.068763017654419, + "logps/chosen": -244.8645782470703, + "logps/rejected": -230.4162139892578, + "loss": 6407.4703, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.4144255518913269, + "rewards/margins": 0.10388362407684326, + "rewards/rejected": -0.5183092355728149, + "rewards/safe_rewards": -0.4147794246673584, + "rewards/unsafe_rewards": -0.4140717387199402, "step": 740 }, { "epoch": 0.4, "learning_rate": 3.7227824372922795e-07, - "logits/chosen": -2.0786495208740234, - "logits/rejected": -1.7780708074569702, - "logps/chosen": -243.3920440673828, - "logps/rejected": -236.19357299804688, - "loss": 9091.2016, - "rewards/accuracies": 0.621874988079071, - "rewards/chosen": -0.5411256551742554, - "rewards/margins": 0.1493724137544632, - "rewards/rejected": -0.6904980540275574, - "rewards/safe_rewards": -0.535223126411438, - "rewards/unsafe_rewards": -0.5470282435417175, + "logits/chosen": -2.2572951316833496, + "logits/rejected": -2.0249342918395996, + "logps/chosen": -233.4899444580078, + "logps/rejected": -220.7420654296875, + "loss": 6312.2859, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.4421048164367676, + "rewards/margins": 0.09387814998626709, + "rewards/rejected": -0.5359830260276794, + "rewards/safe_rewards": -0.43796879053115845, + "rewards/unsafe_rewards": -0.4462409019470215, "step": 750 }, { "epoch": 0.41, "learning_rate": 3.681597695431148e-07, - "logits/chosen": -2.0165421962738037, - "logits/rejected": -1.8141343593597412, - "logps/chosen": -250.3253936767578, - "logps/rejected": -248.60049438476562, - "loss": 9329.7344, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.49013805389404297, - "rewards/margins": 0.16623175144195557, - "rewards/rejected": -0.6563698053359985, - "rewards/safe_rewards": -0.5089055895805359, - "rewards/unsafe_rewards": -0.4713704586029053, + "logits/chosen": -2.2042198181152344, + "logits/rejected": -2.0483179092407227, + "logps/chosen": -239.17190551757812, + "logps/rejected": -231.75057983398438, + "loss": 6550.8, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.37860310077667236, + "rewards/margins": 0.10926772654056549, + "rewards/rejected": -0.4878707826137543, + "rewards/safe_rewards": -0.39898186922073364, + "rewards/unsafe_rewards": -0.3582242727279663, "step": 760 }, { "epoch": 0.41, "learning_rate": 3.639995811437159e-07, - "logits/chosen": -2.0057313442230225, - "logits/rejected": -1.7652851343154907, - "logps/chosen": -250.0684051513672, - "logps/rejected": -247.67379760742188, - "loss": 9334.7859, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.5272955894470215, - "rewards/margins": 0.1573256254196167, - "rewards/rejected": -0.6846212148666382, - "rewards/safe_rewards": -0.5325959324836731, - "rewards/unsafe_rewards": -0.5219953060150146, + "logits/chosen": -2.19938063621521, + "logits/rejected": -2.0218288898468018, + "logps/chosen": -238.9772186279297, + "logps/rejected": -230.42684936523438, + "loss": 6573.4984, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.41638341546058655, + "rewards/margins": 0.0957685336470604, + "rewards/rejected": -0.5121519565582275, + "rewards/safe_rewards": -0.4294784963130951, + "rewards/unsafe_rewards": -0.4032882750034332, "step": 770 }, { "epoch": 0.42, "learning_rate": 3.597991472118426e-07, - "logits/chosen": -2.06282114982605, - "logits/rejected": -1.7575162649154663, - "logps/chosen": -261.662109375, - "logps/rejected": -244.2794189453125, - "loss": 9522.6227, - "rewards/accuracies": 0.621874988079071, - "rewards/chosen": -0.5493914484977722, - "rewards/margins": 0.13128037750720978, - "rewards/rejected": -0.6806718111038208, - "rewards/safe_rewards": -0.56357741355896, - "rewards/unsafe_rewards": -0.5352054834365845, + "logits/chosen": -2.2364182472229004, + "logits/rejected": -2.0072903633117676, + "logps/chosen": -250.245361328125, + "logps/rejected": -227.9338836669922, + "loss": 6494.7406, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.43522390723228455, + "rewards/margins": 0.08199247717857361, + "rewards/rejected": -0.5172163844108582, + "rewards/safe_rewards": -0.4500158727169037, + "rewards/unsafe_rewards": -0.4204320013523102, "step": 780 }, { "epoch": 0.43, "learning_rate": 3.5555995063627836e-07, - "logits/chosen": -2.0601704120635986, - "logits/rejected": -1.7872107028961182, - "logps/chosen": -276.6053466796875, - "logps/rejected": -260.53863525390625, - "loss": 9302.9094, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.5393366813659668, - "rewards/margins": 0.15365658700466156, - "rewards/rejected": -0.6929932832717896, - "rewards/safe_rewards": -0.5410887002944946, - "rewards/unsafe_rewards": -0.5375847220420837, + "logits/chosen": -2.1975796222686768, + "logits/rejected": -1.9783557653427124, + "logps/chosen": -269.57159423828125, + "logps/rejected": -248.4347381591797, + "loss": 6381.1195, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.46899929642677307, + "rewards/margins": 0.10295481979846954, + "rewards/rejected": -0.5719541311264038, + "rewards/safe_rewards": -0.4677364230155945, + "rewards/unsafe_rewards": -0.47026222944259644, "step": 790 }, { "epoch": 0.43, "learning_rate": 3.512834879902715e-07, - "logits/chosen": -2.083620548248291, - "logits/rejected": -1.7811193466186523, - "logps/chosen": -242.5504913330078, - "logps/rejected": -236.78921508789062, - "loss": 9299.475, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.4886937737464905, - "rewards/margins": 0.18866673111915588, - "rewards/rejected": -0.677360475063324, - "rewards/safe_rewards": -0.47278815507888794, - "rewards/unsafe_rewards": -0.504599392414093, + "logits/chosen": -2.1973164081573486, + "logits/rejected": -1.9568490982055664, + "logps/chosen": -239.0543975830078, + "logps/rejected": -227.7292938232422, + "loss": 6457.2102, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.4537328779697418, + "rewards/margins": 0.13302846252918243, + "rewards/rejected": -0.5867613554000854, + "rewards/safe_rewards": -0.43677186965942383, + "rewards/unsafe_rewards": -0.4706939160823822, "step": 800 }, { "epoch": 0.44, "learning_rate": 3.4697126900319616e-07, - "logits/chosen": -2.040707588195801, - "logits/rejected": -1.7349131107330322, - "logps/chosen": -250.6739044189453, - "logps/rejected": -235.45639038085938, - "loss": 9702.6547, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.49636808037757874, - "rewards/margins": 0.1771651953458786, - "rewards/rejected": -0.6735332608222961, - "rewards/safe_rewards": -0.4862751066684723, - "rewards/unsafe_rewards": -0.5064610242843628, + "logits/chosen": -2.1789050102233887, + "logits/rejected": -1.9364265203475952, + "logps/chosen": -245.84500122070312, + "logps/rejected": -226.31884765625, + "loss": 6765.2641, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.4480791985988617, + "rewards/margins": 0.13407856225967407, + "rewards/rejected": -0.5821577310562134, + "rewards/safe_rewards": -0.4408470690250397, + "rewards/unsafe_rewards": -0.4553113579750061, "step": 810 }, { "epoch": 0.44, "learning_rate": 3.426248160275693e-07, - "logits/chosen": -2.048630475997925, - "logits/rejected": -1.8007519245147705, - "logps/chosen": -245.1805877685547, - "logps/rejected": -239.1412353515625, - "loss": 9643.8516, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.4880673289299011, - "rewards/margins": 0.12483618408441544, - "rewards/rejected": -0.6129035949707031, - "rewards/safe_rewards": -0.5116968750953674, - "rewards/unsafe_rewards": -0.46443790197372437, + "logits/chosen": -2.1804230213165283, + "logits/rejected": -1.9869539737701416, + "logps/chosen": -239.50259399414062, + "logps/rejected": -229.2792510986328, + "loss": 6714.4187, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.43128710985183716, + "rewards/margins": 0.08299657702445984, + "rewards/rejected": -0.5142837166786194, + "rewards/safe_rewards": -0.4506538510322571, + "rewards/unsafe_rewards": -0.4119204580783844, "step": 820 }, { "epoch": 0.45, "learning_rate": 3.3824566350161094e-07, - "logits/chosen": -2.0375399589538574, - "logits/rejected": -1.7124494314193726, - "logps/chosen": -253.0242462158203, - "logps/rejected": -223.63064575195312, - "loss": 9343.9945, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.41412609815597534, - "rewards/margins": 0.16684550046920776, - "rewards/rejected": -0.5809715986251831, - "rewards/safe_rewards": -0.4085913598537445, - "rewards/unsafe_rewards": -0.4196607172489166, + "logits/chosen": -2.2021470069885254, + "logits/rejected": -1.9547889232635498, + "logps/chosen": -247.32504272460938, + "logps/rejected": -212.7538604736328, + "loss": 6502.4563, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.3571341633796692, + "rewards/margins": 0.11506947129964828, + "rewards/rejected": -0.47220364212989807, + "rewards/safe_rewards": -0.3600516617298126, + "rewards/unsafe_rewards": -0.35421669483184814, "step": 830 }, { "epoch": 0.45, "learning_rate": 3.338353574075381e-07, - "logits/chosen": -1.9761826992034912, - "logits/rejected": -1.7232366800308228, - "logps/chosen": -235.00265502929688, - "logps/rejected": -225.74734497070312, - "loss": 10735.957, - "rewards/accuracies": 0.6343749761581421, - "rewards/chosen": -0.4648624062538147, - "rewards/margins": 0.12639179825782776, - "rewards/rejected": -0.5912541747093201, - "rewards/safe_rewards": -0.4789041578769684, - "rewards/unsafe_rewards": -0.4508206248283386, + "logits/chosen": -2.1721179485321045, + "logits/rejected": -1.9909629821777344, + "logps/chosen": -226.5969696044922, + "logps/rejected": -212.7884063720703, + "loss": 7247.418, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.38080552220344543, + "rewards/margins": 0.08085910975933075, + "rewards/rejected": -0.46166467666625977, + "rewards/safe_rewards": -0.3966561555862427, + "rewards/unsafe_rewards": -0.3649549186229706, "step": 840 }, { "epoch": 0.46, "learning_rate": 3.2939545472578314e-07, - "logits/chosen": -2.068755865097046, - "logits/rejected": -1.6798311471939087, - "logps/chosen": -271.3804626464844, - "logps/rejected": -238.9709014892578, - "loss": 9617.9484, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.49935182929039, - "rewards/margins": 0.11103274673223495, - "rewards/rejected": -0.6103845834732056, - "rewards/safe_rewards": -0.49606090784072876, - "rewards/unsafe_rewards": -0.5026428699493408, + "logits/chosen": -2.2441887855529785, + "logits/rejected": -1.9507675170898438, + "logps/chosen": -263.19268798828125, + "logps/rejected": -226.2810516357422, + "loss": 6521.1922, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.41747385263442993, + "rewards/margins": 0.06601213663816452, + "rewards/rejected": -0.48348602652549744, + "rewards/safe_rewards": -0.4087367653846741, + "rewards/unsafe_rewards": -0.42621102929115295, "step": 850 }, { "epoch": 0.46, "learning_rate": 3.2492752288532916e-07, - "logits/chosen": -2.0368332862854004, - "logits/rejected": -1.7316844463348389, - "logps/chosen": -239.3889617919922, - "logps/rejected": -232.4639434814453, - "loss": 9418.6711, - "rewards/accuracies": 0.640625, - "rewards/chosen": -0.4682197570800781, - "rewards/margins": 0.14151941239833832, - "rewards/rejected": -0.6097391843795776, - "rewards/safe_rewards": -0.46087655425071716, - "rewards/unsafe_rewards": -0.4755628705024719, + "logits/chosen": -2.2073259353637695, + "logits/rejected": -1.9797918796539307, + "logps/chosen": -234.3251495361328, + "logps/rejected": -222.50390625, + "loss": 6354.6242, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.41758179664611816, + "rewards/margins": 0.0925571545958519, + "rewards/rejected": -0.5101389288902283, + "rewards/safe_rewards": -0.40995293855667114, + "rewards/unsafe_rewards": -0.4252106547355652, "step": 860 }, { "epoch": 0.47, "learning_rate": 3.204331392103574e-07, - "logits/chosen": -2.0891008377075195, - "logits/rejected": -1.7016685009002686, - "logps/chosen": -259.34075927734375, - "logps/rejected": -223.6948699951172, - "loss": 9582.4391, - "rewards/accuracies": 0.6468750238418579, - "rewards/chosen": -0.4757116734981537, - "rewards/margins": 0.12248358875513077, - "rewards/rejected": -0.5981953144073486, - "rewards/safe_rewards": -0.47992807626724243, - "rewards/unsafe_rewards": -0.47149524092674255, + "logits/chosen": -2.2528116703033447, + "logits/rejected": -1.9719661474227905, + "logps/chosen": -254.8304443359375, + "logps/rejected": -214.4097137451172, + "loss": 6612.2297, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4306084215641022, + "rewards/margins": 0.07473549991846085, + "rewards/rejected": -0.5053439736366272, + "rewards/safe_rewards": -0.4317397475242615, + "rewards/unsafe_rewards": -0.42947712540626526, "step": 870 }, { "epoch": 0.47, "learning_rate": 3.159138903634006e-07, - "logits/chosen": -1.9104595184326172, - "logits/rejected": -1.6603901386260986, - "logps/chosen": -260.9939880371094, - "logps/rejected": -244.0780029296875, - "loss": 9344.7211, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.5664443373680115, - "rewards/margins": 0.13713577389717102, - "rewards/rejected": -0.7035800814628601, - "rewards/safe_rewards": -0.5669212341308594, - "rewards/unsafe_rewards": -0.5659674406051636, + "logits/chosen": -2.15206241607666, + "logits/rejected": -1.9694172143936157, + "logps/chosen": -251.68887329101562, + "logps/rejected": -230.56942749023438, + "loss": 6409.4055, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.473393052816391, + "rewards/margins": 0.0951012596487999, + "rewards/rejected": -0.5684942603111267, + "rewards/safe_rewards": -0.4793036878108978, + "rewards/unsafe_rewards": -0.46748247742652893, "step": 880 }, { "epoch": 0.48, "learning_rate": 3.1137137178519977e-07, - "logits/chosen": -1.8854684829711914, - "logits/rejected": -1.5808637142181396, - "logps/chosen": -240.728759765625, - "logps/rejected": -227.60595703125, - "loss": 9511.5703, - "rewards/accuracies": 0.628125011920929, - "rewards/chosen": -0.5657208561897278, - "rewards/margins": 0.13940104842185974, - "rewards/rejected": -0.7051218748092651, - "rewards/safe_rewards": -0.5532920360565186, - "rewards/unsafe_rewards": -0.578149676322937, + "logits/chosen": -2.1355667114257812, + "logits/rejected": -1.9253263473510742, + "logps/chosen": -233.0535888671875, + "logps/rejected": -214.96810913085938, + "loss": 6620.7125, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.4889693260192871, + "rewards/margins": 0.08977462351322174, + "rewards/rejected": -0.5787439942359924, + "rewards/safe_rewards": -0.4807513654232025, + "rewards/unsafe_rewards": -0.49718719720840454, "step": 890 }, { "epoch": 0.48, "learning_rate": 3.068071871314626e-07, - "logits/chosen": -1.8297315835952759, - "logits/rejected": -1.5169748067855835, - "logps/chosen": -242.3647918701172, - "logps/rejected": -221.9072723388672, - "loss": 9144.8594, - "rewards/accuracies": 0.628125011920929, - "rewards/chosen": -0.48873788118362427, - "rewards/margins": 0.14859852194786072, - "rewards/rejected": -0.6373364329338074, - "rewards/safe_rewards": -0.5038073062896729, - "rewards/unsafe_rewards": -0.47366851568222046, + "logits/chosen": -2.1069016456604004, + "logits/rejected": -1.892960548400879, + "logps/chosen": -236.0681915283203, + "logps/rejected": -210.6871337890625, + "loss": 6417.7578, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.42577171325683594, + "rewards/margins": 0.09936337172985077, + "rewards/rejected": -0.5251351594924927, + "rewards/safe_rewards": -0.43935972452163696, + "rewards/unsafe_rewards": -0.4121837019920349, "step": 900 }, { "epoch": 0.49, "learning_rate": 3.022229477067205e-07, - "logits/chosen": -1.9186245203018188, - "logits/rejected": -1.613037109375, - "logps/chosen": -264.6602478027344, - "logps/rejected": -230.00244140625, - "loss": 8938.2313, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.5231385231018066, - "rewards/margins": 0.1498262733221054, - "rewards/rejected": -0.6729647517204285, - "rewards/safe_rewards": -0.5162819623947144, - "rewards/unsafe_rewards": -0.5299952030181885, + "logits/chosen": -2.155628204345703, + "logits/rejected": -1.9340698719024658, + "logps/chosen": -254.1663055419922, + "logps/rejected": -214.5603790283203, + "loss": 6326.4648, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.4181991219520569, + "rewards/margins": 0.10034485161304474, + "rewards/rejected": -0.5185439586639404, + "rewards/safe_rewards": -0.40648728609085083, + "rewards/unsafe_rewards": -0.4299109876155853, "step": 910 }, { "epoch": 0.49, "learning_rate": 2.976202718954869e-07, - "logits/chosen": -1.9586397409439087, - "logits/rejected": -1.6425132751464844, - "logps/chosen": -262.44537353515625, - "logps/rejected": -252.6732940673828, - "loss": 9940.3102, - "rewards/accuracies": 0.6156250238418579, - "rewards/chosen": -0.5410134196281433, - "rewards/margins": 0.1333283931016922, - "rewards/rejected": -0.6743417978286743, - "rewards/safe_rewards": -0.5536484122276306, - "rewards/unsafe_rewards": -0.5283784866333008, + "logits/chosen": -2.1675617694854736, + "logits/rejected": -1.925453543663025, + "logps/chosen": -253.51510620117188, + "logps/rejected": -238.5836181640625, + "loss": 6792.7578, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.4517107903957367, + "rewards/margins": 0.08173434436321259, + "rewards/rejected": -0.5334451794624329, + "rewards/safe_rewards": -0.4674789309501648, + "rewards/unsafe_rewards": -0.4359425902366638, "step": 920 }, { "epoch": 0.5, "learning_rate": 2.930007845909146e-07, - "logits/chosen": -2.0048108100891113, - "logits/rejected": -1.7904495000839233, - "logps/chosen": -276.6556091308594, - "logps/rejected": -264.38922119140625, - "loss": 9747.1859, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.5601319074630737, - "rewards/margins": 0.14147058129310608, - "rewards/rejected": -0.701602578163147, - "rewards/safe_rewards": -0.5517798662185669, - "rewards/unsafe_rewards": -0.5684840083122253, + "logits/chosen": -2.1799418926239014, + "logits/rejected": -2.0040533542633057, + "logps/chosen": -266.9742126464844, + "logps/rejected": -249.785888671875, + "loss": 6767.8273, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.46331778168678284, + "rewards/margins": 0.09225159883499146, + "rewards/rejected": -0.5555693507194519, + "rewards/safe_rewards": -0.45066460967063904, + "rewards/unsafe_rewards": -0.47597089409828186, "step": 930 }, { "epoch": 0.51, "learning_rate": 2.8836611662115634e-07, - "logits/chosen": -1.9245948791503906, - "logits/rejected": -1.6107795238494873, - "logps/chosen": -260.14898681640625, - "logps/rejected": -232.1200714111328, - "loss": 9986.2594, - "rewards/accuracies": 0.628125011920929, - "rewards/chosen": -0.5849139094352722, - "rewards/margins": 0.1456686407327652, - "rewards/rejected": -0.730582594871521, - "rewards/safe_rewards": -0.5763916373252869, - "rewards/unsafe_rewards": -0.5934361219406128, + "logits/chosen": -2.0794804096221924, + "logits/rejected": -1.8282148838043213, + "logps/chosen": -250.9180450439453, + "logps/rejected": -217.40322875976562, + "loss": 6787.9852, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.49260464310646057, + "rewards/margins": 0.09080933034420013, + "rewards/rejected": -0.5834139585494995, + "rewards/safe_rewards": -0.484276682138443, + "rewards/unsafe_rewards": -0.5009325742721558, "step": 940 }, { "epoch": 0.51, "learning_rate": 2.8371790417362986e-07, - "logits/chosen": -1.9404735565185547, - "logits/rejected": -1.6653703451156616, - "logps/chosen": -247.77474975585938, - "logps/rejected": -249.87548828125, - "loss": 10431.0484, - "rewards/accuracies": 0.596875011920929, - "rewards/chosen": -0.5242490768432617, - "rewards/margins": 0.12247468531131744, - "rewards/rejected": -0.646723747253418, - "rewards/safe_rewards": -0.5213707685470581, - "rewards/unsafe_rewards": -0.5271273851394653, + "logits/chosen": -2.115889310836792, + "logits/rejected": -1.9069702625274658, + "logps/chosen": -242.36587524414062, + "logps/rejected": -239.2466278076172, + "loss": 7101.5695, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.47016048431396484, + "rewards/margins": 0.0702749639749527, + "rewards/rejected": -0.5404354333877563, + "rewards/safe_rewards": -0.46828895807266235, + "rewards/unsafe_rewards": -0.47203201055526733, "step": 950 }, { "epoch": 0.52, "learning_rate": 2.7905778821739056e-07, - "logits/chosen": -1.9483007192611694, - "logits/rejected": -1.5984771251678467, - "logps/chosen": -254.5243682861328, - "logps/rejected": -223.98788452148438, - "loss": 9637.5156, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.4666689336299896, - "rewards/margins": 0.15206432342529297, - "rewards/rejected": -0.618733286857605, - "rewards/safe_rewards": -0.44323453307151794, - "rewards/unsafe_rewards": -0.4901033341884613, + "logits/chosen": -2.122325897216797, + "logits/rejected": -1.8465118408203125, + "logps/chosen": -248.68179321289062, + "logps/rejected": -213.2493133544922, + "loss": 6604.2383, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.40824347734451294, + "rewards/margins": 0.10310404002666473, + "rewards/rejected": -0.5113475322723389, + "rewards/safe_rewards": -0.390902042388916, + "rewards/unsafe_rewards": -0.42558494210243225, "step": 960 }, { "epoch": 0.52, "learning_rate": 2.74387413923817e-07, - "logits/chosen": -1.8648412227630615, - "logits/rejected": -1.6388845443725586, - "logps/chosen": -273.03314208984375, - "logps/rejected": -261.84686279296875, - "loss": 9818.5922, - "rewards/accuracies": 0.621874988079071, - "rewards/chosen": -0.5642322897911072, - "rewards/margins": 0.1341981738805771, - "rewards/rejected": -0.6984304785728455, - "rewards/safe_rewards": -0.5430005788803101, - "rewards/unsafe_rewards": -0.5854640603065491, + "logits/chosen": -2.0418407917022705, + "logits/rejected": -1.8530842065811157, + "logps/chosen": -263.1913757324219, + "logps/rejected": -247.3702392578125, + "loss": 6765.8367, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.4658144414424896, + "rewards/margins": 0.08784971386194229, + "rewards/rejected": -0.5536641478538513, + "rewards/safe_rewards": -0.4485572874546051, + "rewards/unsafe_rewards": -0.4830716550350189, "step": 970 }, { "epoch": 0.53, "learning_rate": 2.69708430085812e-07, - "logits/chosen": -1.9712398052215576, - "logits/rejected": -1.6769657135009766, - "logps/chosen": -271.9842834472656, - "logps/rejected": -256.37957763671875, - "loss": 10124.4766, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.6092468500137329, - "rewards/margins": 0.17060907185077667, - "rewards/rejected": -0.7798559665679932, - "rewards/safe_rewards": -0.6114919781684875, - "rewards/unsafe_rewards": -0.6070017218589783, + "logits/chosen": -2.1047558784484863, + "logits/rejected": -1.8455266952514648, + "logps/chosen": -260.87078857421875, + "logps/rejected": -240.1245574951172, + "loss": 6804.8938, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.4981120526790619, + "rewards/margins": 0.11919406801462173, + "rewards/rejected": -0.617306113243103, + "rewards/safe_rewards": -0.5019406080245972, + "rewards/unsafe_rewards": -0.494283527135849, "step": 980 }, { "epoch": 0.53, "learning_rate": 2.6502248853572504e-07, - "logits/chosen": -1.9350106716156006, - "logits/rejected": -1.6565732955932617, - "logps/chosen": -250.8092803955078, - "logps/rejected": -234.9805145263672, - "loss": 10128.2734, - "rewards/accuracies": 0.628125011920929, - "rewards/chosen": -0.5938445329666138, - "rewards/margins": 0.12586383521556854, - "rewards/rejected": -0.7197084426879883, - "rewards/safe_rewards": -0.5987473726272583, - "rewards/unsafe_rewards": -0.588941752910614, + "logits/chosen": -2.065930128097534, + "logits/rejected": -1.8151044845581055, + "logps/chosen": -241.413818359375, + "logps/rejected": -220.83212280273438, + "loss": 6964.4953, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.4998900294303894, + "rewards/margins": 0.0783344954252243, + "rewards/rejected": -0.5782245397567749, + "rewards/safe_rewards": -0.5021571516990662, + "rewards/unsafe_rewards": -0.49762290716171265, "step": 990 }, { "epoch": 0.54, "learning_rate": 2.6033124356220325e-07, - "logits/chosen": -1.9529399871826172, - "logits/rejected": -1.6564655303955078, - "logps/chosen": -246.7399139404297, - "logps/rejected": -222.8207550048828, - "loss": 9326.0547, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.4765213429927826, - "rewards/margins": 0.15468816459178925, - "rewards/rejected": -0.6312094926834106, - "rewards/safe_rewards": -0.47257018089294434, - "rewards/unsafe_rewards": -0.48047250509262085, + "logits/chosen": -2.0734333992004395, + "logits/rejected": -1.8190828561782837, + "logps/chosen": -241.6118621826172, + "logps/rejected": -212.5392303466797, + "loss": 6458.732, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.4252408444881439, + "rewards/margins": 0.10315348953008652, + "rewards/rejected": -0.528394341468811, + "rewards/safe_rewards": -0.4195174276828766, + "rewards/unsafe_rewards": -0.43096423149108887, "step": 1000 }, { "epoch": 0.54, - "eval_logits/chosen": -1.459219217300415, - "eval_logits/rejected": -1.0852044820785522, - "eval_logps/chosen": -198.62112426757812, - "eval_logps/rejected": -165.0467987060547, - "eval_loss": 3789.448974609375, - "eval_rewards/accuracies": 0.5768998265266418, - "eval_rewards/chosen": -0.6775526404380798, - "eval_rewards/margins": 0.04939630255103111, - "eval_rewards/rejected": -0.7269489765167236, - "eval_rewards/safe_rewards": -0.6763142347335815, - "eval_rewards/unsafe_rewards": -0.6752175688743591, - "eval_runtime": 1813.2799, - "eval_samples_per_second": 18.223, - "eval_steps_per_second": 1.139, + "eval_logits/chosen": -1.7035794258117676, + "eval_logits/rejected": -1.4110270738601685, + "eval_logps/chosen": -194.92967224121094, + "eval_logps/rejected": -155.5638885498047, + "eval_loss": 3057.427490234375, + "eval_rewards/accuracies": 0.4847531318664551, + "eval_rewards/chosen": -0.6406379342079163, + "eval_rewards/margins": -0.008517943322658539, + "eval_rewards/rejected": -0.6321200728416443, + "eval_rewards/safe_rewards": -0.6396505832672119, + "eval_rewards/unsafe_rewards": -0.6373612284660339, + "eval_runtime": 1793.558, + "eval_samples_per_second": 18.424, + "eval_steps_per_second": 1.152, "step": 1000 }, { "epoch": 0.54, "learning_rate": 2.55636351326173e-07, - "logits/chosen": -1.9889262914657593, - "logits/rejected": -1.725890874862671, - "logps/chosen": -260.699951171875, - "logps/rejected": -237.9333038330078, - "loss": 8729.0844, - "rewards/accuracies": 0.640625, - "rewards/chosen": -0.455497682094574, - "rewards/margins": 0.16349969804286957, - "rewards/rejected": -0.6189973950386047, - "rewards/safe_rewards": -0.45311230421066284, - "rewards/unsafe_rewards": -0.4578830301761627, + "logits/chosen": -2.128598690032959, + "logits/rejected": -1.909714937210083, + "logps/chosen": -256.15692138671875, + "logps/rejected": -227.8979949951172, + "loss": 6005.793, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.4100671708583832, + "rewards/margins": 0.1085771769285202, + "rewards/rejected": -0.518644392490387, + "rewards/safe_rewards": -0.4100631773471832, + "rewards/unsafe_rewards": -0.4100712239742279, "step": 1010 }, { "epoch": 0.55, "learning_rate": 2.509394692761622e-07, - "logits/chosen": -1.934045433998108, - "logits/rejected": -1.6091651916503906, - "logps/chosen": -266.6876525878906, - "logps/rejected": -247.41781616210938, - "loss": 9747.668, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.4815899729728699, - "rewards/margins": 0.18374037742614746, - "rewards/rejected": -0.6653302907943726, - "rewards/safe_rewards": -0.484616219997406, - "rewards/unsafe_rewards": -0.47856369614601135, + "logits/chosen": -2.1080050468444824, + "logits/rejected": -1.8461472988128662, + "logps/chosen": -260.11492919921875, + "logps/rejected": -234.23159790039062, + "loss": 6730.5984, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4158630967140198, + "rewards/margins": 0.11760497093200684, + "rewards/rejected": -0.5334680676460266, + "rewards/safe_rewards": -0.413133442401886, + "rewards/unsafe_rewards": -0.41859275102615356, "step": 1020 }, { "epoch": 0.55, "learning_rate": 2.462422555631674e-07, - "logits/chosen": -1.9403159618377686, - "logits/rejected": -1.611532211303711, - "logps/chosen": -248.64016723632812, - "logps/rejected": -230.0417022705078, - "loss": 9524.2289, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.51064532995224, - "rewards/margins": 0.1778976172208786, - "rewards/rejected": -0.6885429620742798, - "rewards/safe_rewards": -0.5135669708251953, - "rewards/unsafe_rewards": -0.5077236890792847, + "logits/chosen": -2.129202127456665, + "logits/rejected": -1.863739252090454, + "logps/chosen": -240.8564910888672, + "logps/rejected": -216.32406616210938, + "loss": 6471.7312, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.4328088164329529, + "rewards/margins": 0.11855790764093399, + "rewards/rejected": -0.5513667464256287, + "rewards/safe_rewards": -0.4395027160644531, + "rewards/unsafe_rewards": -0.426114946603775, "step": 1030 }, { "epoch": 0.56, "learning_rate": 2.415463684552728e-07, - "logits/chosen": -1.8627121448516846, - "logits/rejected": -1.6023216247558594, - "logps/chosen": -242.73544311523438, - "logps/rejected": -228.3376007080078, - "loss": 9847.2266, - "rewards/accuracies": 0.6156250238418579, - "rewards/chosen": -0.5525670051574707, - "rewards/margins": 0.13925854861736298, - "rewards/rejected": -0.6918255090713501, - "rewards/safe_rewards": -0.5477128028869629, - "rewards/unsafe_rewards": -0.5574211478233337, + "logits/chosen": -2.0589680671691895, + "logits/rejected": -1.8485314846038818, + "logps/chosen": -236.1913299560547, + "logps/rejected": -216.6231231689453, + "loss": 6823.7234, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.48712578415870667, + "rewards/margins": 0.08755507320165634, + "rewards/rejected": -0.5746808648109436, + "rewards/safe_rewards": -0.4804520010948181, + "rewards/unsafe_rewards": -0.4937995970249176, "step": 1040 }, { "epoch": 0.56, "learning_rate": 2.3685346575222807e-07, - "logits/chosen": -1.9260387420654297, - "logits/rejected": -1.5772655010223389, - "logps/chosen": -257.85382080078125, - "logps/rejected": -236.52359008789062, - "loss": 9434.0938, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.5095101594924927, - "rewards/margins": 0.1503477394580841, - "rewards/rejected": -0.6598579287528992, - "rewards/safe_rewards": -0.522178590297699, - "rewards/unsafe_rewards": -0.4968417286872864, + "logits/chosen": -2.0947399139404297, + "logits/rejected": -1.8131996393203735, + "logps/chosen": -252.2243194580078, + "logps/rejected": -225.3450164794922, + "loss": 6464.8258, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4532151222229004, + "rewards/margins": 0.09485702961683273, + "rewards/rejected": -0.5480721592903137, + "rewards/safe_rewards": -0.464500367641449, + "rewards/unsafe_rewards": -0.4419298768043518, "step": 1050 }, { "epoch": 0.57, "learning_rate": 2.321652042001919e-07, - "logits/chosen": -1.9460338354110718, - "logits/rejected": -1.5725082159042358, - "logps/chosen": -263.1464538574219, - "logps/rejected": -253.8524627685547, - "loss": 9367.7727, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.5323795080184937, - "rewards/margins": 0.17391470074653625, - "rewards/rejected": -0.7062941789627075, - "rewards/safe_rewards": -0.532899022102356, - "rewards/unsafe_rewards": -0.5318600535392761, + "logits/chosen": -2.109151601791382, + "logits/rejected": -1.7986654043197632, + "logps/chosen": -256.6636657714844, + "logps/rejected": -242.2730255126953, + "loss": 6561.9937, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.46755141019821167, + "rewards/margins": 0.12294862419366837, + "rewards/rejected": -0.590499997138977, + "rewards/safe_rewards": -0.4698343276977539, + "rewards/unsafe_rewards": -0.4652685225009918, "step": 1060 }, { "epoch": 0.58, "learning_rate": 2.2748323890684662e-07, - "logits/chosen": -1.9387753009796143, - "logits/rejected": -1.628309965133667, - "logps/chosen": -252.60769653320312, - "logps/rejected": -239.25009155273438, - "loss": 9435.8875, - "rewards/accuracies": 0.621874988079071, - "rewards/chosen": -0.5392150282859802, - "rewards/margins": 0.1553070843219757, - "rewards/rejected": -0.6945220828056335, - "rewards/safe_rewards": -0.556832492351532, - "rewards/unsafe_rewards": -0.5215975642204285, + "logits/chosen": -2.102909803390503, + "logits/rejected": -1.8447599411010742, + "logps/chosen": -245.5310821533203, + "logps/rejected": -227.4581756591797, + "loss": 6596.2875, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.46844926476478577, + "rewards/margins": 0.10815373808145523, + "rewards/rejected": -0.5766030550003052, + "rewards/safe_rewards": -0.4830314517021179, + "rewards/unsafe_rewards": -0.4538671374320984, "step": 1070 }, { "epoch": 0.58, "learning_rate": 2.2280922275709213e-07, - "logits/chosen": -1.9359849691390991, - "logits/rejected": -1.6030991077423096, - "logps/chosen": -256.9175720214844, - "logps/rejected": -248.23233032226562, - "loss": 9624.1469, - "rewards/accuracies": 0.6343749761581421, - "rewards/chosen": -0.5274389982223511, - "rewards/margins": 0.1654755175113678, - "rewards/rejected": -0.6929145455360413, - "rewards/safe_rewards": -0.5106499791145325, - "rewards/unsafe_rewards": -0.5442280769348145, + "logits/chosen": -2.111426830291748, + "logits/rejected": -1.8440996408462524, + "logps/chosen": -249.52572631835938, + "logps/rejected": -235.2782745361328, + "loss": 6598.8516, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.4535207152366638, + "rewards/margins": 0.10985337197780609, + "rewards/rejected": -0.5633742213249207, + "rewards/safe_rewards": -0.4334963262081146, + "rewards/unsafe_rewards": -0.47354525327682495, "step": 1080 }, { "epoch": 0.59, "learning_rate": 2.1814480582952375e-07, - "logits/chosen": -1.9188129901885986, - "logits/rejected": -1.5995439291000366, - "logps/chosen": -259.3883056640625, - "logps/rejected": -253.4076385498047, - "loss": 9489.5648, - "rewards/accuracies": 0.640625, - "rewards/chosen": -0.5569387674331665, - "rewards/margins": 0.16238811612129211, - "rewards/rejected": -0.7193268537521362, - "rewards/safe_rewards": -0.5516430139541626, - "rewards/unsafe_rewards": -0.5622345209121704, + "logits/chosen": -2.1081595420837402, + "logits/rejected": -1.845463514328003, + "logps/chosen": -252.86978149414062, + "logps/rejected": -241.583984375, + "loss": 6642.643, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.49175339937210083, + "rewards/margins": 0.10933689773082733, + "rewards/rejected": -0.6010903716087341, + "rewards/safe_rewards": -0.49363985657691956, + "rewards/unsafe_rewards": -0.48986703157424927, "step": 1090 }, { "epoch": 0.59, "learning_rate": 2.1349163481390187e-07, - "logits/chosen": -1.8804569244384766, - "logits/rejected": -1.5835000276565552, - "logps/chosen": -243.643310546875, - "logps/rejected": -237.51461791992188, - "loss": 9540.782, - "rewards/accuracies": 0.653124988079071, - "rewards/chosen": -0.504102349281311, - "rewards/margins": 0.15102139115333557, - "rewards/rejected": -0.655123770236969, - "rewards/safe_rewards": -0.4968837797641754, - "rewards/unsafe_rewards": -0.511320948600769, + "logits/chosen": -2.0954482555389404, + "logits/rejected": -1.8629337549209595, + "logps/chosen": -238.1507568359375, + "logps/rejected": -226.63101196289062, + "loss": 6691.6672, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.4491768479347229, + "rewards/margins": 0.09711066633462906, + "rewards/rejected": -0.5462875366210938, + "rewards/safe_rewards": -0.4436133801937103, + "rewards/unsafe_rewards": -0.45474034547805786, "step": 1100 }, { "epoch": 0.6, "learning_rate": 2.0885135242981647e-07, - "logits/chosen": -1.8656593561172485, - "logits/rejected": -1.5086779594421387, - "logps/chosen": -264.73736572265625, - "logps/rejected": -229.5301055908203, - "loss": 8907.8078, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.5125385522842407, - "rewards/margins": 0.16026821732521057, - "rewards/rejected": -0.6728067398071289, - "rewards/safe_rewards": -0.5168980956077576, - "rewards/unsafe_rewards": -0.5081789493560791, + "logits/chosen": -2.0993576049804688, + "logits/rejected": -1.8268959522247314, + "logps/chosen": -258.9464111328125, + "logps/rejected": -218.21286010742188, + "loss": 6055.6734, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.45462924242019653, + "rewards/margins": 0.10500512272119522, + "rewards/rejected": -0.5596343278884888, + "rewards/safe_rewards": -0.4565979540348053, + "rewards/unsafe_rewards": -0.4526605010032654, "step": 1110 }, { "epoch": 0.6, "learning_rate": 2.0422559684675494e-07, - "logits/chosen": -1.9330692291259766, - "logits/rejected": -1.5238492488861084, - "logps/chosen": -266.99786376953125, - "logps/rejected": -233.84716796875, - "loss": 9386.2531, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.5010424256324768, - "rewards/margins": 0.14765089750289917, - "rewards/rejected": -0.648693323135376, - "rewards/safe_rewards": -0.4836382269859314, - "rewards/unsafe_rewards": -0.5184466242790222, + "logits/chosen": -2.1483044624328613, + "logits/rejected": -1.8406091928482056, + "logps/chosen": -258.66229248046875, + "logps/rejected": -219.83975219726562, + "loss": 6445.002, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.4176865220069885, + "rewards/margins": 0.09093281626701355, + "rewards/rejected": -0.5086194276809692, + "rewards/safe_rewards": -0.3987955152988434, + "rewards/unsafe_rewards": -0.43657755851745605, "step": 1120 }, { "epoch": 0.61, "learning_rate": 1.9961600110577457e-07, - "logits/chosen": -1.8306465148925781, - "logits/rejected": -1.5314967632293701, - "logps/chosen": -259.0641784667969, - "logps/rejected": -257.81201171875, - "loss": 9385.1578, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.5168896913528442, - "rewards/margins": 0.1426219940185547, - "rewards/rejected": -0.6595116853713989, - "rewards/safe_rewards": -0.4919961988925934, - "rewards/unsafe_rewards": -0.5417832732200623, + "logits/chosen": -2.055497646331787, + "logits/rejected": -1.8122695684432983, + "logps/chosen": -249.71914672851562, + "logps/rejected": -244.0706024169922, + "loss": 6471.9266, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.4234393239021301, + "rewards/margins": 0.09865859895944595, + "rewards/rejected": -0.5220978856086731, + "rewards/safe_rewards": -0.39991191029548645, + "rewards/unsafe_rewards": -0.4469667077064514, "step": 1130 }, { "epoch": 0.61, "learning_rate": 1.950241925429867e-07, - "logits/chosen": -1.9569222927093506, - "logits/rejected": -1.6478564739227295, - "logps/chosen": -250.83657836914062, - "logps/rejected": -236.1243896484375, - "loss": 9243.4328, - "rewards/accuracies": 0.6343749761581421, - "rewards/chosen": -0.4856063425540924, - "rewards/margins": 0.14849331974983215, - "rewards/rejected": -0.6340996026992798, - "rewards/safe_rewards": -0.4814511239528656, - "rewards/unsafe_rewards": -0.4897615909576416, + "logits/chosen": -2.1423842906951904, + "logits/rejected": -1.8863725662231445, + "logps/chosen": -242.42935180664062, + "logps/rejected": -222.77133178710938, + "loss": 6285.3195, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4015336036682129, + "rewards/margins": 0.09903542697429657, + "rewards/rejected": -0.5005691051483154, + "rewards/safe_rewards": -0.4006478786468506, + "rewards/unsafe_rewards": -0.40241941809654236, "step": 1140 }, { "epoch": 0.62, "learning_rate": 1.9045179221505495e-07, - "logits/chosen": -1.8818387985229492, - "logits/rejected": -1.5910406112670898, - "logps/chosen": -275.05859375, - "logps/rejected": -250.9720001220703, - "loss": 9168.943, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.5314797163009644, - "rewards/margins": 0.13968445360660553, - "rewards/rejected": -0.6711641550064087, - "rewards/safe_rewards": -0.5201884508132935, - "rewards/unsafe_rewards": -0.5427709817886353, + "logits/chosen": -2.0579514503479004, + "logits/rejected": -1.8176921606063843, + "logps/chosen": -267.66314697265625, + "logps/rejected": -238.9211883544922, + "loss": 6360.9383, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.45752495527267456, + "rewards/margins": 0.0931306928396225, + "rewards/rejected": -0.550655722618103, + "rewards/safe_rewards": -0.4510927200317383, + "rewards/unsafe_rewards": -0.4639572501182556, "step": 1150 }, { "epoch": 0.62, "learning_rate": 1.8590041432690893e-07, - "logits/chosen": -1.7795979976654053, - "logits/rejected": -1.5073283910751343, - "logps/chosen": -247.8355712890625, - "logps/rejected": -237.6390838623047, - "loss": 9365.9641, - "rewards/accuracies": 0.6156250238418579, - "rewards/chosen": -0.5587030649185181, - "rewards/margins": 0.13812333345413208, - "rewards/rejected": -0.6968263387680054, - "rewards/safe_rewards": -0.5600093603134155, - "rewards/unsafe_rewards": -0.557396650314331, + "logits/chosen": -1.9864181280136108, + "logits/rejected": -1.7629731893539429, + "logps/chosen": -241.68258666992188, + "logps/rejected": -226.5916748046875, + "loss": 6329.5895, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.497173547744751, + "rewards/margins": 0.08917877823114395, + "rewards/rejected": -0.5863522887229919, + "rewards/safe_rewards": -0.49702000617980957, + "rewards/unsafe_rewards": -0.49732694029808044, "step": 1160 }, { "epoch": 0.63, "learning_rate": 1.813716656618788e-07, - "logits/chosen": -1.8242908716201782, - "logits/rejected": -1.5308293104171753, - "logps/chosen": -241.4901885986328, - "logps/rejected": -229.1571807861328, - "loss": 10065.7359, - "rewards/accuracies": 0.653124988079071, - "rewards/chosen": -0.5598009824752808, - "rewards/margins": 0.13132891058921814, - "rewards/rejected": -0.6911298632621765, - "rewards/safe_rewards": -0.5591174364089966, - "rewards/unsafe_rewards": -0.5604844093322754, + "logits/chosen": -2.0296473503112793, + "logits/rejected": -1.793302297592163, + "logps/chosen": -234.65371704101562, + "logps/rejected": -217.74581909179688, + "loss": 7077.6555, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.49143609404563904, + "rewards/margins": 0.08558019250631332, + "rewards/rejected": -0.577016294002533, + "rewards/safe_rewards": -0.499739408493042, + "rewards/unsafe_rewards": -0.48313283920288086, "step": 1170 }, { "epoch": 0.63, "learning_rate": 1.7686714501444788e-07, - "logits/chosen": -1.8726997375488281, - "logits/rejected": -1.432035207748413, - "logps/chosen": -278.05572509765625, - "logps/rejected": -250.7410125732422, - "loss": 9749.7609, - "rewards/accuracies": 0.659375011920929, - "rewards/chosen": -0.5793915390968323, - "rewards/margins": 0.15176436305046082, - "rewards/rejected": -0.7311559915542603, - "rewards/safe_rewards": -0.5804899334907532, - "rewards/unsafe_rewards": -0.5782932043075562, + "logits/chosen": -2.088947296142578, + "logits/rejected": -1.7341728210449219, + "logps/chosen": -270.2713317871094, + "logps/rejected": -237.0913543701172, + "loss": 6750.0367, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5015474557876587, + "rewards/margins": 0.09311170130968094, + "rewards/rejected": -0.594659149646759, + "rewards/safe_rewards": -0.5031787157058716, + "rewards/unsafe_rewards": -0.499916136264801, "step": 1180 }, { "epoch": 0.64, "learning_rate": 1.7238844262582768e-07, - "logits/chosen": -1.839916467666626, - "logits/rejected": -1.592898964881897, - "logps/chosen": -268.4849548339844, - "logps/rejected": -251.6371612548828, - "loss": 8888.5844, - "rewards/accuracies": 0.6156250238418579, - "rewards/chosen": -0.5369018316268921, - "rewards/margins": 0.12209179252386093, - "rewards/rejected": -0.6589936017990112, - "rewards/safe_rewards": -0.5295559167861938, - "rewards/unsafe_rewards": -0.5442477464675903, + "logits/chosen": -2.0681557655334473, + "logits/rejected": -1.87997567653656, + "logps/chosen": -261.2225036621094, + "logps/rejected": -239.202880859375, + "loss": 6169.3504, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.4642775058746338, + "rewards/margins": 0.07037371397018433, + "rewards/rejected": -0.5346512198448181, + "rewards/safe_rewards": -0.4574803411960602, + "rewards/unsafe_rewards": -0.4710747301578522, "step": 1190 }, { "epoch": 0.65, "learning_rate": 1.679371396225504e-07, - "logits/chosen": -1.8138126134872437, - "logits/rejected": -1.4759351015090942, - "logps/chosen": -255.7080078125, - "logps/rejected": -245.1534423828125, - "loss": 9579.3203, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.512513279914856, - "rewards/margins": 0.13113752007484436, - "rewards/rejected": -0.6436507701873779, - "rewards/safe_rewards": -0.5054439902305603, - "rewards/unsafe_rewards": -0.5195825695991516, + "logits/chosen": -2.0639595985412598, + "logits/rejected": -1.796931266784668, + "logps/chosen": -248.5899658203125, + "logps/rejected": -232.2371368408203, + "loss": 6636.9758, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.4413328170776367, + "rewards/margins": 0.07315496355295181, + "rewards/rejected": -0.5144877433776855, + "rewards/safe_rewards": -0.4346230924129486, + "rewards/unsafe_rewards": -0.44804254174232483, "step": 1200 }, { "epoch": 0.65, "learning_rate": 1.6351480745828096e-07, - "logits/chosen": -1.8510429859161377, - "logits/rejected": -1.525810718536377, - "logps/chosen": -247.984130859375, - "logps/rejected": -239.0982208251953, - "loss": 8662.3109, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.48912423849105835, - "rewards/margins": 0.17965053021907806, - "rewards/rejected": -0.6687747836112976, - "rewards/safe_rewards": -0.4798237383365631, - "rewards/unsafe_rewards": -0.498424768447876, + "logits/chosen": -2.096024513244629, + "logits/rejected": -1.840306043624878, + "logps/chosen": -239.71939086914062, + "logps/rejected": -224.69808959960938, + "loss": 6078.8516, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.40647679567337036, + "rewards/margins": 0.11829674243927002, + "rewards/rejected": -0.5247735381126404, + "rewards/safe_rewards": -0.4001832604408264, + "rewards/unsafe_rewards": -0.4127703607082367, "step": 1210 }, { "epoch": 0.66, "learning_rate": 1.5912300735904248e-07, - "logits/chosen": -1.9010947942733765, - "logits/rejected": -1.4919841289520264, - "logps/chosen": -278.72515869140625, - "logps/rejected": -244.07601928710938, - "loss": 9658.5812, - "rewards/accuracies": 0.659375011920929, - "rewards/chosen": -0.5516470670700073, - "rewards/margins": 0.14930351078510284, - "rewards/rejected": -0.7009506821632385, - "rewards/safe_rewards": -0.5480190515518188, - "rewards/unsafe_rewards": -0.5552752017974854, + "logits/chosen": -2.1436526775360107, + "logits/rejected": -1.8284460306167603, + "logps/chosen": -269.2972106933594, + "logps/rejected": -228.4302215576172, + "loss": 6714.8125, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.45736759901046753, + "rewards/margins": 0.08712475001811981, + "rewards/rejected": -0.5444923639297485, + "rewards/safe_rewards": -0.44737473130226135, + "rewards/unsafe_rewards": -0.46736055612564087, "step": 1220 }, { "epoch": 0.66, "learning_rate": 1.5476328977205395e-07, - "logits/chosen": -1.788690209388733, - "logits/rejected": -1.4725120067596436, - "logps/chosen": -249.85226440429688, - "logps/rejected": -238.50814819335938, - "loss": 9309.0977, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.551167368888855, - "rewards/margins": 0.18140828609466553, - "rewards/rejected": -0.7325756549835205, - "rewards/safe_rewards": -0.5781738758087158, - "rewards/unsafe_rewards": -0.5241607427597046, + "logits/chosen": -2.0719504356384277, + "logits/rejected": -1.8282197713851929, + "logps/chosen": -240.5180206298828, + "logps/rejected": -223.3226318359375, + "loss": 6474.3016, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4578246474266052, + "rewards/margins": 0.12289600074291229, + "rewards/rejected": -0.5807207226753235, + "rewards/safe_rewards": -0.479422003030777, + "rewards/unsafe_rewards": -0.43622738122940063, "step": 1230 }, { "epoch": 0.67, "learning_rate": 1.5043719381837112e-07, - "logits/chosen": -1.8141076564788818, - "logits/rejected": -1.5068457126617432, - "logps/chosen": -272.9197692871094, - "logps/rejected": -258.5969543457031, - "loss": 9430.2188, - "rewards/accuracies": 0.621874988079071, - "rewards/chosen": -0.5348296165466309, - "rewards/margins": 0.15831992030143738, - "rewards/rejected": -0.6931496262550354, - "rewards/safe_rewards": -0.5212498903274536, - "rewards/unsafe_rewards": -0.5484093427658081, + "logits/chosen": -2.0963845252990723, + "logits/rejected": -1.8657476902008057, + "logps/chosen": -265.12103271484375, + "logps/rejected": -245.16470336914062, + "loss": 6423.2137, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.4568420350551605, + "rewards/margins": 0.10198511928319931, + "rewards/rejected": -0.5588272213935852, + "rewards/safe_rewards": -0.44269007444381714, + "rewards/unsafe_rewards": -0.47099393606185913, "step": 1240 }, { "epoch": 0.67, "learning_rate": 1.461462467495284e-07, - "logits/chosen": -1.765628457069397, - "logits/rejected": -1.4665768146514893, - "logps/chosen": -248.0455780029297, - "logps/rejected": -240.01553344726562, - "loss": 8793.5156, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.5249950289726257, - "rewards/margins": 0.19615288078784943, - "rewards/rejected": -0.721147894859314, - "rewards/safe_rewards": -0.5359389185905457, - "rewards/unsafe_rewards": -0.514051079750061, + "logits/chosen": -2.0682883262634277, + "logits/rejected": -1.8371692895889282, + "logps/chosen": -242.6766815185547, + "logps/rejected": -227.84335327148438, + "loss": 6296.4098, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.4713061451911926, + "rewards/margins": 0.12812045216560364, + "rewards/rejected": -0.5994266271591187, + "rewards/safe_rewards": -0.4832456111907959, + "rewards/unsafe_rewards": -0.45936664938926697, "step": 1250 }, { "epoch": 0.68, "learning_rate": 1.4189196340836865e-07, - "logits/chosen": -1.8182131052017212, - "logits/rejected": -1.4369385242462158, - "logps/chosen": -251.8483123779297, - "logps/rejected": -233.54336547851562, - "loss": 8949.3219, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.5261684656143188, - "rewards/margins": 0.14398489892482758, - "rewards/rejected": -0.6701533794403076, - "rewards/safe_rewards": -0.5305696725845337, - "rewards/unsafe_rewards": -0.5217671990394592, + "logits/chosen": -2.127166986465454, + "logits/rejected": -1.8461487293243408, + "logps/chosen": -246.42178344726562, + "logps/rejected": -222.8474578857422, + "loss": 6083.118, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.4719027876853943, + "rewards/margins": 0.09129159897565842, + "rewards/rejected": -0.5631943941116333, + "rewards/safe_rewards": -0.47717374563217163, + "rewards/unsafe_rewards": -0.4666318893432617, "step": 1260 }, { "epoch": 0.68, "learning_rate": 1.3767584569425561e-07, - "logits/chosen": -1.8838729858398438, - "logits/rejected": -1.463894009590149, - "logps/chosen": -266.53594970703125, - "logps/rejected": -246.49050903320312, - "loss": 9353.8953, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.5170857310295105, - "rewards/margins": 0.16659244894981384, - "rewards/rejected": -0.6836782693862915, - "rewards/safe_rewards": -0.5318734049797058, - "rewards/unsafe_rewards": -0.5022979974746704, + "logits/chosen": -2.1966021060943604, + "logits/rejected": -1.8988735675811768, + "logps/chosen": -259.76678466796875, + "logps/rejected": -234.09518432617188, + "loss": 6448.4398, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44939374923706055, + "rewards/margins": 0.11033139377832413, + "rewards/rejected": -0.5597251653671265, + "rewards/safe_rewards": -0.4581433832645416, + "rewards/unsafe_rewards": -0.44064411520957947, "step": 1270 }, { "epoch": 0.69, "learning_rate": 1.334993820328541e-07, - "logits/chosen": -1.820159912109375, - "logits/rejected": -1.4640798568725586, - "logps/chosen": -257.88616943359375, - "logps/rejected": -245.20046997070312, - "loss": 8817.425, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.5360870957374573, - "rewards/margins": 0.19950588047504425, - "rewards/rejected": -0.7355929613113403, - "rewards/safe_rewards": -0.5117971301078796, - "rewards/unsafe_rewards": -0.5603770017623901, + "logits/chosen": -2.136979579925537, + "logits/rejected": -1.8722255229949951, + "logps/chosen": -250.0802001953125, + "logps/rejected": -230.7494659423828, + "loss": 5991.9031, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.45802736282348633, + "rewards/margins": 0.1330554187297821, + "rewards/rejected": -0.5910828113555908, + "rewards/safe_rewards": -0.43893709778785706, + "rewards/unsafe_rewards": -0.4771176874637604, "step": 1280 }, { "epoch": 0.69, "learning_rate": 1.2936404685066852e-07, - "logits/chosen": -1.7333199977874756, - "logits/rejected": -1.4254271984100342, - "logps/chosen": -256.46014404296875, - "logps/rejected": -246.2918701171875, - "loss": 9496.3672, - "rewards/accuracies": 0.659375011920929, - "rewards/chosen": -0.5112076997756958, - "rewards/margins": 0.14339503645896912, - "rewards/rejected": -0.6546027064323425, - "rewards/safe_rewards": -0.5295280814170837, - "rewards/unsafe_rewards": -0.49288731813430786, + "logits/chosen": -2.0627663135528564, + "logits/rejected": -1.8343334197998047, + "logps/chosen": -250.0009307861328, + "logps/rejected": -234.5965118408203, + "loss": 6591.9164, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44661563634872437, + "rewards/margins": 0.09103361517190933, + "rewards/rejected": -0.5376492738723755, + "rewards/safe_rewards": -0.4586601257324219, + "rewards/unsafe_rewards": -0.43457117676734924, "step": 1290 }, { "epoch": 0.7, "learning_rate": 1.252713000545221e-07, - "logits/chosen": -1.874889612197876, - "logits/rejected": -1.4825398921966553, - "logps/chosen": -259.69683837890625, - "logps/rejected": -237.74618530273438, - "loss": 8541.1219, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.47730475664138794, - "rewards/margins": 0.17003121972084045, - "rewards/rejected": -0.6473358869552612, - "rewards/safe_rewards": -0.4752611517906189, - "rewards/unsafe_rewards": -0.4793483316898346, + "logits/chosen": -2.161848545074463, + "logits/rejected": -1.8758856058120728, + "logps/chosen": -255.42941284179688, + "logps/rejected": -227.38516235351562, + "loss": 5795.3254, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.43463054299354553, + "rewards/margins": 0.10909499228000641, + "rewards/rejected": -0.5437254309654236, + "rewards/safe_rewards": -0.43680882453918457, + "rewards/unsafe_rewards": -0.43245211243629456, "step": 1300 }, { "epoch": 0.7, "learning_rate": 1.2122258651616304e-07, - "logits/chosen": -1.8564685583114624, - "logits/rejected": -1.4926542043685913, - "logps/chosen": -261.3155822753906, - "logps/rejected": -238.84375, - "loss": 9066.7328, - "rewards/accuracies": 0.6468750238418579, - "rewards/chosen": -0.514704167842865, - "rewards/margins": 0.1353214532136917, - "rewards/rejected": -0.6500256657600403, - "rewards/safe_rewards": -0.5182002186775208, - "rewards/unsafe_rewards": -0.5112081170082092, + "logits/chosen": -2.141693115234375, + "logits/rejected": -1.872849702835083, + "logps/chosen": -255.8201141357422, + "logps/rejected": -228.45068359375, + "loss": 6160.4359, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.4597500264644623, + "rewards/margins": 0.08634473383426666, + "rewards/rejected": -0.5460947751998901, + "rewards/safe_rewards": -0.47202786803245544, + "rewards/unsafe_rewards": -0.44747209548950195, "step": 1310 }, { "epoch": 0.71, "learning_rate": 1.1721933556217792e-07, - "logits/chosen": -1.8125495910644531, - "logits/rejected": -1.517917513847351, - "logps/chosen": -246.9369659423828, - "logps/rejected": -244.0240478515625, - "loss": 9356.8258, - "rewards/accuracies": 0.640625, - "rewards/chosen": -0.508848249912262, - "rewards/margins": 0.17571674287319183, - "rewards/rejected": -0.684565007686615, - "rewards/safe_rewards": -0.5253952145576477, - "rewards/unsafe_rewards": -0.4923012852668762, + "logits/chosen": -2.105320453643799, + "logits/rejected": -1.879272699356079, + "logps/chosen": -239.11795043945312, + "logps/rejected": -230.5526885986328, + "loss": 6437.4898, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.43065792322158813, + "rewards/margins": 0.1191934198141098, + "rewards/rejected": -0.5498512983322144, + "rewards/safe_rewards": -0.4450756907463074, + "rewards/unsafe_rewards": -0.4162401258945465, "step": 1320 }, { "epoch": 0.72, "learning_rate": 1.1326296046939333e-07, - "logits/chosen": -1.7542698383331299, - "logits/rejected": -1.397963285446167, - "logps/chosen": -236.148681640625, - "logps/rejected": -222.0939483642578, - "loss": 8821.2047, - "rewards/accuracies": 0.640625, - "rewards/chosen": -0.5135654807090759, - "rewards/margins": 0.17041505873203278, - "rewards/rejected": -0.6839805841445923, - "rewards/safe_rewards": -0.4959505498409271, - "rewards/unsafe_rewards": -0.5311804413795471, + "logits/chosen": -2.050401210784912, + "logits/rejected": -1.7805709838867188, + "logps/chosen": -227.81747436523438, + "logps/rejected": -207.8745574951172, + "loss": 5906.4801, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.43025341629981995, + "rewards/margins": 0.11153332889080048, + "rewards/rejected": -0.5417866706848145, + "rewards/safe_rewards": -0.41498079895973206, + "rewards/unsafe_rewards": -0.4455259442329407, "step": 1330 }, { "epoch": 0.72, "learning_rate": 1.0935485796594351e-07, - "logits/chosen": -1.9061870574951172, - "logits/rejected": -1.5161683559417725, - "logps/chosen": -276.02056884765625, - "logps/rejected": -245.89279174804688, - "loss": 9924.2531, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.5345636010169983, - "rewards/margins": 0.16327166557312012, - "rewards/rejected": -0.6978353261947632, - "rewards/safe_rewards": -0.5395291447639465, - "rewards/unsafe_rewards": -0.52959805727005, + "logits/chosen": -2.155824661254883, + "logits/rejected": -1.8547401428222656, + "logps/chosen": -268.284423828125, + "logps/rejected": -232.23007202148438, + "loss": 6847.3039, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.4572017788887024, + "rewards/margins": 0.10400652885437012, + "rewards/rejected": -0.5612083673477173, + "rewards/safe_rewards": -0.4599490165710449, + "rewards/unsafe_rewards": -0.4544545114040375, "step": 1340 }, { "epoch": 0.73, "learning_rate": 1.0549640773818028e-07, - "logits/chosen": -1.8199176788330078, - "logits/rejected": -1.5144760608673096, - "logps/chosen": -263.22930908203125, - "logps/rejected": -229.4552764892578, - "loss": 9813.3, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.5825823545455933, - "rewards/margins": 0.1226869598031044, - "rewards/rejected": -0.7052692770957947, - "rewards/safe_rewards": -0.597611665725708, - "rewards/unsafe_rewards": -0.5675531029701233, + "logits/chosen": -2.066615581512451, + "logits/rejected": -1.831392526626587, + "logps/chosen": -254.32418823242188, + "logps/rejected": -215.6810760498047, + "loss": 6832.7969, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.49353137612342834, + "rewards/margins": 0.0739959180355072, + "rewards/rejected": -0.5675273537635803, + "rewards/safe_rewards": -0.505325198173523, + "rewards/unsafe_rewards": -0.4817374646663666, "step": 1350 }, { "epoch": 0.73, "learning_rate": 1.0168897194359921e-07, - "logits/chosen": -1.8647133111953735, - "logits/rejected": -1.4758477210998535, - "logps/chosen": -283.9356384277344, - "logps/rejected": -259.8861999511719, - "loss": 9215.2531, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.616075873374939, - "rewards/margins": 0.15026313066482544, - "rewards/rejected": -0.7663390040397644, - "rewards/safe_rewards": -0.6119508743286133, - "rewards/unsafe_rewards": -0.6202008128166199, + "logits/chosen": -2.1050429344177246, + "logits/rejected": -1.8025562763214111, + "logps/chosen": -274.6943054199219, + "logps/rejected": -244.18777465820312, + "loss": 6459.3121, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.523662805557251, + "rewards/margins": 0.08569201081991196, + "rewards/rejected": -0.6093548536300659, + "rewards/safe_rewards": -0.5233367681503296, + "rewards/unsafe_rewards": -0.5239888429641724, "step": 1360 }, { "epoch": 0.74, "learning_rate": 9.793389472995392e-08, - "logits/chosen": -1.7711708545684814, - "logits/rejected": -1.3847808837890625, - "logps/chosen": -264.9733581542969, - "logps/rejected": -240.27877807617188, - "loss": 8490.6859, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.5528773069381714, - "rewards/margins": 0.18372176587581635, - "rewards/rejected": -0.7365990877151489, - "rewards/safe_rewards": -0.532310962677002, - "rewards/unsafe_rewards": -0.5734436511993408, + "logits/chosen": -2.055671453475952, + "logits/rejected": -1.7598520517349243, + "logps/chosen": -256.25750732421875, + "logps/rejected": -225.249267578125, + "loss": 5955.8453, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.4657188951969147, + "rewards/margins": 0.12058509886264801, + "rewards/rejected": -0.5863040089607239, + "rewards/safe_rewards": -0.44779258966445923, + "rewards/unsafe_rewards": -0.4836452007293701, "step": 1370 }, { "epoch": 0.74, "learning_rate": 9.423250176072874e-08, - "logits/chosen": -1.7696157693862915, - "logits/rejected": -1.4456861019134521, - "logps/chosen": -238.8620147705078, - "logps/rejected": -223.49612426757812, - "loss": 10557.7023, - "rewards/accuracies": 0.6156250238418579, - "rewards/chosen": -0.5741257071495056, - "rewards/margins": 0.12087428569793701, - "rewards/rejected": -0.6949999332427979, - "rewards/safe_rewards": -0.570341944694519, - "rewards/unsafe_rewards": -0.5779094099998474, + "logits/chosen": -2.042423725128174, + "logits/rejected": -1.7933346033096313, + "logps/chosen": -229.9665985107422, + "logps/rejected": -209.86416625976562, + "loss": 7156.8875, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.4851716160774231, + "rewards/margins": 0.07350887358188629, + "rewards/rejected": -0.5586804747581482, + "rewards/safe_rewards": -0.48476117849349976, + "rewards/unsafe_rewards": -0.4855819642543793, "step": 1380 }, { "epoch": 0.75, "learning_rate": 9.058609974713654e-08, - "logits/chosen": -1.8793513774871826, - "logits/rejected": -1.4532734155654907, - "logps/chosen": -259.194091796875, - "logps/rejected": -242.09628295898438, - "loss": 8920.968, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.5298846960067749, - "rewards/margins": 0.17306169867515564, - "rewards/rejected": -0.7029464840888977, - "rewards/safe_rewards": -0.5209446549415588, - "rewards/unsafe_rewards": -0.5388248562812805, + "logits/chosen": -2.1118245124816895, + "logits/rejected": -1.7858221530914307, + "logps/chosen": -252.89590454101562, + "logps/rejected": -229.27572631835938, + "loss": 6091.018, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.46690288186073303, + "rewards/margins": 0.10783787816762924, + "rewards/rejected": -0.5747407674789429, + "rewards/safe_rewards": -0.4574856758117676, + "rewards/unsafe_rewards": -0.4763200283050537, "step": 1390 }, { "epoch": 0.75, "learning_rate": 8.699597598680753e-08, - "logits/chosen": -1.7697757482528687, - "logits/rejected": -1.4160401821136475, - "logps/chosen": -234.19204711914062, - "logps/rejected": -238.2794647216797, - "loss": 8582.0086, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.5013014674186707, - "rewards/margins": 0.19070756435394287, - "rewards/rejected": -0.6920090913772583, - "rewards/safe_rewards": -0.5159270763397217, - "rewards/unsafe_rewards": -0.486675888299942, + "logits/chosen": -2.0331668853759766, + "logits/rejected": -1.7570956945419312, + "logps/chosen": -229.2872314453125, + "logps/rejected": -227.269287109375, + "loss": 5937.1336, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4522533416748047, + "rewards/margins": 0.12965361773967743, + "rewards/rejected": -0.5819069743156433, + "rewards/safe_rewards": -0.4643983840942383, + "rewards/unsafe_rewards": -0.44010835886001587, "step": 1400 }, { "epoch": 0.76, "learning_rate": 8.346339790933166e-08, - "logits/chosen": -1.8758964538574219, - "logits/rejected": -1.4784882068634033, - "logps/chosen": -251.67575073242188, - "logps/rejected": -227.55606079101562, - "loss": 9586.2859, - "rewards/accuracies": 0.6656249761581421, - "rewards/chosen": -0.514674961566925, - "rewards/margins": 0.16187819838523865, - "rewards/rejected": -0.6765531301498413, - "rewards/safe_rewards": -0.5080689787864685, - "rewards/unsafe_rewards": -0.5212809443473816, + "logits/chosen": -2.1295323371887207, + "logits/rejected": -1.8313289880752563, + "logps/chosen": -246.0663299560547, + "logps/rejected": -216.1509552001953, + "loss": 6727.4266, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.4585806727409363, + "rewards/margins": 0.10392139852046967, + "rewards/rejected": -0.5625020265579224, + "rewards/safe_rewards": -0.45531362295150757, + "rewards/unsafe_rewards": -0.461847722530365, "step": 1410 }, { "epoch": 0.76, "learning_rate": 7.998961262881506e-08, - "logits/chosen": -1.8012908697128296, - "logits/rejected": -1.3700876235961914, - "logps/chosen": -268.8277282714844, - "logps/rejected": -236.764404296875, - "loss": 9110.9188, - "rewards/accuracies": 0.6968749761581421, - "rewards/chosen": -0.47869497537612915, - "rewards/margins": 0.15951481461524963, - "rewards/rejected": -0.6382097005844116, - "rewards/safe_rewards": -0.4757060110569, - "rewards/unsafe_rewards": -0.4816839098930359, + "logits/chosen": -2.066490650177002, + "logits/rejected": -1.7475519180297852, + "logps/chosen": -263.6619567871094, + "logps/rejected": -227.02685546875, + "loss": 6193.2656, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.42703723907470703, + "rewards/margins": 0.11379702389240265, + "rewards/rejected": -0.5408341884613037, + "rewards/safe_rewards": -0.42232245206832886, + "rewards/unsafe_rewards": -0.43175190687179565, "step": 1420 }, { "epoch": 0.77, "learning_rate": 7.657584650360846e-08, - "logits/chosen": -1.7479966878890991, - "logits/rejected": -1.4162769317626953, - "logps/chosen": -247.37594604492188, - "logps/rejected": -238.7945098876953, - "loss": 9581.7031, - "rewards/accuracies": 0.659375011920929, - "rewards/chosen": -0.47960662841796875, - "rewards/margins": 0.18462765216827393, - "rewards/rejected": -0.6642343401908875, - "rewards/safe_rewards": -0.4892401099205017, - "rewards/unsafe_rewards": -0.46997323632240295, + "logits/chosen": -2.0444042682647705, + "logits/rejected": -1.795877456665039, + "logps/chosen": -242.45947265625, + "logps/rejected": -228.25601196289062, + "loss": 6729.2211, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.43044179677963257, + "rewards/margins": 0.12840771675109863, + "rewards/rejected": -0.5588495135307312, + "rewards/safe_rewards": -0.4417741894721985, + "rewards/unsafe_rewards": -0.4191093444824219, "step": 1430 }, { "epoch": 0.77, "learning_rate": 7.322330470336313e-08, - "logits/chosen": -1.7617130279541016, - "logits/rejected": -1.4174513816833496, - "logps/chosen": -238.3271942138672, - "logps/rejected": -236.63491821289062, - "loss": 9112.6594, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.4836384356021881, - "rewards/margins": 0.18336188793182373, - "rewards/rejected": -0.6670002341270447, - "rewards/safe_rewards": -0.4770020842552185, - "rewards/unsafe_rewards": -0.49027466773986816, + "logits/chosen": -2.052246570587158, + "logits/rejected": -1.7939999103546143, + "logps/chosen": -232.52175903320312, + "logps/rejected": -225.86880493164062, + "loss": 6284.6258, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.42558392882347107, + "rewards/margins": 0.13375507295131683, + "rewards/rejected": -0.5593389272689819, + "rewards/safe_rewards": -0.4226798117160797, + "rewards/unsafe_rewards": -0.4284881055355072, "step": 1440 }, { "epoch": 0.78, "learning_rate": 6.993317078356709e-08, - "logits/chosen": -1.7554283142089844, - "logits/rejected": -1.4887868165969849, - "logps/chosen": -250.6308135986328, - "logps/rejected": -233.91183471679688, - "loss": 9465.5375, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.5166350603103638, - "rewards/margins": 0.12070528417825699, - "rewards/rejected": -0.6373403668403625, - "rewards/safe_rewards": -0.538526713848114, - "rewards/unsafe_rewards": -0.4947434067726135, + "logits/chosen": -2.0569279193878174, + "logits/rejected": -1.8582913875579834, + "logps/chosen": -244.96859741210938, + "logps/rejected": -223.7991485595703, + "loss": 6497.4699, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.460013210773468, + "rewards/margins": 0.07620026171207428, + "rewards/rejected": -0.5362134575843811, + "rewards/safe_rewards": -0.47913751006126404, + "rewards/unsafe_rewards": -0.4408888816833496, "step": 1450 }, { "epoch": 0.79, "learning_rate": 6.67066062677118e-08, - "logits/chosen": -1.8329315185546875, - "logits/rejected": -1.4907639026641846, - "logps/chosen": -259.4100341796875, - "logps/rejected": -232.3672637939453, - "loss": 9874.7938, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.507000744342804, - "rewards/margins": 0.13802625238895416, - "rewards/rejected": -0.6450269222259521, - "rewards/safe_rewards": -0.5120242238044739, - "rewards/unsafe_rewards": -0.5019772052764893, + "logits/chosen": -2.1144156455993652, + "logits/rejected": -1.8631536960601807, + "logps/chosen": -254.3730010986328, + "logps/rejected": -221.9481201171875, + "loss": 6792.3, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.456630140542984, + "rewards/margins": 0.08420534431934357, + "rewards/rejected": -0.5408354997634888, + "rewards/safe_rewards": -0.4593842029571533, + "rewards/unsafe_rewards": -0.4538760185241699, "step": 1460 }, { "epoch": 0.79, "learning_rate": 6.354475023723685e-08, - "logits/chosen": -1.8093589544296265, - "logits/rejected": -1.4304033517837524, - "logps/chosen": -269.5719909667969, - "logps/rejected": -242.3310089111328, - "loss": 9231.9594, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.5239914059638977, - "rewards/margins": 0.18032845854759216, - "rewards/rejected": -0.7043198347091675, - "rewards/safe_rewards": -0.5223089456558228, - "rewards/unsafe_rewards": -0.5256738662719727, + "logits/chosen": -2.077465534210205, + "logits/rejected": -1.8094123601913452, + "logps/chosen": -262.92901611328125, + "logps/rejected": -229.3378448486328, + "loss": 6367.159, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.457561731338501, + "rewards/margins": 0.11682651191949844, + "rewards/rejected": -0.5743882060050964, + "rewards/safe_rewards": -0.45188575983047485, + "rewards/unsafe_rewards": -0.4632377028465271, "step": 1470 }, { "epoch": 0.8, "learning_rate": 6.044871892939746e-08, - "logits/chosen": -1.8364317417144775, - "logits/rejected": -1.5104265213012695, - "logps/chosen": -279.77197265625, - "logps/rejected": -257.052001953125, - "loss": 9246.6406, - "rewards/accuracies": 0.6343749761581421, - "rewards/chosen": -0.5413323044776917, - "rewards/margins": 0.14012053608894348, - "rewards/rejected": -0.6814528703689575, - "rewards/safe_rewards": -0.5333670973777771, - "rewards/unsafe_rewards": -0.5492974519729614, + "logits/chosen": -2.107585906982422, + "logits/rejected": -1.8727686405181885, + "logps/chosen": -272.7564392089844, + "logps/rejected": -245.385498046875, + "loss": 6379.8508, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.47117677330970764, + "rewards/margins": 0.0936112180352211, + "rewards/rejected": -0.5647879242897034, + "rewards/safe_rewards": -0.46114739775657654, + "rewards/unsafe_rewards": -0.4812060296535492, "step": 1480 }, { "epoch": 0.8, "learning_rate": 5.741960534319676e-08, - "logits/chosen": -1.8173694610595703, - "logits/rejected": -1.513812780380249, - "logps/chosen": -244.9208221435547, - "logps/rejected": -228.85092163085938, - "loss": 9268.5633, - "rewards/accuracies": 0.6343749761581421, - "rewards/chosen": -0.542190432548523, - "rewards/margins": 0.13879111409187317, - "rewards/rejected": -0.6809815168380737, - "rewards/safe_rewards": -0.5698345303535461, - "rewards/unsafe_rewards": -0.5145464539527893, + "logits/chosen": -2.0843305587768555, + "logits/rejected": -1.864058494567871, + "logps/chosen": -237.5848846435547, + "logps/rejected": -216.74258422851562, + "loss": 6329.8926, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.4688313901424408, + "rewards/margins": 0.09106676280498505, + "rewards/rejected": -0.5598980784416199, + "rewards/safe_rewards": -0.48697489500045776, + "rewards/unsafe_rewards": -0.45068782567977905, "step": 1490 }, { "epoch": 0.81, "learning_rate": 5.44584788535217e-08, - "logits/chosen": -1.8249263763427734, - "logits/rejected": -1.475886344909668, - "logps/chosen": -266.36285400390625, - "logps/rejected": -246.23397827148438, - "loss": 8544.8563, - "rewards/accuracies": 0.653124988079071, - "rewards/chosen": -0.5337814688682556, - "rewards/margins": 0.16020691394805908, - "rewards/rejected": -0.6939883828163147, - "rewards/safe_rewards": -0.5405424237251282, - "rewards/unsafe_rewards": -0.5270205736160278, + "logits/chosen": -2.1008994579315186, + "logits/rejected": -1.84600031375885, + "logps/chosen": -259.45599365234375, + "logps/rejected": -233.75082397460938, + "loss": 5925.3711, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.464712917804718, + "rewards/margins": 0.1044440045952797, + "rewards/rejected": -0.56915682554245, + "rewards/safe_rewards": -0.47588276863098145, + "rewards/unsafe_rewards": -0.4535430073738098, "step": 1500 }, { "epoch": 0.81, - "eval_logits/chosen": -1.192850112915039, - "eval_logits/rejected": -0.7429371476173401, - "eval_logps/chosen": -206.96099853515625, - "eval_logps/rejected": -172.02581787109375, - "eval_loss": 3742.037841796875, - "eval_rewards/accuracies": 0.545074999332428, - "eval_rewards/chosen": -0.7609512209892273, - "eval_rewards/margins": 0.03578811138868332, - "eval_rewards/rejected": -0.7967393398284912, - "eval_rewards/safe_rewards": -0.7613836526870728, - "eval_rewards/unsafe_rewards": -0.760369062423706, - "eval_runtime": 1814.7579, - "eval_samples_per_second": 18.208, - "eval_steps_per_second": 1.138, + "eval_logits/chosen": -1.6828917264938354, + "eval_logits/rejected": -1.3719456195831299, + "eval_logps/chosen": -201.3945770263672, + "eval_logps/rejected": -161.33763122558594, + "eval_loss": 3026.4443359375, + "eval_rewards/accuracies": 0.46672314405441284, + "eval_rewards/chosen": -0.7052872776985168, + "eval_rewards/margins": -0.01542994100600481, + "eval_rewards/rejected": -0.6898572444915771, + "eval_rewards/safe_rewards": -0.705478847026825, + "eval_rewards/unsafe_rewards": -0.7033840417861938, + "eval_runtime": 1797.98, + "eval_samples_per_second": 18.378, + "eval_steps_per_second": 1.149, "step": 1500 }, { "epoch": 0.81, "learning_rate": 5.156638483361933e-08, - "logits/chosen": -1.8345102071762085, - "logits/rejected": -1.4621663093566895, - "logps/chosen": -256.693603515625, - "logps/rejected": -243.7182159423828, - "loss": 9258.7203, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.4995827078819275, - "rewards/margins": 0.1932138353586197, - "rewards/rejected": -0.692796528339386, - "rewards/safe_rewards": -0.4974733889102936, - "rewards/unsafe_rewards": -0.5016920566558838, + "logits/chosen": -2.107635021209717, + "logits/rejected": -1.8341983556747437, + "logps/chosen": -250.5362091064453, + "logps/rejected": -231.42312622070312, + "loss": 6448.6469, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.43800896406173706, + "rewards/margins": 0.13183660805225372, + "rewards/rejected": -0.5698455572128296, + "rewards/safe_rewards": -0.4343613088130951, + "rewards/unsafe_rewards": -0.44165658950805664, "step": 1510 }, { "epoch": 0.82, "learning_rate": 4.8744344286046236e-08, - "logits/chosen": -1.7860386371612549, - "logits/rejected": -1.433927059173584, - "logps/chosen": -261.70098876953125, - "logps/rejected": -237.2835693359375, - "loss": 9460.0688, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.5433987379074097, - "rewards/margins": 0.14123043417930603, - "rewards/rejected": -0.6846292614936829, - "rewards/safe_rewards": -0.5410050749778748, - "rewards/unsafe_rewards": -0.5457924604415894, + "logits/chosen": -2.0658557415008545, + "logits/rejected": -1.8039467334747314, + "logps/chosen": -254.8303680419922, + "logps/rejected": -224.9630584716797, + "loss": 6587.8852, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.4746926724910736, + "rewards/margins": 0.08673126995563507, + "rewards/rejected": -0.5614239573478699, + "rewards/safe_rewards": -0.4753999710083008, + "rewards/unsafe_rewards": -0.4739854335784912, "step": 1520 }, { "epoch": 0.82, "learning_rate": 4.599335348222169e-08, - "logits/chosen": -1.8373982906341553, - "logits/rejected": -1.5454175472259521, - "logps/chosen": -259.53271484375, - "logps/rejected": -255.33447265625, - "loss": 8876.5953, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.5245291590690613, - "rewards/margins": 0.16881993412971497, - "rewards/rejected": -0.6933490633964539, - "rewards/safe_rewards": -0.5141156911849976, - "rewards/unsafe_rewards": -0.5349426865577698, + "logits/chosen": -2.104322910308838, + "logits/rejected": -1.8812888860702515, + "logps/chosen": -253.0201416015625, + "logps/rejected": -242.67636108398438, + "loss": 6177.6988, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.4594038128852844, + "rewards/margins": 0.10736414045095444, + "rewards/rejected": -0.5667679309844971, + "rewards/safe_rewards": -0.45568904280662537, + "rewards/unsafe_rewards": -0.46311864256858826, "step": 1530 }, { "epoch": 0.83, "learning_rate": 4.331438361071163e-08, - "logits/chosen": -1.7704732418060303, - "logits/rejected": -1.5517984628677368, - "logps/chosen": -265.05096435546875, - "logps/rejected": -259.9892578125, - "loss": 9630.9031, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.5252208709716797, - "rewards/margins": 0.13485923409461975, - "rewards/rejected": -0.660080075263977, - "rewards/safe_rewards": -0.5125634074211121, - "rewards/unsafe_rewards": -0.5378782153129578, + "logits/chosen": -2.03129243850708, + "logits/rejected": -1.8628809452056885, + "logps/chosen": -258.9869079589844, + "logps/rejected": -249.06881713867188, + "loss": 6718.5445, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4645802080631256, + "rewards/margins": 0.08629541099071503, + "rewards/rejected": -0.5508756637573242, + "rewards/safe_rewards": -0.45355597138404846, + "rewards/unsafe_rewards": -0.4756045341491699, "step": 1540 }, { "epoch": 0.83, "learning_rate": 4.0708380434367864e-08, - "logits/chosen": -1.8555190563201904, - "logits/rejected": -1.4889328479766846, - "logps/chosen": -251.518798828125, - "logps/rejected": -240.971923828125, - "loss": 8757.7797, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.5223067402839661, - "rewards/margins": 0.17267867922782898, - "rewards/rejected": -0.6949853897094727, - "rewards/safe_rewards": -0.51201331615448, - "rewards/unsafe_rewards": -0.5326001644134521, + "logits/chosen": -2.1100175380706787, + "logits/rejected": -1.8269599676132202, + "logps/chosen": -244.19241333007812, + "logps/rejected": -228.1028594970703, + "loss": 6020.5559, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.4490428566932678, + "rewards/margins": 0.117251917719841, + "rewards/rejected": -0.5662947297096252, + "rewards/safe_rewards": -0.4383305609226227, + "rewards/unsafe_rewards": -0.4597550928592682, "step": 1550 }, { "epoch": 0.84, "learning_rate": 3.817626395644305e-08, - "logits/chosen": -1.8922573328018188, - "logits/rejected": -1.5942326784133911, - "logps/chosen": -260.1571350097656, - "logps/rejected": -242.51220703125, - "loss": 9391.0875, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.542544424533844, - "rewards/margins": 0.1094987541437149, - "rewards/rejected": -0.6520432233810425, - "rewards/safe_rewards": -0.5353507995605469, - "rewards/unsafe_rewards": -0.5497379899024963, + "logits/chosen": -2.1250922679901123, + "logits/rejected": -1.9055827856063843, + "logps/chosen": -252.9553985595703, + "logps/rejected": -230.5366668701172, + "loss": 6467.1945, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4705273509025574, + "rewards/margins": 0.061760313808918, + "rewards/rejected": -0.5322877168655396, + "rewards/safe_rewards": -0.46299856901168823, + "rewards/unsafe_rewards": -0.4780561327934265, "step": 1560 }, { "epoch": 0.84, "learning_rate": 3.571892809580013e-08, - "logits/chosen": -1.8377292156219482, - "logits/rejected": -1.51100754737854, - "logps/chosen": -248.9451141357422, - "logps/rejected": -243.75479125976562, - "loss": 9603.1625, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.5371780395507812, - "rewards/margins": 0.14618846774101257, - "rewards/rejected": -0.6833664774894714, - "rewards/safe_rewards": -0.521805465221405, - "rewards/unsafe_rewards": -0.5525504350662231, + "logits/chosen": -2.091125965118408, + "logits/rejected": -1.8502193689346313, + "logps/chosen": -240.99044799804688, + "logps/rejected": -229.93167114257812, + "loss": 6833.8594, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.45763105154037476, + "rewards/margins": 0.08750450611114502, + "rewards/rejected": -0.5451356172561646, + "rewards/safe_rewards": -0.4449933171272278, + "rewards/unsafe_rewards": -0.4702689051628113, "step": 1570 }, { "epoch": 0.85, "learning_rate": 3.333724037132976e-08, - "logits/chosen": -1.860884428024292, - "logits/rejected": -1.525147557258606, - "logps/chosen": -253.8798370361328, - "logps/rejected": -240.14736938476562, - "loss": 9219.7844, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.5542603731155396, - "rewards/margins": 0.13847388327121735, - "rewards/rejected": -0.6927343606948853, - "rewards/safe_rewards": -0.5410465598106384, - "rewards/unsafe_rewards": -0.5674742460250854, + "logits/chosen": -2.106806516647339, + "logits/rejected": -1.8567653894424438, + "logps/chosen": -245.6099853515625, + "logps/rejected": -225.84719848632812, + "loss": 6472.2758, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.47156161069869995, + "rewards/margins": 0.07817086577415466, + "rewards/rejected": -0.549732506275177, + "rewards/safe_rewards": -0.4597776532173157, + "rewards/unsafe_rewards": -0.4833455979824066, "step": 1580 }, { "epoch": 0.86, "learning_rate": 3.1032041595688506e-08, - "logits/chosen": -1.8546321392059326, - "logits/rejected": -1.525552749633789, - "logps/chosen": -268.1792907714844, - "logps/rejected": -254.33023071289062, - "loss": 9052.8406, - "rewards/accuracies": 0.628125011920929, - "rewards/chosen": -0.5202707052230835, - "rewards/margins": 0.16665077209472656, - "rewards/rejected": -0.6869214177131653, - "rewards/safe_rewards": -0.5220972895622253, - "rewards/unsafe_rewards": -0.5184441208839417, + "logits/chosen": -2.086425304412842, + "logits/rejected": -1.8307304382324219, + "logps/chosen": -259.83050537109375, + "logps/rejected": -240.1065673828125, + "loss": 6203.0508, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.43678250908851624, + "rewards/margins": 0.10790238529443741, + "rewards/rejected": -0.544684886932373, + "rewards/safe_rewards": -0.4430006444454193, + "rewards/unsafe_rewards": -0.4305643141269684, "step": 1590 }, { "epoch": 0.86, "learning_rate": 2.880414557846453e-08, - "logits/chosen": -1.8775207996368408, - "logits/rejected": -1.6342952251434326, - "logps/chosen": -249.0835418701172, - "logps/rejected": -231.78561401367188, - "loss": 8478.7961, - "rewards/accuracies": 0.6468750238418579, - "rewards/chosen": -0.4925798773765564, - "rewards/margins": 0.18134264647960663, - "rewards/rejected": -0.6739225387573242, - "rewards/safe_rewards": -0.4908020496368408, - "rewards/unsafe_rewards": -0.4943576753139496, + "logits/chosen": -2.116537570953369, + "logits/rejected": -1.9354660511016846, + "logps/chosen": -241.5932159423828, + "logps/rejected": -219.1776123046875, + "loss": 5802.0645, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.4176766276359558, + "rewards/margins": 0.13016578555107117, + "rewards/rejected": -0.5478425025939941, + "rewards/safe_rewards": -0.4161531925201416, + "rewards/unsafe_rewards": -0.4192000925540924, "step": 1600 }, { "epoch": 0.87, "learning_rate": 2.6654338838876662e-08, - "logits/chosen": -1.8869003057479858, - "logits/rejected": -1.4599946737289429, - "logps/chosen": -259.44793701171875, - "logps/rejected": -230.3014678955078, - "loss": 9434.275, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.5290555357933044, - "rewards/margins": 0.15350595116615295, - "rewards/rejected": -0.6825615167617798, - "rewards/safe_rewards": -0.5380357503890991, - "rewards/unsafe_rewards": -0.5200753211975098, + "logits/chosen": -2.127192497253418, + "logits/rejected": -1.8054416179656982, + "logps/chosen": -251.4619598388672, + "logps/rejected": -217.2257537841797, + "loss": 6674.4672, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.4491957128047943, + "rewards/margins": 0.10260852426290512, + "rewards/rejected": -0.5518041849136353, + "rewards/safe_rewards": -0.45763081312179565, + "rewards/unsafe_rewards": -0.44076067209243774, "step": 1610 }, { "epoch": 0.87, "learning_rate": 2.4583380328107805e-08, - "logits/chosen": -1.8703784942626953, - "logits/rejected": -1.500836730003357, - "logps/chosen": -271.333251953125, - "logps/rejected": -241.6490478515625, - "loss": 9626.8453, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.5253649353981018, - "rewards/margins": 0.14888349175453186, - "rewards/rejected": -0.674248456954956, - "rewards/safe_rewards": -0.5112279653549194, - "rewards/unsafe_rewards": -0.539501965045929, + "logits/chosen": -2.1055760383605957, + "logits/rejected": -1.831613540649414, + "logps/chosen": -264.2273864746094, + "logps/rejected": -228.84228515625, + "loss": 6595.832, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.4543065130710602, + "rewards/margins": 0.09187433123588562, + "rewards/rejected": -0.5461808443069458, + "rewards/safe_rewards": -0.4405900835990906, + "rewards/unsafe_rewards": -0.46802282333374023, "step": 1620 }, { "epoch": 0.88, "learning_rate": 2.259200116137039e-08, - "logits/chosen": -1.82293701171875, - "logits/rejected": -1.5371062755584717, - "logps/chosen": -256.2450256347656, - "logps/rejected": -253.77890014648438, - "loss": 9601.7156, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.5125375986099243, - "rewards/margins": 0.15483711659908295, - "rewards/rejected": -0.6673747301101685, - "rewards/safe_rewards": -0.5055891275405884, - "rewards/unsafe_rewards": -0.519486129283905, + "logits/chosen": -2.0728936195373535, + "logits/rejected": -1.8516597747802734, + "logps/chosen": -248.5038299560547, + "logps/rejected": -240.140625, + "loss": 6611.8016, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.43512576818466187, + "rewards/margins": 0.09586648643016815, + "rewards/rejected": -0.530992329120636, + "rewards/safe_rewards": -0.42405596375465393, + "rewards/unsafe_rewards": -0.4461956024169922, "step": 1630 }, { "epoch": 0.88, "learning_rate": 2.068090435979958e-08, - "logits/chosen": -1.7882747650146484, - "logits/rejected": -1.522146463394165, - "logps/chosen": -243.7785186767578, - "logps/rejected": -229.19284057617188, - "loss": 9414.1625, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.49749964475631714, - "rewards/margins": 0.14228388667106628, - "rewards/rejected": -0.6397835612297058, - "rewards/safe_rewards": -0.5161217451095581, - "rewards/unsafe_rewards": -0.47887754440307617, + "logits/chosen": -2.0459280014038086, + "logits/rejected": -1.8485805988311768, + "logps/chosen": -236.850341796875, + "logps/rejected": -217.1259307861328, + "loss": 6589.3367, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.42821797728538513, + "rewards/margins": 0.0908961072564125, + "rewards/rejected": -0.519114077091217, + "rewards/safe_rewards": -0.4395659565925598, + "rewards/unsafe_rewards": -0.4168699383735657, "step": 1640 }, { "epoch": 0.89, "learning_rate": 1.8850764602263423e-08, - "logits/chosen": -1.8769031763076782, - "logits/rejected": -1.4577304124832153, - "logps/chosen": -251.89291381835938, - "logps/rejected": -240.9708709716797, - "loss": 9424.6891, - "rewards/accuracies": 0.659375011920929, - "rewards/chosen": -0.5158787965774536, - "rewards/margins": 0.1585373878479004, - "rewards/rejected": -0.6744161248207092, - "rewards/safe_rewards": -0.5054845213890076, - "rewards/unsafe_rewards": -0.5262728929519653, + "logits/chosen": -2.1169512271881104, + "logits/rejected": -1.8033020496368408, + "logps/chosen": -244.875732421875, + "logps/rejected": -228.0574188232422, + "loss": 6550.1773, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.4457070231437683, + "rewards/margins": 0.09957444667816162, + "rewards/rejected": -0.5452814698219299, + "rewards/safe_rewards": -0.43225640058517456, + "rewards/unsafe_rewards": -0.4591576159000397, "step": 1650 }, { "epoch": 0.89, "learning_rate": 1.710222798718028e-08, - "logits/chosen": -1.9094442129135132, - "logits/rejected": -1.5923407077789307, - "logps/chosen": -253.0218963623047, - "logps/rejected": -244.26025390625, - "loss": 9327.8766, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.49680081009864807, - "rewards/margins": 0.15851224958896637, - "rewards/rejected": -0.6553130745887756, - "rewards/safe_rewards": -0.4968167245388031, - "rewards/unsafe_rewards": -0.49678486585617065, + "logits/chosen": -2.144412040710449, + "logits/rejected": -1.9019086360931396, + "logps/chosen": -245.9793243408203, + "logps/rejected": -231.61734008789062, + "loss": 6516.768, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.426375150680542, + "rewards/margins": 0.102508544921875, + "rewards/rejected": -0.5288837552070618, + "rewards/safe_rewards": -0.41905418038368225, + "rewards/unsafe_rewards": -0.4336961805820465, "step": 1660 }, { "epoch": 0.9, "learning_rate": 1.5435911804424356e-08, - "logits/chosen": -1.872719168663025, - "logits/rejected": -1.5997536182403564, - "logps/chosen": -281.7905578613281, - "logps/rejected": -253.61685180664062, - "loss": 9943.9188, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.5251935124397278, - "rewards/margins": 0.1569596230983734, - "rewards/rejected": -0.6821531057357788, - "rewards/safe_rewards": -0.5269424915313721, - "rewards/unsafe_rewards": -0.5234444737434387, + "logits/chosen": -2.104454755783081, + "logits/rejected": -1.9044666290283203, + "logps/chosen": -274.7130432128906, + "logps/rejected": -240.73812866210938, + "loss": 6860.0828, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.4544183611869812, + "rewards/margins": 0.0989476665854454, + "rewards/rejected": -0.5533660650253296, + "rewards/safe_rewards": -0.447795569896698, + "rewards/unsafe_rewards": -0.461041122674942, "step": 1670 }, { "epoch": 0.9, "learning_rate": 1.3852404317403199e-08, - "logits/chosen": -1.8720991611480713, - "logits/rejected": -1.5759871006011963, - "logps/chosen": -270.53118896484375, - "logps/rejected": -261.20208740234375, - "loss": 9561.843, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.5019877552986145, - "rewards/margins": 0.16725938022136688, - "rewards/rejected": -0.6692470908164978, - "rewards/safe_rewards": -0.518138587474823, - "rewards/unsafe_rewards": -0.48583683371543884, + "logits/chosen": -2.104949474334717, + "logits/rejected": -1.8842833042144775, + "logps/chosen": -263.541259765625, + "logps/rejected": -248.37600708007812, + "loss": 6686.3102, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.43208789825439453, + "rewards/margins": 0.10889849811792374, + "rewards/rejected": -0.5409864187240601, + "rewards/safe_rewards": -0.44288283586502075, + "rewards/unsafe_rewards": -0.4212929606437683, "step": 1680 }, { "epoch": 0.91, "learning_rate": 1.235226455538113e-08, - "logits/chosen": -1.9165754318237305, - "logits/rejected": -1.5968153476715088, - "logps/chosen": -252.7272491455078, - "logps/rejected": -232.4368896484375, - "loss": 9787.6281, - "rewards/accuracies": 0.609375, - "rewards/chosen": -0.5125941038131714, - "rewards/margins": 0.13192793726921082, - "rewards/rejected": -0.6445220112800598, - "rewards/safe_rewards": -0.5255392789840698, - "rewards/unsafe_rewards": -0.4996488690376282, + "logits/chosen": -2.1549041271209717, + "logits/rejected": -1.9211156368255615, + "logps/chosen": -245.10513305664062, + "logps/rejected": -219.42529296875, + "loss": 6679.3547, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.4363730847835541, + "rewards/margins": 0.07803308963775635, + "rewards/rejected": -0.5144062042236328, + "rewards/safe_rewards": -0.4496842324733734, + "rewards/unsafe_rewards": -0.42306193709373474, "step": 1690 }, { "epoch": 0.91, "learning_rate": 1.0936022116124321e-08, - "logits/chosen": -1.9121907949447632, - "logits/rejected": -1.566451072692871, - "logps/chosen": -250.210693359375, - "logps/rejected": -233.00064086914062, - "loss": 9021.8656, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.5082031488418579, - "rewards/margins": 0.17004860937595367, - "rewards/rejected": -0.6782518029212952, - "rewards/safe_rewards": -0.517989456653595, - "rewards/unsafe_rewards": -0.4984169602394104, + "logits/chosen": -2.13358211517334, + "logits/rejected": -1.880497694015503, + "logps/chosen": -243.0107421875, + "logps/rejected": -220.0526123046875, + "loss": 6255.5586, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.4362039566040039, + "rewards/margins": 0.11256744712591171, + "rewards/rejected": -0.5487713813781738, + "rewards/safe_rewards": -0.44225654006004333, + "rewards/unsafe_rewards": -0.43015122413635254, "step": 1700 }, { "epoch": 0.92, "learning_rate": 9.60417697893534e-09, - "logits/chosen": -1.8621336221694946, - "logits/rejected": -1.581054449081421, - "logps/chosen": -251.6416015625, - "logps/rejected": -240.51083374023438, - "loss": 9547.9414, - "rewards/accuracies": 0.6343749761581421, - "rewards/chosen": -0.5166631937026978, - "rewards/margins": 0.1472282111644745, - "rewards/rejected": -0.6638914346694946, - "rewards/safe_rewards": -0.5074386596679688, - "rewards/unsafe_rewards": -0.5258878469467163, + "logits/chosen": -2.1104042530059814, + "logits/rejected": -1.9010612964630127, + "logps/chosen": -243.5632781982422, + "logps/rejected": -227.34716796875, + "loss": 6634.2828, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.4358798861503601, + "rewards/margins": 0.0963749811053276, + "rewards/rejected": -0.5322549939155579, + "rewards/safe_rewards": -0.42670002579689026, + "rewards/unsafe_rewards": -0.44505977630615234, "step": 1710 }, { "epoch": 0.93, "learning_rate": 8.357199328144576e-09, - "logits/chosen": -1.8677952289581299, - "logits/rejected": -1.571240782737732, - "logps/chosen": -271.2632141113281, - "logps/rejected": -256.4866027832031, - "loss": 8577.2609, - "rewards/accuracies": 0.6156250238418579, - "rewards/chosen": -0.5463152527809143, - "rewards/margins": 0.1413048803806305, - "rewards/rejected": -0.6876201033592224, - "rewards/safe_rewards": -0.541541576385498, - "rewards/unsafe_rewards": -0.551088809967041, + "logits/chosen": -2.1054680347442627, + "logits/rejected": -1.8882167339324951, + "logps/chosen": -263.0108947753906, + "logps/rejected": -243.1034698486328, + "loss": 6000.1617, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.46379217505455017, + "rewards/margins": 0.08999677002429962, + "rewards/rejected": -0.553788959980011, + "rewards/safe_rewards": -0.4631151258945465, + "rewards/unsafe_rewards": -0.4644692540168762, "step": 1720 }, { "epoch": 0.93, "learning_rate": 7.1955293871198144e-09, - "logits/chosen": -1.8462215662002563, - "logits/rejected": -1.6382449865341187, - "logps/chosen": -239.09780883789062, - "logps/rejected": -233.4207000732422, - "loss": 9856.8133, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.5202466249465942, - "rewards/margins": 0.11259704828262329, - "rewards/rejected": -0.6328436136245728, - "rewards/safe_rewards": -0.5164317488670349, - "rewards/unsafe_rewards": -0.5240614414215088, + "logits/chosen": -2.101811170578003, + "logits/rejected": -1.945387840270996, + "logps/chosen": -230.8267059326172, + "logps/rejected": -220.24111938476562, + "loss": 6856.0781, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.4375353455543518, + "rewards/margins": 0.06351236999034882, + "rewards/rejected": -0.5010477304458618, + "rewards/safe_rewards": -0.4362742006778717, + "rewards/unsafe_rewards": -0.4387964606285095, "step": 1730 }, { "epoch": 0.94, "learning_rate": 6.119577262853254e-09, - "logits/chosen": -1.8888225555419922, - "logits/rejected": -1.5417115688323975, - "logps/chosen": -245.1438751220703, - "logps/rejected": -229.68832397460938, - "loss": 9592.1523, - "rewards/accuracies": 0.621874988079071, - "rewards/chosen": -0.5198003649711609, - "rewards/margins": 0.15061242878437042, - "rewards/rejected": -0.6704128384590149, - "rewards/safe_rewards": -0.5187990665435791, - "rewards/unsafe_rewards": -0.5208017230033875, + "logits/chosen": -2.128836154937744, + "logits/rejected": -1.8593677282333374, + "logps/chosen": -237.20468139648438, + "logps/rejected": -215.84890747070312, + "loss": 6559.5469, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.44040852785110474, + "rewards/margins": 0.09161019325256348, + "rewards/rejected": -0.5320187211036682, + "rewards/safe_rewards": -0.4399905204772949, + "rewards/unsafe_rewards": -0.4408264756202698, "step": 1740 }, { "epoch": 0.94, "learning_rate": 5.129722801180542e-09, - "logits/chosen": -1.7919261455535889, - "logits/rejected": -1.533604383468628, - "logps/chosen": -248.8613739013672, - "logps/rejected": -249.96713256835938, - "loss": 8648.2398, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.5194342732429504, - "rewards/margins": 0.17617598176002502, - "rewards/rejected": -0.6956101655960083, - "rewards/safe_rewards": -0.5344281196594238, - "rewards/unsafe_rewards": -0.504440426826477, + "logits/chosen": -2.041652202606201, + "logits/rejected": -1.8507925271987915, + "logps/chosen": -242.01846313476562, + "logps/rejected": -237.16769409179688, + "loss": 6117.1355, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.4510047435760498, + "rewards/margins": 0.11661112308502197, + "rewards/rejected": -0.5676159262657166, + "rewards/safe_rewards": -0.45727530121803284, + "rewards/unsafe_rewards": -0.44473427534103394, "step": 1750 }, { "epoch": 0.95, "learning_rate": 4.226315452682816e-09, - "logits/chosen": -1.8799426555633545, - "logits/rejected": -1.5365275144577026, - "logps/chosen": -248.68115234375, - "logps/rejected": -240.21273803710938, - "loss": 9132.9219, - "rewards/accuracies": 0.684374988079071, - "rewards/chosen": -0.5227926969528198, - "rewards/margins": 0.14881432056427002, - "rewards/rejected": -0.6716070175170898, - "rewards/safe_rewards": -0.5319480299949646, - "rewards/unsafe_rewards": -0.5136373043060303, + "logits/chosen": -2.111886501312256, + "logits/rejected": -1.8561007976531982, + "logps/chosen": -241.124267578125, + "logps/rejected": -226.9755096435547, + "loss": 6254.3047, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.44722381234169006, + "rewards/margins": 0.092010997235775, + "rewards/rejected": -0.5392348766326904, + "rewards/safe_rewards": -0.4598180651664734, + "rewards/unsafe_rewards": -0.4346295893192291, "step": 1760 }, { "epoch": 0.95, "learning_rate": 3.4096741493194193e-09, - "logits/chosen": -1.9021289348602295, - "logits/rejected": -1.6262273788452148, - "logps/chosen": -249.9552459716797, - "logps/rejected": -238.49853515625, - "loss": 9927.5625, - "rewards/accuracies": 0.640625, - "rewards/chosen": -0.5048868060112, - "rewards/margins": 0.14025315642356873, - "rewards/rejected": -0.6451399922370911, - "rewards/safe_rewards": -0.5093857049942017, - "rewards/unsafe_rewards": -0.5003879070281982, + "logits/chosen": -2.14597749710083, + "logits/rejected": -1.940734624862671, + "logps/chosen": -241.82864379882812, + "logps/rejected": -225.2042999267578, + "loss": 6777.9141, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.4236208498477936, + "rewards/margins": 0.08857716619968414, + "rewards/rejected": -0.5121980309486389, + "rewards/safe_rewards": -0.4265195429325104, + "rewards/unsafe_rewards": -0.4207221567630768, "step": 1770 }, { "epoch": 0.96, "learning_rate": 2.6800871918346846e-09, - "logits/chosen": -1.8678810596466064, - "logits/rejected": -1.4804426431655884, - "logps/chosen": -251.1927490234375, - "logps/rejected": -241.011474609375, - "loss": 9305.5492, - "rewards/accuracies": 0.684374988079071, - "rewards/chosen": -0.47926703095436096, - "rewards/margins": 0.19927099347114563, - "rewards/rejected": -0.6785380244255066, - "rewards/safe_rewards": -0.49306339025497437, - "rewards/unsafe_rewards": -0.46547073125839233, + "logits/chosen": -2.1110918521881104, + "logits/rejected": -1.8164072036743164, + "logps/chosen": -244.0395965576172, + "logps/rejected": -226.79354858398438, + "loss": 6527.3641, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4077354073524475, + "rewards/margins": 0.12862348556518555, + "rewards/rejected": -0.5363588333129883, + "rewards/safe_rewards": -0.4131808876991272, + "rewards/unsafe_rewards": -0.4022899568080902, "step": 1780 }, { "epoch": 0.96, "learning_rate": 2.0378121479783796e-09, - "logits/chosen": -1.8298466205596924, - "logits/rejected": -1.4826829433441162, - "logps/chosen": -248.55142211914062, - "logps/rejected": -236.0150146484375, - "loss": 9491.6953, + "logits/chosen": -2.0832958221435547, + "logits/rejected": -1.8225599527359009, + "logps/chosen": -240.6083984375, + "logps/rejected": -222.42355346679688, + "loss": 6531.5406, "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.5252212882041931, - "rewards/margins": 0.16228215396404266, - "rewards/rejected": -0.687503457069397, - "rewards/safe_rewards": -0.5260053277015686, - "rewards/unsafe_rewards": -0.5244373083114624, + "rewards/chosen": -0.4457913041114807, + "rewards/margins": 0.10579758882522583, + "rewards/rejected": -0.5515888929367065, + "rewards/safe_rewards": -0.4444062113761902, + "rewards/unsafe_rewards": -0.44717639684677124, "step": 1790 }, { "epoch": 0.97, "learning_rate": 1.4830757615760247e-09, - "logits/chosen": -1.9217582941055298, - "logits/rejected": -1.5524566173553467, - "logps/chosen": -259.631103515625, - "logps/rejected": -236.8588409423828, - "loss": 9801.7766, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.5246955752372742, - "rewards/margins": 0.14091341197490692, - "rewards/rejected": -0.6656090617179871, - "rewards/safe_rewards": -0.5533384084701538, - "rewards/unsafe_rewards": -0.4960528016090393, + "logits/chosen": -2.1382036209106445, + "logits/rejected": -1.8611648082733154, + "logps/chosen": -252.05618286132812, + "logps/rejected": -223.10189819335938, + "loss": 6842.4117, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.44894617795944214, + "rewards/margins": 0.07909347116947174, + "rewards/rejected": -0.5280395746231079, + "rewards/safe_rewards": -0.4768086075782776, + "rewards/unsafe_rewards": -0.4210837781429291, "step": 1800 }, { "epoch": 0.97, "learning_rate": 1.0160738724809548e-09, - "logits/chosen": -1.9124891757965088, - "logits/rejected": -1.5568935871124268, - "logps/chosen": -247.20138549804688, - "logps/rejected": -238.9026641845703, - "loss": 8768.0156, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.5117820501327515, - "rewards/margins": 0.16674897074699402, - "rewards/rejected": -0.6785309314727783, - "rewards/safe_rewards": -0.5125290155410767, - "rewards/unsafe_rewards": -0.5110349059104919, + "logits/chosen": -2.136545181274414, + "logits/rejected": -1.8660471439361572, + "logps/chosen": -239.1907958984375, + "logps/rejected": -225.1981964111328, + "loss": 6138.1797, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.4316760003566742, + "rewards/margins": 0.10981061309576035, + "rewards/rejected": -0.5414865612983704, + "rewards/safe_rewards": -0.43494072556495667, + "rewards/unsafe_rewards": -0.42841118574142456, "step": 1810 }, { "epoch": 0.98, "learning_rate": 6.369713474366212e-10, - "logits/chosen": -1.8888686895370483, - "logits/rejected": -1.5450111627578735, - "logps/chosen": -273.55096435546875, - "logps/rejected": -252.25448608398438, - "loss": 8381.8445, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.5429133772850037, - "rewards/margins": 0.16056939959526062, - "rewards/rejected": -0.7034828662872314, - "rewards/safe_rewards": -0.5493394136428833, - "rewards/unsafe_rewards": -0.5364874601364136, + "logits/chosen": -2.122779369354248, + "logits/rejected": -1.8656489849090576, + "logps/chosen": -265.56866455078125, + "logps/rejected": -237.6756591796875, + "loss": 5787.684, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.46309009194374084, + "rewards/margins": 0.09460426867008209, + "rewards/rejected": -0.5576944351196289, + "rewards/safe_rewards": -0.4682251513004303, + "rewards/unsafe_rewards": -0.4579550623893738, "step": 1820 }, { "epoch": 0.98, "learning_rate": 3.459020218731512e-10, - "logits/chosen": -1.9145901203155518, - "logits/rejected": -1.5939346551895142, - "logps/chosen": -254.10696411132812, - "logps/rejected": -237.9196014404297, - "loss": 8491.7383, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.5133448839187622, - "rewards/margins": 0.19383959472179413, - "rewards/rejected": -0.7071844339370728, - "rewards/safe_rewards": -0.5064384341239929, - "rewards/unsafe_rewards": -0.5202513933181763, + "logits/chosen": -2.134183883666992, + "logits/rejected": -1.8942667245864868, + "logps/chosen": -247.1185302734375, + "logps/rejected": -223.71078491210938, + "loss": 5796.5938, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.443460613489151, + "rewards/margins": 0.12163563072681427, + "rewards/rejected": -0.5650962591171265, + "rewards/safe_rewards": -0.43802452087402344, + "rewards/unsafe_rewards": -0.44889673590660095, "step": 1830 }, { "epoch": 0.99, "learning_rate": 1.429686526593088e-10, - "logits/chosen": -1.8546758890151978, - "logits/rejected": -1.5610625743865967, - "logps/chosen": -259.65338134765625, - "logps/rejected": -243.43154907226562, - "loss": 9367.1047, - "rewards/accuracies": 0.640625, - "rewards/chosen": -0.524687647819519, - "rewards/margins": 0.14939476549625397, - "rewards/rejected": -0.6740824580192566, - "rewards/safe_rewards": -0.5346362590789795, - "rewards/unsafe_rewards": -0.5147390365600586, + "logits/chosen": -2.093444347381592, + "logits/rejected": -1.8680245876312256, + "logps/chosen": -251.08969116210938, + "logps/rejected": -229.55322265625, + "loss": 6394.6027, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.43905067443847656, + "rewards/margins": 0.0962488204240799, + "rewards/rejected": -0.5352994799613953, + "rewards/safe_rewards": -0.4522920250892639, + "rewards/unsafe_rewards": -0.4258092939853668, "step": 1840 }, { "epoch": 1.0, "learning_rate": 2.824288182584622e-11, - "logits/chosen": -1.9083442687988281, - "logits/rejected": -1.624894380569458, - "logps/chosen": -259.8470153808594, - "logps/rejected": -236.27783203125, - "loss": 8749.6625, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.5334314107894897, - "rewards/margins": 0.14777374267578125, - "rewards/rejected": -0.681205153465271, - "rewards/safe_rewards": -0.5262561440467834, - "rewards/unsafe_rewards": -0.5406066179275513, + "logits/chosen": -2.135525703430176, + "logits/rejected": -1.925082802772522, + "logps/chosen": -252.08755493164062, + "logps/rejected": -223.11550903320312, + "loss": 5975.7648, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.45583659410476685, + "rewards/margins": 0.09374500811100006, + "rewards/rejected": -0.5495815873146057, + "rewards/safe_rewards": -0.4470265805721283, + "rewards/unsafe_rewards": -0.464646577835083, "step": 1850 }, { "epoch": 1.0, "step": 1858, "total_flos": 0.0, - "train_loss": 9735.328284782023, - "train_runtime": 39733.2407, - "train_samples_per_second": 1.497, + "train_loss": 6725.912355355221, + "train_runtime": 39534.0439, + "train_samples_per_second": 1.504, "train_steps_per_second": 0.047 } ],