diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,21 +1,21 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.9994837377387713, + "epoch": 2.998451213216314, "eval_steps": 100, - "global_step": 968, + "global_step": 2904, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, - "learning_rate": 5.154639175257731e-09, - "logits/chosen": -2.251229763031006, - "logits/rejected": -2.2295913696289062, - "logps/chosen": -269.52740478515625, - "logps/rejected": -240.59812927246094, - "loss": 0.6931, + "learning_rate": 1.7182130584192438e-09, + "logits/chosen": -2.9648265838623047, + "logits/rejected": -2.9711227416992188, + "logps/chosen": -256.0919494628906, + "logps/rejected": -234.60708618164062, + "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -24,1377 +24,4125 @@ }, { "epoch": 0.01, - "learning_rate": 5.154639175257731e-08, - "logits/chosen": -2.223740339279175, - "logits/rejected": -2.180643081665039, - "logps/chosen": -284.7340087890625, - "logps/rejected": -205.98194885253906, - "loss": 0.694, - "rewards/accuracies": 0.4305555522441864, - "rewards/chosen": -0.0006893649115227163, - "rewards/margins": 0.0007374237175099552, - "rewards/rejected": -0.0014267880469560623, + "learning_rate": 1.718213058419244e-08, + "logits/chosen": -3.049875497817993, + "logits/rejected": -3.0188238620758057, + "logps/chosen": -276.6912536621094, + "logps/rejected": -202.39605712890625, + "loss": 1.0001, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0005764114903286099, + "rewards/margins": -0.006484686397016048, + "rewards/rejected": 0.007061097305268049, "step": 10 }, { "epoch": 0.02, - "learning_rate": 1.0309278350515462e-07, - "logits/chosen": -2.33476185798645, - "logits/rejected": -2.2125375270843506, - "logps/chosen": -320.8204040527344, - "logps/rejected": -248.4267120361328, - "loss": 0.692, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": 0.0003039050498045981, - "rewards/margins": 0.0023796656168997288, - "rewards/rejected": -0.0020757606253027916, + "learning_rate": 3.436426116838488e-08, + "logits/chosen": -2.988577127456665, + "logits/rejected": -2.9995627403259277, + "logps/chosen": -312.2018127441406, + "logps/rejected": -246.76266479492188, + "loss": 1.0026, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": 0.003506724489852786, + "rewards/margins": -0.0012849611230194569, + "rewards/rejected": 0.004791685380041599, "step": 20 }, { "epoch": 0.03, - "learning_rate": 1.5463917525773197e-07, - "logits/chosen": -2.339370012283325, - "logits/rejected": -2.304020404815674, - "logps/chosen": -268.95074462890625, - "logps/rejected": -227.067626953125, - "loss": 0.6921, - "rewards/accuracies": 0.46875, - "rewards/chosen": 0.0005883350968360901, - "rewards/margins": 0.002594549907371402, - "rewards/rejected": -0.0020062148105353117, + "learning_rate": 5.154639175257731e-08, + "logits/chosen": -3.063732624053955, + "logits/rejected": -3.0357906818389893, + "logps/chosen": -260.15679931640625, + "logps/rejected": -224.3686065673828, + "loss": 0.9974, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.0021156296133995056, + "rewards/margins": 0.0043937130831182, + "rewards/rejected": -0.0022780844010412693, "step": 30 }, { "epoch": 0.04, - "learning_rate": 2.0618556701030925e-07, - "logits/chosen": -2.3392791748046875, - "logits/rejected": -2.3300938606262207, - "logps/chosen": -308.5113220214844, - "logps/rejected": -253.8385467529297, - "loss": 0.6945, - "rewards/accuracies": 0.44999998807907104, - "rewards/chosen": 0.0024464379530400038, - "rewards/margins": -0.00025889737298712134, - "rewards/rejected": 0.0027053358498960733, + "learning_rate": 6.872852233676976e-08, + "logits/chosen": -3.070286273956299, + "logits/rejected": -3.0322961807250977, + "logps/chosen": -299.5580139160156, + "logps/rejected": -250.05123901367188, + "loss": 0.9971, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.00013011172995902598, + "rewards/margins": -0.0006899217842146754, + "rewards/rejected": 0.0008200337179005146, "step": 40 }, { "epoch": 0.05, - "learning_rate": 2.5773195876288655e-07, - "logits/chosen": -2.251412868499756, - "logits/rejected": -2.2359275817871094, - "logps/chosen": -297.78375244140625, - "logps/rejected": -227.23556518554688, - "loss": 0.6922, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.0033915191888809204, - "rewards/margins": 0.0055986023508012295, - "rewards/rejected": -0.0022070836275815964, + "learning_rate": 8.59106529209622e-08, + "logits/chosen": -3.019392490386963, + "logits/rejected": -3.024167060852051, + "logps/chosen": -289.3621520996094, + "logps/rejected": -224.00979614257812, + "loss": 1.003, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.0025630726013332605, + "rewards/margins": -0.0055747563019394875, + "rewards/rejected": 0.003011685097590089, "step": 50 }, { "epoch": 0.06, - "learning_rate": 3.0927835051546394e-07, - "logits/chosen": -2.167163848876953, - "logits/rejected": -2.3376193046569824, - "logps/chosen": -256.54510498046875, - "logps/rejected": -229.5459747314453, - "loss": 0.6917, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": 0.000388039683457464, - "rewards/margins": 0.007883811369538307, - "rewards/rejected": -0.0074957734905183315, + "learning_rate": 1.0309278350515462e-07, + "logits/chosen": -3.0416665077209473, + "logits/rejected": -3.020573616027832, + "logps/chosen": -247.55380249023438, + "logps/rejected": -226.4866943359375, + "loss": 0.9977, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.0035637759137898684, + "rewards/margins": 0.0046446239575743675, + "rewards/rejected": -0.0010808479273691773, "step": 60 }, { "epoch": 0.07, - "learning_rate": 3.608247422680412e-07, - "logits/chosen": -2.3430614471435547, - "logits/rejected": -2.281782627105713, - "logps/chosen": -313.92608642578125, - "logps/rejected": -252.57284545898438, - "loss": 0.6924, - "rewards/accuracies": 0.46875, - "rewards/chosen": 0.0012417413527145982, - "rewards/margins": 0.0001173208438558504, - "rewards/rejected": 0.0011244199704378843, + "learning_rate": 1.202749140893471e-07, + "logits/chosen": -3.0761704444885254, + "logits/rejected": -3.058954954147339, + "logps/chosen": -305.7156677246094, + "logps/rejected": -251.75009155273438, + "loss": 0.9938, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.004508626647293568, + "rewards/margins": 0.007502266205847263, + "rewards/rejected": -0.0029936402570456266, "step": 70 }, { "epoch": 0.08, - "learning_rate": 4.123711340206185e-07, - "logits/chosen": -2.337070941925049, - "logits/rejected": -2.3018112182617188, - "logps/chosen": -302.9524841308594, - "logps/rejected": -243.9047088623047, - "loss": 0.6916, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": 0.0021400884725153446, - "rewards/margins": -0.0002812549355439842, - "rewards/rejected": 0.002421343233436346, + "learning_rate": 1.3745704467353952e-07, + "logits/chosen": -3.049072742462158, + "logits/rejected": -3.0234384536743164, + "logps/chosen": -293.57989501953125, + "logps/rejected": -240.2385711669922, + "loss": 0.9985, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.002101506572216749, + "rewards/margins": 0.002054845681414008, + "rewards/rejected": 4.6660610678372905e-05, "step": 80 }, { "epoch": 0.09, - "learning_rate": 4.639175257731959e-07, - "logits/chosen": -2.259251356124878, - "logits/rejected": -2.2963995933532715, - "logps/chosen": -270.1668395996094, - "logps/rejected": -216.64822387695312, - "loss": 0.6913, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.009941437281668186, - "rewards/margins": 0.010241752490401268, - "rewards/rejected": -0.00030031436472199857, + "learning_rate": 1.5463917525773197e-07, + "logits/chosen": -3.079655885696411, + "logits/rejected": -3.0430655479431152, + "logps/chosen": -259.3849792480469, + "logps/rejected": -216.38330078125, + "loss": 0.9993, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.001981315901502967, + "rewards/margins": 0.003211658913642168, + "rewards/rejected": -0.0012303430121392012, "step": 90 }, { "epoch": 0.1, - "learning_rate": 4.982778415614236e-07, - "logits/chosen": -2.1677582263946533, - "logits/rejected": -2.2741990089416504, - "logps/chosen": -274.75836181640625, - "logps/rejected": -226.3966064453125, - "loss": 0.6901, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.006115993484854698, - "rewards/margins": 0.0013887921813875437, - "rewards/rejected": 0.0047272020019590855, + "learning_rate": 1.718213058419244e-07, + "logits/chosen": -3.0111169815063477, + "logits/rejected": -3.006265640258789, + "logps/chosen": -267.73577880859375, + "logps/rejected": -222.9344482421875, + "loss": 1.0009, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.005559581331908703, + "rewards/margins": 0.004569889511913061, + "rewards/rejected": 0.000989692285656929, "step": 100 }, { "epoch": 0.11, - "learning_rate": 4.925373134328357e-07, - "logits/chosen": -2.271916389465332, - "logits/rejected": -2.197857141494751, - "logps/chosen": -274.72113037109375, - "logps/rejected": -232.5464324951172, - "loss": 0.6886, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.005831545684486628, - "rewards/margins": 0.0067709460854530334, - "rewards/rejected": -0.000939400284551084, + "learning_rate": 1.8900343642611682e-07, + "logits/chosen": -3.0228209495544434, + "logits/rejected": -2.9778640270233154, + "logps/chosen": -269.3376770019531, + "logps/rejected": -230.95877075195312, + "loss": 0.9971, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0013723246520385146, + "rewards/margins": 0.0024911228101700544, + "rewards/rejected": -0.0011187975760549307, "step": 110 }, { "epoch": 0.12, - "learning_rate": 4.867967853042479e-07, - "logits/chosen": -2.2548232078552246, - "logits/rejected": -2.322075366973877, - "logps/chosen": -319.34521484375, - "logps/rejected": -235.76535034179688, - "loss": 0.689, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.013832703232765198, - "rewards/margins": 0.01176449190825224, - "rewards/rejected": 0.002068211790174246, + "learning_rate": 2.0618556701030925e-07, + "logits/chosen": -3.0604119300842285, + "logits/rejected": -3.0274159908294678, + "logps/chosen": -310.97454833984375, + "logps/rejected": -232.7030029296875, + "loss": 0.9997, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.0033910819329321384, + "rewards/margins": 0.0011182299349457026, + "rewards/rejected": 0.002272851997986436, "step": 120 }, { "epoch": 0.13, - "learning_rate": 4.810562571756601e-07, - "logits/chosen": -2.32174015045166, - "logits/rejected": -2.3775150775909424, - "logps/chosen": -296.20733642578125, - "logps/rejected": -245.56655883789062, - "loss": 0.6875, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": 0.017552796751260757, - "rewards/margins": 0.013545483350753784, - "rewards/rejected": 0.004007314797490835, + "learning_rate": 2.2336769759450173e-07, + "logits/chosen": -3.1223320960998535, + "logits/rejected": -3.0860095024108887, + "logps/chosen": -286.6527099609375, + "logps/rejected": -241.933349609375, + "loss": 0.996, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.0031617667991667986, + "rewards/margins": 0.005714719649404287, + "rewards/rejected": -0.0025529528502374887, "step": 130 }, { "epoch": 0.14, - "learning_rate": 4.753157290470723e-07, - "logits/chosen": -2.3627283573150635, - "logits/rejected": -2.310948133468628, - "logps/chosen": -301.9321594238281, - "logps/rejected": -239.2898406982422, - "loss": 0.688, - "rewards/accuracies": 0.46875, - "rewards/chosen": 0.011156091466546059, - "rewards/margins": 0.009668431244790554, - "rewards/rejected": 0.0014876595232635736, + "learning_rate": 2.405498281786942e-07, + "logits/chosen": -3.045279026031494, + "logits/rejected": -3.040912628173828, + "logps/chosen": -292.4465637207031, + "logps/rejected": -234.72903442382812, + "loss": 0.9968, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0010984055697917938, + "rewards/margins": 0.002134153386577964, + "rewards/rejected": -0.00103574781678617, "step": 140 }, { "epoch": 0.15, - "learning_rate": 4.6957520091848447e-07, - "logits/chosen": -2.2531113624572754, - "logits/rejected": -2.348215341567993, - "logps/chosen": -284.4292907714844, - "logps/rejected": -259.6882019042969, - "loss": 0.6858, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.017186133190989494, - "rewards/margins": 0.011862866580486298, - "rewards/rejected": 0.005323265679180622, + "learning_rate": 2.5773195876288655e-07, + "logits/chosen": -3.0267093181610107, + "logits/rejected": -3.0170674324035645, + "logps/chosen": -275.6455993652344, + "logps/rejected": -256.4563903808594, + "loss": 0.9998, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.0011621230514720082, + "rewards/margins": -2.7875230443896726e-05, + "rewards/rejected": -0.001134247868321836, "step": 150 }, { "epoch": 0.17, - "learning_rate": 4.6383467278989666e-07, - "logits/chosen": -2.361238956451416, - "logits/rejected": -2.4430744647979736, - "logps/chosen": -286.7644348144531, - "logps/rejected": -221.6837158203125, - "loss": 0.6857, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.01911218836903572, - "rewards/margins": 0.014816234819591045, - "rewards/rejected": 0.00429595448076725, + "learning_rate": 2.7491408934707903e-07, + "logits/chosen": -3.08597731590271, + "logits/rejected": -3.0747408866882324, + "logps/chosen": -278.91754150390625, + "logps/rejected": -218.88558959960938, + "loss": 0.9949, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.004570486024022102, + "rewards/margins": 0.01147634070366621, + "rewards/rejected": -0.006905855145305395, "step": 160 }, { "epoch": 0.18, - "learning_rate": 4.580941446613088e-07, - "logits/chosen": -2.32244610786438, - "logits/rejected": -2.3339757919311523, - "logps/chosen": -301.54693603515625, - "logps/rejected": -239.26095581054688, - "loss": 0.6839, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.023171866312623024, - "rewards/margins": 0.014685508795082569, - "rewards/rejected": 0.00848635844886303, + "learning_rate": 2.9209621993127146e-07, + "logits/chosen": -3.0499391555786133, + "logits/rejected": -3.036341905593872, + "logps/chosen": -292.2102966308594, + "logps/rejected": -236.95703125, + "loss": 0.9925, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0019660559482872486, + "rewards/margins": 0.009962075389921665, + "rewards/rejected": -0.00799601897597313, "step": 170 }, { "epoch": 0.19, - "learning_rate": 4.52353616532721e-07, - "logits/chosen": -2.347285032272339, - "logits/rejected": -2.3244121074676514, - "logps/chosen": -257.841552734375, - "logps/rejected": -214.5565643310547, - "loss": 0.6864, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.019994111731648445, - "rewards/margins": 0.01520625315606594, - "rewards/rejected": 0.004787858575582504, + "learning_rate": 3.0927835051546394e-07, + "logits/chosen": -3.091757297515869, + "logits/rejected": -3.080824375152588, + "logps/chosen": -248.6970672607422, + "logps/rejected": -211.7375946044922, + "loss": 0.993, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.0008403388783335686, + "rewards/margins": 0.004896977450698614, + "rewards/rejected": -0.004056639038026333, "step": 180 }, { "epoch": 0.2, - "learning_rate": 4.4661308840413316e-07, - "logits/chosen": -2.2657313346862793, - "logits/rejected": -2.201254367828369, - "logps/chosen": -253.98916625976562, - "logps/rejected": -206.3340301513672, - "loss": 0.6833, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.0245305635035038, - "rewards/margins": 0.017677443102002144, - "rewards/rejected": 0.006853120867162943, + "learning_rate": 3.2646048109965636e-07, + "logits/chosen": -3.0578689575195312, + "logits/rejected": -3.033844232559204, + "logps/chosen": -246.07040405273438, + "logps/rejected": -200.9595184326172, + "loss": 0.9978, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0035698451101779938, + "rewards/margins": 0.012628579512238503, + "rewards/rejected": -0.009058734402060509, "step": 190 }, { "epoch": 0.21, - "learning_rate": 4.408725602755453e-07, - "logits/chosen": -2.284461498260498, - "logits/rejected": -2.2873706817626953, - "logps/chosen": -261.44427490234375, - "logps/rejected": -195.59422302246094, - "loss": 0.6835, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.03187788277864456, - "rewards/margins": 0.024095263332128525, - "rewards/rejected": 0.007782619446516037, + "learning_rate": 3.436426116838488e-07, + "logits/chosen": -3.08526349067688, + "logits/rejected": -3.063560724258423, + "logps/chosen": -252.265869140625, + "logps/rejected": -192.21331787109375, + "loss": 0.99, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.009008857421576977, + "rewards/margins": 0.016440508887171745, + "rewards/rejected": -0.007431652396917343, "step": 200 }, { "epoch": 0.22, - "learning_rate": 4.351320321469575e-07, - "logits/chosen": -2.18426513671875, - "logits/rejected": -2.1963071823120117, - "logps/chosen": -302.31195068359375, - "logps/rejected": -218.6005401611328, - "loss": 0.6815, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.03718667849898338, - "rewards/margins": 0.026892077177762985, - "rewards/rejected": 0.010294605046510696, + "learning_rate": 3.608247422680412e-07, + "logits/chosen": -2.9606690406799316, + "logits/rejected": -2.906953811645508, + "logps/chosen": -292.0260925292969, + "logps/rejected": -215.34036254882812, + "loss": 0.9855, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0046757906675338745, + "rewards/margins": 0.013047484681010246, + "rewards/rejected": -0.008371694944798946, "step": 210 }, { "epoch": 0.23, - "learning_rate": 4.2939150401836967e-07, - "logits/chosen": -2.2150394916534424, - "logits/rejected": -2.2160990238189697, - "logps/chosen": -269.44769287109375, - "logps/rejected": -235.6748504638672, - "loss": 0.6801, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.038056183606386185, - "rewards/margins": 0.023441683501005173, - "rewards/rejected": 0.014614498242735863, + "learning_rate": 3.7800687285223364e-07, + "logits/chosen": -2.990668773651123, + "logits/rejected": -2.986696481704712, + "logps/chosen": -260.21832275390625, + "logps/rejected": -237.1192626953125, + "loss": 0.9839, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.006101504433900118, + "rewards/margins": 0.011856775730848312, + "rewards/rejected": -0.005755270831286907, "step": 220 }, { "epoch": 0.24, - "learning_rate": 4.236509758897818e-07, - "logits/chosen": -2.2152469158172607, - "logits/rejected": -2.1862380504608154, - "logps/chosen": -271.4049377441406, - "logps/rejected": -242.6397247314453, - "loss": 0.6826, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.03268683701753616, - "rewards/margins": 0.026912549510598183, - "rewards/rejected": 0.0057742842473089695, + "learning_rate": 3.9518900343642607e-07, + "logits/chosen": -3.0254712104797363, + "logits/rejected": -3.0170722007751465, + "logps/chosen": -263.41680908203125, + "logps/rejected": -240.377685546875, + "loss": 0.9888, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.006732765585184097, + "rewards/margins": 0.014450904913246632, + "rewards/rejected": -0.007718136068433523, "step": 230 }, { "epoch": 0.25, - "learning_rate": 4.17910447761194e-07, - "logits/chosen": -2.3059380054473877, - "logits/rejected": -2.2681984901428223, - "logps/chosen": -309.55499267578125, - "logps/rejected": -221.61703491210938, - "loss": 0.6827, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": 0.03509462997317314, - "rewards/margins": 0.012767216190695763, - "rewards/rejected": 0.02232741378247738, + "learning_rate": 4.123711340206185e-07, + "logits/chosen": -3.078042507171631, + "logits/rejected": -3.0471174716949463, + "logps/chosen": -299.16107177734375, + "logps/rejected": -214.18759155273438, + "loss": 0.9867, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.006704004947096109, + "rewards/margins": 0.01422835886478424, + "rewards/rejected": -0.007524352520704269, "step": 240 }, { "epoch": 0.26, - "learning_rate": 4.121699196326062e-07, - "logits/chosen": -2.307035446166992, - "logits/rejected": -2.2920923233032227, - "logps/chosen": -272.9412841796875, - "logps/rejected": -237.314208984375, - "loss": 0.6824, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.03535359352827072, - "rewards/margins": 0.012216273695230484, - "rewards/rejected": 0.023137323558330536, + "learning_rate": 4.2955326460481097e-07, + "logits/chosen": -3.026909589767456, + "logits/rejected": -3.018611431121826, + "logps/chosen": -264.2486572265625, + "logps/rejected": -233.31826782226562, + "loss": 0.9832, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.011163066141307354, + "rewards/margins": 0.019244546070694923, + "rewards/rejected": -0.008081478998064995, "step": 250 }, { "epoch": 0.27, - "learning_rate": 4.0642939150401836e-07, - "logits/chosen": -2.3456673622131348, - "logits/rejected": -2.3194832801818848, - "logps/chosen": -270.475341796875, - "logps/rejected": -221.84536743164062, - "loss": 0.6805, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.04569912329316139, - "rewards/margins": 0.029975151643157005, - "rewards/rejected": 0.015723969787359238, + "learning_rate": 4.4673539518900345e-07, + "logits/chosen": -3.0170772075653076, + "logits/rejected": -3.0285823345184326, + "logps/chosen": -263.9449157714844, + "logps/rejected": -219.4688262939453, + "loss": 0.9828, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.010579807683825493, + "rewards/margins": 0.024952661246061325, + "rewards/rejected": -0.014372853562235832, "step": 260 }, { "epoch": 0.28, - "learning_rate": 4.006888633754305e-07, - "logits/chosen": -2.385854721069336, - "logits/rejected": -2.3556528091430664, - "logps/chosen": -284.36029052734375, - "logps/rejected": -232.5426788330078, - "loss": 0.6793, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.04998317360877991, - "rewards/margins": 0.032010577619075775, - "rewards/rejected": 0.017972594127058983, + "learning_rate": 4.639175257731959e-07, + "logits/chosen": -3.0661990642547607, + "logits/rejected": -3.0676910877227783, + "logps/chosen": -274.22003173828125, + "logps/rejected": -229.6044158935547, + "loss": 0.9799, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.004864403046667576, + "rewards/margins": 0.016584355384111404, + "rewards/rejected": -0.011719951406121254, "step": 270 }, { "epoch": 0.29, - "learning_rate": 3.949483352468427e-07, - "logits/chosen": -2.308225154876709, - "logits/rejected": -2.259629726409912, - "logps/chosen": -293.1715087890625, - "logps/rejected": -236.4293975830078, - "loss": 0.6771, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.057786036282777786, - "rewards/margins": 0.04149205610156059, - "rewards/rejected": 0.016293983906507492, + "learning_rate": 4.810996563573884e-07, + "logits/chosen": -3.031026840209961, + "logits/rejected": -3.0113377571105957, + "logps/chosen": -283.8157653808594, + "logps/rejected": -235.0233612060547, + "loss": 0.9754, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.013933306559920311, + "rewards/margins": 0.027073601260781288, + "rewards/rejected": -0.013140290975570679, "step": 280 }, { "epoch": 0.3, - "learning_rate": 3.8920780711825487e-07, - "logits/chosen": -2.278501033782959, - "logits/rejected": -2.369293689727783, - "logps/chosen": -278.4786376953125, - "logps/rejected": -227.40927124023438, - "loss": 0.6792, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.0485750176012516, - "rewards/margins": 0.02242155373096466, - "rewards/rejected": 0.02615346387028694, + "learning_rate": 4.982817869415807e-07, + "logits/chosen": -3.094398021697998, + "logits/rejected": -3.0440070629119873, + "logps/chosen": -270.22052001953125, + "logps/rejected": -223.65493774414062, + "loss": 0.9774, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.008421173319220543, + "rewards/margins": 0.019475247710943222, + "rewards/rejected": -0.01105407439172268, "step": 290 }, { "epoch": 0.31, - "learning_rate": 3.83467278989667e-07, - "logits/chosen": -2.2661235332489014, - "logits/rejected": -2.205644130706787, - "logps/chosen": -254.183837890625, - "logps/rejected": -221.9667510986328, - "loss": 0.6772, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.05874975398182869, - "rewards/margins": 0.03965791314840317, - "rewards/rejected": 0.019091838970780373, + "learning_rate": 4.982778415614236e-07, + "logits/chosen": -3.023087978363037, + "logits/rejected": -2.9992034435272217, + "logps/chosen": -244.30337524414062, + "logps/rejected": -218.7770538330078, + "loss": 0.9735, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.008720096200704575, + "rewards/margins": 0.029811996966600418, + "rewards/rejected": -0.021091898903250694, "step": 300 }, { "epoch": 0.32, - "learning_rate": 3.777267508610792e-07, - "logits/chosen": -2.32353138923645, - "logits/rejected": -2.3743112087249756, - "logps/chosen": -306.22711181640625, - "logps/rejected": -257.60980224609375, - "loss": 0.6783, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.04823786020278931, - "rewards/margins": 0.017192820087075233, - "rewards/rejected": 0.03104504384100437, + "learning_rate": 4.963643321852277e-07, + "logits/chosen": -3.0621352195739746, + "logits/rejected": -3.0481069087982178, + "logps/chosen": -299.58758544921875, + "logps/rejected": -257.4301452636719, + "loss": 0.9719, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.011654629372060299, + "rewards/margins": 0.02595471777021885, + "rewards/rejected": -0.01430008839815855, "step": 310 }, { "epoch": 0.33, - "learning_rate": 3.7198622273249137e-07, - "logits/chosen": -2.234679698944092, - "logits/rejected": -2.211430788040161, - "logps/chosen": -251.83053588867188, - "logps/rejected": -193.01544189453125, - "loss": 0.6739, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.06583289802074432, - "rewards/margins": 0.047706056386232376, - "rewards/rejected": 0.018126841634511948, + "learning_rate": 4.944508228090318e-07, + "logits/chosen": -3.026646137237549, + "logits/rejected": -3.0066604614257812, + "logps/chosen": -242.5664825439453, + "logps/rejected": -187.6553497314453, + "loss": 0.9641, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.014026440680027008, + "rewards/margins": 0.0420592799782753, + "rewards/rejected": -0.02803283929824829, "step": 320 }, { "epoch": 0.34, - "learning_rate": 3.662456946039035e-07, - "logits/chosen": -2.259127140045166, - "logits/rejected": -2.287956714630127, - "logps/chosen": -312.1918029785156, - "logps/rejected": -239.03530883789062, - "loss": 0.6761, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.07068151980638504, - "rewards/margins": 0.051512353122234344, - "rewards/rejected": 0.0191691592335701, + "learning_rate": 4.925373134328357e-07, + "logits/chosen": -3.0689878463745117, + "logits/rejected": -3.052264928817749, + "logps/chosen": -303.94036865234375, + "logps/rejected": -238.4488067626953, + "loss": 0.9634, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02398153766989708, + "rewards/margins": 0.042572326958179474, + "rewards/rejected": -0.018590793013572693, "step": 330 }, { "epoch": 0.35, - "learning_rate": 3.605051664753157e-07, - "logits/chosen": -2.197277784347534, - "logits/rejected": -2.13037109375, - "logps/chosen": -244.2609100341797, - "logps/rejected": -238.80953979492188, - "loss": 0.6788, + "learning_rate": 4.906238040566398e-07, + "logits/chosen": -2.995407819747925, + "logits/rejected": -2.9780545234680176, + "logps/chosen": -235.09848022460938, + "logps/rejected": -236.380859375, + "loss": 0.9616, "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.05732797831296921, - "rewards/margins": 0.030042264610528946, - "rewards/rejected": 0.027285713702440262, + "rewards/chosen": 0.006325787398964167, + "rewards/margins": 0.03254387527704239, + "rewards/rejected": -0.02621809020638466, "step": 340 }, { "epoch": 0.36, - "learning_rate": 3.547646383467279e-07, - "logits/chosen": -2.365830421447754, - "logits/rejected": -2.3728528022766113, - "logps/chosen": -313.7022705078125, - "logps/rejected": -248.090087890625, - "loss": 0.6746, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.08016298711299896, - "rewards/margins": 0.05509548634290695, - "rewards/rejected": 0.025067497044801712, + "learning_rate": 4.887102946804438e-07, + "logits/chosen": -3.066584348678589, + "logits/rejected": -3.03863263130188, + "logps/chosen": -306.0690612792969, + "logps/rejected": -245.27407836914062, + "loss": 0.9508, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.02622169815003872, + "rewards/margins": 0.059129487723112106, + "rewards/rejected": -0.032907791435718536, "step": 350 }, { "epoch": 0.37, - "learning_rate": 3.4902411021814007e-07, - "logits/chosen": -2.22756290435791, - "logits/rejected": -2.259359121322632, - "logps/chosen": -303.25250244140625, - "logps/rejected": -249.8985595703125, - "loss": 0.6723, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.06414168328046799, - "rewards/margins": 0.04363773763179779, - "rewards/rejected": 0.020503941923379898, + "learning_rate": 4.867967853042479e-07, + "logits/chosen": -3.0108275413513184, + "logits/rejected": -3.008779525756836, + "logps/chosen": -294.0123596191406, + "logps/rejected": -248.9111785888672, + "loss": 0.9496, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.008144224062561989, + "rewards/margins": 0.045391060411930084, + "rewards/rejected": -0.037246834486722946, "step": 360 }, { "epoch": 0.38, - "learning_rate": 3.432835820895522e-07, - "logits/chosen": -2.3700273036956787, - "logits/rejected": -2.3231639862060547, - "logps/chosen": -314.5257263183594, - "logps/rejected": -270.7105712890625, - "loss": 0.6759, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.07061124593019485, - "rewards/margins": 0.03391130641102791, - "rewards/rejected": 0.03669993579387665, + "learning_rate": 4.84883275928052e-07, + "logits/chosen": -3.048269510269165, + "logits/rejected": -3.011050224304199, + "logps/chosen": -303.9602355957031, + "logps/rejected": -269.4437561035156, + "loss": 0.9473, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.021487019956111908, + "rewards/margins": 0.06678290665149689, + "rewards/rejected": -0.04529587924480438, "step": 370 }, { "epoch": 0.39, - "learning_rate": 3.375430539609644e-07, - "logits/chosen": -2.3212878704071045, - "logits/rejected": -2.249602794647217, - "logps/chosen": -291.92474365234375, - "logps/rejected": -239.6724395751953, - "loss": 0.677, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.07373902946710587, - "rewards/margins": 0.03367278352379799, - "rewards/rejected": 0.04006624594330788, + "learning_rate": 4.82969766551856e-07, + "logits/chosen": -3.0777668952941895, + "logits/rejected": -3.068040370941162, + "logps/chosen": -282.12713623046875, + "logps/rejected": -236.7052459716797, + "loss": 0.9515, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.02360449731349945, + "rewards/margins": 0.05431375652551651, + "rewards/rejected": -0.03070926107466221, "step": 380 }, { "epoch": 0.4, - "learning_rate": 3.3180252583237657e-07, - "logits/chosen": -2.297023057937622, - "logits/rejected": -2.264172077178955, - "logps/chosen": -278.0927734375, - "logps/rejected": -237.13436889648438, - "loss": 0.6722, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": 0.0686158686876297, - "rewards/margins": 0.051144860684871674, - "rewards/rejected": 0.01747100241482258, + "learning_rate": 4.810562571756601e-07, + "logits/chosen": -2.9710302352905273, + "logits/rejected": -2.983682155609131, + "logps/chosen": -272.12713623046875, + "logps/rejected": -235.8425750732422, + "loss": 0.9475, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.013235519640147686, + "rewards/margins": 0.05444386601448059, + "rewards/rejected": -0.04120834544301033, "step": 390 }, { "epoch": 0.41, - "learning_rate": 3.260619977037887e-07, - "logits/chosen": -2.237035036087036, - "logits/rejected": -2.2392399311065674, - "logps/chosen": -263.4399108886719, - "logps/rejected": -213.87451171875, - "loss": 0.6707, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": 0.06768475472927094, - "rewards/margins": 0.048441771417856216, - "rewards/rejected": 0.019242987036705017, + "learning_rate": 4.791427477994642e-07, + "logits/chosen": -3.0254111289978027, + "logits/rejected": -3.006087303161621, + "logps/chosen": -254.69107055664062, + "logps/rejected": -210.39474487304688, + "loss": 0.9402, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.022726301103830338, + "rewards/margins": 0.06853805482387543, + "rewards/rejected": -0.04581175372004509, "step": 400 }, { "epoch": 0.42, - "learning_rate": 3.203214695752009e-07, - "logits/chosen": -2.2776081562042236, - "logits/rejected": -2.2924447059631348, - "logps/chosen": -268.8953857421875, - "logps/rejected": -252.852294921875, - "loss": 0.6673, + "learning_rate": 4.772292384232682e-07, + "logits/chosen": -3.0280232429504395, + "logits/rejected": -2.9936630725860596, + "logps/chosen": -261.80731201171875, + "logps/rejected": -251.14950561523438, + "loss": 0.9398, "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": 0.08111406862735748, - "rewards/margins": 0.05318716913461685, - "rewards/rejected": 0.027926897630095482, + "rewards/chosen": 0.014309520833194256, + "rewards/margins": 0.050246305763721466, + "rewards/rejected": -0.03593678027391434, "step": 410 }, { "epoch": 0.43, - "learning_rate": 3.145809414466131e-07, - "logits/chosen": -2.3054046630859375, - "logits/rejected": -2.2502362728118896, - "logps/chosen": -252.5205841064453, - "logps/rejected": -204.43344116210938, - "loss": 0.6749, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.07272285223007202, - "rewards/margins": 0.04809904843568802, - "rewards/rejected": 0.024623800069093704, + "learning_rate": 4.753157290470723e-07, + "logits/chosen": -3.0422613620758057, + "logits/rejected": -3.004459857940674, + "logps/chosen": -245.8754425048828, + "logps/rejected": -202.38157653808594, + "loss": 0.9391, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.005309578962624073, + "rewards/margins": 0.0503067672252655, + "rewards/rejected": -0.0449971929192543, "step": 420 }, { "epoch": 0.44, - "learning_rate": 3.0884041331802526e-07, - "logits/chosen": -2.3482632637023926, - "logits/rejected": -2.3258707523345947, - "logps/chosen": -263.67095947265625, - "logps/rejected": -241.14047241210938, - "loss": 0.6741, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.07022975385189056, - "rewards/margins": 0.04051927849650383, - "rewards/rejected": 0.029710477218031883, + "learning_rate": 4.7340221967087635e-07, + "logits/chosen": -3.061112880706787, + "logits/rejected": -3.045253038406372, + "logps/chosen": -257.9686584472656, + "logps/rejected": -239.047119140625, + "loss": 0.9323, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.012545446865260601, + "rewards/margins": 0.07196511328220367, + "rewards/rejected": -0.0594196543097496, "step": 430 }, { "epoch": 0.45, - "learning_rate": 3.030998851894374e-07, - "logits/chosen": -2.286533832550049, - "logits/rejected": -2.320568084716797, - "logps/chosen": -286.72894287109375, - "logps/rejected": -247.65542602539062, - "loss": 0.6705, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": 0.07666246592998505, - "rewards/margins": 0.05972421169281006, - "rewards/rejected": 0.01693824864923954, + "learning_rate": 4.714887102946804e-07, + "logits/chosen": -3.0016167163848877, + "logits/rejected": -2.9664511680603027, + "logps/chosen": -278.2596740722656, + "logps/rejected": -246.8672637939453, + "loss": 0.9194, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.00037018657894805074, + "rewards/margins": 0.0750364139676094, + "rewards/rejected": -0.07466623187065125, "step": 440 }, { "epoch": 0.46, - "learning_rate": 2.973593570608496e-07, - "logits/chosen": -2.206477642059326, - "logits/rejected": -2.315464496612549, - "logps/chosen": -276.1682434082031, - "logps/rejected": -230.3959197998047, - "loss": 0.678, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.0718303695321083, - "rewards/margins": 0.04074189439415932, - "rewards/rejected": 0.03108847141265869, + "learning_rate": 4.6957520091848447e-07, + "logits/chosen": -3.0514495372772217, + "logits/rejected": -3.0419204235076904, + "logps/chosen": -272.3708801269531, + "logps/rejected": -227.4208221435547, + "loss": 0.9356, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.01730353757739067, + "rewards/margins": 0.059229202568531036, + "rewards/rejected": -0.04192566126585007, "step": 450 }, { "epoch": 0.47, - "learning_rate": 2.9161882893226177e-07, - "logits/chosen": -2.277815103530884, - "logits/rejected": -2.342268705368042, - "logps/chosen": -273.23773193359375, - "logps/rejected": -222.5966796875, - "loss": 0.6662, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.0802597850561142, - "rewards/margins": 0.050464123487472534, - "rewards/rejected": 0.029795657843351364, + "learning_rate": 4.6766169154228853e-07, + "logits/chosen": -3.0307857990264893, + "logits/rejected": -3.0139756202697754, + "logps/chosen": -264.16168212890625, + "logps/rejected": -219.8409881591797, + "loss": 0.9122, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.007891577668488026, + "rewards/margins": 0.07121269404888153, + "rewards/rejected": -0.06332111358642578, "step": 460 }, { "epoch": 0.49, - "learning_rate": 2.858783008036739e-07, - "logits/chosen": -2.2656216621398926, - "logits/rejected": -2.2778594493865967, - "logps/chosen": -248.9929656982422, - "logps/rejected": -215.5894012451172, - "loss": 0.6669, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.08564073592424393, - "rewards/margins": 0.06490761041641235, - "rewards/rejected": 0.020733121782541275, + "learning_rate": 4.657481821660926e-07, + "logits/chosen": -3.016810178756714, + "logits/rejected": -3.019348621368408, + "logps/chosen": -238.8083038330078, + "logps/rejected": -212.76193237304688, + "loss": 0.9028, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.028794366866350174, + "rewards/margins": 0.08307679742574692, + "rewards/rejected": -0.054282426834106445, "step": 470 }, { "epoch": 0.5, - "learning_rate": 2.801377726750861e-07, - "logits/chosen": -2.2962255477905273, - "logits/rejected": -2.27239727973938, - "logps/chosen": -289.5277404785156, - "logps/rejected": -231.601318359375, - "loss": 0.6713, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.08144901692867279, - "rewards/margins": 0.05658548325300217, - "rewards/rejected": 0.024863524362444878, + "learning_rate": 4.6383467278989666e-07, + "logits/chosen": -3.044045925140381, + "logits/rejected": -3.0309886932373047, + "logps/chosen": -282.59814453125, + "logps/rejected": -229.63858032226562, + "loss": 0.9063, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.022610556334257126, + "rewards/margins": 0.09912824630737305, + "rewards/rejected": -0.07651769369840622, "step": 480 }, { "epoch": 0.51, - "learning_rate": 2.743972445464983e-07, - "logits/chosen": -2.445746660232544, - "logits/rejected": -2.267007827758789, - "logps/chosen": -293.1885986328125, - "logps/rejected": -243.8875274658203, - "loss": 0.6676, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.10828351974487305, - "rewards/margins": 0.08175922185182571, - "rewards/rejected": 0.02652430161833763, + "learning_rate": 4.6192116341370067e-07, + "logits/chosen": -3.0251693725585938, + "logits/rejected": -3.040748119354248, + "logps/chosen": -289.87896728515625, + "logps/rejected": -243.85952758789062, + "loss": 0.8948, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": 0.03339407593011856, + "rewards/margins": 0.1147690862417221, + "rewards/rejected": -0.08137501776218414, "step": 490 }, { "epoch": 0.52, - "learning_rate": 2.686567164179104e-07, - "logits/chosen": -2.278276205062866, - "logits/rejected": -2.295633316040039, - "logps/chosen": -254.94760131835938, - "logps/rejected": -221.79452514648438, - "loss": 0.6672, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.08227074891328812, - "rewards/margins": 0.055896710604429245, - "rewards/rejected": 0.026374032720923424, + "learning_rate": 4.6000765403750473e-07, + "logits/chosen": -3.0105278491973877, + "logits/rejected": -2.9718642234802246, + "logps/chosen": -244.4474334716797, + "logps/rejected": -221.8011474609375, + "loss": 0.8829, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.017204025760293007, + "rewards/margins": 0.0894416943192482, + "rewards/rejected": -0.07223766297101974, "step": 500 }, { "epoch": 0.53, - "learning_rate": 2.629161882893226e-07, - "logits/chosen": -2.202611207962036, - "logits/rejected": -2.2495861053466797, - "logps/chosen": -310.4443664550781, - "logps/rejected": -256.72406005859375, - "loss": 0.6666, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.07021793723106384, - "rewards/margins": 0.040728576481342316, - "rewards/rejected": 0.02948935702443123, + "learning_rate": 4.580941446613088e-07, + "logits/chosen": -2.9197583198547363, + "logits/rejected": -2.915809392929077, + "logps/chosen": -304.55645751953125, + "logps/rejected": -253.95828247070312, + "loss": 0.8834, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.02509082481265068, + "rewards/margins": 0.11200642585754395, + "rewards/rejected": -0.08691558986902237, "step": 510 }, { "epoch": 0.54, - "learning_rate": 2.571756601607348e-07, - "logits/chosen": -2.3376307487487793, - "logits/rejected": -2.352074146270752, - "logps/chosen": -278.10504150390625, - "logps/rejected": -244.0722198486328, - "loss": 0.6697, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.0925985723733902, - "rewards/margins": 0.0637633204460144, - "rewards/rejected": 0.028835251927375793, + "learning_rate": 4.5618063528511285e-07, + "logits/chosen": -3.0240163803100586, + "logits/rejected": -2.998610019683838, + "logps/chosen": -270.31378173828125, + "logps/rejected": -242.4883270263672, + "loss": 0.8813, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.02489570900797844, + "rewards/margins": 0.13008132576942444, + "rewards/rejected": -0.1051856279373169, "step": 520 }, { "epoch": 0.55, - "learning_rate": 2.5143513203214697e-07, - "logits/chosen": -2.243332624435425, - "logits/rejected": -2.2513413429260254, - "logps/chosen": -242.59439086914062, - "logps/rejected": -224.13259887695312, - "loss": 0.6716, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.07866770029067993, - "rewards/margins": 0.057711243629455566, - "rewards/rejected": 0.020956454798579216, + "learning_rate": 4.542671259089169e-07, + "logits/chosen": -3.0373079776763916, + "logits/rejected": -3.0401182174682617, + "logps/chosen": -235.19534301757812, + "logps/rejected": -221.75363159179688, + "loss": 0.8933, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.00027992590912617743, + "rewards/margins": 0.09569151699542999, + "rewards/rejected": -0.09541159123182297, "step": 530 }, { "epoch": 0.56, - "learning_rate": 2.456946039035591e-07, - "logits/chosen": -2.300567150115967, - "logits/rejected": -2.271827220916748, - "logps/chosen": -288.2174377441406, - "logps/rejected": -240.34439086914062, - "loss": 0.6682, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.10411250591278076, - "rewards/margins": 0.05851038545370102, - "rewards/rejected": 0.04560210928320885, + "learning_rate": 4.52353616532721e-07, + "logits/chosen": -3.0647318363189697, + "logits/rejected": -3.054605484008789, + "logps/chosen": -280.6692810058594, + "logps/rejected": -237.3024444580078, + "loss": 0.8826, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.03587502986192703, + "rewards/margins": 0.12422885000705719, + "rewards/rejected": -0.08835381269454956, "step": 540 }, { "epoch": 0.57, - "learning_rate": 2.399540757749713e-07, - "logits/chosen": -2.3359756469726562, - "logits/rejected": -2.194058895111084, - "logps/chosen": -265.052001953125, - "logps/rejected": -230.23605346679688, - "loss": 0.6686, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.0775262787938118, - "rewards/margins": 0.05575944110751152, - "rewards/rejected": 0.021766824647784233, + "learning_rate": 4.5044010715652504e-07, + "logits/chosen": -3.0048820972442627, + "logits/rejected": -3.0057966709136963, + "logps/chosen": -256.4432373046875, + "logps/rejected": -229.13198852539062, + "loss": 0.878, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.039617545902729034, + "rewards/margins": 0.11413271725177765, + "rewards/rejected": -0.07451517134904861, "step": 550 }, { "epoch": 0.58, - "learning_rate": 2.3421354764638345e-07, - "logits/chosen": -2.3195242881774902, - "logits/rejected": -2.283975124359131, - "logps/chosen": -302.0104064941406, - "logps/rejected": -252.0124053955078, - "loss": 0.6708, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.10010389983654022, - "rewards/margins": 0.053703296929597855, - "rewards/rejected": 0.04640059918165207, + "learning_rate": 4.485265977803291e-07, + "logits/chosen": -3.0452122688293457, + "logits/rejected": -3.041473865509033, + "logps/chosen": -293.8966369628906, + "logps/rejected": -251.0624542236328, + "loss": 0.8656, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.04036722332239151, + "rewards/margins": 0.1490786224603653, + "rewards/rejected": -0.10871138423681259, "step": 560 }, { "epoch": 0.59, - "learning_rate": 2.2847301951779563e-07, - "logits/chosen": -2.2481091022491455, - "logits/rejected": -2.400871515274048, - "logps/chosen": -268.6519775390625, - "logps/rejected": -223.69882202148438, - "loss": 0.6654, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.0826568529009819, - "rewards/margins": 0.05431235954165459, - "rewards/rejected": 0.028344491496682167, + "learning_rate": 4.4661308840413316e-07, + "logits/chosen": -3.0647714138031006, + "logits/rejected": -3.01206636428833, + "logps/chosen": -260.71343994140625, + "logps/rejected": -221.20632934570312, + "loss": 0.8666, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.011453949846327305, + "rewards/margins": 0.13622693717479706, + "rewards/rejected": -0.12477298080921173, "step": 570 }, { "epoch": 0.6, - "learning_rate": 2.227324913892078e-07, - "logits/chosen": -2.299408197402954, - "logits/rejected": -2.22338604927063, - "logps/chosen": -299.3912353515625, - "logps/rejected": -236.9815216064453, - "loss": 0.661, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": 0.10458721220493317, - "rewards/margins": 0.08465038239955902, - "rewards/rejected": 0.019936833530664444, + "learning_rate": 4.446995790279372e-07, + "logits/chosen": -3.034562349319458, + "logits/rejected": -2.989062547683716, + "logps/chosen": -291.01446533203125, + "logps/rejected": -241.4232177734375, + "loss": 0.8519, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": 0.035387031733989716, + "rewards/margins": 0.18669307231903076, + "rewards/rejected": -0.15130606293678284, "step": 580 }, { "epoch": 0.61, - "learning_rate": 2.1699196326061998e-07, - "logits/chosen": -2.2584633827209473, - "logits/rejected": -2.2311649322509766, - "logps/chosen": -253.76913452148438, - "logps/rejected": -218.6166534423828, - "loss": 0.6687, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.07234074175357819, - "rewards/margins": 0.04758009687066078, - "rewards/rejected": 0.024760644882917404, + "learning_rate": 4.4278606965174123e-07, + "logits/chosen": -3.012864351272583, + "logits/rejected": -3.005479097366333, + "logps/chosen": -243.08975219726562, + "logps/rejected": -216.92074584960938, + "loss": 0.8492, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.002515086205676198, + "rewards/margins": 0.121725894510746, + "rewards/rejected": -0.11921081691980362, "step": 590 }, { "epoch": 0.62, - "learning_rate": 2.1125143513203214e-07, - "logits/chosen": -2.318943738937378, - "logits/rejected": -2.2511682510375977, - "logps/chosen": -256.5652770996094, - "logps/rejected": -206.35586547851562, - "loss": 0.669, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.07542125880718231, - "rewards/margins": 0.0553053617477417, - "rewards/rejected": 0.020115893334150314, + "learning_rate": 4.408725602755453e-07, + "logits/chosen": -3.013345718383789, + "logits/rejected": -3.0360682010650635, + "logps/chosen": -248.50326538085938, + "logps/rejected": -203.6788787841797, + "loss": 0.8617, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.0035158656537532806, + "rewards/margins": 0.14544394612312317, + "rewards/rejected": -0.1419280618429184, "step": 600 }, { "epoch": 0.63, - "learning_rate": 2.055109070034443e-07, - "logits/chosen": -2.3058714866638184, - "logits/rejected": -2.304198741912842, - "logps/chosen": -266.4674987792969, - "logps/rejected": -223.82711791992188, - "loss": 0.6677, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.09824246913194656, - "rewards/margins": 0.06738617271184921, - "rewards/rejected": 0.03085630014538765, + "learning_rate": 4.3895905089934936e-07, + "logits/chosen": -3.047393321990967, + "logits/rejected": -3.0524885654449463, + "logps/chosen": -255.6022491455078, + "logps/rejected": -219.2570343017578, + "loss": 0.8256, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.03486616909503937, + "rewards/margins": 0.1496470421552658, + "rewards/rejected": -0.11478086560964584, "step": 610 }, { "epoch": 0.64, - "learning_rate": 1.997703788748565e-07, - "logits/chosen": -2.337787389755249, - "logits/rejected": -2.2819180488586426, - "logps/chosen": -313.7826232910156, - "logps/rejected": -249.5704803466797, - "loss": 0.6582, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": 0.10966908931732178, - "rewards/margins": 0.08016980439424515, - "rewards/rejected": 0.029499292373657227, + "learning_rate": 4.370455415231534e-07, + "logits/chosen": -3.066741466522217, + "logits/rejected": -3.046435832977295, + "logps/chosen": -305.30712890625, + "logps/rejected": -249.01968383789062, + "loss": 0.8203, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.04162443429231644, + "rewards/margins": 0.18165114521980286, + "rewards/rejected": -0.14002671837806702, "step": 620 }, { "epoch": 0.65, - "learning_rate": 1.9402985074626865e-07, - "logits/chosen": -2.2067112922668457, - "logits/rejected": -2.246953010559082, - "logps/chosen": -259.2144775390625, - "logps/rejected": -240.3810272216797, - "loss": 0.6653, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.09941162168979645, - "rewards/margins": 0.06417630612850189, - "rewards/rejected": 0.035235337913036346, + "learning_rate": 4.351320321469575e-07, + "logits/chosen": -2.990051746368408, + "logits/rejected": -2.992987632751465, + "logps/chosen": -251.0054168701172, + "logps/rejected": -238.8704376220703, + "loss": 0.8282, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.028837282210588455, + "rewards/margins": 0.1591511368751526, + "rewards/rejected": -0.13031385838985443, "step": 630 }, { "epoch": 0.66, - "learning_rate": 1.8828932261768083e-07, - "logits/chosen": -2.2894420623779297, - "logits/rejected": -2.2385382652282715, - "logps/chosen": -266.48992919921875, - "logps/rejected": -217.8952178955078, - "loss": 0.661, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.095299132168293, - "rewards/margins": 0.07987986505031586, - "rewards/rejected": 0.01541926246136427, + "learning_rate": 4.3321852277076154e-07, + "logits/chosen": -3.0534205436706543, + "logits/rejected": -3.0434939861297607, + "logps/chosen": -256.14019775390625, + "logps/rejected": -216.6497802734375, + "loss": 0.8083, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.034832023084163666, + "rewards/margins": 0.20265790820121765, + "rewards/rejected": -0.16782590746879578, "step": 640 }, { "epoch": 0.67, - "learning_rate": 1.82548794489093e-07, - "logits/chosen": -2.33485746383667, - "logits/rejected": -2.3108019828796387, - "logps/chosen": -284.7020568847656, - "logps/rejected": -232.82080078125, - "loss": 0.664, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": 0.10341651737689972, - "rewards/margins": 0.07464977353811264, - "rewards/rejected": 0.028766745701432228, + "learning_rate": 4.313050133945656e-07, + "logits/chosen": -3.0195870399475098, + "logits/rejected": -3.027886390686035, + "logps/chosen": -277.5207214355469, + "logps/rejected": -232.2840576171875, + "loss": 0.8119, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.05330047011375427, + "rewards/margins": 0.21206972002983093, + "rewards/rejected": -0.15876924991607666, "step": 650 }, { "epoch": 0.68, - "learning_rate": 1.7680826636050515e-07, - "logits/chosen": -2.3347816467285156, - "logits/rejected": -2.2758853435516357, - "logps/chosen": -279.80059814453125, - "logps/rejected": -233.2425994873047, - "loss": 0.6608, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": 0.11068934202194214, - "rewards/margins": 0.07695071399211884, - "rewards/rejected": 0.0337386280298233, + "learning_rate": 4.2939150401836967e-07, + "logits/chosen": -3.0407612323760986, + "logits/rejected": -3.014266014099121, + "logps/chosen": -271.2508850097656, + "logps/rejected": -233.294189453125, + "loss": 0.792, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.02551809512078762, + "rewards/margins": 0.21874013543128967, + "rewards/rejected": -0.1932220160961151, "step": 660 }, { "epoch": 0.69, - "learning_rate": 1.7106773823191734e-07, - "logits/chosen": -2.2854952812194824, - "logits/rejected": -2.273536205291748, - "logps/chosen": -295.6964416503906, - "logps/rejected": -240.4071502685547, - "loss": 0.6615, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.1013779416680336, - "rewards/margins": 0.060683172196149826, - "rewards/rejected": 0.04069476202130318, + "learning_rate": 4.2747799464217373e-07, + "logits/chosen": -2.994800567626953, + "logits/rejected": -2.9698376655578613, + "logps/chosen": -287.5726623535156, + "logps/rejected": -237.8695068359375, + "loss": 0.8042, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.006596171762794256, + "rewards/margins": 0.18577079474925995, + "rewards/rejected": -0.17917463183403015, "step": 670 }, { "epoch": 0.7, - "learning_rate": 1.653272101033295e-07, - "logits/chosen": -2.34243106842041, - "logits/rejected": -2.2720611095428467, - "logps/chosen": -289.71722412109375, - "logps/rejected": -230.321533203125, - "loss": 0.6729, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.09767869859933853, - "rewards/margins": 0.039280109107494354, - "rewards/rejected": 0.05839858204126358, + "learning_rate": 4.255644852659778e-07, + "logits/chosen": -3.0441243648529053, + "logits/rejected": -3.053039073944092, + "logps/chosen": -281.0970153808594, + "logps/rejected": -227.56851196289062, + "loss": 0.8389, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.022974971681833267, + "rewards/margins": 0.15209509432315826, + "rewards/rejected": -0.1291201412677765, "step": 680 }, { "epoch": 0.71, - "learning_rate": 1.5958668197474169e-07, - "logits/chosen": -2.371598482131958, - "logits/rejected": -2.362656354904175, - "logps/chosen": -268.17828369140625, - "logps/rejected": -229.41232299804688, - "loss": 0.6659, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.0969640463590622, - "rewards/margins": 0.06369610875844955, - "rewards/rejected": 0.033267926424741745, + "learning_rate": 4.236509758897818e-07, + "logits/chosen": -3.0892813205718994, + "logits/rejected": -3.0816287994384766, + "logps/chosen": -258.57781982421875, + "logps/rejected": -230.27615356445312, + "loss": 0.8064, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.006377282552421093, + "rewards/margins": 0.21148601174354553, + "rewards/rejected": -0.20510873198509216, "step": 690 }, { "epoch": 0.72, - "learning_rate": 1.5384615384615385e-07, - "logits/chosen": -2.2588796615600586, - "logits/rejected": -2.2576823234558105, - "logps/chosen": -282.4342041015625, - "logps/rejected": -222.56381225585938, - "loss": 0.664, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.10399500280618668, - "rewards/margins": 0.08138440549373627, - "rewards/rejected": 0.0226106159389019, + "learning_rate": 4.2173746651358586e-07, + "logits/chosen": -3.0129265785217285, + "logits/rejected": -2.9877090454101562, + "logps/chosen": -275.82568359375, + "logps/rejected": -219.6047821044922, + "loss": 0.8098, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.04388806223869324, + "rewards/margins": 0.2091568410396576, + "rewards/rejected": -0.16526879370212555, "step": 700 }, { "epoch": 0.73, - "learning_rate": 1.4810562571756603e-07, - "logits/chosen": -2.3341283798217773, - "logits/rejected": -2.2046780586242676, - "logps/chosen": -272.2647399902344, - "logps/rejected": -208.01364135742188, - "loss": 0.666, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.10669133812189102, - "rewards/margins": 0.08235933631658554, - "rewards/rejected": 0.02433200553059578, + "learning_rate": 4.198239571373899e-07, + "logits/chosen": -2.9762985706329346, + "logits/rejected": -2.986323833465576, + "logps/chosen": -262.74371337890625, + "logps/rejected": -207.13418579101562, + "loss": 0.8056, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02364834025502205, + "rewards/margins": 0.2192881554365158, + "rewards/rejected": -0.19563981890678406, "step": 710 }, { "epoch": 0.74, - "learning_rate": 1.423650975889782e-07, - "logits/chosen": -2.323979139328003, - "logits/rejected": -2.340238094329834, - "logps/chosen": -303.2074279785156, - "logps/rejected": -259.44268798828125, - "loss": 0.6667, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.11533965170383453, - "rewards/margins": 0.047552816569805145, - "rewards/rejected": 0.06778682768344879, + "learning_rate": 4.17910447761194e-07, + "logits/chosen": -3.0574042797088623, + "logits/rejected": -3.0168094635009766, + "logps/chosen": -296.2746276855469, + "logps/rejected": -256.06854248046875, + "loss": 0.8097, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.013486223295331001, + "rewards/margins": 0.14934802055358887, + "rewards/rejected": -0.13586178421974182, "step": 720 }, { "epoch": 0.75, - "learning_rate": 1.3662456946039035e-07, - "logits/chosen": -2.3031513690948486, - "logits/rejected": -2.28584623336792, - "logps/chosen": -270.1670837402344, - "logps/rejected": -252.5519256591797, - "loss": 0.6642, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": 0.10461707413196564, - "rewards/margins": 0.058367032557725906, - "rewards/rejected": 0.04625004902482033, + "learning_rate": 4.1599693838499805e-07, + "logits/chosen": -3.0353150367736816, + "logits/rejected": -3.0224339962005615, + "logps/chosen": -261.10992431640625, + "logps/rejected": -251.72293090820312, + "loss": 0.7899, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.007094231434166431, + "rewards/margins": 0.16344769299030304, + "rewards/rejected": -0.1705418974161148, "step": 730 }, { "epoch": 0.76, - "learning_rate": 1.3088404133180254e-07, - "logits/chosen": -2.2157022953033447, - "logits/rejected": -2.2670745849609375, - "logps/chosen": -276.71240234375, - "logps/rejected": -199.2496795654297, - "loss": 0.6635, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": 0.11176248639822006, - "rewards/margins": 0.08353973925113678, - "rewards/rejected": 0.02822275087237358, + "learning_rate": 4.140834290088021e-07, + "logits/chosen": -3.008420944213867, + "logits/rejected": -2.9702980518341064, + "logps/chosen": -269.81903076171875, + "logps/rejected": -199.1494903564453, + "loss": 0.7627, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.01843448355793953, + "rewards/margins": 0.2920437455177307, + "rewards/rejected": -0.2736092209815979, "step": 740 }, { "epoch": 0.77, - "learning_rate": 1.251435132032147e-07, - "logits/chosen": -2.2043914794921875, - "logits/rejected": -2.221619129180908, - "logps/chosen": -269.0702819824219, - "logps/rejected": -220.8921356201172, - "loss": 0.665, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": 0.09922349452972412, - "rewards/margins": 0.04318443313241005, - "rewards/rejected": 0.05603905767202377, + "learning_rate": 4.121699196326062e-07, + "logits/chosen": -3.004692792892456, + "logits/rejected": -2.9878294467926025, + "logps/chosen": -262.6126708984375, + "logps/rejected": -220.02096557617188, + "loss": 0.7653, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.001659922068938613, + "rewards/margins": 0.25197452306747437, + "rewards/rejected": -0.2503146231174469, "step": 750 }, { "epoch": 0.78, - "learning_rate": 1.1940298507462686e-07, - "logits/chosen": -2.232959270477295, - "logits/rejected": -2.2529525756835938, - "logps/chosen": -267.9338684082031, - "logps/rejected": -249.4876251220703, - "loss": 0.6684, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.08004304021596909, - "rewards/margins": 0.04949140548706055, - "rewards/rejected": 0.030551627278327942, + "learning_rate": 4.1025641025641024e-07, + "logits/chosen": -2.9879119396209717, + "logits/rejected": -2.980886459350586, + "logps/chosen": -262.13287353515625, + "logps/rejected": -249.5322723388672, + "loss": 0.782, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.036843255162239075, + "rewards/margins": 0.17954595386981964, + "rewards/rejected": -0.21638920903205872, "step": 760 }, { "epoch": 0.8, - "learning_rate": 1.1366245694603903e-07, - "logits/chosen": -2.293257236480713, - "logits/rejected": -2.2078585624694824, - "logps/chosen": -273.19671630859375, - "logps/rejected": -238.57858276367188, - "loss": 0.661, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.11353409290313721, - "rewards/margins": 0.06645722687244415, - "rewards/rejected": 0.04707685858011246, + "learning_rate": 4.083429008802143e-07, + "logits/chosen": -3.0227646827697754, + "logits/rejected": -3.0109972953796387, + "logps/chosen": -266.5917663574219, + "logps/rejected": -237.83108520507812, + "loss": 0.7486, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03099486604332924, + "rewards/margins": 0.23689258098602295, + "rewards/rejected": -0.20589768886566162, "step": 770 }, { "epoch": 0.81, - "learning_rate": 1.079219288174512e-07, - "logits/chosen": -2.3507869243621826, - "logits/rejected": -2.325718879699707, - "logps/chosen": -290.9693298339844, - "logps/rejected": -236.1486358642578, - "loss": 0.6633, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.0980958342552185, - "rewards/margins": 0.07181811332702637, - "rewards/rejected": 0.026277724653482437, + "learning_rate": 4.0642939150401836e-07, + "logits/chosen": -2.9976096153259277, + "logits/rejected": -2.9851810932159424, + "logps/chosen": -283.94677734375, + "logps/rejected": -237.71841430664062, + "loss": 0.7486, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.020093750208616257, + "rewards/margins": 0.27248382568359375, + "rewards/rejected": -0.2925775945186615, "step": 780 }, { "epoch": 0.82, - "learning_rate": 1.0218140068886336e-07, - "logits/chosen": -2.268038272857666, - "logits/rejected": -2.286581516265869, - "logps/chosen": -270.3387451171875, - "logps/rejected": -221.06356811523438, - "loss": 0.6564, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.12088136374950409, - "rewards/margins": 0.080001600086689, - "rewards/rejected": 0.040879763662815094, + "learning_rate": 4.0451588212782237e-07, + "logits/chosen": -3.031033754348755, + "logits/rejected": -3.024602174758911, + "logps/chosen": -261.4927978515625, + "logps/rejected": -222.33651733398438, + "loss": 0.7052, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.016252126544713974, + "rewards/margins": 0.25844720005989075, + "rewards/rejected": -0.24219508469104767, "step": 790 }, { "epoch": 0.83, - "learning_rate": 9.644087256027554e-08, - "logits/chosen": -2.272735118865967, - "logits/rejected": -2.2941083908081055, - "logps/chosen": -284.6488952636719, - "logps/rejected": -243.56796264648438, - "loss": 0.6639, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.1113913282752037, - "rewards/margins": 0.05327050760388374, - "rewards/rejected": 0.05812082439661026, + "learning_rate": 4.0260237275162643e-07, + "logits/chosen": -3.017972946166992, + "logits/rejected": -2.993112564086914, + "logps/chosen": -277.76177978515625, + "logps/rejected": -241.6676483154297, + "loss": 0.765, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.010549797676503658, + "rewards/margins": 0.2188224494457245, + "rewards/rejected": -0.20827265083789825, "step": 800 }, { "epoch": 0.84, - "learning_rate": 9.070034443168771e-08, - "logits/chosen": -2.2838375568389893, - "logits/rejected": -2.289247751235962, - "logps/chosen": -269.5845642089844, - "logps/rejected": -230.6207275390625, - "loss": 0.6617, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": 0.09149408340454102, - "rewards/margins": 0.06341233849525452, - "rewards/rejected": 0.02808173932135105, + "learning_rate": 4.006888633754305e-07, + "logits/chosen": -3.0501556396484375, + "logits/rejected": -3.039248466491699, + "logps/chosen": -261.3684387207031, + "logps/rejected": -231.1971893310547, + "loss": 0.6973, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": 0.002077583223581314, + "rewards/margins": 0.3389395475387573, + "rewards/rejected": -0.3368619680404663, "step": 810 }, { "epoch": 0.85, - "learning_rate": 8.495981630309988e-08, - "logits/chosen": -2.365980863571167, - "logits/rejected": -2.3436598777770996, - "logps/chosen": -302.0718688964844, - "logps/rejected": -228.1407470703125, - "loss": 0.6623, + "learning_rate": 3.9877535399923456e-07, + "logits/chosen": -3.063708782196045, + "logits/rejected": -3.0399699211120605, + "logps/chosen": -294.5484924316406, + "logps/rejected": -228.28854370117188, + "loss": 0.7335, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.13062262535095215, - "rewards/margins": 0.08858474344015121, - "rewards/rejected": 0.04203786700963974, + "rewards/chosen": 0.037904877215623856, + "rewards/margins": 0.2777422368526459, + "rewards/rejected": -0.2398373782634735, "step": 820 }, { "epoch": 0.86, - "learning_rate": 7.921928817451206e-08, - "logits/chosen": -2.342413902282715, - "logits/rejected": -2.2254080772399902, - "logps/chosen": -287.4922180175781, - "logps/rejected": -222.5606231689453, - "loss": 0.6565, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.12904855608940125, - "rewards/margins": 0.08615640550851822, - "rewards/rejected": 0.04289213940501213, + "learning_rate": 3.968618446230386e-07, + "logits/chosen": -3.044889211654663, + "logits/rejected": -3.0049989223480225, + "logps/chosen": -280.3213195800781, + "logps/rejected": -223.97787475585938, + "loss": 0.6679, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.06323406845331192, + "rewards/margins": 0.37538376450538635, + "rewards/rejected": -0.31214970350265503, "step": 830 }, { "epoch": 0.87, - "learning_rate": 7.347876004592423e-08, - "logits/chosen": -2.259397029876709, - "logits/rejected": -2.227036476135254, - "logps/chosen": -258.3423767089844, - "logps/rejected": -216.99606323242188, - "loss": 0.6714, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.10358164459466934, - "rewards/margins": 0.06773830950260162, - "rewards/rejected": 0.03584333881735802, + "learning_rate": 3.949483352468427e-07, + "logits/chosen": -3.0053951740264893, + "logits/rejected": -2.98051118850708, + "logps/chosen": -252.00387573242188, + "logps/rejected": -218.14602661132812, + "loss": 0.7566, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.040188662707805634, + "rewards/margins": 0.23952028155326843, + "rewards/rejected": -0.2797089219093323, "step": 840 }, { "epoch": 0.88, - "learning_rate": 6.773823191733639e-08, - "logits/chosen": -2.2834537029266357, - "logits/rejected": -2.3872971534729004, - "logps/chosen": -262.05084228515625, - "logps/rejected": -231.11306762695312, - "loss": 0.6647, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.09495140612125397, - "rewards/margins": 0.055265575647354126, - "rewards/rejected": 0.03968583419919014, + "learning_rate": 3.9303482587064674e-07, + "logits/chosen": -3.0364222526550293, + "logits/rejected": -3.0096614360809326, + "logps/chosen": -253.96923828125, + "logps/rejected": -229.8144989013672, + "loss": 0.7089, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.007532055489718914, + "rewards/margins": 0.29781442880630493, + "rewards/rejected": -0.29028236865997314, "step": 850 }, { "epoch": 0.89, - "learning_rate": 6.199770378874856e-08, - "logits/chosen": -2.4065003395080566, - "logits/rejected": -2.3337345123291016, - "logps/chosen": -295.71478271484375, - "logps/rejected": -270.1822814941406, - "loss": 0.6693, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.11348612606525421, - "rewards/margins": 0.07466179132461548, - "rewards/rejected": 0.03882431983947754, + "learning_rate": 3.911213164944508e-07, + "logits/chosen": -3.0369744300842285, + "logits/rejected": -3.0353286266326904, + "logps/chosen": -290.00042724609375, + "logps/rejected": -273.43267822265625, + "loss": 0.7019, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.018222318962216377, + "rewards/margins": 0.31897154450416565, + "rewards/rejected": -0.3007492423057556, "step": 860 }, { "epoch": 0.9, - "learning_rate": 5.6257175660160735e-08, - "logits/chosen": -2.2463555335998535, - "logits/rejected": -2.2443947792053223, - "logps/chosen": -312.9588317871094, - "logps/rejected": -237.4109344482422, - "loss": 0.6644, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.10128283500671387, - "rewards/margins": 0.053178369998931885, - "rewards/rejected": 0.04810447618365288, + "learning_rate": 3.8920780711825487e-07, + "logits/chosen": -2.9672696590423584, + "logits/rejected": -2.9645469188690186, + "logps/chosen": -307.1138610839844, + "logps/rejected": -237.83786010742188, + "loss": 0.6937, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.005494369193911552, + "rewards/margins": 0.31811192631721497, + "rewards/rejected": -0.31261754035949707, "step": 870 }, { "epoch": 0.91, - "learning_rate": 5.05166475315729e-08, - "logits/chosen": -2.358501434326172, - "logits/rejected": -2.313483715057373, - "logps/chosen": -291.43377685546875, - "logps/rejected": -240.09054565429688, - "loss": 0.6632, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.10742716491222382, - "rewards/margins": 0.07204015552997589, - "rewards/rejected": 0.03538701683282852, + "learning_rate": 3.8729429774205893e-07, + "logits/chosen": -3.0339221954345703, + "logits/rejected": -3.0139739513397217, + "logps/chosen": -284.10772705078125, + "logps/rejected": -243.3460235595703, + "loss": 0.6766, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.01381886936724186, + "rewards/margins": 0.38050729036331177, + "rewards/rejected": -0.39432623982429504, "step": 880 }, { "epoch": 0.92, - "learning_rate": 4.477611940298507e-08, - "logits/chosen": -2.313149929046631, - "logits/rejected": -2.3558261394500732, - "logps/chosen": -285.90643310546875, - "logps/rejected": -235.43051147460938, - "loss": 0.6666, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.12259715795516968, - "rewards/margins": 0.09698096662759781, - "rewards/rejected": 0.02561618760228157, + "learning_rate": 3.8538078836586294e-07, + "logits/chosen": -3.0409202575683594, + "logits/rejected": -3.017521381378174, + "logps/chosen": -274.10675048828125, + "logps/rejected": -235.25332641601562, + "loss": 0.7214, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.045437611639499664, + "rewards/margins": 0.3723521828651428, + "rewards/rejected": -0.32691454887390137, "step": 890 }, { "epoch": 0.93, - "learning_rate": 3.903559127439724e-08, - "logits/chosen": -2.3278651237487793, - "logits/rejected": -2.195068836212158, - "logps/chosen": -272.7381896972656, - "logps/rejected": -211.40640258789062, - "loss": 0.658, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": 0.1207551583647728, - "rewards/margins": 0.09316142648458481, - "rewards/rejected": 0.027593741193413734, + "learning_rate": 3.83467278989667e-07, + "logits/chosen": -3.0040934085845947, + "logits/rejected": -2.9863858222961426, + "logps/chosen": -264.234130859375, + "logps/rejected": -212.1512908935547, + "loss": 0.6845, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.005959497299045324, + "rewards/margins": 0.3572615683078766, + "rewards/rejected": -0.3513020873069763, "step": 900 }, { "epoch": 0.94, - "learning_rate": 3.3295063145809414e-08, - "logits/chosen": -2.290696859359741, - "logits/rejected": -2.3440823554992676, - "logps/chosen": -238.2651824951172, - "logps/rejected": -206.77969360351562, - "loss": 0.6616, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.09928463399410248, - "rewards/margins": 0.07226204872131348, - "rewards/rejected": 0.027022594586014748, + "learning_rate": 3.8155376961347106e-07, + "logits/chosen": -3.0645086765289307, + "logits/rejected": -3.007006883621216, + "logps/chosen": -230.1156005859375, + "logps/rejected": -207.70779418945312, + "loss": 0.6515, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.018168695271015167, + "rewards/margins": 0.38268524408340454, + "rewards/rejected": -0.4008539617061615, "step": 910 }, { "epoch": 0.95, - "learning_rate": 2.755453501722158e-08, - "logits/chosen": -2.375807762145996, - "logits/rejected": -2.367743730545044, - "logps/chosen": -281.56195068359375, - "logps/rejected": -225.125244140625, - "loss": 0.662, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.1072310209274292, - "rewards/margins": 0.056608647108078, - "rewards/rejected": 0.050622373819351196, + "learning_rate": 3.796402602372751e-07, + "logits/chosen": -3.0344386100769043, + "logits/rejected": -3.020028829574585, + "logps/chosen": -273.54412841796875, + "logps/rejected": -222.2449493408203, + "loss": 0.6707, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.026546839624643326, + "rewards/margins": 0.2530173659324646, + "rewards/rejected": -0.27956423163414, "step": 920 }, { "epoch": 0.96, - "learning_rate": 2.1814006888633754e-08, - "logits/chosen": -2.281919002532959, - "logits/rejected": -2.254122734069824, - "logps/chosen": -256.39105224609375, - "logps/rejected": -203.3081817626953, - "loss": 0.6617, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.11211923509836197, - "rewards/margins": 0.07925260812044144, - "rewards/rejected": 0.03286661207675934, + "learning_rate": 3.777267508610792e-07, + "logits/chosen": -2.9949254989624023, + "logits/rejected": -3.0048325061798096, + "logps/chosen": -247.9457244873047, + "logps/rejected": -203.7804412841797, + "loss": 0.6833, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05914358049631119, + "rewards/margins": 0.3101976811885834, + "rewards/rejected": -0.3693412244319916, "step": 930 }, { "epoch": 0.97, - "learning_rate": 1.6073478760045924e-08, - "logits/chosen": -2.316282272338867, - "logits/rejected": -2.3123340606689453, - "logps/chosen": -271.6207580566406, - "logps/rejected": -231.7317352294922, - "loss": 0.6626, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": 0.10637687146663666, - "rewards/margins": 0.06768520176410675, - "rewards/rejected": 0.0386916846036911, + "learning_rate": 3.7581324148488325e-07, + "logits/chosen": -3.0347888469696045, + "logits/rejected": -2.994046688079834, + "logps/chosen": -263.569091796875, + "logps/rejected": -231.2215118408203, + "loss": 0.6597, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.021738069131970406, + "rewards/margins": 0.3827739953994751, + "rewards/rejected": -0.36103588342666626, "step": 940 }, { "epoch": 0.98, - "learning_rate": 1.0332950631458094e-08, - "logits/chosen": -2.3146958351135254, - "logits/rejected": -2.2793381214141846, - "logps/chosen": -282.83270263671875, - "logps/rejected": -233.0804443359375, - "loss": 0.6612, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": 0.11455857753753662, - "rewards/margins": 0.0838586837053299, - "rewards/rejected": 0.030699897557497025, + "learning_rate": 3.738997321086873e-07, + "logits/chosen": -3.0062592029571533, + "logits/rejected": -2.993868589401245, + "logps/chosen": -276.2633056640625, + "logps/rejected": -232.21493530273438, + "loss": 0.6677, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.03908178210258484, + "rewards/margins": 0.2943773865699768, + "rewards/rejected": -0.33345913887023926, "step": 950 }, { "epoch": 0.99, - "learning_rate": 4.592422502870264e-09, - "logits/chosen": -2.251638889312744, - "logits/rejected": -2.234907627105713, - "logps/chosen": -281.0075378417969, - "logps/rejected": -239.98049926757812, - "loss": 0.661, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.1062885969877243, - "rewards/margins": 0.06708581745624542, - "rewards/rejected": 0.03920278698205948, + "learning_rate": 3.7198622273249137e-07, + "logits/chosen": -2.9820735454559326, + "logits/rejected": -3.0040230751037598, + "logps/chosen": -275.2674865722656, + "logps/rejected": -240.7872314453125, + "loss": 0.6137, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.020224738866090775, + "rewards/margins": 0.3282891809940338, + "rewards/rejected": -0.30806440114974976, "step": 960 }, { "epoch": 1.0, - "eval_logits/chosen": -2.4597132205963135, - "eval_logits/rejected": -2.398695468902588, - "eval_logps/chosen": -278.69171142578125, - "eval_logps/rejected": -230.4560089111328, - "eval_loss": 0.6642152070999146, - "eval_rewards/accuracies": 0.6480000019073486, - "eval_rewards/chosen": 0.10415761172771454, - "eval_rewards/margins": 0.06405296921730042, - "eval_rewards/rejected": 0.04010463133454323, - "eval_runtime": 443.9432, - "eval_samples_per_second": 4.505, - "eval_steps_per_second": 0.282, + "eval_logits/chosen": -3.0153579711914062, + "eval_logits/rejected": -2.9988856315612793, + "eval_logps/chosen": -271.4433288574219, + "eval_logps/rejected": -232.6822967529297, + "eval_loss": 0.6276752948760986, + "eval_rewards/accuracies": 0.7039999961853027, + "eval_rewards/chosen": -0.028680188581347466, + "eval_rewards/margins": 0.3904646337032318, + "eval_rewards/rejected": -0.4191448390483856, + "eval_runtime": 449.0184, + "eval_samples_per_second": 4.454, + "eval_steps_per_second": 0.278, "step": 968 }, { "epoch": 1.0, - "step": 968, + "learning_rate": 3.7007271335629544e-07, + "logits/chosen": -3.0252368450164795, + "logits/rejected": -3.0065531730651855, + "logps/chosen": -275.3318176269531, + "logps/rejected": -244.02072143554688, + "loss": 0.6284, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.04684365913271904, + "rewards/margins": 0.3803178668022156, + "rewards/rejected": -0.42716145515441895, + "step": 970 + }, + { + "epoch": 1.01, + "learning_rate": 3.681592039800995e-07, + "logits/chosen": -3.0262959003448486, + "logits/rejected": -3.0287508964538574, + "logps/chosen": -271.2191162109375, + "logps/rejected": -233.7063446044922, + "loss": 0.6202, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.028351956978440285, + "rewards/margins": 0.39485400915145874, + "rewards/rejected": -0.4232059419155121, + "step": 980 + }, + { + "epoch": 1.02, + "learning_rate": 3.662456946039035e-07, + "logits/chosen": -3.0115294456481934, + "logits/rejected": -3.0146679878234863, + "logps/chosen": -251.9388885498047, + "logps/rejected": -229.016357421875, + "loss": 0.6954, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.07080521434545517, + "rewards/margins": 0.35120025277137756, + "rewards/rejected": -0.42200547456741333, + "step": 990 + }, + { + "epoch": 1.03, + "learning_rate": 3.6433218522770757e-07, + "logits/chosen": -3.007986068725586, + "logits/rejected": -2.9926083087921143, + "logps/chosen": -262.0426940917969, + "logps/rejected": -256.99188232421875, + "loss": 0.6533, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.070511594414711, + "rewards/margins": 0.3111626207828522, + "rewards/rejected": -0.381674200296402, + "step": 1000 + }, + { + "epoch": 1.04, + "learning_rate": 3.6241867585151163e-07, + "logits/chosen": -2.9853007793426514, + "logits/rejected": -3.0082547664642334, + "logps/chosen": -270.3041076660156, + "logps/rejected": -236.4681854248047, + "loss": 0.6359, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.022838518023490906, + "rewards/margins": 0.3421871066093445, + "rewards/rejected": -0.3650256097316742, + "step": 1010 + }, + { + "epoch": 1.05, + "learning_rate": 3.605051664753157e-07, + "logits/chosen": -2.9447665214538574, + "logits/rejected": -2.933103084564209, + "logps/chosen": -266.8084411621094, + "logps/rejected": -215.50308227539062, + "loss": 0.538, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04661910608410835, + "rewards/margins": 0.4469257891178131, + "rewards/rejected": -0.49354487657546997, + "step": 1020 + }, + { + "epoch": 1.06, + "learning_rate": 3.5859165709911975e-07, + "logits/chosen": -3.0248591899871826, + "logits/rejected": -2.98162579536438, + "logps/chosen": -277.4466552734375, + "logps/rejected": -250.49893188476562, + "loss": 0.5931, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.004877015482634306, + "rewards/margins": 0.4228130280971527, + "rewards/rejected": -0.4276900887489319, + "step": 1030 + }, + { + "epoch": 1.07, + "learning_rate": 3.566781477229238e-07, + "logits/chosen": -3.0339090824127197, + "logits/rejected": -2.9957573413848877, + "logps/chosen": -275.76910400390625, + "logps/rejected": -235.79830932617188, + "loss": 0.6007, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.08596549183130264, + "rewards/margins": 0.29382139444351196, + "rewards/rejected": -0.3797869086265564, + "step": 1040 + }, + { + "epoch": 1.08, + "learning_rate": 3.547646383467279e-07, + "logits/chosen": -2.9797756671905518, + "logits/rejected": -2.9523847103118896, + "logps/chosen": -284.53839111328125, + "logps/rejected": -214.0476531982422, + "loss": 0.5431, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07950444519519806, + "rewards/margins": 0.4745730459690094, + "rewards/rejected": -0.5540775060653687, + "step": 1050 + }, + { + "epoch": 1.09, + "learning_rate": 3.5285112897053194e-07, + "logits/chosen": -2.946882963180542, + "logits/rejected": -2.9203898906707764, + "logps/chosen": -257.57611083984375, + "logps/rejected": -244.93075561523438, + "loss": 0.5401, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.020254051312804222, + "rewards/margins": 0.4997115135192871, + "rewards/rejected": -0.5199655294418335, + "step": 1060 + }, + { + "epoch": 1.1, + "learning_rate": 3.50937619594336e-07, + "logits/chosen": -2.991321563720703, + "logits/rejected": -3.0015194416046143, + "logps/chosen": -279.9203186035156, + "logps/rejected": -243.14889526367188, + "loss": 0.5657, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.08546855300664902, + "rewards/margins": 0.4243212342262268, + "rewards/rejected": -0.5097898244857788, + "step": 1070 + }, + { + "epoch": 1.12, + "learning_rate": 3.4902411021814007e-07, + "logits/chosen": -3.034787893295288, + "logits/rejected": -2.9858031272888184, + "logps/chosen": -259.6640319824219, + "logps/rejected": -238.2606201171875, + "loss": 0.5362, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.11487703025341034, + "rewards/margins": 0.41897639632225037, + "rewards/rejected": -0.5338534116744995, + "step": 1080 + }, + { + "epoch": 1.13, + "learning_rate": 3.4711060084194413e-07, + "logits/chosen": -2.951054096221924, + "logits/rejected": -2.953270673751831, + "logps/chosen": -288.8511962890625, + "logps/rejected": -241.43490600585938, + "loss": 0.5902, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.041309602558612823, + "rewards/margins": 0.47613269090652466, + "rewards/rejected": -0.5174422860145569, + "step": 1090 + }, + { + "epoch": 1.14, + "learning_rate": 3.4519709146574814e-07, + "logits/chosen": -3.0297837257385254, + "logits/rejected": -2.9690465927124023, + "logps/chosen": -249.8524932861328, + "logps/rejected": -211.01486206054688, + "loss": 0.579, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.15785838663578033, + "rewards/margins": 0.3855217397212982, + "rewards/rejected": -0.5433801412582397, + "step": 1100 + }, + { + "epoch": 1.15, + "learning_rate": 3.432835820895522e-07, + "logits/chosen": -3.023965358734131, + "logits/rejected": -3.012373685836792, + "logps/chosen": -296.8121643066406, + "logps/rejected": -269.64410400390625, + "loss": 0.576, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.013493712060153484, + "rewards/margins": 0.45747965574264526, + "rewards/rejected": -0.47097334265708923, + "step": 1110 + }, + { + "epoch": 1.16, + "learning_rate": 3.4137007271335626e-07, + "logits/chosen": -2.9927139282226562, + "logits/rejected": -2.995068073272705, + "logps/chosen": -238.1875, + "logps/rejected": -238.9112548828125, + "loss": 0.5405, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.07948704063892365, + "rewards/margins": 0.4464842677116394, + "rewards/rejected": -0.5259712934494019, + "step": 1120 + }, + { + "epoch": 1.17, + "learning_rate": 3.394565633371603e-07, + "logits/chosen": -3.00760555267334, + "logits/rejected": -2.992061138153076, + "logps/chosen": -304.2665100097656, + "logps/rejected": -251.5317840576172, + "loss": 0.5381, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.1150180920958519, + "rewards/margins": 0.46640753746032715, + "rewards/rejected": -0.581425666809082, + "step": 1130 + }, + { + "epoch": 1.18, + "learning_rate": 3.375430539609644e-07, + "logits/chosen": -3.0006496906280518, + "logits/rejected": -3.0168707370758057, + "logps/chosen": -260.1852111816406, + "logps/rejected": -252.5244598388672, + "loss": 0.4865, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.16791771352291107, + "rewards/margins": 0.4443301260471344, + "rewards/rejected": -0.6122478246688843, + "step": 1140 + }, + { + "epoch": 1.19, + "learning_rate": 3.3562954458476845e-07, + "logits/chosen": -3.023045063018799, + "logits/rejected": -2.9913861751556396, + "logps/chosen": -295.21502685546875, + "logps/rejected": -226.15426635742188, + "loss": 0.5825, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.09654710441827774, + "rewards/margins": 0.504024088382721, + "rewards/rejected": -0.6005711555480957, + "step": 1150 + }, + { + "epoch": 1.2, + "learning_rate": 3.337160352085725e-07, + "logits/chosen": -2.9731438159942627, + "logits/rejected": -2.9854893684387207, + "logps/chosen": -260.6565856933594, + "logps/rejected": -235.22476196289062, + "loss": 0.5259, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.0705583468079567, + "rewards/margins": 0.5393149852752686, + "rewards/rejected": -0.6098732948303223, + "step": 1160 + }, + { + "epoch": 1.21, + "learning_rate": 3.3180252583237657e-07, + "logits/chosen": -3.015777111053467, + "logits/rejected": -2.9891409873962402, + "logps/chosen": -270.416748046875, + "logps/rejected": -232.32089233398438, + "loss": 0.5407, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.13838811218738556, + "rewards/margins": 0.4489242136478424, + "rewards/rejected": -0.5873123407363892, + "step": 1170 + }, + { + "epoch": 1.22, + "learning_rate": 3.2988901645618063e-07, + "logits/chosen": -2.980250597000122, + "logits/rejected": -2.9568262100219727, + "logps/chosen": -257.48504638671875, + "logps/rejected": -231.20101928710938, + "loss": 0.5238, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.26241081953048706, + "rewards/margins": 0.4477602541446686, + "rewards/rejected": -0.7101710438728333, + "step": 1180 + }, + { + "epoch": 1.23, + "learning_rate": 3.279755070799847e-07, + "logits/chosen": -2.987285614013672, + "logits/rejected": -2.9756875038146973, + "logps/chosen": -250.1853790283203, + "logps/rejected": -234.43313598632812, + "loss": 0.4216, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.1883806735277176, + "rewards/margins": 0.47481974959373474, + "rewards/rejected": -0.6632004380226135, + "step": 1190 + }, + { + "epoch": 1.24, + "learning_rate": 3.260619977037887e-07, + "logits/chosen": -2.9940743446350098, + "logits/rejected": -2.9866111278533936, + "logps/chosen": -261.4402770996094, + "logps/rejected": -236.15670776367188, + "loss": 0.4046, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.11559490859508514, + "rewards/margins": 0.5072110891342163, + "rewards/rejected": -0.6228059530258179, + "step": 1200 + }, + { + "epoch": 1.25, + "learning_rate": 3.2414848832759277e-07, + "logits/chosen": -3.0217831134796143, + "logits/rejected": -2.9786057472229004, + "logps/chosen": -263.88592529296875, + "logps/rejected": -227.23828125, + "loss": 0.4809, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.1470792442560196, + "rewards/margins": 0.5500885844230652, + "rewards/rejected": -0.6971677541732788, + "step": 1210 + }, + { + "epoch": 1.26, + "learning_rate": 3.2223497895139683e-07, + "logits/chosen": -2.983037233352661, + "logits/rejected": -2.9867138862609863, + "logps/chosen": -270.10089111328125, + "logps/rejected": -262.2214660644531, + "loss": 0.5297, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.19438357651233673, + "rewards/margins": 0.3766476809978485, + "rewards/rejected": -0.5710312128067017, + "step": 1220 + }, + { + "epoch": 1.27, + "learning_rate": 3.203214695752009e-07, + "logits/chosen": -2.923466920852661, + "logits/rejected": -2.9358396530151367, + "logps/chosen": -291.2686767578125, + "logps/rejected": -251.558837890625, + "loss": 0.5026, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.16642309725284576, + "rewards/margins": 0.3848158121109009, + "rewards/rejected": -0.5512388944625854, + "step": 1230 + }, + { + "epoch": 1.28, + "learning_rate": 3.1840796019900495e-07, + "logits/chosen": -3.007678508758545, + "logits/rejected": -2.988664388656616, + "logps/chosen": -291.9914245605469, + "logps/rejected": -263.6539611816406, + "loss": 0.4669, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.1000869870185852, + "rewards/margins": 0.597449004650116, + "rewards/rejected": -0.6975361108779907, + "step": 1240 + }, + { + "epoch": 1.29, + "learning_rate": 3.16494450822809e-07, + "logits/chosen": -3.0168919563293457, + "logits/rejected": -2.987907886505127, + "logps/chosen": -249.50704956054688, + "logps/rejected": -224.2857208251953, + "loss": 0.4447, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.16220508515834808, + "rewards/margins": 0.5248938798904419, + "rewards/rejected": -0.6870989203453064, + "step": 1250 + }, + { + "epoch": 1.3, + "learning_rate": 3.145809414466131e-07, + "logits/chosen": -3.0371620655059814, + "logits/rejected": -3.0139615535736084, + "logps/chosen": -273.0102233886719, + "logps/rejected": -234.0775909423828, + "loss": 0.439, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.13673745095729828, + "rewards/margins": 0.6153367161750793, + "rewards/rejected": -0.752074122428894, + "step": 1260 + }, + { + "epoch": 1.31, + "learning_rate": 3.1266743207041714e-07, + "logits/chosen": -3.0206820964813232, + "logits/rejected": -3.0111212730407715, + "logps/chosen": -309.1382751464844, + "logps/rejected": -276.2449645996094, + "loss": 0.4547, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1778876632452011, + "rewards/margins": 0.6206526756286621, + "rewards/rejected": -0.7985404133796692, + "step": 1270 + }, + { + "epoch": 1.32, + "learning_rate": 3.107539226942212e-07, + "logits/chosen": -3.007878303527832, + "logits/rejected": -3.0043411254882812, + "logps/chosen": -238.5860137939453, + "logps/rejected": -227.81234741210938, + "loss": 0.5382, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.1700361967086792, + "rewards/margins": 0.43853870034217834, + "rewards/rejected": -0.6085748672485352, + "step": 1280 + }, + { + "epoch": 1.33, + "learning_rate": 3.0884041331802526e-07, + "logits/chosen": -2.970362901687622, + "logits/rejected": -2.969972848892212, + "logps/chosen": -257.76812744140625, + "logps/rejected": -230.56820678710938, + "loss": 0.4622, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.14981916546821594, + "rewards/margins": 0.5685046315193176, + "rewards/rejected": -0.718323826789856, + "step": 1290 + }, + { + "epoch": 1.34, + "learning_rate": 3.0692690394182927e-07, + "logits/chosen": -3.0062780380249023, + "logits/rejected": -2.9634342193603516, + "logps/chosen": -278.4779968261719, + "logps/rejected": -237.66873168945312, + "loss": 0.3758, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.20812156796455383, + "rewards/margins": 0.5646450519561768, + "rewards/rejected": -0.7727665305137634, + "step": 1300 + }, + { + "epoch": 1.35, + "learning_rate": 3.0501339456563334e-07, + "logits/chosen": -2.974381923675537, + "logits/rejected": -2.9355669021606445, + "logps/chosen": -286.21453857421875, + "logps/rejected": -236.1365203857422, + "loss": 0.378, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.08342897891998291, + "rewards/margins": 0.6408315896987915, + "rewards/rejected": -0.7242605090141296, + "step": 1310 + }, + { + "epoch": 1.36, + "learning_rate": 3.030998851894374e-07, + "logits/chosen": -3.02203369140625, + "logits/rejected": -3.0106561183929443, + "logps/chosen": -244.7522430419922, + "logps/rejected": -214.9333038330078, + "loss": 0.3807, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.24016663432121277, + "rewards/margins": 0.5892859697341919, + "rewards/rejected": -0.829452633857727, + "step": 1320 + }, + { + "epoch": 1.37, + "learning_rate": 3.0118637581324146e-07, + "logits/chosen": -2.9896299839019775, + "logits/rejected": -2.9215025901794434, + "logps/chosen": -263.9308166503906, + "logps/rejected": -236.4021453857422, + "loss": 0.4104, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.14813081920146942, + "rewards/margins": 0.6776358485221863, + "rewards/rejected": -0.8257666826248169, + "step": 1330 + }, + { + "epoch": 1.38, + "learning_rate": 2.992728664370455e-07, + "logits/chosen": -3.0315442085266113, + "logits/rejected": -3.035961866378784, + "logps/chosen": -261.2752990722656, + "logps/rejected": -235.47705078125, + "loss": 0.3998, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.2596365213394165, + "rewards/margins": 0.658948540687561, + "rewards/rejected": -0.9185851812362671, + "step": 1340 + }, + { + "epoch": 1.39, + "learning_rate": 2.973593570608496e-07, + "logits/chosen": -3.0204250812530518, + "logits/rejected": -3.015413761138916, + "logps/chosen": -246.57290649414062, + "logps/rejected": -231.91238403320312, + "loss": 0.4193, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.3196146488189697, + "rewards/margins": 0.4760715365409851, + "rewards/rejected": -0.7956861257553101, + "step": 1350 + }, + { + "epoch": 1.4, + "learning_rate": 2.9544584768465365e-07, + "logits/chosen": -3.016695022583008, + "logits/rejected": -2.9959843158721924, + "logps/chosen": -275.1457824707031, + "logps/rejected": -220.0662078857422, + "loss": 0.4373, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.33876991271972656, + "rewards/margins": 0.457784503698349, + "rewards/rejected": -0.796554446220398, + "step": 1360 + }, + { + "epoch": 1.41, + "learning_rate": 2.935323383084577e-07, + "logits/chosen": -3.059567451477051, + "logits/rejected": -2.9867048263549805, + "logps/chosen": -287.88153076171875, + "logps/rejected": -235.1467742919922, + "loss": 0.3824, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.264303594827652, + "rewards/margins": 0.6922810673713684, + "rewards/rejected": -0.956584632396698, + "step": 1370 + }, + { + "epoch": 1.42, + "learning_rate": 2.9161882893226177e-07, + "logits/chosen": -2.975642442703247, + "logits/rejected": -2.9326682090759277, + "logps/chosen": -251.0466766357422, + "logps/rejected": -223.5084686279297, + "loss": 0.4333, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.1487085521221161, + "rewards/margins": 0.689045250415802, + "rewards/rejected": -0.8377537727355957, + "step": 1380 + }, + { + "epoch": 1.44, + "learning_rate": 2.8970531955606583e-07, + "logits/chosen": -3.0104973316192627, + "logits/rejected": -2.9991507530212402, + "logps/chosen": -296.5973205566406, + "logps/rejected": -226.55111694335938, + "loss": 0.3786, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.20407001674175262, + "rewards/margins": 0.6605352163314819, + "rewards/rejected": -0.8646053075790405, + "step": 1390 + }, + { + "epoch": 1.45, + "learning_rate": 2.8779181017986984e-07, + "logits/chosen": -2.9817981719970703, + "logits/rejected": -2.988542079925537, + "logps/chosen": -285.8086853027344, + "logps/rejected": -253.98934936523438, + "loss": 0.4188, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.25984063744544983, + "rewards/margins": 0.5409084558486938, + "rewards/rejected": -0.8007491230964661, + "step": 1400 + }, + { + "epoch": 1.46, + "learning_rate": 2.858783008036739e-07, + "logits/chosen": -2.99088978767395, + "logits/rejected": -2.9880213737487793, + "logps/chosen": -269.12982177734375, + "logps/rejected": -245.9726104736328, + "loss": 0.3797, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.3341473937034607, + "rewards/margins": 0.6310809254646301, + "rewards/rejected": -0.9652281999588013, + "step": 1410 + }, + { + "epoch": 1.47, + "learning_rate": 2.8396479142747797e-07, + "logits/chosen": -3.031111717224121, + "logits/rejected": -3.011475086212158, + "logps/chosen": -265.3778381347656, + "logps/rejected": -236.4595947265625, + "loss": 0.3353, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23791718482971191, + "rewards/margins": 0.7006685733795166, + "rewards/rejected": -0.938585638999939, + "step": 1420 + }, + { + "epoch": 1.48, + "learning_rate": 2.8205128205128203e-07, + "logits/chosen": -2.972764492034912, + "logits/rejected": -2.9663169384002686, + "logps/chosen": -261.602294921875, + "logps/rejected": -261.4017639160156, + "loss": 0.3131, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.2686450779438019, + "rewards/margins": 0.671106219291687, + "rewards/rejected": -0.9397512674331665, + "step": 1430 + }, + { + "epoch": 1.49, + "learning_rate": 2.801377726750861e-07, + "logits/chosen": -3.018611192703247, + "logits/rejected": -2.9920201301574707, + "logps/chosen": -253.02285766601562, + "logps/rejected": -222.1746826171875, + "loss": 0.3765, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.35335665941238403, + "rewards/margins": 0.5318818092346191, + "rewards/rejected": -0.8852384686470032, + "step": 1440 + }, + { + "epoch": 1.5, + "learning_rate": 2.7822426329889015e-07, + "logits/chosen": -2.9993600845336914, + "logits/rejected": -2.953437328338623, + "logps/chosen": -272.62347412109375, + "logps/rejected": -223.8088836669922, + "loss": 0.3979, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3333335518836975, + "rewards/margins": 0.6397331953048706, + "rewards/rejected": -0.9730666875839233, + "step": 1450 + }, + { + "epoch": 1.51, + "learning_rate": 2.763107539226942e-07, + "logits/chosen": -3.0080342292785645, + "logits/rejected": -2.9877243041992188, + "logps/chosen": -285.1849670410156, + "logps/rejected": -218.37954711914062, + "loss": 0.3865, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.35278716683387756, + "rewards/margins": 0.6462265253067017, + "rewards/rejected": -0.9990137815475464, + "step": 1460 + }, + { + "epoch": 1.52, + "learning_rate": 2.743972445464983e-07, + "logits/chosen": -2.989492177963257, + "logits/rejected": -2.9886727333068848, + "logps/chosen": -276.1953430175781, + "logps/rejected": -228.0172882080078, + "loss": 0.4094, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.3594445586204529, + "rewards/margins": 0.6459834575653076, + "rewards/rejected": -1.0054280757904053, + "step": 1470 + }, + { + "epoch": 1.53, + "learning_rate": 2.7248373517030234e-07, + "logits/chosen": -2.9692909717559814, + "logits/rejected": -2.978031635284424, + "logps/chosen": -260.3178405761719, + "logps/rejected": -236.1215362548828, + "loss": 0.3159, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.33140963315963745, + "rewards/margins": 0.5777403116226196, + "rewards/rejected": -0.9091499447822571, + "step": 1480 + }, + { + "epoch": 1.54, + "learning_rate": 2.705702257941064e-07, + "logits/chosen": -2.9462051391601562, + "logits/rejected": -2.9286155700683594, + "logps/chosen": -289.8208923339844, + "logps/rejected": -247.3198699951172, + "loss": 0.3091, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3897753953933716, + "rewards/margins": 0.5884745717048645, + "rewards/rejected": -0.978249728679657, + "step": 1490 + }, + { + "epoch": 1.55, + "learning_rate": 2.686567164179104e-07, + "logits/chosen": -3.018886089324951, + "logits/rejected": -2.9545936584472656, + "logps/chosen": -265.15240478515625, + "logps/rejected": -227.3736114501953, + "loss": 0.3156, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.2435026615858078, + "rewards/margins": 0.8812161684036255, + "rewards/rejected": -1.1247189044952393, + "step": 1500 + }, + { + "epoch": 1.56, + "learning_rate": 2.6674320704171447e-07, + "logits/chosen": -3.019289016723633, + "logits/rejected": -2.9821600914001465, + "logps/chosen": -288.81646728515625, + "logps/rejected": -226.24755859375, + "loss": 0.262, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.3103589713573456, + "rewards/margins": 0.6744868159294128, + "rewards/rejected": -0.9848458170890808, + "step": 1510 + }, + { + "epoch": 1.57, + "learning_rate": 2.6482969766551853e-07, + "logits/chosen": -2.918996572494507, + "logits/rejected": -2.9260575771331787, + "logps/chosen": -229.50146484375, + "logps/rejected": -233.02432250976562, + "loss": 0.3571, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3969925045967102, + "rewards/margins": 0.5756487250328064, + "rewards/rejected": -0.9726413488388062, + "step": 1520 + }, + { + "epoch": 1.58, + "learning_rate": 2.629161882893226e-07, + "logits/chosen": -3.0124337673187256, + "logits/rejected": -2.9816842079162598, + "logps/chosen": -272.998779296875, + "logps/rejected": -232.07638549804688, + "loss": 0.2496, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3978312909603119, + "rewards/margins": 0.6203471422195435, + "rewards/rejected": -1.0181784629821777, + "step": 1530 + }, + { + "epoch": 1.59, + "learning_rate": 2.6100267891312666e-07, + "logits/chosen": -3.0306756496429443, + "logits/rejected": -3.016284465789795, + "logps/chosen": -293.61407470703125, + "logps/rejected": -241.7076873779297, + "loss": 0.2565, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3279929757118225, + "rewards/margins": 0.6071383953094482, + "rewards/rejected": -0.9351313710212708, + "step": 1540 + }, + { + "epoch": 1.6, + "learning_rate": 2.590891695369307e-07, + "logits/chosen": -2.9170010089874268, + "logits/rejected": -2.904465436935425, + "logps/chosen": -251.91354370117188, + "logps/rejected": -234.70751953125, + "loss": 0.2908, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.33537206053733826, + "rewards/margins": 0.6658948063850403, + "rewards/rejected": -1.0012669563293457, + "step": 1550 + }, + { + "epoch": 1.61, + "learning_rate": 2.571756601607348e-07, + "logits/chosen": -3.0143392086029053, + "logits/rejected": -2.9897260665893555, + "logps/chosen": -301.2428894042969, + "logps/rejected": -244.96145629882812, + "loss": 0.2721, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3570322096347809, + "rewards/margins": 0.7769542336463928, + "rewards/rejected": -1.133986473083496, + "step": 1560 + }, + { + "epoch": 1.62, + "learning_rate": 2.5526215078453884e-07, + "logits/chosen": -3.022491216659546, + "logits/rejected": -2.9913883209228516, + "logps/chosen": -284.7639465332031, + "logps/rejected": -248.6448211669922, + "loss": 0.2595, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3536800742149353, + "rewards/margins": 0.7084030508995056, + "rewards/rejected": -1.0620832443237305, + "step": 1570 + }, + { + "epoch": 1.63, + "learning_rate": 2.533486414083429e-07, + "logits/chosen": -3.0248942375183105, + "logits/rejected": -2.9973325729370117, + "logps/chosen": -284.40472412109375, + "logps/rejected": -241.6700897216797, + "loss": 0.1589, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.3532782196998596, + "rewards/margins": 0.7998963594436646, + "rewards/rejected": -1.153174638748169, + "step": 1580 + }, + { + "epoch": 1.64, + "learning_rate": 2.5143513203214697e-07, + "logits/chosen": -2.9913268089294434, + "logits/rejected": -2.971450090408325, + "logps/chosen": -319.8451232910156, + "logps/rejected": -254.3859100341797, + "loss": 0.3029, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.31969529390335083, + "rewards/margins": 0.671107292175293, + "rewards/rejected": -0.9908025860786438, + "step": 1590 + }, + { + "epoch": 1.65, + "learning_rate": 2.49521622655951e-07, + "logits/chosen": -2.959479331970215, + "logits/rejected": -2.908963441848755, + "logps/chosen": -272.8207092285156, + "logps/rejected": -235.93020629882812, + "loss": 0.2796, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.34670349955558777, + "rewards/margins": 0.7621387243270874, + "rewards/rejected": -1.108842134475708, + "step": 1600 + }, + { + "epoch": 1.66, + "learning_rate": 2.4760811327975504e-07, + "logits/chosen": -2.9981067180633545, + "logits/rejected": -2.9972851276397705, + "logps/chosen": -278.3575744628906, + "logps/rejected": -239.57223510742188, + "loss": 0.2067, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5294784307479858, + "rewards/margins": 0.6004038453102112, + "rewards/rejected": -1.1298822164535522, + "step": 1610 + }, + { + "epoch": 1.67, + "learning_rate": 2.456946039035591e-07, + "logits/chosen": -3.030169725418091, + "logits/rejected": -2.9914770126342773, + "logps/chosen": -289.63116455078125, + "logps/rejected": -249.74649047851562, + "loss": 0.3592, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.44787079095840454, + "rewards/margins": 0.63862544298172, + "rewards/rejected": -1.086496114730835, + "step": 1620 + }, + { + "epoch": 1.68, + "learning_rate": 2.4378109452736316e-07, + "logits/chosen": -2.9960074424743652, + "logits/rejected": -2.980121374130249, + "logps/chosen": -267.38446044921875, + "logps/rejected": -244.11593627929688, + "loss": 0.2287, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.34468650817871094, + "rewards/margins": 0.8173492550849915, + "rewards/rejected": -1.1620357036590576, + "step": 1630 + }, + { + "epoch": 1.69, + "learning_rate": 2.418675851511672e-07, + "logits/chosen": -2.913470506668091, + "logits/rejected": -2.90181040763855, + "logps/chosen": -268.4359436035156, + "logps/rejected": -238.85806274414062, + "loss": 0.2227, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.43024927377700806, + "rewards/margins": 0.583633303642273, + "rewards/rejected": -1.0138825178146362, + "step": 1640 + }, + { + "epoch": 1.7, + "learning_rate": 2.399540757749713e-07, + "logits/chosen": -2.9995226860046387, + "logits/rejected": -2.971139430999756, + "logps/chosen": -276.39031982421875, + "logps/rejected": -242.7020263671875, + "loss": 0.2584, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5093586444854736, + "rewards/margins": 0.657143771648407, + "rewards/rejected": -1.1665024757385254, + "step": 1650 + }, + { + "epoch": 1.71, + "learning_rate": 2.3804056639877535e-07, + "logits/chosen": -2.95731258392334, + "logits/rejected": -2.9587581157684326, + "logps/chosen": -248.0506134033203, + "logps/rejected": -226.7265625, + "loss": 0.2349, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.3466361165046692, + "rewards/margins": 0.8473577499389648, + "rewards/rejected": -1.1939939260482788, + "step": 1660 + }, + { + "epoch": 1.72, + "learning_rate": 2.361270570225794e-07, + "logits/chosen": -3.0136914253234863, + "logits/rejected": -2.9863269329071045, + "logps/chosen": -285.3213195800781, + "logps/rejected": -233.94619750976562, + "loss": 0.1567, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.3250366151332855, + "rewards/margins": 0.8868409991264343, + "rewards/rejected": -1.2118775844573975, + "step": 1670 + }, + { + "epoch": 1.73, + "learning_rate": 2.3421354764638345e-07, + "logits/chosen": -2.9790773391723633, + "logits/rejected": -2.952543020248413, + "logps/chosen": -276.2936706542969, + "logps/rejected": -257.79718017578125, + "loss": 0.1955, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.4911643862724304, + "rewards/margins": 0.6523909568786621, + "rewards/rejected": -1.1435552835464478, + "step": 1680 + }, + { + "epoch": 1.74, + "learning_rate": 2.323000382701875e-07, + "logits/chosen": -2.9914324283599854, + "logits/rejected": -2.98734974861145, + "logps/chosen": -322.0937805175781, + "logps/rejected": -261.5361633300781, + "loss": 0.1863, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.3622845411300659, + "rewards/margins": 0.9477508664131165, + "rewards/rejected": -1.3100353479385376, + "step": 1690 + }, + { + "epoch": 1.76, + "learning_rate": 2.3038652889399157e-07, + "logits/chosen": -2.9850406646728516, + "logits/rejected": -2.972008228302002, + "logps/chosen": -262.46197509765625, + "logps/rejected": -237.19375610351562, + "loss": 0.1809, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.53041011095047, + "rewards/margins": 0.7410814762115479, + "rewards/rejected": -1.2714916467666626, + "step": 1700 + }, + { + "epoch": 1.77, + "learning_rate": 2.2847301951779563e-07, + "logits/chosen": -2.983722448348999, + "logits/rejected": -2.956228256225586, + "logps/chosen": -272.92156982421875, + "logps/rejected": -256.91107177734375, + "loss": 0.1089, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5018629431724548, + "rewards/margins": 0.8895123600959778, + "rewards/rejected": -1.391375184059143, + "step": 1710 + }, + { + "epoch": 1.78, + "learning_rate": 2.265595101415997e-07, + "logits/chosen": -2.9394657611846924, + "logits/rejected": -2.92991304397583, + "logps/chosen": -273.16815185546875, + "logps/rejected": -262.08917236328125, + "loss": 0.1644, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5341050624847412, + "rewards/margins": 0.7611141800880432, + "rewards/rejected": -1.2952191829681396, + "step": 1720 + }, + { + "epoch": 1.79, + "learning_rate": 2.2464600076540373e-07, + "logits/chosen": -2.946911334991455, + "logits/rejected": -2.931849718093872, + "logps/chosen": -308.5828552246094, + "logps/rejected": -247.25338745117188, + "loss": 0.1064, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5544737577438354, + "rewards/margins": 0.9120496511459351, + "rewards/rejected": -1.46652352809906, + "step": 1730 + }, + { + "epoch": 1.8, + "learning_rate": 2.227324913892078e-07, + "logits/chosen": -2.951788902282715, + "logits/rejected": -2.909658908843994, + "logps/chosen": -295.8861389160156, + "logps/rejected": -243.9182891845703, + "loss": 0.0733, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5971349477767944, + "rewards/margins": 0.7489143013954163, + "rewards/rejected": -1.3460490703582764, + "step": 1740 + }, + { + "epoch": 1.81, + "learning_rate": 2.2081898201301186e-07, + "logits/chosen": -3.023937702178955, + "logits/rejected": -2.9923651218414307, + "logps/chosen": -301.7521057128906, + "logps/rejected": -257.3502197265625, + "loss": 0.1576, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5676156282424927, + "rewards/margins": 0.8253081440925598, + "rewards/rejected": -1.3929237127304077, + "step": 1750 + }, + { + "epoch": 1.82, + "learning_rate": 2.1890547263681592e-07, + "logits/chosen": -2.979418992996216, + "logits/rejected": -2.979773998260498, + "logps/chosen": -278.7250061035156, + "logps/rejected": -281.96417236328125, + "loss": 0.1553, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5305044651031494, + "rewards/margins": 0.8628344535827637, + "rewards/rejected": -1.393338918685913, + "step": 1760 + }, + { + "epoch": 1.83, + "learning_rate": 2.1699196326061998e-07, + "logits/chosen": -2.93343448638916, + "logits/rejected": -2.9477057456970215, + "logps/chosen": -259.96685791015625, + "logps/rejected": -260.68145751953125, + "loss": 0.2389, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5586804151535034, + "rewards/margins": 0.8505626916885376, + "rewards/rejected": -1.409243106842041, + "step": 1770 + }, + { + "epoch": 1.84, + "learning_rate": 2.1507845388442402e-07, + "logits/chosen": -3.001537799835205, + "logits/rejected": -3.000216007232666, + "logps/chosen": -253.8172607421875, + "logps/rejected": -229.71261596679688, + "loss": 0.117, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6867496371269226, + "rewards/margins": 0.839769184589386, + "rewards/rejected": -1.5265188217163086, + "step": 1780 + }, + { + "epoch": 1.85, + "learning_rate": 2.1316494450822808e-07, + "logits/chosen": -2.9645118713378906, + "logits/rejected": -2.921504497528076, + "logps/chosen": -249.779296875, + "logps/rejected": -217.2705078125, + "loss": 0.187, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6474353075027466, + "rewards/margins": 0.7807954549789429, + "rewards/rejected": -1.428230881690979, + "step": 1790 + }, + { + "epoch": 1.86, + "learning_rate": 2.1125143513203214e-07, + "logits/chosen": -2.995269298553467, + "logits/rejected": -2.9834041595458984, + "logps/chosen": -293.59716796875, + "logps/rejected": -229.48092651367188, + "loss": 0.0751, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6189619302749634, + "rewards/margins": 0.7833685874938965, + "rewards/rejected": -1.4023306369781494, + "step": 1800 + }, + { + "epoch": 1.87, + "learning_rate": 2.093379257558362e-07, + "logits/chosen": -2.9907360076904297, + "logits/rejected": -2.963219404220581, + "logps/chosen": -274.832275390625, + "logps/rejected": -237.86355590820312, + "loss": 0.2011, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7243114709854126, + "rewards/margins": 0.5377382636070251, + "rewards/rejected": -1.2620497941970825, + "step": 1810 + }, + { + "epoch": 1.88, + "learning_rate": 2.0742441637964026e-07, + "logits/chosen": -2.9230685234069824, + "logits/rejected": -2.88169264793396, + "logps/chosen": -309.228515625, + "logps/rejected": -246.1265411376953, + "loss": -0.039, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.5196012258529663, + "rewards/margins": 1.0259116888046265, + "rewards/rejected": -1.5455129146575928, + "step": 1820 + }, + { + "epoch": 1.89, + "learning_rate": 2.055109070034443e-07, + "logits/chosen": -2.9902000427246094, + "logits/rejected": -2.9472696781158447, + "logps/chosen": -300.9866027832031, + "logps/rejected": -249.76412963867188, + "loss": 0.1368, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6899394989013672, + "rewards/margins": 0.7042897343635559, + "rewards/rejected": -1.3942292928695679, + "step": 1830 + }, + { + "epoch": 1.9, + "learning_rate": 2.0359739762724836e-07, + "logits/chosen": -3.0022895336151123, + "logits/rejected": -2.973423719406128, + "logps/chosen": -313.137939453125, + "logps/rejected": -254.30709838867188, + "loss": 0.0727, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5544772744178772, + "rewards/margins": 1.1537253856658936, + "rewards/rejected": -1.7082027196884155, + "step": 1840 + }, + { + "epoch": 1.91, + "learning_rate": 2.0168388825105242e-07, + "logits/chosen": -2.976815700531006, + "logits/rejected": -2.958930492401123, + "logps/chosen": -285.1069641113281, + "logps/rejected": -275.8247985839844, + "loss": 0.046, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5782260894775391, + "rewards/margins": 0.8343210220336914, + "rewards/rejected": -1.41254723072052, + "step": 1850 + }, + { + "epoch": 1.92, + "learning_rate": 1.997703788748565e-07, + "logits/chosen": -2.997415065765381, + "logits/rejected": -2.95662784576416, + "logps/chosen": -302.427734375, + "logps/rejected": -215.8485565185547, + "loss": 0.0356, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.4519409239292145, + "rewards/margins": 1.185298204421997, + "rewards/rejected": -1.6372392177581787, + "step": 1860 + }, + { + "epoch": 1.93, + "learning_rate": 1.9785686949866055e-07, + "logits/chosen": -2.943821668624878, + "logits/rejected": -2.9366466999053955, + "logps/chosen": -267.79620361328125, + "logps/rejected": -246.8135223388672, + "loss": 0.158, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6127734184265137, + "rewards/margins": 0.7575459480285645, + "rewards/rejected": -1.3703193664550781, + "step": 1870 + }, + { + "epoch": 1.94, + "learning_rate": 1.9594336012246458e-07, + "logits/chosen": -2.941462278366089, + "logits/rejected": -2.9433281421661377, + "logps/chosen": -253.16256713867188, + "logps/rejected": -241.6164093017578, + "loss": 0.0191, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5904272198677063, + "rewards/margins": 0.8854333758354187, + "rewards/rejected": -1.475860595703125, + "step": 1880 + }, + { + "epoch": 1.95, + "learning_rate": 1.9402985074626865e-07, + "logits/chosen": -2.990996837615967, + "logits/rejected": -2.9396445751190186, + "logps/chosen": -286.08404541015625, + "logps/rejected": -246.9237823486328, + "loss": 0.0619, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.7161077260971069, + "rewards/margins": 0.9890392422676086, + "rewards/rejected": -1.7051467895507812, + "step": 1890 + }, + { + "epoch": 1.96, + "learning_rate": 1.921163413700727e-07, + "logits/chosen": -2.9665896892547607, + "logits/rejected": -2.985443353652954, + "logps/chosen": -273.56964111328125, + "logps/rejected": -255.6343536376953, + "loss": 0.1713, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7685881853103638, + "rewards/margins": 0.8922961354255676, + "rewards/rejected": -1.6608844995498657, + "step": 1900 + }, + { + "epoch": 1.97, + "learning_rate": 1.9020283199387677e-07, + "logits/chosen": -2.993872880935669, + "logits/rejected": -2.9838452339172363, + "logps/chosen": -268.5338134765625, + "logps/rejected": -235.5106658935547, + "loss": 0.1585, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5740541219711304, + "rewards/margins": 0.8445619344711304, + "rewards/rejected": -1.4186161756515503, + "step": 1910 + }, + { + "epoch": 1.98, + "learning_rate": 1.8828932261768083e-07, + "logits/chosen": -2.918458938598633, + "logits/rejected": -2.924969434738159, + "logps/chosen": -275.4915466308594, + "logps/rejected": -230.2731170654297, + "loss": -0.0004, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.717719316482544, + "rewards/margins": 1.1288950443267822, + "rewards/rejected": -1.8466142416000366, + "step": 1920 + }, + { + "epoch": 1.99, + "learning_rate": 1.8637581324148487e-07, + "logits/chosen": -3.0128183364868164, + "logits/rejected": -3.0055766105651855, + "logps/chosen": -273.6071472167969, + "logps/rejected": -248.68997192382812, + "loss": 0.0705, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7629455327987671, + "rewards/margins": 1.0152510404586792, + "rewards/rejected": -1.7781963348388672, + "step": 1930 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -2.983034372329712, + "eval_logits/rejected": -2.960925817489624, + "eval_logps/chosen": -277.86474609375, + "eval_logps/rejected": -245.16693115234375, + "eval_loss": 0.05700839310884476, + "eval_rewards/accuracies": 0.6959999799728394, + "eval_rewards/chosen": -0.6708189845085144, + "eval_rewards/margins": 0.9967920184135437, + "eval_rewards/rejected": -1.6676111221313477, + "eval_runtime": 448.2745, + "eval_samples_per_second": 4.462, + "eval_steps_per_second": 0.279, + "step": 1937 + }, + { + "epoch": 2.0, + "learning_rate": 1.8446230386528893e-07, + "logits/chosen": -2.99873423576355, + "logits/rejected": -2.9760866165161133, + "logps/chosen": -270.5379943847656, + "logps/rejected": -233.87255859375, + "loss": 0.104, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6196664571762085, + "rewards/margins": 0.8855097889900208, + "rewards/rejected": -1.505176305770874, + "step": 1940 + }, + { + "epoch": 2.01, + "learning_rate": 1.82548794489093e-07, + "logits/chosen": -2.972491502761841, + "logits/rejected": -2.9654781818389893, + "logps/chosen": -272.2441711425781, + "logps/rejected": -247.496337890625, + "loss": 0.0695, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8281705975532532, + "rewards/margins": 0.8387929201126099, + "rewards/rejected": -1.6669635772705078, + "step": 1950 + }, + { + "epoch": 2.02, + "learning_rate": 1.8063528511289706e-07, + "logits/chosen": -2.9022774696350098, + "logits/rejected": -2.8986656665802, + "logps/chosen": -272.8119201660156, + "logps/rejected": -286.4425964355469, + "loss": 0.0376, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6966532468795776, + "rewards/margins": 1.0244591236114502, + "rewards/rejected": -1.7211124897003174, + "step": 1960 + }, + { + "epoch": 2.03, + "learning_rate": 1.7872177573670112e-07, + "logits/chosen": -3.0008738040924072, + "logits/rejected": -2.9414420127868652, + "logps/chosen": -269.3557434082031, + "logps/rejected": -228.98892211914062, + "loss": 0.1134, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6135867238044739, + "rewards/margins": 1.0139942169189453, + "rewards/rejected": -1.627581000328064, + "step": 1970 + }, + { + "epoch": 2.04, + "learning_rate": 1.7680826636050515e-07, + "logits/chosen": -2.97796893119812, + "logits/rejected": -2.9451377391815186, + "logps/chosen": -290.0522155761719, + "logps/rejected": -247.7688446044922, + "loss": 0.0676, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7555999755859375, + "rewards/margins": 0.919529139995575, + "rewards/rejected": -1.6751289367675781, + "step": 1980 + }, + { + "epoch": 2.05, + "learning_rate": 1.7489475698430921e-07, + "logits/chosen": -2.9556636810302734, + "logits/rejected": -2.9225146770477295, + "logps/chosen": -281.0018615722656, + "logps/rejected": -257.39471435546875, + "loss": -0.0304, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7463997006416321, + "rewards/margins": 0.9945909380912781, + "rewards/rejected": -1.7409906387329102, + "step": 1990 + }, + { + "epoch": 2.07, + "learning_rate": 1.7298124760811328e-07, + "logits/chosen": -2.9892733097076416, + "logits/rejected": -2.9609320163726807, + "logps/chosen": -265.5694885253906, + "logps/rejected": -227.8389434814453, + "loss": 0.068, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8352034687995911, + "rewards/margins": 1.0397050380706787, + "rewards/rejected": -1.874908685684204, + "step": 2000 + }, + { + "epoch": 2.08, + "learning_rate": 1.7106773823191734e-07, + "logits/chosen": -2.9671616554260254, + "logits/rejected": -2.937986373901367, + "logps/chosen": -281.4359436035156, + "logps/rejected": -241.79080200195312, + "loss": -0.0155, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6781337857246399, + "rewards/margins": 1.3058927059173584, + "rewards/rejected": -1.984026312828064, + "step": 2010 + }, + { + "epoch": 2.09, + "learning_rate": 1.691542288557214e-07, + "logits/chosen": -2.9833037853240967, + "logits/rejected": -2.943441152572632, + "logps/chosen": -295.3065490722656, + "logps/rejected": -253.66360473632812, + "loss": 0.0664, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.7511407136917114, + "rewards/margins": 0.9544817209243774, + "rewards/rejected": -1.7056224346160889, + "step": 2020 + }, + { + "epoch": 2.1, + "learning_rate": 1.6724071947952544e-07, + "logits/chosen": -2.9694552421569824, + "logits/rejected": -2.9318954944610596, + "logps/chosen": -254.4512939453125, + "logps/rejected": -235.69595336914062, + "loss": 0.0632, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.7740479111671448, + "rewards/margins": 0.9885379672050476, + "rewards/rejected": -1.762585997581482, + "step": 2030 + }, + { + "epoch": 2.11, + "learning_rate": 1.653272101033295e-07, + "logits/chosen": -2.9927315711975098, + "logits/rejected": -2.9617929458618164, + "logps/chosen": -286.9222412109375, + "logps/rejected": -262.54559326171875, + "loss": -0.0387, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7180510759353638, + "rewards/margins": 1.0184663534164429, + "rewards/rejected": -1.7365175485610962, + "step": 2040 + }, + { + "epoch": 2.12, + "learning_rate": 1.6341370072713356e-07, + "logits/chosen": -2.9783358573913574, + "logits/rejected": -2.9684910774230957, + "logps/chosen": -288.4617614746094, + "logps/rejected": -269.187255859375, + "loss": -0.0243, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.75580233335495, + "rewards/margins": 0.9117294549942017, + "rewards/rejected": -1.6675317287445068, + "step": 2050 + }, + { + "epoch": 2.13, + "learning_rate": 1.6150019135093762e-07, + "logits/chosen": -3.0292530059814453, + "logits/rejected": -2.9775586128234863, + "logps/chosen": -298.93585205078125, + "logps/rejected": -247.0900115966797, + "loss": -0.0146, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6352591514587402, + "rewards/margins": 1.129183053970337, + "rewards/rejected": -1.7644420862197876, + "step": 2060 + }, + { + "epoch": 2.14, + "learning_rate": 1.5958668197474169e-07, + "logits/chosen": -3.0193440914154053, + "logits/rejected": -2.99593186378479, + "logps/chosen": -263.82464599609375, + "logps/rejected": -277.60418701171875, + "loss": 0.0702, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6851625442504883, + "rewards/margins": 0.9985010027885437, + "rewards/rejected": -1.6836636066436768, + "step": 2070 + }, + { + "epoch": 2.15, + "learning_rate": 1.5767317259854572e-07, + "logits/chosen": -3.0217273235321045, + "logits/rejected": -2.970132350921631, + "logps/chosen": -307.5477600097656, + "logps/rejected": -248.69448852539062, + "loss": -0.024, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.65143883228302, + "rewards/margins": 1.1803841590881348, + "rewards/rejected": -1.8318227529525757, + "step": 2080 + }, + { + "epoch": 2.16, + "learning_rate": 1.5575966322234978e-07, + "logits/chosen": -2.931065559387207, + "logits/rejected": -2.9219722747802734, + "logps/chosen": -276.0883483886719, + "logps/rejected": -247.0487060546875, + "loss": -0.052, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.64451664686203, + "rewards/margins": 1.1405279636383057, + "rewards/rejected": -1.7850444316864014, + "step": 2090 + }, + { + "epoch": 2.17, + "learning_rate": 1.5384615384615385e-07, + "logits/chosen": -3.0177485942840576, + "logits/rejected": -3.0004374980926514, + "logps/chosen": -263.8681640625, + "logps/rejected": -226.38119506835938, + "loss": -0.047, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8407020568847656, + "rewards/margins": 1.1751278638839722, + "rewards/rejected": -2.0158302783966064, + "step": 2100 + }, + { + "epoch": 2.18, + "learning_rate": 1.519326444699579e-07, + "logits/chosen": -2.9868111610412598, + "logits/rejected": -2.961195468902588, + "logps/chosen": -294.43402099609375, + "logps/rejected": -237.2691650390625, + "loss": -0.1127, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.678774356842041, + "rewards/margins": 1.201111078262329, + "rewards/rejected": -1.8798853158950806, + "step": 2110 + }, + { + "epoch": 2.19, + "learning_rate": 1.5001913509376197e-07, + "logits/chosen": -3.0007481575012207, + "logits/rejected": -2.9801931381225586, + "logps/chosen": -290.3331298828125, + "logps/rejected": -263.9862976074219, + "loss": -0.071, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.908436119556427, + "rewards/margins": 0.9944915771484375, + "rewards/rejected": -1.9029273986816406, + "step": 2120 + }, + { + "epoch": 2.2, + "learning_rate": 1.4810562571756603e-07, + "logits/chosen": -3.011329174041748, + "logits/rejected": -2.980834484100342, + "logps/chosen": -259.76031494140625, + "logps/rejected": -239.0861053466797, + "loss": 0.0556, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.774993360042572, + "rewards/margins": 1.0526189804077148, + "rewards/rejected": -1.8276125192642212, + "step": 2130 + }, + { + "epoch": 2.21, + "learning_rate": 1.4619211634137007e-07, + "logits/chosen": -2.9631924629211426, + "logits/rejected": -2.954556703567505, + "logps/chosen": -248.605224609375, + "logps/rejected": -215.9401397705078, + "loss": -0.0457, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.8637608289718628, + "rewards/margins": 1.0648829936981201, + "rewards/rejected": -1.9286441802978516, + "step": 2140 + }, + { + "epoch": 2.22, + "learning_rate": 1.4427860696517413e-07, + "logits/chosen": -2.950859308242798, + "logits/rejected": -2.961940288543701, + "logps/chosen": -328.16314697265625, + "logps/rejected": -277.8406982421875, + "loss": 0.0939, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7997218370437622, + "rewards/margins": 1.051257848739624, + "rewards/rejected": -1.8509798049926758, + "step": 2150 + }, + { + "epoch": 2.23, + "learning_rate": 1.423650975889782e-07, + "logits/chosen": -3.0021846294403076, + "logits/rejected": -2.9544687271118164, + "logps/chosen": -284.53570556640625, + "logps/rejected": -254.57559204101562, + "loss": -0.1516, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8068240880966187, + "rewards/margins": 1.2634921073913574, + "rewards/rejected": -2.0703163146972656, + "step": 2160 + }, + { + "epoch": 2.24, + "learning_rate": 1.4045158821278225e-07, + "logits/chosen": -2.968425750732422, + "logits/rejected": -2.9926857948303223, + "logps/chosen": -295.5940246582031, + "logps/rejected": -265.8125, + "loss": -0.1379, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.7207796573638916, + "rewards/margins": 1.3109276294708252, + "rewards/rejected": -2.0317070484161377, + "step": 2170 + }, + { + "epoch": 2.25, + "learning_rate": 1.3853807883658632e-07, + "logits/chosen": -3.0091452598571777, + "logits/rejected": -2.9704108238220215, + "logps/chosen": -298.40948486328125, + "logps/rejected": -241.05337524414062, + "loss": 0.1214, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9363592863082886, + "rewards/margins": 0.984167754650116, + "rewards/rejected": -1.9205271005630493, + "step": 2180 + }, + { + "epoch": 2.26, + "learning_rate": 1.3662456946039035e-07, + "logits/chosen": -2.924776554107666, + "logits/rejected": -2.920172691345215, + "logps/chosen": -259.19580078125, + "logps/rejected": -247.6489715576172, + "loss": 0.0587, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8932672739028931, + "rewards/margins": 0.9966402053833008, + "rewards/rejected": -1.8899074792861938, + "step": 2190 + }, + { + "epoch": 2.27, + "learning_rate": 1.3471106008419441e-07, + "logits/chosen": -2.9807791709899902, + "logits/rejected": -2.9884274005889893, + "logps/chosen": -244.3290252685547, + "logps/rejected": -231.4929656982422, + "loss": 0.0254, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9376288652420044, + "rewards/margins": 1.005837082862854, + "rewards/rejected": -1.9434658288955688, + "step": 2200 + }, + { + "epoch": 2.28, + "learning_rate": 1.3279755070799848e-07, + "logits/chosen": -2.9636523723602295, + "logits/rejected": -2.898196220397949, + "logps/chosen": -302.7827453613281, + "logps/rejected": -254.11367797851562, + "loss": -0.0107, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0612095594406128, + "rewards/margins": 0.9029472470283508, + "rewards/rejected": -1.9641568660736084, + "step": 2210 + }, + { + "epoch": 2.29, + "learning_rate": 1.3088404133180254e-07, + "logits/chosen": -2.962622880935669, + "logits/rejected": -2.9335360527038574, + "logps/chosen": -321.44342041015625, + "logps/rejected": -250.2067108154297, + "loss": -0.2537, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8546531796455383, + "rewards/margins": 1.267183780670166, + "rewards/rejected": -2.1218371391296387, + "step": 2220 + }, + { + "epoch": 2.3, + "learning_rate": 1.289705319556066e-07, + "logits/chosen": -2.961509943008423, + "logits/rejected": -2.923830509185791, + "logps/chosen": -294.05279541015625, + "logps/rejected": -256.3243103027344, + "loss": -0.0748, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.8142738342285156, + "rewards/margins": 1.3003737926483154, + "rewards/rejected": -2.114647626876831, + "step": 2230 + }, + { + "epoch": 2.31, + "learning_rate": 1.2705702257941064e-07, + "logits/chosen": -2.9922921657562256, + "logits/rejected": -2.9666550159454346, + "logps/chosen": -281.14788818359375, + "logps/rejected": -261.50445556640625, + "loss": -0.103, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.889809787273407, + "rewards/margins": 1.2682868242263794, + "rewards/rejected": -2.1580967903137207, + "step": 2240 + }, + { + "epoch": 2.32, + "learning_rate": 1.251435132032147e-07, + "logits/chosen": -3.009636878967285, + "logits/rejected": -2.9912712574005127, + "logps/chosen": -262.1191101074219, + "logps/rejected": -254.8567352294922, + "loss": 0.0064, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9876982569694519, + "rewards/margins": 0.9436071515083313, + "rewards/rejected": -1.9313055276870728, + "step": 2250 + }, + { + "epoch": 2.33, + "learning_rate": 1.2323000382701873e-07, + "logits/chosen": -3.0256762504577637, + "logits/rejected": -3.006740093231201, + "logps/chosen": -309.2908630371094, + "logps/rejected": -252.59323120117188, + "loss": 0.0352, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9725021123886108, + "rewards/margins": 1.1422688961029053, + "rewards/rejected": -2.1147711277008057, + "step": 2260 + }, + { + "epoch": 2.34, + "learning_rate": 1.213164944508228e-07, + "logits/chosen": -2.9900639057159424, + "logits/rejected": -2.9898974895477295, + "logps/chosen": -299.51177978515625, + "logps/rejected": -267.8308410644531, + "loss": -0.1218, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0168074369430542, + "rewards/margins": 1.148054838180542, + "rewards/rejected": -2.1648621559143066, + "step": 2270 + }, + { + "epoch": 2.35, + "learning_rate": 1.1940298507462686e-07, + "logits/chosen": -2.986194610595703, + "logits/rejected": -2.9787726402282715, + "logps/chosen": -285.0082702636719, + "logps/rejected": -241.463134765625, + "loss": -0.0902, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0312423706054688, + "rewards/margins": 1.1112394332885742, + "rewards/rejected": -2.142481803894043, + "step": 2280 + }, + { + "epoch": 2.36, + "learning_rate": 1.1748947569843092e-07, + "logits/chosen": -2.949153184890747, + "logits/rejected": -2.939624786376953, + "logps/chosen": -294.7286682128906, + "logps/rejected": -233.32864379882812, + "loss": -0.2292, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8459509015083313, + "rewards/margins": 1.125945806503296, + "rewards/rejected": -1.9718964099884033, + "step": 2290 + }, + { + "epoch": 2.37, + "learning_rate": 1.1557596632223497e-07, + "logits/chosen": -3.0164811611175537, + "logits/rejected": -3.015223741531372, + "logps/chosen": -283.99652099609375, + "logps/rejected": -248.39602661132812, + "loss": -0.175, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.068522334098816, + "rewards/margins": 0.9977639317512512, + "rewards/rejected": -2.066286325454712, + "step": 2300 + }, + { + "epoch": 2.39, + "learning_rate": 1.1366245694603903e-07, + "logits/chosen": -2.9585719108581543, + "logits/rejected": -2.9644482135772705, + "logps/chosen": -282.3656921386719, + "logps/rejected": -257.11041259765625, + "loss": -0.04, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.0202432870864868, + "rewards/margins": 1.0421122312545776, + "rewards/rejected": -2.0623555183410645, + "step": 2310 + }, + { + "epoch": 2.4, + "learning_rate": 1.1174894756984308e-07, + "logits/chosen": -2.976578712463379, + "logits/rejected": -2.958589792251587, + "logps/chosen": -243.4874725341797, + "logps/rejected": -247.5500030517578, + "loss": -0.1438, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.131103277206421, + "rewards/margins": 1.045607328414917, + "rewards/rejected": -2.176710605621338, + "step": 2320 + }, + { + "epoch": 2.41, + "learning_rate": 1.0983543819364714e-07, + "logits/chosen": -2.966090440750122, + "logits/rejected": -2.963219165802002, + "logps/chosen": -265.3279113769531, + "logps/rejected": -245.90011596679688, + "loss": -0.3008, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1297423839569092, + "rewards/margins": 1.0481914281845093, + "rewards/rejected": -2.177933931350708, + "step": 2330 + }, + { + "epoch": 2.42, + "learning_rate": 1.079219288174512e-07, + "logits/chosen": -2.8891921043395996, + "logits/rejected": -2.8980748653411865, + "logps/chosen": -245.95260620117188, + "logps/rejected": -274.68511962890625, + "loss": -0.0161, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.029191255569458, + "rewards/margins": 1.0524277687072754, + "rewards/rejected": -2.0816187858581543, + "step": 2340 + }, + { + "epoch": 2.43, + "learning_rate": 1.0600841944125525e-07, + "logits/chosen": -2.954099178314209, + "logits/rejected": -2.9609456062316895, + "logps/chosen": -305.5159606933594, + "logps/rejected": -262.00189208984375, + "loss": 0.0565, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0684045553207397, + "rewards/margins": 1.0408602952957153, + "rewards/rejected": -2.109265089035034, + "step": 2350 + }, + { + "epoch": 2.44, + "learning_rate": 1.0409491006505931e-07, + "logits/chosen": -2.95751690864563, + "logits/rejected": -2.9540932178497314, + "logps/chosen": -272.47802734375, + "logps/rejected": -252.9427490234375, + "loss": -0.1484, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9607385396957397, + "rewards/margins": 1.1778024435043335, + "rewards/rejected": -2.1385409832000732, + "step": 2360 + }, + { + "epoch": 2.45, + "learning_rate": 1.0218140068886336e-07, + "logits/chosen": -2.9053263664245605, + "logits/rejected": -2.8971660137176514, + "logps/chosen": -288.0937805175781, + "logps/rejected": -249.78091430664062, + "loss": -0.0786, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9809226989746094, + "rewards/margins": 1.1126328706741333, + "rewards/rejected": -2.0935556888580322, + "step": 2370 + }, + { + "epoch": 2.46, + "learning_rate": 1.0026789131266743e-07, + "logits/chosen": -2.902618169784546, + "logits/rejected": -2.8822264671325684, + "logps/chosen": -282.31939697265625, + "logps/rejected": -234.1844940185547, + "loss": -0.0294, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0446959733963013, + "rewards/margins": 1.026447057723999, + "rewards/rejected": -2.0711429119110107, + "step": 2380 + }, + { + "epoch": 2.47, + "learning_rate": 9.835438193647149e-08, + "logits/chosen": -2.979097366333008, + "logits/rejected": -2.939335584640503, + "logps/chosen": -292.04156494140625, + "logps/rejected": -249.23880004882812, + "loss": -0.0569, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1423760652542114, + "rewards/margins": 1.2263530492782593, + "rewards/rejected": -2.3687291145324707, + "step": 2390 + }, + { + "epoch": 2.48, + "learning_rate": 9.644087256027554e-08, + "logits/chosen": -2.9777472019195557, + "logits/rejected": -2.9249491691589355, + "logps/chosen": -300.14605712890625, + "logps/rejected": -261.2189025878906, + "loss": -0.1677, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.053171992301941, + "rewards/margins": 1.085288166999817, + "rewards/rejected": -2.1384599208831787, + "step": 2400 + }, + { + "epoch": 2.49, + "learning_rate": 9.45273631840796e-08, + "logits/chosen": -2.9415881633758545, + "logits/rejected": -2.920991897583008, + "logps/chosen": -283.33758544921875, + "logps/rejected": -262.96820068359375, + "loss": -0.1276, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8991168737411499, + "rewards/margins": 1.2601044178009033, + "rewards/rejected": -2.1592211723327637, + "step": 2410 + }, + { + "epoch": 2.5, + "learning_rate": 9.261385380788366e-08, + "logits/chosen": -2.9756853580474854, + "logits/rejected": -2.946068525314331, + "logps/chosen": -280.66241455078125, + "logps/rejected": -240.4520263671875, + "loss": -0.126, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0988253355026245, + "rewards/margins": 0.9154653549194336, + "rewards/rejected": -2.0142908096313477, + "step": 2420 + }, + { + "epoch": 2.51, + "learning_rate": 9.070034443168771e-08, + "logits/chosen": -2.981102466583252, + "logits/rejected": -2.961693286895752, + "logps/chosen": -277.573974609375, + "logps/rejected": -287.72515869140625, + "loss": -0.0153, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1206835508346558, + "rewards/margins": 1.1004483699798584, + "rewards/rejected": -2.2211318016052246, + "step": 2430 + }, + { + "epoch": 2.52, + "learning_rate": 8.878683505549177e-08, + "logits/chosen": -2.969409465789795, + "logits/rejected": -2.9719467163085938, + "logps/chosen": -269.2553405761719, + "logps/rejected": -276.8422546386719, + "loss": -0.2463, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.0086815357208252, + "rewards/margins": 1.2824007272720337, + "rewards/rejected": -2.2910819053649902, + "step": 2440 + }, + { + "epoch": 2.53, + "learning_rate": 8.687332567929582e-08, + "logits/chosen": -2.9511923789978027, + "logits/rejected": -2.945295810699463, + "logps/chosen": -259.01507568359375, + "logps/rejected": -261.44781494140625, + "loss": -0.1105, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.238194465637207, + "rewards/margins": 0.8837486505508423, + "rewards/rejected": -2.1219429969787598, + "step": 2450 + }, + { + "epoch": 2.54, + "learning_rate": 8.495981630309988e-08, + "logits/chosen": -2.9451937675476074, + "logits/rejected": -2.9014077186584473, + "logps/chosen": -306.05902099609375, + "logps/rejected": -251.77072143554688, + "loss": -0.0642, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9629167318344116, + "rewards/margins": 1.1698607206344604, + "rewards/rejected": -2.132777452468872, + "step": 2460 + }, + { + "epoch": 2.55, + "learning_rate": 8.304630692690395e-08, + "logits/chosen": -2.968233108520508, + "logits/rejected": -2.9597315788269043, + "logps/chosen": -272.21258544921875, + "logps/rejected": -263.3721008300781, + "loss": -0.1204, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0844926834106445, + "rewards/margins": 1.1088446378707886, + "rewards/rejected": -2.1933372020721436, + "step": 2470 + }, + { + "epoch": 2.56, + "learning_rate": 8.1132797550708e-08, + "logits/chosen": -3.0234591960906982, + "logits/rejected": -2.9642250537872314, + "logps/chosen": -313.2720947265625, + "logps/rejected": -251.24020385742188, + "loss": -0.1713, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7700778245925903, + "rewards/margins": 1.3790977001190186, + "rewards/rejected": -2.1491756439208984, + "step": 2480 + }, + { + "epoch": 2.57, + "learning_rate": 7.921928817451206e-08, + "logits/chosen": -2.968733549118042, + "logits/rejected": -2.9718379974365234, + "logps/chosen": -277.2424621582031, + "logps/rejected": -242.42172241210938, + "loss": -0.1563, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.095190167427063, + "rewards/margins": 1.008971095085144, + "rewards/rejected": -2.104161262512207, + "step": 2490 + }, + { + "epoch": 2.58, + "learning_rate": 7.73057787983161e-08, + "logits/chosen": -2.9798028469085693, + "logits/rejected": -2.9650533199310303, + "logps/chosen": -290.0262756347656, + "logps/rejected": -264.9449157714844, + "loss": -0.2883, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.0747302770614624, + "rewards/margins": 1.2478855848312378, + "rewards/rejected": -2.3226161003112793, + "step": 2500 + }, + { + "epoch": 2.59, + "learning_rate": 7.539226942212017e-08, + "logits/chosen": -2.945732593536377, + "logits/rejected": -2.94990611076355, + "logps/chosen": -305.3632507324219, + "logps/rejected": -274.59722900390625, + "loss": -0.2619, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0448615550994873, + "rewards/margins": 1.2908263206481934, + "rewards/rejected": -2.3356876373291016, + "step": 2510 + }, + { + "epoch": 2.6, + "learning_rate": 7.347876004592423e-08, + "logits/chosen": -2.9896063804626465, + "logits/rejected": -2.9576222896575928, + "logps/chosen": -272.3258361816406, + "logps/rejected": -214.65298461914062, + "loss": -0.1643, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.9911131858825684, + "rewards/margins": 1.24199640750885, + "rewards/rejected": -2.23310923576355, + "step": 2520 + }, + { + "epoch": 2.61, + "learning_rate": 7.156525066972828e-08, + "logits/chosen": -2.9311654567718506, + "logits/rejected": -2.9140172004699707, + "logps/chosen": -268.74444580078125, + "logps/rejected": -236.22988891601562, + "loss": -0.0763, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.111546277999878, + "rewards/margins": 1.021420955657959, + "rewards/rejected": -2.132966995239258, + "step": 2530 + }, + { + "epoch": 2.62, + "learning_rate": 6.965174129353234e-08, + "logits/chosen": -2.9497828483581543, + "logits/rejected": -2.9311363697052, + "logps/chosen": -305.40240478515625, + "logps/rejected": -264.26177978515625, + "loss": -0.1631, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7808889746665955, + "rewards/margins": 1.4971510171890259, + "rewards/rejected": -2.2780401706695557, + "step": 2540 + }, + { + "epoch": 2.63, + "learning_rate": 6.773823191733639e-08, + "logits/chosen": -2.8948826789855957, + "logits/rejected": -2.8769936561584473, + "logps/chosen": -272.2402648925781, + "logps/rejected": -246.4182891845703, + "loss": -0.2377, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.8501654863357544, + "rewards/margins": 1.4621152877807617, + "rewards/rejected": -2.3122806549072266, + "step": 2550 + }, + { + "epoch": 2.64, + "learning_rate": 6.582472254114045e-08, + "logits/chosen": -2.9354665279388428, + "logits/rejected": -2.8916828632354736, + "logps/chosen": -272.43682861328125, + "logps/rejected": -274.33819580078125, + "loss": -0.0427, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1567806005477905, + "rewards/margins": 0.9909998178482056, + "rewards/rejected": -2.147780179977417, + "step": 2560 + }, + { + "epoch": 2.65, + "learning_rate": 6.391121316494451e-08, + "logits/chosen": -2.9822142124176025, + "logits/rejected": -2.982553482055664, + "logps/chosen": -282.8048400878906, + "logps/rejected": -264.55401611328125, + "loss": -0.2413, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2675011157989502, + "rewards/margins": 1.1336963176727295, + "rewards/rejected": -2.4011974334716797, + "step": 2570 + }, + { + "epoch": 2.66, + "learning_rate": 6.199770378874856e-08, + "logits/chosen": -2.946916103363037, + "logits/rejected": -2.921715497970581, + "logps/chosen": -295.14581298828125, + "logps/rejected": -264.893798828125, + "loss": -0.1178, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0069832801818848, + "rewards/margins": 1.1966971158981323, + "rewards/rejected": -2.2036805152893066, + "step": 2580 + }, + { + "epoch": 2.67, + "learning_rate": 6.008419441255262e-08, + "logits/chosen": -2.996048927307129, + "logits/rejected": -2.9277799129486084, + "logps/chosen": -288.4164733886719, + "logps/rejected": -240.5204315185547, + "loss": -0.1271, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2253119945526123, + "rewards/margins": 0.9576913714408875, + "rewards/rejected": -2.1830034255981445, + "step": 2590 + }, + { + "epoch": 2.68, + "learning_rate": 5.817068503635668e-08, + "logits/chosen": -2.979665756225586, + "logits/rejected": -3.006110668182373, + "logps/chosen": -309.9973449707031, + "logps/rejected": -266.8681945800781, + "loss": -0.1954, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.072455644607544, + "rewards/margins": 1.285042643547058, + "rewards/rejected": -2.3574986457824707, + "step": 2600 + }, + { + "epoch": 2.69, + "learning_rate": 5.6257175660160735e-08, + "logits/chosen": -2.995903968811035, + "logits/rejected": -2.985555410385132, + "logps/chosen": -255.1632537841797, + "logps/rejected": -262.6217346191406, + "loss": -0.1921, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.135871171951294, + "rewards/margins": 1.1645699739456177, + "rewards/rejected": -2.300441026687622, + "step": 2610 + }, + { + "epoch": 2.71, + "learning_rate": 5.4343666283964784e-08, + "logits/chosen": -2.947361946105957, + "logits/rejected": -2.9222099781036377, + "logps/chosen": -283.39483642578125, + "logps/rejected": -236.3388214111328, + "loss": -0.2522, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0431753396987915, + "rewards/margins": 1.4626045227050781, + "rewards/rejected": -2.50577974319458, + "step": 2620 + }, + { + "epoch": 2.72, + "learning_rate": 5.243015690776884e-08, + "logits/chosen": -2.934537649154663, + "logits/rejected": -2.903463840484619, + "logps/chosen": -297.0821228027344, + "logps/rejected": -241.7076416015625, + "loss": -0.1882, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.178007960319519, + "rewards/margins": 1.0751577615737915, + "rewards/rejected": -2.2531659603118896, + "step": 2630 + }, + { + "epoch": 2.73, + "learning_rate": 5.05166475315729e-08, + "logits/chosen": -2.9361720085144043, + "logits/rejected": -2.882931709289551, + "logps/chosen": -270.83148193359375, + "logps/rejected": -255.24337768554688, + "loss": -0.1525, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2923060655593872, + "rewards/margins": 1.1520329713821411, + "rewards/rejected": -2.444338798522949, + "step": 2640 + }, + { + "epoch": 2.74, + "learning_rate": 4.860313815537696e-08, + "logits/chosen": -2.9840073585510254, + "logits/rejected": -2.9296116828918457, + "logps/chosen": -283.15643310546875, + "logps/rejected": -250.57608032226562, + "loss": -0.1079, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0855220556259155, + "rewards/margins": 1.3181664943695068, + "rewards/rejected": -2.4036881923675537, + "step": 2650 + }, + { + "epoch": 2.75, + "learning_rate": 4.668962877918101e-08, + "logits/chosen": -2.9590346813201904, + "logits/rejected": -2.9196345806121826, + "logps/chosen": -256.04315185546875, + "logps/rejected": -279.31787109375, + "loss": -0.1673, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3440836668014526, + "rewards/margins": 1.061953067779541, + "rewards/rejected": -2.406036853790283, + "step": 2660 + }, + { + "epoch": 2.76, + "learning_rate": 4.477611940298507e-08, + "logits/chosen": -2.9351706504821777, + "logits/rejected": -2.9165854454040527, + "logps/chosen": -302.4515075683594, + "logps/rejected": -260.8388366699219, + "loss": -0.4567, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1327465772628784, + "rewards/margins": 1.2759268283843994, + "rewards/rejected": -2.4086735248565674, + "step": 2670 + }, + { + "epoch": 2.77, + "learning_rate": 4.2862610026789124e-08, + "logits/chosen": -3.013035297393799, + "logits/rejected": -2.989163398742676, + "logps/chosen": -270.7334899902344, + "logps/rejected": -254.8367156982422, + "loss": -0.1713, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0498313903808594, + "rewards/margins": 1.175312876701355, + "rewards/rejected": -2.225144147872925, + "step": 2680 + }, + { + "epoch": 2.78, + "learning_rate": 4.0949100650593186e-08, + "logits/chosen": -2.880460023880005, + "logits/rejected": -2.861250400543213, + "logps/chosen": -269.2752990722656, + "logps/rejected": -265.5714111328125, + "loss": -0.0863, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1616452932357788, + "rewards/margins": 1.2222042083740234, + "rewards/rejected": -2.383849620819092, + "step": 2690 + }, + { + "epoch": 2.79, + "learning_rate": 3.903559127439724e-08, + "logits/chosen": -2.9040732383728027, + "logits/rejected": -2.920821189880371, + "logps/chosen": -267.18621826171875, + "logps/rejected": -227.2675018310547, + "loss": -0.3545, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.007155179977417, + "rewards/margins": 1.408747911453247, + "rewards/rejected": -2.415903091430664, + "step": 2700 + }, + { + "epoch": 2.8, + "learning_rate": 3.71220818982013e-08, + "logits/chosen": -2.9555585384368896, + "logits/rejected": -2.891514778137207, + "logps/chosen": -293.35809326171875, + "logps/rejected": -241.7179412841797, + "loss": -0.0836, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3558762073516846, + "rewards/margins": 0.7381645441055298, + "rewards/rejected": -2.094040632247925, + "step": 2710 + }, + { + "epoch": 2.81, + "learning_rate": 3.520857252200535e-08, + "logits/chosen": -2.9766342639923096, + "logits/rejected": -2.9516689777374268, + "logps/chosen": -288.6914978027344, + "logps/rejected": -282.8217468261719, + "loss": -0.079, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2634086608886719, + "rewards/margins": 0.9370874166488647, + "rewards/rejected": -2.200496196746826, + "step": 2720 + }, + { + "epoch": 2.82, + "learning_rate": 3.3295063145809414e-08, + "logits/chosen": -2.9203312397003174, + "logits/rejected": -2.923161268234253, + "logps/chosen": -278.013916015625, + "logps/rejected": -267.54168701171875, + "loss": -0.1356, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0634254217147827, + "rewards/margins": 1.14893639087677, + "rewards/rejected": -2.212362051010132, + "step": 2730 + }, + { + "epoch": 2.83, + "learning_rate": 3.138155376961347e-08, + "logits/chosen": -3.015432834625244, + "logits/rejected": -2.9883949756622314, + "logps/chosen": -306.4672546386719, + "logps/rejected": -284.0825500488281, + "loss": -0.0924, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2742321491241455, + "rewards/margins": 1.3397479057312012, + "rewards/rejected": -2.6139800548553467, + "step": 2740 + }, + { + "epoch": 2.84, + "learning_rate": 2.9468044393417525e-08, + "logits/chosen": -2.9456729888916016, + "logits/rejected": -2.9101481437683105, + "logps/chosen": -281.6629638671875, + "logps/rejected": -247.2931671142578, + "loss": -0.2122, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8153074383735657, + "rewards/margins": 1.5913559198379517, + "rewards/rejected": -2.406663417816162, + "step": 2750 + }, + { + "epoch": 2.85, + "learning_rate": 2.755453501722158e-08, + "logits/chosen": -2.8928332328796387, + "logits/rejected": -2.8488776683807373, + "logps/chosen": -262.3460693359375, + "logps/rejected": -253.6611328125, + "loss": -0.3582, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2655181884765625, + "rewards/margins": 1.1255066394805908, + "rewards/rejected": -2.3910248279571533, + "step": 2760 + }, + { + "epoch": 2.86, + "learning_rate": 2.564102564102564e-08, + "logits/chosen": -2.9541738033294678, + "logits/rejected": -2.9483609199523926, + "logps/chosen": -281.6482849121094, + "logps/rejected": -263.13116455078125, + "loss": -0.2229, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.135956048965454, + "rewards/margins": 1.4537389278411865, + "rewards/rejected": -2.5896952152252197, + "step": 2770 + }, + { + "epoch": 2.87, + "learning_rate": 2.3727516264829695e-08, + "logits/chosen": -2.96710467338562, + "logits/rejected": -2.935039520263672, + "logps/chosen": -314.2068176269531, + "logps/rejected": -288.13592529296875, + "loss": -0.0912, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2961633205413818, + "rewards/margins": 1.0811083316802979, + "rewards/rejected": -2.3772716522216797, + "step": 2780 + }, + { + "epoch": 2.88, + "learning_rate": 2.1814006888633754e-08, + "logits/chosen": -2.9459080696105957, + "logits/rejected": -2.89158296585083, + "logps/chosen": -286.0101318359375, + "logps/rejected": -239.49160766601562, + "loss": -0.115, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2027372121810913, + "rewards/margins": 1.0859975814819336, + "rewards/rejected": -2.2887349128723145, + "step": 2790 + }, + { + "epoch": 2.89, + "learning_rate": 1.990049751243781e-08, + "logits/chosen": -2.9392828941345215, + "logits/rejected": -2.9443650245666504, + "logps/chosen": -291.0428771972656, + "logps/rejected": -242.24203491210938, + "loss": -0.2681, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.150545358657837, + "rewards/margins": 1.183664083480835, + "rewards/rejected": -2.334209680557251, + "step": 2800 + }, + { + "epoch": 2.9, + "learning_rate": 1.7986988136241865e-08, + "logits/chosen": -2.936471462249756, + "logits/rejected": -2.891524076461792, + "logps/chosen": -287.06024169921875, + "logps/rejected": -288.4886169433594, + "loss": -0.2539, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1829683780670166, + "rewards/margins": 1.063999891281128, + "rewards/rejected": -2.2469685077667236, + "step": 2810 + }, + { + "epoch": 2.91, + "learning_rate": 1.6073478760045924e-08, + "logits/chosen": -2.9530060291290283, + "logits/rejected": -2.9200854301452637, + "logps/chosen": -283.59716796875, + "logps/rejected": -236.90316772460938, + "loss": -0.1698, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1723743677139282, + "rewards/margins": 1.1659984588623047, + "rewards/rejected": -2.3383727073669434, + "step": 2820 + }, + { + "epoch": 2.92, + "learning_rate": 1.4159969383849981e-08, + "logits/chosen": -2.9582297801971436, + "logits/rejected": -2.9348087310791016, + "logps/chosen": -269.26531982421875, + "logps/rejected": -241.8841094970703, + "loss": -0.3465, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1937872171401978, + "rewards/margins": 1.0226060152053833, + "rewards/rejected": -2.216393232345581, + "step": 2830 + }, + { + "epoch": 2.93, + "learning_rate": 1.2246460007654037e-08, + "logits/chosen": -2.985964775085449, + "logits/rejected": -2.9667985439300537, + "logps/chosen": -297.0205078125, + "logps/rejected": -251.09805297851562, + "loss": -0.1413, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.407981276512146, + "rewards/margins": 0.9606062173843384, + "rewards/rejected": -2.3685877323150635, + "step": 2840 + }, + { + "epoch": 2.94, + "learning_rate": 1.0332950631458094e-08, + "logits/chosen": -2.9180400371551514, + "logits/rejected": -2.9097437858581543, + "logps/chosen": -291.8419494628906, + "logps/rejected": -265.8045349121094, + "loss": -0.116, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1430332660675049, + "rewards/margins": 1.2068586349487305, + "rewards/rejected": -2.3498916625976562, + "step": 2850 + }, + { + "epoch": 2.95, + "learning_rate": 8.419441255262151e-09, + "logits/chosen": -2.9281063079833984, + "logits/rejected": -2.8641605377197266, + "logps/chosen": -270.81256103515625, + "logps/rejected": -246.7694549560547, + "loss": -0.2278, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.335326910018921, + "rewards/margins": 1.1297729015350342, + "rewards/rejected": -2.465099811553955, + "step": 2860 + }, + { + "epoch": 2.96, + "learning_rate": 6.505931879066207e-09, + "logits/chosen": -2.948106050491333, + "logits/rejected": -2.921739339828491, + "logps/chosen": -276.6488952636719, + "logps/rejected": -248.07608032226562, + "loss": -0.1698, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1927988529205322, + "rewards/margins": 1.302239179611206, + "rewards/rejected": -2.4950382709503174, + "step": 2870 + }, + { + "epoch": 2.97, + "learning_rate": 4.592422502870264e-09, + "logits/chosen": -2.9207305908203125, + "logits/rejected": -2.8921730518341064, + "logps/chosen": -288.192138671875, + "logps/rejected": -244.62954711914062, + "loss": -0.1019, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.206674337387085, + "rewards/margins": 0.8675975799560547, + "rewards/rejected": -2.0742716789245605, + "step": 2880 + }, + { + "epoch": 2.98, + "learning_rate": 2.6789131266743202e-09, + "logits/chosen": -3.0061116218566895, + "logits/rejected": -2.972447156906128, + "logps/chosen": -306.70977783203125, + "logps/rejected": -261.82244873046875, + "loss": -0.1622, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2506418228149414, + "rewards/margins": 1.2404968738555908, + "rewards/rejected": -2.4911389350891113, + "step": 2890 + }, + { + "epoch": 2.99, + "learning_rate": 7.654037504783773e-10, + "logits/chosen": -2.974818706512451, + "logits/rejected": -2.963379383087158, + "logps/chosen": -263.7333679199219, + "logps/rejected": -262.7867126464844, + "loss": -0.2602, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.150649070739746, + "rewards/margins": 1.2455689907073975, + "rewards/rejected": -2.3962180614471436, + "step": 2900 + }, + { + "epoch": 3.0, + "eval_logits/chosen": -2.965512990951538, + "eval_logits/rejected": -2.9399757385253906, + "eval_logps/chosen": -282.7847900390625, + "eval_logps/rejected": -252.9479217529297, + "eval_loss": -0.203842431306839, + "eval_rewards/accuracies": 0.6840000152587891, + "eval_rewards/chosen": -1.1628247499465942, + "eval_rewards/margins": 1.2828813791275024, + "eval_rewards/rejected": -2.4457061290740967, + "eval_runtime": 446.4813, + "eval_samples_per_second": 4.479, + "eval_steps_per_second": 0.28, + "step": 2904 + }, + { + "epoch": 3.0, + "step": 2904, "total_flos": 0.0, - "train_loss": 0.6728762634529555, - "train_runtime": 27528.1814, - "train_samples_per_second": 2.251, - "train_steps_per_second": 0.035 + "train_loss": 0.36701411283100355, + "train_runtime": 84636.1866, + "train_samples_per_second": 2.196, + "train_steps_per_second": 0.034 } ], "logging_steps": 10, - "max_steps": 968, - "num_train_epochs": 1, + "max_steps": 2904, + "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "trial_name": null,