{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994111874386653, "eval_steps": 500, "global_step": 1273, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 3.90625e-08, "logits/chosen": 0.753882884979248, "logits/rejected": 0.9021581411361694, "logps/chosen": -402.58197021484375, "logps/rejected": -272.54583740234375, "loss": 250000.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 3.90625e-07, "logits/chosen": 0.7883161902427673, "logits/rejected": 0.8325157761573792, "logps/chosen": -300.5879211425781, "logps/rejected": -293.91253662109375, "loss": 249972.8333, "rewards/accuracies": 0.3888888955116272, "rewards/chosen": -2.7298270651954226e-05, "rewards/margins": 2.835398663592059e-05, "rewards/rejected": -5.5652246373938397e-05, "step": 10 }, { "epoch": 0.02, "learning_rate": 7.8125e-07, "logits/chosen": 0.7864906191825867, "logits/rejected": 0.8796188235282898, "logps/chosen": -296.65606689453125, "logps/rejected": -256.8786926269531, "loss": 250065.0, "rewards/accuracies": 0.375, "rewards/chosen": -7.715411629760638e-05, "rewards/margins": -6.430874054785818e-05, "rewards/rejected": -1.2845377568737604e-05, "step": 20 }, { "epoch": 0.02, "learning_rate": 1.1718750000000001e-06, "logits/chosen": 0.8224496841430664, "logits/rejected": 0.8862675428390503, "logps/chosen": -272.9891662597656, "logps/rejected": -244.2042236328125, "loss": 249965.3, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.00036038103280588984, "rewards/margins": 3.5376859159441665e-05, "rewards/rejected": -0.00039575790287926793, "step": 30 }, { "epoch": 0.03, "learning_rate": 1.5625e-06, "logits/chosen": 0.7388542890548706, "logits/rejected": 0.8643872141838074, "logps/chosen": -300.79156494140625, "logps/rejected": -313.8456115722656, "loss": 249955.5, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.0008908492745831609, "rewards/margins": 4.541150701697916e-05, "rewards/rejected": -0.0009362607379443944, "step": 40 }, { "epoch": 0.04, "learning_rate": 1.953125e-06, "logits/chosen": 0.7705200910568237, "logits/rejected": 0.8214074969291687, "logps/chosen": -313.44525146484375, "logps/rejected": -286.5237121582031, "loss": 249616.75, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0017550220945850015, "rewards/margins": 0.0003843012382276356, "rewards/rejected": -0.002139323391020298, "step": 50 }, { "epoch": 0.05, "learning_rate": 2.3437500000000002e-06, "logits/chosen": 0.7623851895332336, "logits/rejected": 0.8920168876647949, "logps/chosen": -350.54656982421875, "logps/rejected": -304.0585021972656, "loss": 249209.575, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.003797227516770363, "rewards/margins": 0.0007935293251648545, "rewards/rejected": -0.004590757191181183, "step": 60 }, { "epoch": 0.05, "learning_rate": 2.7343750000000004e-06, "logits/chosen": 0.7647961378097534, "logits/rejected": 0.8629061579704285, "logps/chosen": -286.3582458496094, "logps/rejected": -248.80715942382812, "loss": 248737.95, "rewards/accuracies": 0.59375, "rewards/chosen": -0.006279756315052509, "rewards/margins": 0.0012701054802164435, "rewards/rejected": -0.0075498609803617, "step": 70 }, { "epoch": 0.06, "learning_rate": 3.125e-06, "logits/chosen": 0.8171808123588562, "logits/rejected": 0.9254066348075867, "logps/chosen": -274.9587097167969, "logps/rejected": -252.5512237548828, "loss": 248480.9, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.010536030866205692, "rewards/margins": 0.0015363285783678293, "rewards/rejected": -0.012072358280420303, "step": 80 }, { "epoch": 0.07, "learning_rate": 3.5156250000000003e-06, "logits/chosen": 0.8039876818656921, "logits/rejected": 0.8282687067985535, "logps/chosen": -333.23455810546875, "logps/rejected": -340.7774353027344, "loss": 248449.15, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.01643429882824421, "rewards/margins": 0.001596734276972711, "rewards/rejected": -0.018031032755970955, "step": 90 }, { "epoch": 0.08, "learning_rate": 3.90625e-06, "logits/chosen": 0.7214804291725159, "logits/rejected": 0.8068147897720337, "logps/chosen": -340.88555908203125, "logps/rejected": -315.34942626953125, "loss": 246475.125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02047683112323284, "rewards/margins": 0.0035947158467024565, "rewards/rejected": -0.024071548134088516, "step": 100 }, { "epoch": 0.09, "learning_rate": 4.296875e-06, "logits/chosen": 0.7144443392753601, "logits/rejected": 0.861451268196106, "logps/chosen": -326.4942932128906, "logps/rejected": -317.719482421875, "loss": 244917.625, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.026887858286499977, "rewards/margins": 0.005240024067461491, "rewards/rejected": -0.032127875834703445, "step": 110 }, { "epoch": 0.09, "learning_rate": 4.6875000000000004e-06, "logits/chosen": 0.7016499042510986, "logits/rejected": 0.6916753053665161, "logps/chosen": -354.7713928222656, "logps/rejected": -349.99114990234375, "loss": 243192.975, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.03751590475440025, "rewards/margins": 0.007144705858081579, "rewards/rejected": -0.04466061294078827, "step": 120 }, { "epoch": 0.1, "learning_rate": 4.999962359300416e-06, "logits/chosen": 0.6297165155410767, "logits/rejected": 0.6793630123138428, "logps/chosen": -338.17181396484375, "logps/rejected": -367.1688537597656, "loss": 241014.625, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.05371738597750664, "rewards/margins": 0.009516485966742039, "rewards/rejected": -0.0632338672876358, "step": 130 }, { "epoch": 0.11, "learning_rate": 4.998645053824218e-06, "logits/chosen": 0.5639317631721497, "logits/rejected": 0.610073447227478, "logps/chosen": -337.5505065917969, "logps/rejected": -357.8117370605469, "loss": 239699.275, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06457965821027756, "rewards/margins": 0.011107666417956352, "rewards/rejected": -0.07568733394145966, "step": 140 }, { "epoch": 0.12, "learning_rate": 4.9954468466732145e-06, "logits/chosen": 0.404470831155777, "logits/rejected": 0.439323753118515, "logps/chosen": -339.54901123046875, "logps/rejected": -331.80499267578125, "loss": 239823.925, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0667237862944603, "rewards/margins": 0.011366022750735283, "rewards/rejected": -0.07808981090784073, "step": 150 }, { "epoch": 0.13, "learning_rate": 4.990370145357496e-06, "logits/chosen": 0.431760311126709, "logits/rejected": 0.49426621198654175, "logps/chosen": -370.2152404785156, "logps/rejected": -357.75103759765625, "loss": 236582.7, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0688774362206459, "rewards/margins": 0.015688957646489143, "rewards/rejected": -0.0845663920044899, "step": 160 }, { "epoch": 0.13, "learning_rate": 4.983418771458684e-06, "logits/chosen": 0.3582938313484192, "logits/rejected": 0.4145487844944, "logps/chosen": -393.54296875, "logps/rejected": -395.73968505859375, "loss": 233456.825, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.09368976205587387, "rewards/margins": 0.019242364913225174, "rewards/rejected": -0.11293212324380875, "step": 170 }, { "epoch": 0.14, "learning_rate": 4.97459795775315e-06, "logits/chosen": 0.24134807288646698, "logits/rejected": 0.34352773427963257, "logps/chosen": -479.29827880859375, "logps/rejected": -480.0040588378906, "loss": 226326.95, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1502467542886734, "rewards/margins": 0.02949386276304722, "rewards/rejected": -0.17974062263965607, "step": 180 }, { "epoch": 0.15, "learning_rate": 4.963914344272961e-06, "logits/chosen": 0.1726927012205124, "logits/rejected": 0.17276537418365479, "logps/chosen": -474.7080993652344, "logps/rejected": -503.10791015625, "loss": 226260.375, "rewards/accuracies": 0.59375, "rewards/chosen": -0.16851405799388885, "rewards/margins": 0.033648595213890076, "rewards/rejected": -0.20216265320777893, "step": 190 }, { "epoch": 0.16, "learning_rate": 4.951375973307458e-06, "logits/chosen": 0.08363378793001175, "logits/rejected": 0.19105622172355652, "logps/chosen": -507.63128662109375, "logps/rejected": -521.0435791015625, "loss": 229957.85, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.18896250426769257, "rewards/margins": 0.03131803497672081, "rewards/rejected": -0.22028055787086487, "step": 200 }, { "epoch": 0.16, "learning_rate": 4.93699228334928e-06, "logits/chosen": 0.027421921491622925, "logits/rejected": 0.09061434864997864, "logps/chosen": -499.6670837402344, "logps/rejected": -513.214111328125, "loss": 228005.45, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.19364123046398163, "rewards/margins": 0.03244911879301071, "rewards/rejected": -0.22609035670757294, "step": 210 }, { "epoch": 0.17, "learning_rate": 4.920774101989362e-06, "logits/chosen": 0.015898525714874268, "logits/rejected": 0.06961273401975632, "logps/chosen": -542.1640625, "logps/rejected": -571.1614990234375, "loss": 220063.4, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.22638651728630066, "rewards/margins": 0.041207194328308105, "rewards/rejected": -0.2675936818122864, "step": 220 }, { "epoch": 0.18, "learning_rate": 4.902733637766261e-06, "logits/chosen": -0.03187388926744461, "logits/rejected": -0.05228201299905777, "logps/chosen": -584.11083984375, "logps/rejected": -596.0657348632812, "loss": 223339.975, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.27142053842544556, "rewards/margins": 0.04074941202998161, "rewards/rejected": -0.3121699094772339, "step": 230 }, { "epoch": 0.19, "learning_rate": 4.882884470975954e-06, "logits/chosen": -0.07718712091445923, "logits/rejected": -0.048422373831272125, "logps/chosen": -590.1216430664062, "logps/rejected": -614.7066650390625, "loss": 235878.475, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.2930641770362854, "rewards/margins": 0.059319932013750076, "rewards/rejected": -0.352384090423584, "step": 240 }, { "epoch": 0.2, "learning_rate": 4.861241543449015e-06, "logits/chosen": 0.0006945926579646766, "logits/rejected": -0.017523247748613358, "logps/chosen": -501.6593322753906, "logps/rejected": -508.7500915527344, "loss": 233176.425, "rewards/accuracies": 0.5625, "rewards/chosen": -0.22137589752674103, "rewards/margins": 0.02421090006828308, "rewards/rejected": -0.2455867975950241, "step": 250 }, { "epoch": 0.2, "learning_rate": 4.8378211473028755e-06, "logits/chosen": 0.013015779666602612, "logits/rejected": 0.12976054847240448, "logps/chosen": -529.851806640625, "logps/rejected": -552.1076049804688, "loss": 217051.375, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.2184825837612152, "rewards/margins": 0.04304370656609535, "rewards/rejected": -0.26152628660202026, "step": 260 }, { "epoch": 0.21, "learning_rate": 4.812640912677624e-06, "logits/chosen": 0.10295257717370987, "logits/rejected": 0.10734937340021133, "logps/chosen": -514.9952392578125, "logps/rejected": -532.2618408203125, "loss": 228697.625, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.2471189945936203, "rewards/margins": 0.03284931927919388, "rewards/rejected": -0.2799683213233948, "step": 270 }, { "epoch": 0.22, "learning_rate": 4.785719794464596e-06, "logits/chosen": 0.08165857940912247, "logits/rejected": 0.11624784767627716, "logps/chosen": -614.1241455078125, "logps/rejected": -608.5601806640625, "loss": 219380.5, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.2938295304775238, "rewards/margins": 0.04496774449944496, "rewards/rejected": -0.33879727125167847, "step": 280 }, { "epoch": 0.23, "learning_rate": 4.757078058037722e-06, "logits/chosen": 0.049666643142700195, "logits/rejected": 0.037592388689517975, "logps/chosen": -709.3441162109375, "logps/rejected": -723.0515747070312, "loss": 222553.975, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.3819270133972168, "rewards/margins": 0.04088777303695679, "rewards/rejected": -0.4228147864341736, "step": 290 }, { "epoch": 0.24, "learning_rate": 4.72673726399839e-06, "logits/chosen": -0.011786766350269318, "logits/rejected": 0.012369150295853615, "logps/chosen": -711.5050659179688, "logps/rejected": -751.9043579101562, "loss": 216777.1, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3882337510585785, "rewards/margins": 0.05219928175210953, "rewards/rejected": -0.4404330849647522, "step": 300 }, { "epoch": 0.24, "learning_rate": 4.694720251945298e-06, "logits/chosen": -0.03419340401887894, "logits/rejected": 0.06611661612987518, "logps/chosen": -677.6072387695312, "logps/rejected": -711.2977905273438, "loss": 213271.225, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.36067238450050354, "rewards/margins": 0.058848440647125244, "rewards/rejected": -0.41952085494995117, "step": 310 }, { "epoch": 0.25, "learning_rate": 4.661051123281528e-06, "logits/chosen": -0.017030570656061172, "logits/rejected": -0.015286751091480255, "logps/chosen": -677.0394287109375, "logps/rejected": -715.4779052734375, "loss": 210663.625, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3839905560016632, "rewards/margins": 0.06271493434906006, "rewards/rejected": -0.44670549035072327, "step": 320 }, { "epoch": 0.26, "learning_rate": 4.6257552230717536e-06, "logits/chosen": -0.07125677913427353, "logits/rejected": 0.0028305717278271914, "logps/chosen": -692.05517578125, "logps/rejected": -677.0928955078125, "loss": 225718.325, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.3579491972923279, "rewards/margins": 0.04101933166384697, "rewards/rejected": -0.39896851778030396, "step": 330 }, { "epoch": 0.27, "learning_rate": 4.588859120963282e-06, "logits/chosen": -0.02518557570874691, "logits/rejected": 0.010962575674057007, "logps/chosen": -620.8372802734375, "logps/rejected": -658.2938232421875, "loss": 223632.35, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.333041250705719, "rewards/margins": 0.04603361338376999, "rewards/rejected": -0.3790748715400696, "step": 340 }, { "epoch": 0.27, "learning_rate": 4.5503905911852435e-06, "logits/chosen": 0.007638671435415745, "logits/rejected": -0.031868986785411835, "logps/chosen": -696.9804077148438, "logps/rejected": -731.325439453125, "loss": 219627.55, "rewards/accuracies": 0.625, "rewards/chosen": -0.3883155286312103, "rewards/margins": 0.05170748755335808, "rewards/rejected": -0.4400230050086975, "step": 350 }, { "epoch": 0.28, "learning_rate": 4.510378591641036e-06, "logits/chosen": 0.0013331234222278, "logits/rejected": 0.0017831831937655807, "logps/chosen": -748.482421875, "logps/rejected": -801.7854614257812, "loss": 227919.725, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.4426950514316559, "rewards/margins": 0.03837837651371956, "rewards/rejected": -0.48107343912124634, "step": 360 }, { "epoch": 0.29, "learning_rate": 4.468853242109712e-06, "logits/chosen": -0.035867899656295776, "logits/rejected": -0.031078562140464783, "logps/chosen": -724.6630859375, "logps/rejected": -755.1024169921875, "loss": 220950.85, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.43880361318588257, "rewards/margins": 0.05036715418100357, "rewards/rejected": -0.4891708493232727, "step": 370 }, { "epoch": 0.3, "learning_rate": 4.42584580157276e-06, "logits/chosen": -0.013162782415747643, "logits/rejected": -0.002673022449016571, "logps/chosen": -783.6541748046875, "logps/rejected": -803.5646362304688, "loss": 218305.25, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.44587355852127075, "rewards/margins": 0.06072569638490677, "rewards/rejected": -0.5065993070602417, "step": 380 }, { "epoch": 0.31, "learning_rate": 4.381388644683317e-06, "logits/chosen": -0.014830539003014565, "logits/rejected": -0.03430696204304695, "logps/chosen": -691.3172607421875, "logps/rejected": -730.2721557617188, "loss": 231019.025, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.3912193477153778, "rewards/margins": 0.045134175568819046, "rewards/rejected": -0.43635350465774536, "step": 390 }, { "epoch": 0.31, "learning_rate": 4.33551523739555e-06, "logits/chosen": -0.07614697515964508, "logits/rejected": -0.05786391347646713, "logps/chosen": -633.596923828125, "logps/rejected": -678.1466674804688, "loss": 218172.15, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.33879750967025757, "rewards/margins": 0.05953192710876465, "rewards/rejected": -0.398329496383667, "step": 400 }, { "epoch": 0.32, "learning_rate": 4.288260111772535e-06, "logits/chosen": -0.0014561188872903585, "logits/rejected": 0.05796881392598152, "logps/chosen": -645.994384765625, "logps/rejected": -661.2260131835938, "loss": 211935.25, "rewards/accuracies": 0.625, "rewards/chosen": -0.34373944997787476, "rewards/margins": 0.05440125986933708, "rewards/rejected": -0.39814066886901855, "step": 410 }, { "epoch": 0.33, "learning_rate": 4.239658839991594e-06, "logits/chosen": -0.06332430243492126, "logits/rejected": -0.0031203762628138065, "logps/chosen": -718.9708251953125, "logps/rejected": -738.2911376953125, "loss": 210876.625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4123147428035736, "rewards/margins": 0.06380443274974823, "rewards/rejected": -0.47611913084983826, "step": 420 }, { "epoch": 0.34, "learning_rate": 4.189748007566686e-06, "logits/chosen": -0.03928182274103165, "logits/rejected": 0.015087981708347797, "logps/chosen": -762.869384765625, "logps/rejected": -791.5504760742188, "loss": 198171.0625, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.45352697372436523, "rewards/margins": 0.07380986958742142, "rewards/rejected": -0.5273367762565613, "step": 430 }, { "epoch": 0.35, "learning_rate": 4.138565185807972e-06, "logits/chosen": -0.011115262284874916, "logits/rejected": 0.020389294251799583, "logps/chosen": -725.9690551757812, "logps/rejected": -757.0069580078125, "loss": 200690.2, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.40321993827819824, "rewards/margins": 0.06992204487323761, "rewards/rejected": -0.47314196825027466, "step": 440 }, { "epoch": 0.35, "learning_rate": 4.086148903539311e-06, "logits/chosen": 0.0033815137576311827, "logits/rejected": 0.02135222777724266, "logps/chosen": -694.905029296875, "logps/rejected": -763.0654907226562, "loss": 194541.9625, "rewards/accuracies": 0.625, "rewards/chosen": -0.39870306849479675, "rewards/margins": 0.08366172015666962, "rewards/rejected": -0.48236480355262756, "step": 450 }, { "epoch": 0.36, "learning_rate": 4.032538618094972e-06, "logits/chosen": 0.046546097844839096, "logits/rejected": 0.06920139491558075, "logps/chosen": -779.2876586914062, "logps/rejected": -792.000732421875, "loss": 223143.625, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.4378412365913391, "rewards/margins": 0.04833221063017845, "rewards/rejected": -0.48617345094680786, "step": 460 }, { "epoch": 0.37, "learning_rate": 3.977774685617386e-06, "logits/chosen": -0.024039577692747116, "logits/rejected": 0.034949518740177155, "logps/chosen": -762.0443115234375, "logps/rejected": -840.4417724609375, "loss": 188606.2, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4447455406188965, "rewards/margins": 0.09278737008571625, "rewards/rejected": -0.5375329256057739, "step": 470 }, { "epoch": 0.38, "learning_rate": 3.92189833067831e-06, "logits/chosen": -0.0034719116520136595, "logits/rejected": 0.03559238836169243, "logps/chosen": -727.2840576171875, "logps/rejected": -749.8525390625, "loss": 219781.9, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.39409303665161133, "rewards/margins": 0.05169659107923508, "rewards/rejected": -0.445789635181427, "step": 480 }, { "epoch": 0.38, "learning_rate": 3.864951615246261e-06, "logits/chosen": -0.10218246281147003, "logits/rejected": -0.04928632080554962, "logps/chosen": -696.552978515625, "logps/rejected": -761.4454956054688, "loss": 201005.8875, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3766392767429352, "rewards/margins": 0.07472492754459381, "rewards/rejected": -0.4513641893863678, "step": 490 }, { "epoch": 0.39, "learning_rate": 3.806977407023581e-06, "logits/chosen": -0.08053256571292877, "logits/rejected": -0.027696877717971802, "logps/chosen": -716.932373046875, "logps/rejected": -746.5601196289062, "loss": 210761.25, "rewards/accuracies": 0.625, "rewards/chosen": -0.3994150161743164, "rewards/margins": 0.0657259151339531, "rewards/rejected": -0.4651409089565277, "step": 500 }, { "epoch": 0.4, "learning_rate": 3.7480193471769815e-06, "logits/chosen": -0.021161776036024094, "logits/rejected": 0.010638940148055553, "logps/chosen": -752.3082275390625, "logps/rejected": -824.42626953125, "loss": 208462.75, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4740660786628723, "rewards/margins": 0.07625924050807953, "rewards/rejected": -0.5503252744674683, "step": 510 }, { "epoch": 0.41, "learning_rate": 3.6881218174858354e-06, "logits/chosen": -0.08741439133882523, "logits/rejected": 0.028159821406006813, "logps/chosen": -808.7000732421875, "logps/rejected": -843.1199340820312, "loss": 204978.175, "rewards/accuracies": 0.625, "rewards/chosen": -0.4982606768608093, "rewards/margins": 0.08009181916713715, "rewards/rejected": -0.5783525109291077, "step": 520 }, { "epoch": 0.42, "learning_rate": 3.627329906932964e-06, "logits/chosen": -0.0961703211069107, "logits/rejected": 0.01851445809006691, "logps/chosen": -785.6272583007812, "logps/rejected": -845.8230590820312, "loss": 202598.85, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4964609146118164, "rewards/margins": 0.08662732690572739, "rewards/rejected": -0.583088219165802, "step": 530 }, { "epoch": 0.42, "learning_rate": 3.5656893777630686e-06, "logits/chosen": -0.09825177490711212, "logits/rejected": -0.0450114943087101, "logps/chosen": -800.4473266601562, "logps/rejected": -877.4309692382812, "loss": 206751.5, "rewards/accuracies": 0.625, "rewards/chosen": -0.49454784393310547, "rewards/margins": 0.08050151914358139, "rewards/rejected": -0.5750494003295898, "step": 540 }, { "epoch": 0.43, "learning_rate": 3.503246631034345e-06, "logits/chosen": -0.03669491782784462, "logits/rejected": 0.04769158363342285, "logps/chosen": -901.7119140625, "logps/rejected": -944.8836059570312, "loss": 232280.975, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.5475555658340454, "rewards/margins": 0.05907317250967026, "rewards/rejected": -0.6066287159919739, "step": 550 }, { "epoch": 0.44, "learning_rate": 3.440048671689219e-06, "logits/chosen": -0.06334514915943146, "logits/rejected": 0.0007638297975063324, "logps/chosen": -795.4130249023438, "logps/rejected": -868.2384643554688, "loss": 209493.875, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5169624090194702, "rewards/margins": 0.07504051178693771, "rewards/rejected": -0.5920029878616333, "step": 560 }, { "epoch": 0.45, "learning_rate": 3.3761430731705056e-06, "logits/chosen": 0.016527634114027023, "logits/rejected": 0.005024082958698273, "logps/chosen": -785.9190673828125, "logps/rejected": -843.1102294921875, "loss": 221721.375, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.468578577041626, "rewards/margins": 0.05255637317895889, "rewards/rejected": -0.5211349725723267, "step": 570 }, { "epoch": 0.46, "learning_rate": 3.311577941609604e-06, "logits/chosen": -0.06407289206981659, "logits/rejected": -0.07606904208660126, "logps/chosen": -685.833740234375, "logps/rejected": -754.46240234375, "loss": 213850.075, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.4029085040092468, "rewards/margins": 0.06034379079937935, "rewards/rejected": -0.46325230598449707, "step": 580 }, { "epoch": 0.46, "learning_rate": 3.2464018796137157e-06, "logits/chosen": -0.1008271723985672, "logits/rejected": -0.02092236839234829, "logps/chosen": -745.2072143554688, "logps/rejected": -780.5989379882812, "loss": 204456.325, "rewards/accuracies": 0.625, "rewards/chosen": -0.42655831575393677, "rewards/margins": 0.07653003931045532, "rewards/rejected": -0.5030883550643921, "step": 590 }, { "epoch": 0.47, "learning_rate": 3.1806639496793245e-06, "logits/chosen": -0.12006688117980957, "logits/rejected": -0.09113277494907379, "logps/chosen": -725.4483642578125, "logps/rejected": -775.4747314453125, "loss": 213843.55, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4422362446784973, "rewards/margins": 0.06481991708278656, "rewards/rejected": -0.5070561170578003, "step": 600 }, { "epoch": 0.48, "learning_rate": 3.114413637259484e-06, "logits/chosen": -0.09821692854166031, "logits/rejected": -0.005470535717904568, "logps/chosen": -746.876708984375, "logps/rejected": -782.652587890625, "loss": 219806.95, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4441962242126465, "rewards/margins": 0.06350026279687881, "rewards/rejected": -0.5076965093612671, "step": 610 }, { "epoch": 0.49, "learning_rate": 3.0477008135127247e-06, "logits/chosen": -0.09074200689792633, "logits/rejected": -0.03884665295481682, "logps/chosen": -720.9779052734375, "logps/rejected": -743.6715087890625, "loss": 208771.95, "rewards/accuracies": 0.625, "rewards/chosen": -0.40446630120277405, "rewards/margins": 0.062393445521593094, "rewards/rejected": -0.46685975790023804, "step": 620 }, { "epoch": 0.49, "learning_rate": 2.980575697761603e-06, "logits/chosen": -0.0913369357585907, "logits/rejected": -0.04820000380277634, "logps/chosen": -739.7911376953125, "logps/rejected": -790.2059936523438, "loss": 201716.1, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.4266403317451477, "rewards/margins": 0.07397651672363281, "rewards/rejected": -0.5006168484687805, "step": 630 }, { "epoch": 0.5, "learning_rate": 2.9130888196891755e-06, "logits/chosen": -0.06239433214068413, "logits/rejected": 0.019041964784264565, "logps/chosen": -717.3048095703125, "logps/rejected": -778.1002807617188, "loss": 214451.2, "rewards/accuracies": 0.625, "rewards/chosen": -0.4269956946372986, "rewards/margins": 0.07155928760766983, "rewards/rejected": -0.4985550045967102, "step": 640 }, { "epoch": 0.51, "learning_rate": 2.845290981301834e-06, "logits/chosen": -0.04994695261120796, "logits/rejected": -0.04595662280917168, "logps/chosen": -726.4156494140625, "logps/rejected": -770.7301025390625, "loss": 211533.975, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4114510118961334, "rewards/margins": 0.06760390847921371, "rewards/rejected": -0.47905483841896057, "step": 650 }, { "epoch": 0.52, "learning_rate": 2.7772332186871464e-06, "logits/chosen": -0.0662728101015091, "logits/rejected": -0.038154177367687225, "logps/chosen": -671.4014892578125, "logps/rejected": -740.8782348632812, "loss": 198928.3625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.37442153692245483, "rewards/margins": 0.07407635450363159, "rewards/rejected": -0.44849786162376404, "step": 660 }, { "epoch": 0.53, "learning_rate": 2.708966763595493e-06, "logits/chosen": -0.11307473480701447, "logits/rejected": -0.09715189039707184, "logps/chosen": -669.0352783203125, "logps/rejected": -714.9685668945312, "loss": 209100.4625, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.37991762161254883, "rewards/margins": 0.07245246320962906, "rewards/rejected": -0.4523701071739197, "step": 670 }, { "epoch": 0.53, "learning_rate": 2.640543004874409e-06, "logits/chosen": -0.05394214391708374, "logits/rejected": -0.05522084981203079, "logps/chosen": -693.9007568359375, "logps/rejected": -747.3922729492188, "loss": 210209.575, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.39138975739479065, "rewards/margins": 0.07271454483270645, "rewards/rejected": -0.4641042649745941, "step": 680 }, { "epoch": 0.54, "learning_rate": 2.572013449784671e-06, "logits/chosen": -0.027393341064453125, "logits/rejected": -0.05223383754491806, "logps/chosen": -677.8887329101562, "logps/rejected": -740.2653198242188, "loss": 208986.8125, "rewards/accuracies": 0.625, "rewards/chosen": -0.4014677107334137, "rewards/margins": 0.07114878296852112, "rewards/rejected": -0.4726164937019348, "step": 690 }, { "epoch": 0.55, "learning_rate": 2.503429685227245e-06, "logits/chosen": -0.01898156851530075, "logits/rejected": -0.024372858926653862, "logps/chosen": -723.0344848632812, "logps/rejected": -742.4080810546875, "loss": 197337.45, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.3911907970905304, "rewards/margins": 0.08259539306163788, "rewards/rejected": -0.4737861752510071, "step": 700 }, { "epoch": 0.56, "learning_rate": 2.434843338910286e-06, "logits/chosen": -0.05535392835736275, "logits/rejected": -0.07486173510551453, "logps/chosen": -754.6624755859375, "logps/rejected": -807.8843994140625, "loss": 199448.225, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4616442620754242, "rewards/margins": 0.08079784363508224, "rewards/rejected": -0.5424421429634094, "step": 710 }, { "epoch": 0.57, "learning_rate": 2.3663060404854155e-06, "logits/chosen": -0.047691427171230316, "logits/rejected": 0.02117311768233776, "logps/chosen": -810.5379028320312, "logps/rejected": -838.4991455078125, "loss": 222795.0, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.4926871359348297, "rewards/margins": 0.0528423972427845, "rewards/rejected": -0.5455294847488403, "step": 720 }, { "epoch": 0.57, "learning_rate": 2.2978693826825406e-06, "logits/chosen": -0.05972999334335327, "logits/rejected": 0.053670674562454224, "logps/chosen": -771.9221801757812, "logps/rejected": -843.3212890625, "loss": 208553.9125, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.46609845757484436, "rewards/margins": 0.07322056591510773, "rewards/rejected": -0.5393189191818237, "step": 730 }, { "epoch": 0.58, "learning_rate": 2.2295848824724612e-06, "logits/chosen": -0.03266960754990578, "logits/rejected": 0.05311200022697449, "logps/chosen": -811.7847900390625, "logps/rejected": -888.1708984375, "loss": 196582.3, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4684126377105713, "rewards/margins": 0.09803617745637894, "rewards/rejected": -0.5664488077163696, "step": 740 }, { "epoch": 0.59, "learning_rate": 2.1615039422865136e-06, "logits/chosen": -0.02488083206117153, "logits/rejected": 0.00421232171356678, "logps/chosen": -749.4576416015625, "logps/rejected": -815.193115234375, "loss": 208486.4, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4397048354148865, "rewards/margins": 0.08280255645513535, "rewards/rejected": -0.52250736951828, "step": 750 }, { "epoch": 0.6, "learning_rate": 2.0936778113224253e-06, "logits/chosen": 0.0021423858124762774, "logits/rejected": 0.0016391072422266006, "logps/chosen": -767.9728393554688, "logps/rejected": -829.259765625, "loss": 207514.8625, "rewards/accuracies": 0.59375, "rewards/chosen": -0.44562071561813354, "rewards/margins": 0.07032287120819092, "rewards/rejected": -0.5159436464309692, "step": 760 }, { "epoch": 0.6, "learning_rate": 2.0261575469655304e-06, "logits/chosen": -0.03952546417713165, "logits/rejected": 0.07452509552240372, "logps/chosen": -799.8232421875, "logps/rejected": -832.5133666992188, "loss": 207497.575, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4807282090187073, "rewards/margins": 0.07494666427373886, "rewards/rejected": -0.5556748509407043, "step": 770 }, { "epoch": 0.61, "learning_rate": 1.9589939763543693e-06, "logits/chosen": 0.030634239315986633, "logits/rejected": 0.04007285460829735, "logps/chosen": -783.6361083984375, "logps/rejected": -811.500732421875, "loss": 222795.075, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.48045220971107483, "rewards/margins": 0.06402502954006195, "rewards/rejected": -0.5444772839546204, "step": 780 }, { "epoch": 0.62, "learning_rate": 1.8922376581196107e-06, "logits/chosen": 0.03562704846262932, "logits/rejected": 0.09303382784128189, "logps/chosen": -801.5706787109375, "logps/rejected": -821.78564453125, "loss": 210521.95, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4747587740421295, "rewards/margins": 0.06814552843570709, "rewards/rejected": -0.5429043173789978, "step": 790 }, { "epoch": 0.63, "learning_rate": 1.8259388443250993e-06, "logits/chosen": 0.0239779744297266, "logits/rejected": 0.05331903696060181, "logps/chosen": -768.3508911132812, "logps/rejected": -849.29296875, "loss": 193255.2625, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4437880516052246, "rewards/margins": 0.08583388477563858, "rewards/rejected": -0.5296218991279602, "step": 800 }, { "epoch": 0.64, "learning_rate": 1.760147442639679e-06, "logits/chosen": 0.038005001842975616, "logits/rejected": 0.03408225253224373, "logps/chosen": -798.1541748046875, "logps/rejected": -828.2268676757812, "loss": 208489.85, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.4753784239292145, "rewards/margins": 0.0705508217215538, "rewards/rejected": -0.5459292531013489, "step": 810 }, { "epoch": 0.64, "learning_rate": 1.6949129787682628e-06, "logits/chosen": 0.03351437300443649, "logits/rejected": 0.0348244272172451, "logps/chosen": -833.7937622070312, "logps/rejected": -868.5783081054688, "loss": 203584.05, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.4822169840335846, "rewards/margins": 0.07765035331249237, "rewards/rejected": -0.5598673224449158, "step": 820 }, { "epoch": 0.65, "learning_rate": 1.6302845591704348e-06, "logits/chosen": 0.028645271435379982, "logits/rejected": 0.03767635300755501, "logps/chosen": -773.2454833984375, "logps/rejected": -871.0046997070312, "loss": 205490.675, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.504454493522644, "rewards/margins": 0.07273290306329727, "rewards/rejected": -0.5771873593330383, "step": 830 }, { "epoch": 0.66, "learning_rate": 1.5663108340946465e-06, "logits/chosen": -0.016361240297555923, "logits/rejected": 0.02944091334939003, "logps/chosen": -810.2437744140625, "logps/rejected": -861.1094970703125, "loss": 201889.525, "rewards/accuracies": 0.625, "rewards/chosen": -0.5184367895126343, "rewards/margins": 0.078752800822258, "rewards/rejected": -0.5971895456314087, "step": 840 }, { "epoch": 0.67, "learning_rate": 1.5030399609558364e-06, "logits/chosen": -0.04980049282312393, "logits/rejected": -0.0025185912381857634, "logps/chosen": -838.6681518554688, "logps/rejected": -880.5914916992188, "loss": 201939.1, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.512233555316925, "rewards/margins": 0.08945199102163315, "rewards/rejected": -0.6016855835914612, "step": 850 }, { "epoch": 0.68, "learning_rate": 1.4405195680840357e-06, "logits/chosen": -0.006730606313794851, "logits/rejected": 0.008829834870994091, "logps/chosen": -809.2523193359375, "logps/rejected": -881.7073974609375, "loss": 201830.9875, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.4848810136318207, "rewards/margins": 0.08851764351129532, "rewards/rejected": -0.5733986496925354, "step": 860 }, { "epoch": 0.68, "learning_rate": 1.378796718871252e-06, "logits/chosen": 0.003451726632192731, "logits/rejected": 0.04693222790956497, "logps/chosen": -828.8895263671875, "logps/rejected": -869.78515625, "loss": 223132.05, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5101678371429443, "rewards/margins": 0.06179608777165413, "rewards/rejected": -0.571963906288147, "step": 870 }, { "epoch": 0.69, "learning_rate": 1.3179178763436302e-06, "logits/chosen": -0.00423568207770586, "logits/rejected": 0.005345098674297333, "logps/chosen": -770.8553466796875, "logps/rejected": -826.5133666992188, "loss": 220881.4, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.48375120759010315, "rewards/margins": 0.054837919771671295, "rewards/rejected": -0.5385891199111938, "step": 880 }, { "epoch": 0.7, "learning_rate": 1.2579288681855364e-06, "logits/chosen": -0.04643459990620613, "logits/rejected": -0.00930569414049387, "logps/chosen": -800.4423828125, "logps/rejected": -868.2879638671875, "loss": 199618.2, "rewards/accuracies": 0.625, "rewards/chosen": -0.48793333768844604, "rewards/margins": 0.08429589867591858, "rewards/rejected": -0.572229266166687, "step": 890 }, { "epoch": 0.71, "learning_rate": 1.1988748522419163e-06, "logits/chosen": -0.021517012268304825, "logits/rejected": -0.0038474828470498323, "logps/chosen": -755.4798583984375, "logps/rejected": -817.7532348632812, "loss": 212245.85, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.46443241834640503, "rewards/margins": 0.0739377960562706, "rewards/rejected": -0.5383703112602234, "step": 900 }, { "epoch": 0.71, "learning_rate": 1.1408002825248842e-06, "logits/chosen": -0.08825428783893585, "logits/rejected": 0.0003117397427558899, "logps/chosen": -738.525146484375, "logps/rejected": -810.4708251953125, "loss": 195269.8, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.45489567518234253, "rewards/margins": 0.08346293121576309, "rewards/rejected": -0.5383585691452026, "step": 910 }, { "epoch": 0.72, "learning_rate": 1.0837488757501369e-06, "logits/chosen": -0.03578663989901543, "logits/rejected": -0.04554619640111923, "logps/chosen": -734.4967651367188, "logps/rejected": -777.3861694335938, "loss": 216987.525, "rewards/accuracies": 0.53125, "rewards/chosen": -0.44713911414146423, "rewards/margins": 0.06695397198200226, "rewards/rejected": -0.5140931010246277, "step": 920 }, { "epoch": 0.73, "learning_rate": 1.027763578428379e-06, "logits/chosen": -0.04421938955783844, "logits/rejected": 0.01812281273305416, "logps/chosen": -745.9063720703125, "logps/rejected": -811.608642578125, "loss": 198343.5375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4607219696044922, "rewards/margins": 0.08409135043621063, "rewards/rejected": -0.5448133945465088, "step": 930 }, { "epoch": 0.74, "learning_rate": 9.728865345365379e-07, "logits/chosen": -0.0179077610373497, "logits/rejected": 0.008015085011720657, "logps/chosen": -736.4071655273438, "logps/rejected": -788.5194091796875, "loss": 210246.7, "rewards/accuracies": 0.59375, "rewards/chosen": -0.45147353410720825, "rewards/margins": 0.07040319591760635, "rewards/rejected": -0.5218766927719116, "step": 940 }, { "epoch": 0.75, "learning_rate": 9.191590537930975e-07, "logits/chosen": -0.07539006322622299, "logits/rejected": 0.06443232297897339, "logps/chosen": -848.2332153320312, "logps/rejected": -881.9641723632812, "loss": 199938.1625, "rewards/accuracies": 0.65625, "rewards/chosen": -0.49263858795166016, "rewards/margins": 0.0914057120680809, "rewards/rejected": -0.5840442776679993, "step": 950 }, { "epoch": 0.75, "learning_rate": 8.666215805614373e-07, "logits/chosen": -0.035882774740457535, "logits/rejected": -0.0026063353288918734, "logps/chosen": -816.1239013671875, "logps/rejected": -860.7882690429688, "loss": 195339.6, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.47968846559524536, "rewards/margins": 0.08311482518911362, "rewards/rejected": -0.5628032088279724, "step": 960 }, { "epoch": 0.76, "learning_rate": 8.153136634045844e-07, "logits/chosen": -0.021835245192050934, "logits/rejected": 0.0024397834204137325, "logps/chosen": -794.1774291992188, "logps/rejected": -856.4476318359375, "loss": 184634.5375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.48376354575157166, "rewards/margins": 0.10054849088191986, "rewards/rejected": -0.5843120813369751, "step": 970 }, { "epoch": 0.77, "learning_rate": 7.652739253142915e-07, "logits/chosen": -0.009013411588966846, "logits/rejected": 0.01207656692713499, "logps/chosen": -827.5479736328125, "logps/rejected": -872.8331909179688, "loss": 205367.375, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.5190316438674927, "rewards/margins": 0.0740259662270546, "rewards/rejected": -0.5930576324462891, "step": 980 }, { "epoch": 0.78, "learning_rate": 7.165400346368648e-07, "logits/chosen": -0.027726858854293823, "logits/rejected": -0.0028947251848876476, "logps/chosen": -791.7979736328125, "logps/rejected": -881.1510620117188, "loss": 197640.8625, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5166811943054199, "rewards/margins": 0.094887375831604, "rewards/rejected": -0.6115685701370239, "step": 990 }, { "epoch": 0.79, "learning_rate": 6.691486767176092e-07, "logits/chosen": -0.025321567431092262, "logits/rejected": 0.03234836459159851, "logps/chosen": -768.285400390625, "logps/rejected": -811.7093505859375, "loss": 219588.225, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5012341737747192, "rewards/margins": 0.06108912080526352, "rewards/rejected": -0.5623233318328857, "step": 1000 }, { "epoch": 0.79, "learning_rate": 6.231355262852529e-07, "logits/chosen": -0.030688485130667686, "logits/rejected": 0.05593588203191757, "logps/chosen": -845.5794067382812, "logps/rejected": -900.7802734375, "loss": 191174.325, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5286746025085449, "rewards/margins": 0.10104093700647354, "rewards/rejected": -0.6297155618667603, "step": 1010 }, { "epoch": 0.8, "learning_rate": 5.785352205971275e-07, "logits/chosen": -0.03481902927160263, "logits/rejected": 0.09135408699512482, "logps/chosen": -841.7257080078125, "logps/rejected": -931.0070190429688, "loss": 200004.7, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.550171971321106, "rewards/margins": 0.0866222232580185, "rewards/rejected": -0.6367942094802856, "step": 1020 }, { "epoch": 0.81, "learning_rate": 5.353813333653287e-07, "logits/chosen": 0.016893912106752396, "logits/rejected": 0.0011436156928539276, "logps/chosen": -758.1476440429688, "logps/rejected": -817.4668579101562, "loss": 212142.725, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4770483374595642, "rewards/margins": 0.07137372344732285, "rewards/rejected": -0.54842209815979, "step": 1030 }, { "epoch": 0.82, "learning_rate": 4.937063494834774e-07, "logits/chosen": -0.014474679715931416, "logits/rejected": -0.032013148069381714, "logps/chosen": -804.8319091796875, "logps/rejected": -824.1060791015625, "loss": 212177.825, "rewards/accuracies": 0.5625, "rewards/chosen": -0.49243831634521484, "rewards/margins": 0.06834886968135834, "rewards/rejected": -0.5607872605323792, "step": 1040 }, { "epoch": 0.82, "learning_rate": 4.5354164057310857e-07, "logits/chosen": -0.0435628667473793, "logits/rejected": 0.027259284630417824, "logps/chosen": -789.1632080078125, "logps/rejected": -851.56298828125, "loss": 190594.05, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.49468034505844116, "rewards/margins": 0.09655535966157913, "rewards/rejected": -0.5912356972694397, "step": 1050 }, { "epoch": 0.83, "learning_rate": 4.1491744136810066e-07, "logits/chosen": -0.04903930425643921, "logits/rejected": 0.004996160510927439, "logps/chosen": -817.1986083984375, "logps/rejected": -844.3206787109375, "loss": 206357.5875, "rewards/accuracies": 0.625, "rewards/chosen": -0.5099014043807983, "rewards/margins": 0.07538522779941559, "rewards/rejected": -0.5852866768836975, "step": 1060 }, { "epoch": 0.84, "learning_rate": 3.7786282695491313e-07, "logits/chosen": -0.013560289517045021, "logits/rejected": 0.024793395772576332, "logps/chosen": -859.4154052734375, "logps/rejected": -930.1591796875, "loss": 202759.0375, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5486615300178528, "rewards/margins": 0.08299361169338226, "rewards/rejected": -0.6316550970077515, "step": 1070 }, { "epoch": 0.85, "learning_rate": 3.4240569088577564e-07, "logits/chosen": -0.043563805520534515, "logits/rejected": 0.038817066699266434, "logps/chosen": -784.8958129882812, "logps/rejected": -838.8773193359375, "loss": 196943.7125, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4984671175479889, "rewards/margins": 0.0784321129322052, "rewards/rejected": -0.5768992900848389, "step": 1080 }, { "epoch": 0.86, "learning_rate": 3.0857272418129136e-07, "logits/chosen": 0.014950004406273365, "logits/rejected": 0.008164005354046822, "logps/chosen": -833.267578125, "logps/rejected": -885.646484375, "loss": 198440.8625, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5129631757736206, "rewards/margins": 0.08385080099105835, "rewards/rejected": -0.596813976764679, "step": 1090 }, { "epoch": 0.86, "learning_rate": 2.7638939523827956e-07, "logits/chosen": -0.0004991426831111312, "logits/rejected": 0.01584457978606224, "logps/chosen": -853.5109252929688, "logps/rejected": -916.8821411132812, "loss": 204926.275, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5368956327438354, "rewards/margins": 0.08267486840486526, "rewards/rejected": -0.6195704936981201, "step": 1100 }, { "epoch": 0.87, "learning_rate": 2.4587993065795983e-07, "logits/chosen": -0.043995875865221024, "logits/rejected": 0.016014937311410904, "logps/chosen": -809.5734252929688, "logps/rejected": -882.4210815429688, "loss": 201038.4375, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5105189085006714, "rewards/margins": 0.0844583585858345, "rewards/rejected": -0.5949772596359253, "step": 1110 }, { "epoch": 0.88, "learning_rate": 2.170672970089291e-07, "logits/chosen": -0.00044642388820648193, "logits/rejected": -0.03587064892053604, "logps/chosen": -791.4276733398438, "logps/rejected": -819.1533813476562, "loss": 219342.475, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.49560171365737915, "rewards/margins": 0.05810080096125603, "rewards/rejected": -0.5537024736404419, "step": 1120 }, { "epoch": 0.89, "learning_rate": 1.8997318353864673e-07, "logits/chosen": 0.02901034988462925, "logits/rejected": -0.009537712670862675, "logps/chosen": -746.2117919921875, "logps/rejected": -798.05224609375, "loss": 214356.25, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.48434096574783325, "rewards/margins": 0.07087163627147675, "rewards/rejected": -0.5552126169204712, "step": 1130 }, { "epoch": 0.89, "learning_rate": 1.6461798584644944e-07, "logits/chosen": -0.0717676654458046, "logits/rejected": -0.04001053422689438, "logps/chosen": -844.8236083984375, "logps/rejected": -927.1820068359375, "loss": 176394.5875, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5142861008644104, "rewards/margins": 0.11530987918376923, "rewards/rejected": -0.629595935344696, "step": 1140 }, { "epoch": 0.9, "learning_rate": 1.4102079053038454e-07, "logits/chosen": -0.013663539662957191, "logits/rejected": -0.04209432005882263, "logps/chosen": -806.2244262695312, "logps/rejected": -875.3914184570312, "loss": 205334.5, "rewards/accuracies": 0.625, "rewards/chosen": -0.5035674571990967, "rewards/margins": 0.08000927418470383, "rewards/rejected": -0.5835766792297363, "step": 1150 }, { "epoch": 0.91, "learning_rate": 1.1919936081941585e-07, "logits/chosen": -0.004432595334947109, "logits/rejected": 0.02214151620864868, "logps/chosen": -842.6566162109375, "logps/rejected": -888.5098876953125, "loss": 193055.6, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5068452954292297, "rewards/margins": 0.09299594908952713, "rewards/rejected": -0.5998412370681763, "step": 1160 }, { "epoch": 0.92, "learning_rate": 9.917012320182245e-08, "logits/chosen": -0.05645931884646416, "logits/rejected": 0.1126350611448288, "logps/chosen": -817.47509765625, "logps/rejected": -878.1007690429688, "loss": 193461.9875, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4907683730125427, "rewards/margins": 0.08928798139095306, "rewards/rejected": -0.5800563097000122, "step": 1170 }, { "epoch": 0.93, "learning_rate": 8.094815505985315e-08, "logits/chosen": -0.03593028336763382, "logits/rejected": 0.005944651551544666, "logps/chosen": -824.0018310546875, "logps/rejected": -869.1282958984375, "loss": 213653.2, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5079488754272461, "rewards/margins": 0.07349041849374771, "rewards/rejected": -0.5814392566680908, "step": 1180 }, { "epoch": 0.93, "learning_rate": 6.454717331994542e-08, "logits/chosen": -0.020399611443281174, "logits/rejected": 0.020858485251665115, "logps/chosen": -794.6226806640625, "logps/rejected": -853.1974487304688, "loss": 201205.1125, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.48565536737442017, "rewards/margins": 0.08772826939821243, "rewards/rejected": -0.5733836889266968, "step": 1190 }, { "epoch": 0.94, "learning_rate": 4.9979524127052595e-08, "logits/chosen": -0.017864791676402092, "logits/rejected": 0.025667399168014526, "logps/chosen": -763.6353759765625, "logps/rejected": -851.0655517578125, "loss": 201221.4875, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4817201495170593, "rewards/margins": 0.08347930014133453, "rewards/rejected": -0.5651993751525879, "step": 1200 }, { "epoch": 0.95, "learning_rate": 3.725617355085476e-08, "logits/chosen": -0.053539175540208817, "logits/rejected": -0.005616022739559412, "logps/chosen": -837.1541137695312, "logps/rejected": -917.78955078125, "loss": 216221.275, "rewards/accuracies": 0.59375, "rewards/chosen": -0.513336181640625, "rewards/margins": 0.06921021640300751, "rewards/rejected": -0.5825464129447937, "step": 1210 }, { "epoch": 0.96, "learning_rate": 2.63866993308437e-08, "logits/chosen": -0.04671555012464523, "logits/rejected": 0.012625640258193016, "logps/chosen": -850.5285034179688, "logps/rejected": -898.7066650390625, "loss": 203778.7, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.5049459934234619, "rewards/margins": 0.08938591182231903, "rewards/rejected": -0.5943318605422974, "step": 1220 }, { "epoch": 0.97, "learning_rate": 1.737928366650099e-08, "logits/chosen": -0.03952642157673836, "logits/rejected": -0.024786999449133873, "logps/chosen": -845.3889770507812, "logps/rejected": -912.8518676757812, "loss": 203856.5625, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5316712856292725, "rewards/margins": 0.08585255593061447, "rewards/rejected": -0.6175239086151123, "step": 1230 }, { "epoch": 0.97, "learning_rate": 1.0240707057995735e-08, "logits/chosen": -0.009350773878395557, "logits/rejected": 0.0019861645996570587, "logps/chosen": -840.8093872070312, "logps/rejected": -873.7894287109375, "loss": 219184.125, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.5165701508522034, "rewards/margins": 0.07291480898857117, "rewards/rejected": -0.5894849896430969, "step": 1240 }, { "epoch": 0.98, "learning_rate": 4.976343202034717e-09, "logits/chosen": -0.04459824040532112, "logits/rejected": -0.026420336216688156, "logps/chosen": -822.0613403320312, "logps/rejected": -859.7013549804688, "loss": 218496.775, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5172399878501892, "rewards/margins": 0.0695352554321289, "rewards/rejected": -0.5867753028869629, "step": 1250 }, { "epoch": 0.99, "learning_rate": 1.5901549467139953e-09, "logits/chosen": 0.020182963460683823, "logits/rejected": 0.03361889719963074, "logps/chosen": -862.8426513671875, "logps/rejected": -934.9107666015625, "loss": 214655.3, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5248240828514099, "rewards/margins": 0.06909482926130295, "rewards/rejected": -0.5939189791679382, "step": 1260 }, { "epoch": 1.0, "learning_rate": 8.469130840960127e-11, "logits/chosen": -0.02860623598098755, "logits/rejected": -0.045795194804668427, "logps/chosen": -831.4700317382812, "logps/rejected": -876.29296875, "loss": 191498.6375, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5061026215553284, "rewards/margins": 0.09500899165868759, "rewards/rejected": -0.601111650466919, "step": 1270 }, { "epoch": 1.0, "step": 1273, "total_flos": 0.0, "train_loss": 214508.74675962294, "train_runtime": 12229.8763, "train_samples_per_second": 1.666, "train_steps_per_second": 0.104 } ], "logging_steps": 10, "max_steps": 1273, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }