{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9801980198019802, "eval_steps": 100, "global_step": 8500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0023296447291788003, "grad_norm": 3.924811840057373, "learning_rate": 2.3282887077997674e-07, "logits/chosen": -4.522591590881348, "logits/rejected": -4.452101707458496, "logps/chosen": -684.3924560546875, "logps/rejected": -665.7002563476562, "loss": 0.878, "rewards/accuracies": 0.4375, "rewards/chosen": -0.12848183512687683, "rewards/margins": -0.08472710102796555, "rewards/rejected": -0.04375474527478218, "step": 10 }, { "epoch": 0.004659289458357601, "grad_norm": 4.744262218475342, "learning_rate": 4.656577415599535e-07, "logits/chosen": -4.606254577636719, "logits/rejected": -4.580027103424072, "logps/chosen": -706.7974243164062, "logps/rejected": -657.6036987304688, "loss": 0.8944, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.10672809183597565, "rewards/margins": -0.03675349801778793, "rewards/rejected": -0.06997456401586533, "step": 20 }, { "epoch": 0.006988934187536401, "grad_norm": 5.417810440063477, "learning_rate": 6.984866123399302e-07, "logits/chosen": -4.553195476531982, "logits/rejected": -4.513618469238281, "logps/chosen": -703.244873046875, "logps/rejected": -695.6546630859375, "loss": 0.9522, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.12389123439788818, "rewards/margins": -0.1695944219827652, "rewards/rejected": 0.045703209936618805, "step": 30 }, { "epoch": 0.009318578916715201, "grad_norm": 3.657839775085449, "learning_rate": 9.31315483119907e-07, "logits/chosen": -4.52838134765625, "logits/rejected": -4.499939441680908, "logps/chosen": -709.3988037109375, "logps/rejected": -729.5408935546875, "loss": 0.7638, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.016293197870254517, "rewards/margins": 0.1943705528974533, "rewards/rejected": -0.21066375076770782, "step": 40 }, { "epoch": 0.011648223645894, "grad_norm": 4.347109317779541, "learning_rate": 1.1641443538998836e-06, "logits/chosen": -4.601327419281006, "logits/rejected": -4.51685094833374, "logps/chosen": -728.6759643554688, "logps/rejected": -757.50390625, "loss": 0.8954, "rewards/accuracies": 0.4375, "rewards/chosen": 0.026629159227013588, "rewards/margins": -0.0508374348282814, "rewards/rejected": 0.07746660709381104, "step": 50 }, { "epoch": 0.013977868375072802, "grad_norm": 4.963409900665283, "learning_rate": 1.3969732246798604e-06, "logits/chosen": -4.556868076324463, "logits/rejected": -4.478787422180176, "logps/chosen": -754.9805908203125, "logps/rejected": -714.3489379882812, "loss": 1.0465, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.1778850257396698, "rewards/margins": -0.26734185218811035, "rewards/rejected": 0.08945684880018234, "step": 60 }, { "epoch": 0.016307513104251603, "grad_norm": 4.398076057434082, "learning_rate": 1.629802095459837e-06, "logits/chosen": -4.506744861602783, "logits/rejected": -4.536816596984863, "logps/chosen": -714.246337890625, "logps/rejected": -728.5725708007812, "loss": 0.8227, "rewards/accuracies": 0.5, "rewards/chosen": 0.009435917250812054, "rewards/margins": 0.08285114914178848, "rewards/rejected": -0.07341523468494415, "step": 70 }, { "epoch": 0.018637157833430402, "grad_norm": 3.7324328422546387, "learning_rate": 1.862630966239814e-06, "logits/chosen": -4.5511274337768555, "logits/rejected": -4.537388801574707, "logps/chosen": -686.8524780273438, "logps/rejected": -745.160888671875, "loss": 0.8821, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.02854597568511963, "rewards/margins": -0.06114887073636055, "rewards/rejected": 0.08969485759735107, "step": 80 }, { "epoch": 0.020966802562609202, "grad_norm": 5.837793350219727, "learning_rate": 2.0954598370197905e-06, "logits/chosen": -4.489697456359863, "logits/rejected": -4.449906826019287, "logps/chosen": -680.8670654296875, "logps/rejected": -650.242431640625, "loss": 0.8567, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.08860909938812256, "rewards/margins": 0.003706153482198715, "rewards/rejected": -0.09231523424386978, "step": 90 }, { "epoch": 0.023296447291788, "grad_norm": 5.7243475914001465, "learning_rate": 2.3282887077997673e-06, "logits/chosen": -4.5048298835754395, "logits/rejected": -4.545318603515625, "logps/chosen": -686.437744140625, "logps/rejected": -765.6935424804688, "loss": 0.7578, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.1273827850818634, "rewards/margins": 0.3406364917755127, "rewards/rejected": -0.21325376629829407, "step": 100 }, { "epoch": 0.023296447291788, "eval_logits/chosen": -4.505529403686523, "eval_logits/rejected": -4.499370574951172, "eval_logps/chosen": -690.83056640625, "eval_logps/rejected": -706.8093872070312, "eval_loss": 0.6929930448532104, "eval_rewards/accuracies": 0.5116019248962402, "eval_rewards/chosen": -0.0002367756824241951, "eval_rewards/margins": 0.0008354606688953936, "eval_rewards/rejected": -0.0010722363367676735, "eval_runtime": 384.5796, "eval_samples_per_second": 18.602, "eval_steps_per_second": 9.301, "step": 100 }, { "epoch": 0.0256260920209668, "grad_norm": 4.138678550720215, "learning_rate": 2.5611175785797445e-06, "logits/chosen": -4.5441060066223145, "logits/rejected": -4.443483829498291, "logps/chosen": -748.9180908203125, "logps/rejected": -738.2803955078125, "loss": 0.7429, "rewards/accuracies": 0.625, "rewards/chosen": 0.06179947406053543, "rewards/margins": 0.23742561042308807, "rewards/rejected": -0.17562614381313324, "step": 110 }, { "epoch": 0.027955736750145604, "grad_norm": 3.8973371982574463, "learning_rate": 2.793946449359721e-06, "logits/chosen": -4.614588260650635, "logits/rejected": -4.612898826599121, "logps/chosen": -732.0816650390625, "logps/rejected": -785.8519287109375, "loss": 0.7942, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.07624070346355438, "rewards/margins": 0.12703315913677216, "rewards/rejected": -0.05079244449734688, "step": 120 }, { "epoch": 0.030285381479324403, "grad_norm": 2.5106780529022217, "learning_rate": 3.0267753201396976e-06, "logits/chosen": -4.505324363708496, "logits/rejected": -4.493563652038574, "logps/chosen": -688.0574951171875, "logps/rejected": -751.806396484375, "loss": 0.9648, "rewards/accuracies": 0.4375, "rewards/chosen": -0.05348503589630127, "rewards/margins": -0.17224565148353577, "rewards/rejected": 0.1187606006860733, "step": 130 }, { "epoch": 0.032615026208503206, "grad_norm": 3.486295700073242, "learning_rate": 3.259604190919674e-06, "logits/chosen": -4.501693248748779, "logits/rejected": -4.5221099853515625, "logps/chosen": -692.7454223632812, "logps/rejected": -702.983154296875, "loss": 0.8493, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.02032635547220707, "rewards/margins": 0.046482861042022705, "rewards/rejected": -0.06680919975042343, "step": 140 }, { "epoch": 0.034944670937682006, "grad_norm": 4.175703048706055, "learning_rate": 3.492433061699651e-06, "logits/chosen": -4.6017351150512695, "logits/rejected": -4.442572593688965, "logps/chosen": -810.1216430664062, "logps/rejected": -699.7447509765625, "loss": 0.9037, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.06848535686731339, "rewards/margins": -0.10816104710102081, "rewards/rejected": 0.03967570140957832, "step": 150 }, { "epoch": 0.037274315666860805, "grad_norm": 4.84321403503418, "learning_rate": 3.725261932479628e-06, "logits/chosen": -4.5791015625, "logits/rejected": -4.534665107727051, "logps/chosen": -746.9636840820312, "logps/rejected": -760.25390625, "loss": 0.8184, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.015708060935139656, "rewards/margins": 0.1453014314174652, "rewards/rejected": -0.16100946068763733, "step": 160 }, { "epoch": 0.039603960396039604, "grad_norm": 5.143755912780762, "learning_rate": 3.958090803259605e-06, "logits/chosen": -4.579737663269043, "logits/rejected": -4.561814785003662, "logps/chosen": -731.2387084960938, "logps/rejected": -708.92578125, "loss": 0.7407, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.1105731949210167, "rewards/margins": 0.251499742269516, "rewards/rejected": -0.1409265697002411, "step": 170 }, { "epoch": 0.041933605125218404, "grad_norm": 3.8264622688293457, "learning_rate": 4.190919674039581e-06, "logits/chosen": -4.5193586349487305, "logits/rejected": -4.51457405090332, "logps/chosen": -698.3035888671875, "logps/rejected": -717.1882934570312, "loss": 0.762, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.054043758660554886, "rewards/margins": 0.21840786933898926, "rewards/rejected": -0.16436411440372467, "step": 180 }, { "epoch": 0.0442632498543972, "grad_norm": 4.408588409423828, "learning_rate": 4.423748544819557e-06, "logits/chosen": -4.572482585906982, "logits/rejected": -4.569614410400391, "logps/chosen": -784.6768188476562, "logps/rejected": -776.5426635742188, "loss": 1.1467, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.06866107136011124, "rewards/margins": -0.35513216257095337, "rewards/rejected": 0.28647106885910034, "step": 190 }, { "epoch": 0.046592894583576, "grad_norm": 4.938295364379883, "learning_rate": 4.6565774155995345e-06, "logits/chosen": -4.534358024597168, "logits/rejected": -4.555275917053223, "logps/chosen": -691.1871337890625, "logps/rejected": -718.7325439453125, "loss": 0.7219, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.007454717066138983, "rewards/margins": 0.2289915531873703, "rewards/rejected": -0.22153684496879578, "step": 200 }, { "epoch": 0.046592894583576, "eval_logits/chosen": -4.502413749694824, "eval_logits/rejected": -4.496292591094971, "eval_logps/chosen": -690.849365234375, "eval_logps/rejected": -706.842041015625, "eval_loss": 0.6923176050186157, "eval_rewards/accuracies": 0.517892062664032, "eval_rewards/chosen": -0.0021104670595377684, "eval_rewards/margins": 0.0022353504318743944, "eval_rewards/rejected": -0.004345817491412163, "eval_runtime": 385.229, "eval_samples_per_second": 18.571, "eval_steps_per_second": 9.285, "step": 200 }, { "epoch": 0.0489225393127548, "grad_norm": 4.715427875518799, "learning_rate": 4.889406286379512e-06, "logits/chosen": -4.474791526794434, "logits/rejected": -4.529447555541992, "logps/chosen": -701.8544921875, "logps/rejected": -734.6244506835938, "loss": 0.7614, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.10978059470653534, "rewards/margins": 0.19003726541996002, "rewards/rejected": -0.08025668561458588, "step": 210 }, { "epoch": 0.0512521840419336, "grad_norm": 5.457529067993164, "learning_rate": 5.122235157159489e-06, "logits/chosen": -4.573043346405029, "logits/rejected": -4.5491943359375, "logps/chosen": -690.1962280273438, "logps/rejected": -711.5494384765625, "loss": 0.9218, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.1155082955956459, "rewards/margins": -0.14690880477428436, "rewards/rejected": 0.03140052407979965, "step": 220 }, { "epoch": 0.05358182877111241, "grad_norm": 4.8636555671691895, "learning_rate": 5.355064027939465e-06, "logits/chosen": -4.635704040527344, "logits/rejected": -4.550039768218994, "logps/chosen": -756.4803466796875, "logps/rejected": -737.9407958984375, "loss": 0.8103, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.03032105602324009, "rewards/margins": 0.0932706966996193, "rewards/rejected": -0.06294964253902435, "step": 230 }, { "epoch": 0.05591147350029121, "grad_norm": 5.851677417755127, "learning_rate": 5.587892898719442e-06, "logits/chosen": -4.482217788696289, "logits/rejected": -4.481154441833496, "logps/chosen": -727.5157470703125, "logps/rejected": -705.8177490234375, "loss": 0.8402, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.07104573398828506, "rewards/margins": 0.024619558826088905, "rewards/rejected": 0.04642615467309952, "step": 240 }, { "epoch": 0.05824111822947001, "grad_norm": 3.755674123764038, "learning_rate": 5.820721769499419e-06, "logits/chosen": -4.555675506591797, "logits/rejected": -4.55229377746582, "logps/chosen": -688.3670654296875, "logps/rejected": -759.0899047851562, "loss": 0.9507, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.029390472918748856, "rewards/margins": -0.06705763190984726, "rewards/rejected": 0.09644811600446701, "step": 250 }, { "epoch": 0.060570762958648806, "grad_norm": 4.303655624389648, "learning_rate": 6.053550640279395e-06, "logits/chosen": -4.517312049865723, "logits/rejected": -4.544795036315918, "logps/chosen": -664.3435668945312, "logps/rejected": -723.6569213867188, "loss": 0.8181, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.03719661012291908, "rewards/margins": 0.10673441737890244, "rewards/rejected": -0.1439310610294342, "step": 260 }, { "epoch": 0.0629004076878276, "grad_norm": 3.8483376502990723, "learning_rate": 6.2863795110593715e-06, "logits/chosen": -4.524532318115234, "logits/rejected": -4.585890293121338, "logps/chosen": -664.9467163085938, "logps/rejected": -704.1334228515625, "loss": 0.8362, "rewards/accuracies": 0.5, "rewards/chosen": -0.1777094155550003, "rewards/margins": 0.05187971517443657, "rewards/rejected": -0.22958913445472717, "step": 270 }, { "epoch": 0.06523005241700641, "grad_norm": 5.691915988922119, "learning_rate": 6.519208381839348e-06, "logits/chosen": -4.5398993492126465, "logits/rejected": -4.574265480041504, "logps/chosen": -702.3248291015625, "logps/rejected": -724.2568359375, "loss": 0.9145, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.08214785158634186, "rewards/margins": -0.08197866380214691, "rewards/rejected": -0.00016917586617637426, "step": 280 }, { "epoch": 0.0675596971461852, "grad_norm": 6.575222015380859, "learning_rate": 6.752037252619326e-06, "logits/chosen": -4.494467735290527, "logits/rejected": -4.485854148864746, "logps/chosen": -725.3032836914062, "logps/rejected": -746.1402587890625, "loss": 0.8225, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.014149573631584644, "rewards/margins": 0.014778072014451027, "rewards/rejected": -0.0006285011768341064, "step": 290 }, { "epoch": 0.06988934187536401, "grad_norm": 4.541840076446533, "learning_rate": 6.984866123399302e-06, "logits/chosen": -4.506882190704346, "logits/rejected": -4.480749607086182, "logps/chosen": -778.0560302734375, "logps/rejected": -752.30224609375, "loss": 0.9151, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.11194051802158356, "rewards/margins": -0.137613907456398, "rewards/rejected": 0.0256733950227499, "step": 300 }, { "epoch": 0.06988934187536401, "eval_logits/chosen": -4.500364780426025, "eval_logits/rejected": -4.49424934387207, "eval_logps/chosen": -690.8859252929688, "eval_logps/rejected": -706.9114990234375, "eval_loss": 0.6907868981361389, "eval_rewards/accuracies": 0.5388593673706055, "eval_rewards/chosen": -0.005771713797003031, "eval_rewards/margins": 0.005515825469046831, "eval_rewards/rejected": -0.011287540197372437, "eval_runtime": 385.5275, "eval_samples_per_second": 18.556, "eval_steps_per_second": 9.278, "step": 300 }, { "epoch": 0.0722189866045428, "grad_norm": 5.339130401611328, "learning_rate": 7.2176949941792785e-06, "logits/chosen": -4.623049259185791, "logits/rejected": -4.607933044433594, "logps/chosen": -787.4456176757812, "logps/rejected": -809.6871337890625, "loss": 1.0983, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.2957928776741028, "rewards/margins": -0.4004906117916107, "rewards/rejected": 0.10469770431518555, "step": 310 }, { "epoch": 0.07454863133372161, "grad_norm": 4.564884662628174, "learning_rate": 7.450523864959256e-06, "logits/chosen": -4.502755165100098, "logits/rejected": -4.475649833679199, "logps/chosen": -669.8948364257812, "logps/rejected": -723.6424560546875, "loss": 0.8954, "rewards/accuracies": 0.4375, "rewards/chosen": -0.05094115808606148, "rewards/margins": -0.03177493438124657, "rewards/rejected": -0.019166234880685806, "step": 320 }, { "epoch": 0.0768782760629004, "grad_norm": 5.733292102813721, "learning_rate": 7.683352735739232e-06, "logits/chosen": -4.540478706359863, "logits/rejected": -4.566623210906982, "logps/chosen": -757.8765258789062, "logps/rejected": -708.9313354492188, "loss": 0.8511, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.047536421567201614, "rewards/margins": 0.053884364664554596, "rewards/rejected": -0.006347939372062683, "step": 330 }, { "epoch": 0.07920792079207921, "grad_norm": 5.753773212432861, "learning_rate": 7.91618160651921e-06, "logits/chosen": -4.459725856781006, "logits/rejected": -4.57793664932251, "logps/chosen": -723.3109741210938, "logps/rejected": -762.54736328125, "loss": 1.0541, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.049885571002960205, "rewards/margins": -0.25494030117988586, "rewards/rejected": 0.20505471527576447, "step": 340 }, { "epoch": 0.081537565521258, "grad_norm": 5.136266231536865, "learning_rate": 8.149010477299186e-06, "logits/chosen": -4.529461860656738, "logits/rejected": -4.453719615936279, "logps/chosen": -711.1002197265625, "logps/rejected": -640.2122802734375, "loss": 0.8314, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0677490234375, "rewards/margins": 0.18676519393920898, "rewards/rejected": -0.11901617050170898, "step": 350 }, { "epoch": 0.08386721025043681, "grad_norm": 5.856282711029053, "learning_rate": 8.381839348079162e-06, "logits/chosen": -4.503441333770752, "logits/rejected": -4.504217624664307, "logps/chosen": -777.5228271484375, "logps/rejected": -750.1853637695312, "loss": 0.9783, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.07102885842323303, "rewards/margins": -0.1274302750825882, "rewards/rejected": 0.19845914840698242, "step": 360 }, { "epoch": 0.08619685497961561, "grad_norm": 5.638661861419678, "learning_rate": 8.61466821885914e-06, "logits/chosen": -4.5200605392456055, "logits/rejected": -4.6175665855407715, "logps/chosen": -752.9575805664062, "logps/rejected": -788.6130981445312, "loss": 0.8494, "rewards/accuracies": 0.5, "rewards/chosen": 0.14627352356910706, "rewards/margins": 0.04493988677859306, "rewards/rejected": 0.10133364051580429, "step": 370 }, { "epoch": 0.0885264997087944, "grad_norm": 6.1573357582092285, "learning_rate": 8.847497089639115e-06, "logits/chosen": -4.608702182769775, "logits/rejected": -4.623651027679443, "logps/chosen": -706.5177612304688, "logps/rejected": -751.2601318359375, "loss": 1.0472, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.27272742986679077, "rewards/margins": -0.3680197596549988, "rewards/rejected": 0.09529231488704681, "step": 380 }, { "epoch": 0.09085614443797321, "grad_norm": 6.762535095214844, "learning_rate": 9.080325960419094e-06, "logits/chosen": -4.571442604064941, "logits/rejected": -4.51932430267334, "logps/chosen": -751.0111083984375, "logps/rejected": -743.8901977539062, "loss": 0.9392, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.057555168867111206, "rewards/margins": -0.0428026020526886, "rewards/rejected": 0.100357785820961, "step": 390 }, { "epoch": 0.093185789167152, "grad_norm": 5.95464563369751, "learning_rate": 9.313154831199069e-06, "logits/chosen": -4.542258262634277, "logits/rejected": -4.514950752258301, "logps/chosen": -790.67919921875, "logps/rejected": -716.7508544921875, "loss": 0.8007, "rewards/accuracies": 0.5, "rewards/chosen": 0.12491132318973541, "rewards/margins": 0.12783382833003998, "rewards/rejected": -0.0029225251637399197, "step": 400 }, { "epoch": 0.093185789167152, "eval_logits/chosen": -4.494009017944336, "eval_logits/rejected": -4.487839698791504, "eval_logps/chosen": -690.876220703125, "eval_logps/rejected": -706.9720458984375, "eval_loss": 0.6875714063644409, "eval_rewards/accuracies": 0.5735253095626831, "eval_rewards/chosen": -0.004798768553882837, "eval_rewards/margins": 0.012539190240204334, "eval_rewards/rejected": -0.01733795739710331, "eval_runtime": 386.3479, "eval_samples_per_second": 18.517, "eval_steps_per_second": 9.258, "step": 400 }, { "epoch": 0.09551543389633081, "grad_norm": 6.097142696380615, "learning_rate": 9.545983701979046e-06, "logits/chosen": -4.555731773376465, "logits/rejected": -4.547555923461914, "logps/chosen": -764.7968139648438, "logps/rejected": -813.4968872070312, "loss": 1.0184, "rewards/accuracies": 0.375, "rewards/chosen": -0.14817802608013153, "rewards/margins": -0.21674151718616486, "rewards/rejected": 0.06856345385313034, "step": 410 }, { "epoch": 0.0978450786255096, "grad_norm": 3.8165435791015625, "learning_rate": 9.778812572759023e-06, "logits/chosen": -4.537326812744141, "logits/rejected": -4.496391296386719, "logps/chosen": -661.8091430664062, "logps/rejected": -675.4200439453125, "loss": 0.8522, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.1329130232334137, "rewards/margins": -0.04306463152170181, "rewards/rejected": -0.08984839916229248, "step": 420 }, { "epoch": 0.10017472335468841, "grad_norm": 6.400147914886475, "learning_rate": 1.0011641443538999e-05, "logits/chosen": -4.467156410217285, "logits/rejected": -4.5243730545043945, "logps/chosen": -700.6357421875, "logps/rejected": -794.3414306640625, "loss": 0.949, "rewards/accuracies": 0.4375, "rewards/chosen": -0.02168646827340126, "rewards/margins": -0.14708887040615082, "rewards/rejected": 0.12540242075920105, "step": 430 }, { "epoch": 0.1025043680838672, "grad_norm": 5.348326206207275, "learning_rate": 1.0244470314318978e-05, "logits/chosen": -4.550189018249512, "logits/rejected": -4.601241588592529, "logps/chosen": -675.608154296875, "logps/rejected": -734.4439086914062, "loss": 0.8471, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.15092246234416962, "rewards/margins": 0.09153494983911514, "rewards/rejected": 0.05938751623034477, "step": 440 }, { "epoch": 0.10483401281304601, "grad_norm": 5.341457843780518, "learning_rate": 1.0477299185098953e-05, "logits/chosen": -4.568203449249268, "logits/rejected": -4.5304179191589355, "logps/chosen": -729.2503662109375, "logps/rejected": -744.8440551757812, "loss": 0.8296, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.04322073608636856, "rewards/margins": 0.1277836114168167, "rewards/rejected": -0.08456286042928696, "step": 450 }, { "epoch": 0.10716365754222482, "grad_norm": 4.877196788787842, "learning_rate": 1.071012805587893e-05, "logits/chosen": -4.477053165435791, "logits/rejected": -4.52747106552124, "logps/chosen": -678.0662231445312, "logps/rejected": -736.2633056640625, "loss": 0.8902, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.03617674857378006, "rewards/margins": -0.0766257494688034, "rewards/rejected": 0.040448982268571854, "step": 460 }, { "epoch": 0.10949330227140361, "grad_norm": 6.649731159210205, "learning_rate": 1.0942956926658908e-05, "logits/chosen": -4.47160005569458, "logits/rejected": -4.600485801696777, "logps/chosen": -767.81103515625, "logps/rejected": -851.0872802734375, "loss": 0.8232, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.023301953449845314, "rewards/margins": -0.006503301672637463, "rewards/rejected": 0.029805243015289307, "step": 470 }, { "epoch": 0.11182294700058241, "grad_norm": 5.362112998962402, "learning_rate": 1.1175785797438883e-05, "logits/chosen": -4.62471866607666, "logits/rejected": -4.54784631729126, "logps/chosen": -733.701416015625, "logps/rejected": -679.8546142578125, "loss": 0.7379, "rewards/accuracies": 0.5625, "rewards/chosen": 0.1373155266046524, "rewards/margins": 0.20806872844696045, "rewards/rejected": -0.07075319439172745, "step": 480 }, { "epoch": 0.11415259172976121, "grad_norm": 4.938389301300049, "learning_rate": 1.140861466821886e-05, "logits/chosen": -4.493394374847412, "logits/rejected": -4.541499137878418, "logps/chosen": -714.9727783203125, "logps/rejected": -744.9597778320312, "loss": 0.9243, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.053753603249788284, "rewards/margins": -0.20864422619342804, "rewards/rejected": 0.15489062666893005, "step": 490 }, { "epoch": 0.11648223645894001, "grad_norm": 3.43468976020813, "learning_rate": 1.1641443538998838e-05, "logits/chosen": -4.504472732543945, "logits/rejected": -4.528023719787598, "logps/chosen": -671.1829833984375, "logps/rejected": -717.5294189453125, "loss": 0.8537, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0035562929697334766, "rewards/margins": 0.06256623566150665, "rewards/rejected": -0.05900995805859566, "step": 500 }, { "epoch": 0.11648223645894001, "eval_logits/chosen": -4.482612609863281, "eval_logits/rejected": -4.475794315338135, "eval_logps/chosen": -690.953369140625, "eval_logps/rejected": -707.1657104492188, "eval_loss": 0.6825693249702454, "eval_rewards/accuracies": 0.5932345390319824, "eval_rewards/chosen": -0.012506458908319473, "eval_rewards/margins": 0.024205248802900314, "eval_rewards/rejected": -0.03671170771121979, "eval_runtime": 386.3263, "eval_samples_per_second": 18.518, "eval_steps_per_second": 9.259, "step": 500 }, { "epoch": 0.1188118811881188, "grad_norm": 5.996777534484863, "learning_rate": 1.1874272409778813e-05, "logits/chosen": -4.582979202270508, "logits/rejected": -4.537229061126709, "logps/chosen": -756.7322998046875, "logps/rejected": -711.3515014648438, "loss": 0.8802, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.016579514369368553, "rewards/margins": -0.010887527838349342, "rewards/rejected": 0.02746700681746006, "step": 510 }, { "epoch": 0.12114152591729761, "grad_norm": 5.092260837554932, "learning_rate": 1.210710128055879e-05, "logits/chosen": -4.395134925842285, "logits/rejected": -4.46746826171875, "logps/chosen": -605.616455078125, "logps/rejected": -695.7340698242188, "loss": 0.8504, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05290902778506279, "rewards/margins": 0.08201033622026443, "rewards/rejected": -0.13491936028003693, "step": 520 }, { "epoch": 0.12347117064647642, "grad_norm": 6.615758895874023, "learning_rate": 1.2339930151338766e-05, "logits/chosen": -4.505391597747803, "logits/rejected": -4.579988479614258, "logps/chosen": -662.4857177734375, "logps/rejected": -739.6719970703125, "loss": 0.8742, "rewards/accuracies": 0.5, "rewards/chosen": -0.057824719697237015, "rewards/margins": -0.011621838435530663, "rewards/rejected": -0.0462028793990612, "step": 530 }, { "epoch": 0.1258008153756552, "grad_norm": 7.350104808807373, "learning_rate": 1.2572759022118743e-05, "logits/chosen": -4.466317176818848, "logits/rejected": -4.424395561218262, "logps/chosen": -771.0465087890625, "logps/rejected": -732.0314331054688, "loss": 0.9882, "rewards/accuracies": 0.4375, "rewards/chosen": -0.07970836013555527, "rewards/margins": -0.20637159049510956, "rewards/rejected": 0.1266632378101349, "step": 540 }, { "epoch": 0.128130460104834, "grad_norm": 6.6570281982421875, "learning_rate": 1.280558789289872e-05, "logits/chosen": -4.606103897094727, "logits/rejected": -4.553511619567871, "logps/chosen": -730.1910400390625, "logps/rejected": -739.556884765625, "loss": 0.9002, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.08168536424636841, "rewards/margins": -0.03502635285258293, "rewards/rejected": -0.04665901139378548, "step": 550 }, { "epoch": 0.13046010483401282, "grad_norm": 4.95475435256958, "learning_rate": 1.3038416763678696e-05, "logits/chosen": -4.475506782531738, "logits/rejected": -4.3944501876831055, "logps/chosen": -758.3416137695312, "logps/rejected": -779.4380493164062, "loss": 0.9269, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.09176072478294373, "rewards/margins": -0.047757431864738464, "rewards/rejected": -0.04400332272052765, "step": 560 }, { "epoch": 0.13278974956319162, "grad_norm": 4.766490936279297, "learning_rate": 1.3271245634458675e-05, "logits/chosen": -4.471567153930664, "logits/rejected": -4.58807373046875, "logps/chosen": -716.8036499023438, "logps/rejected": -793.9383544921875, "loss": 0.839, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.04037877544760704, "rewards/margins": 0.085518978536129, "rewards/rejected": -0.04514019936323166, "step": 570 }, { "epoch": 0.1351193942923704, "grad_norm": 6.618696689605713, "learning_rate": 1.3504074505238652e-05, "logits/chosen": -4.571255683898926, "logits/rejected": -4.508147239685059, "logps/chosen": -782.623046875, "logps/rejected": -730.2835693359375, "loss": 0.9174, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.12230058014392853, "rewards/margins": -0.1568373292684555, "rewards/rejected": 0.03453673794865608, "step": 580 }, { "epoch": 0.1374490390215492, "grad_norm": 8.756173133850098, "learning_rate": 1.3736903376018627e-05, "logits/chosen": -4.579248905181885, "logits/rejected": -4.511783599853516, "logps/chosen": -723.1712646484375, "logps/rejected": -692.4654541015625, "loss": 0.8997, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.20503079891204834, "rewards/margins": -0.061770010739564896, "rewards/rejected": -0.14326077699661255, "step": 590 }, { "epoch": 0.13977868375072802, "grad_norm": 4.448201656341553, "learning_rate": 1.3969732246798604e-05, "logits/chosen": -4.505821704864502, "logits/rejected": -4.529115676879883, "logps/chosen": -720.322509765625, "logps/rejected": -713.7578735351562, "loss": 0.8746, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.03187558054924011, "rewards/margins": -0.010967904701828957, "rewards/rejected": 0.042843498289585114, "step": 600 }, { "epoch": 0.13977868375072802, "eval_logits/chosen": -4.465523719787598, "eval_logits/rejected": -4.4580254554748535, "eval_logps/chosen": -691.2229614257812, "eval_logps/rejected": -707.6290283203125, "eval_loss": 0.6756829619407654, "eval_rewards/accuracies": 0.5942130088806152, "eval_rewards/chosen": -0.03947289660573006, "eval_rewards/margins": 0.043567754328250885, "eval_rewards/rejected": -0.08304064720869064, "eval_runtime": 386.2804, "eval_samples_per_second": 18.52, "eval_steps_per_second": 9.26, "step": 600 }, { "epoch": 0.14210832847990681, "grad_norm": 3.8562746047973633, "learning_rate": 1.4202561117578582e-05, "logits/chosen": -4.432703971862793, "logits/rejected": -4.496449947357178, "logps/chosen": -671.8670043945312, "logps/rejected": -723.2869262695312, "loss": 0.8301, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.08208735287189484, "rewards/margins": 0.1055004820227623, "rewards/rejected": -0.023413140326738358, "step": 610 }, { "epoch": 0.1444379732090856, "grad_norm": 5.218752384185791, "learning_rate": 1.4435389988358557e-05, "logits/chosen": -4.540579795837402, "logits/rejected": -4.502984046936035, "logps/chosen": -732.4451904296875, "logps/rejected": -736.8656005859375, "loss": 0.8271, "rewards/accuracies": 0.4375, "rewards/chosen": -0.07555864006280899, "rewards/margins": 0.08679278939962387, "rewards/rejected": -0.16235145926475525, "step": 620 }, { "epoch": 0.14676761793826443, "grad_norm": 4.187116622924805, "learning_rate": 1.4668218859138534e-05, "logits/chosen": -4.4309468269348145, "logits/rejected": -4.475230693817139, "logps/chosen": -670.060302734375, "logps/rejected": -692.5626831054688, "loss": 0.7864, "rewards/accuracies": 0.5625, "rewards/chosen": 0.09609910100698471, "rewards/margins": 0.13632231950759888, "rewards/rejected": -0.04022323340177536, "step": 630 }, { "epoch": 0.14909726266744322, "grad_norm": 6.015897750854492, "learning_rate": 1.4901047729918511e-05, "logits/chosen": -4.529510498046875, "logits/rejected": -4.5059356689453125, "logps/chosen": -695.6485595703125, "logps/rejected": -638.5431518554688, "loss": 0.8831, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.07902977615594864, "rewards/margins": -0.08549060672521591, "rewards/rejected": 0.006460844073444605, "step": 640 }, { "epoch": 0.151426907396622, "grad_norm": 9.856568336486816, "learning_rate": 1.5133876600698487e-05, "logits/chosen": -4.480548858642578, "logits/rejected": -4.469473838806152, "logps/chosen": -748.206787109375, "logps/rejected": -737.4120483398438, "loss": 0.9739, "rewards/accuracies": 0.5, "rewards/chosen": -0.027322787791490555, "rewards/margins": -0.0710093080997467, "rewards/rejected": 0.04368652403354645, "step": 650 }, { "epoch": 0.1537565521258008, "grad_norm": 5.658621311187744, "learning_rate": 1.5366705471478464e-05, "logits/chosen": -4.546767234802246, "logits/rejected": -4.459005355834961, "logps/chosen": -669.4649658203125, "logps/rejected": -639.6283569335938, "loss": 0.8724, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.019634447991847992, "rewards/margins": -0.0433480404317379, "rewards/rejected": 0.023713573813438416, "step": 660 }, { "epoch": 0.15608619685497963, "grad_norm": 6.9683308601379395, "learning_rate": 1.5599534342258443e-05, "logits/chosen": -4.432524681091309, "logits/rejected": -4.509583473205566, "logps/chosen": -714.3270874023438, "logps/rejected": -806.752197265625, "loss": 0.7997, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.03766901418566704, "rewards/margins": 0.21293482184410095, "rewards/rejected": -0.1752658188343048, "step": 670 }, { "epoch": 0.15841584158415842, "grad_norm": 6.2949113845825195, "learning_rate": 1.583236321303842e-05, "logits/chosen": -4.499951362609863, "logits/rejected": -4.426943302154541, "logps/chosen": -678.3032836914062, "logps/rejected": -684.98388671875, "loss": 0.8157, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.06583984196186066, "rewards/margins": 0.10869854688644409, "rewards/rejected": -0.17453840374946594, "step": 680 }, { "epoch": 0.1607454863133372, "grad_norm": 6.752503871917725, "learning_rate": 1.6065192083818394e-05, "logits/chosen": -4.490216255187988, "logits/rejected": -4.514633655548096, "logps/chosen": -717.6605224609375, "logps/rejected": -744.0989990234375, "loss": 0.9619, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.10855916887521744, "rewards/margins": -0.13426534831523895, "rewards/rejected": 0.025706171989440918, "step": 690 }, { "epoch": 0.163075131042516, "grad_norm": 6.129384517669678, "learning_rate": 1.6298020954598373e-05, "logits/chosen": -4.400869846343994, "logits/rejected": -4.518251895904541, "logps/chosen": -754.826416015625, "logps/rejected": -817.2966918945312, "loss": 0.7814, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.015134274959564209, "rewards/margins": 0.2830790877342224, "rewards/rejected": -0.29821330308914185, "step": 700 }, { "epoch": 0.163075131042516, "eval_logits/chosen": -4.4405694007873535, "eval_logits/rejected": -4.4323883056640625, "eval_logps/chosen": -691.7077026367188, "eval_logps/rejected": -708.324951171875, "eval_loss": 0.6701375246047974, "eval_rewards/accuracies": 0.5978473424911499, "eval_rewards/chosen": -0.08795131742954254, "eval_rewards/margins": 0.06467774510383606, "eval_rewards/rejected": -0.1526290476322174, "eval_runtime": 388.0467, "eval_samples_per_second": 18.436, "eval_steps_per_second": 9.218, "step": 700 }, { "epoch": 0.16540477577169482, "grad_norm": 6.498562335968018, "learning_rate": 1.653084982537835e-05, "logits/chosen": -4.449611663818359, "logits/rejected": -4.466498851776123, "logps/chosen": -664.6646728515625, "logps/rejected": -748.9295654296875, "loss": 0.8716, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.07674837857484818, "rewards/margins": 0.03639410063624382, "rewards/rejected": -0.1131424754858017, "step": 710 }, { "epoch": 0.16773442050087362, "grad_norm": 7.23454475402832, "learning_rate": 1.6763678696158324e-05, "logits/chosen": -4.440798759460449, "logits/rejected": -4.442746162414551, "logps/chosen": -708.8589477539062, "logps/rejected": -670.4962158203125, "loss": 0.6813, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.19437508285045624, "rewards/margins": 0.3670928478240967, "rewards/rejected": -0.17271773517131805, "step": 720 }, { "epoch": 0.1700640652300524, "grad_norm": 4.659682273864746, "learning_rate": 1.6996507566938303e-05, "logits/chosen": -4.551013946533203, "logits/rejected": -4.534272193908691, "logps/chosen": -711.7779541015625, "logps/rejected": -745.0709228515625, "loss": 0.9104, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.1599648892879486, "rewards/margins": 0.05072777345776558, "rewards/rejected": -0.2106926441192627, "step": 730 }, { "epoch": 0.17239370995923123, "grad_norm": 6.002203941345215, "learning_rate": 1.722933643771828e-05, "logits/chosen": -4.513463497161865, "logits/rejected": -4.531512260437012, "logps/chosen": -743.213623046875, "logps/rejected": -761.3690185546875, "loss": 0.8078, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0007909566047601402, "rewards/margins": 0.25189337134361267, "rewards/rejected": -0.2511024475097656, "step": 740 }, { "epoch": 0.17472335468841002, "grad_norm": 4.882469654083252, "learning_rate": 1.7462165308498257e-05, "logits/chosen": -4.452869415283203, "logits/rejected": -4.476421356201172, "logps/chosen": -705.3482055664062, "logps/rejected": -641.9478759765625, "loss": 0.7559, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.00530981132760644, "rewards/margins": 0.2093144953250885, "rewards/rejected": -0.2040046900510788, "step": 750 }, { "epoch": 0.1770529994175888, "grad_norm": 8.57348918914795, "learning_rate": 1.769499417927823e-05, "logits/chosen": -4.47825813293457, "logits/rejected": -4.470673561096191, "logps/chosen": -696.1239013671875, "logps/rejected": -751.4468383789062, "loss": 0.8814, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.01002002228051424, "rewards/margins": -0.008996338583528996, "rewards/rejected": 0.019016362726688385, "step": 760 }, { "epoch": 0.1793826441467676, "grad_norm": 7.979362964630127, "learning_rate": 1.7927823050058208e-05, "logits/chosen": -4.551217079162598, "logits/rejected": -4.480177402496338, "logps/chosen": -750.54052734375, "logps/rejected": -755.7467041015625, "loss": 0.9642, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.27815550565719604, "rewards/margins": -0.1696508228778839, "rewards/rejected": -0.10850467532873154, "step": 770 }, { "epoch": 0.18171228887594643, "grad_norm": 9.473190307617188, "learning_rate": 1.8160651920838187e-05, "logits/chosen": -4.4927873611450195, "logits/rejected": -4.439538955688477, "logps/chosen": -681.1729125976562, "logps/rejected": -659.6249389648438, "loss": 0.9542, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.21827229857444763, "rewards/margins": -0.15694120526313782, "rewards/rejected": -0.061331115663051605, "step": 780 }, { "epoch": 0.18404193360512522, "grad_norm": 8.782654762268066, "learning_rate": 1.8393480791618163e-05, "logits/chosen": -4.477587699890137, "logits/rejected": -4.4893479347229, "logps/chosen": -691.896484375, "logps/rejected": -744.2025756835938, "loss": 0.8238, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.06819736957550049, "rewards/margins": 0.09529562294483185, "rewards/rejected": -0.16349297761917114, "step": 790 }, { "epoch": 0.186371578334304, "grad_norm": 4.834448337554932, "learning_rate": 1.8626309662398138e-05, "logits/chosen": -4.495875835418701, "logits/rejected": -4.446606159210205, "logps/chosen": -712.1375732421875, "logps/rejected": -698.7463989257812, "loss": 0.8807, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.06593946367502213, "rewards/margins": 0.09279344230890274, "rewards/rejected": -0.15873286128044128, "step": 800 }, { "epoch": 0.186371578334304, "eval_logits/chosen": -4.416188716888428, "eval_logits/rejected": -4.407541275024414, "eval_logps/chosen": -692.6242065429688, "eval_logps/rejected": -709.51318359375, "eval_loss": 0.6647844314575195, "eval_rewards/accuracies": 0.5993849635124207, "eval_rewards/chosen": -0.1795974224805832, "eval_rewards/margins": 0.09186282753944397, "eval_rewards/rejected": -0.27146023511886597, "eval_runtime": 386.9115, "eval_samples_per_second": 18.49, "eval_steps_per_second": 9.245, "step": 800 }, { "epoch": 0.18870122306348283, "grad_norm": 6.349125862121582, "learning_rate": 1.8859138533178117e-05, "logits/chosen": -4.330979824066162, "logits/rejected": -4.421043872833252, "logps/chosen": -646.6573486328125, "logps/rejected": -706.3137817382812, "loss": 0.907, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.03287108615040779, "rewards/margins": 0.07876729965209961, "rewards/rejected": -0.1116383820772171, "step": 810 }, { "epoch": 0.19103086779266162, "grad_norm": 6.316599369049072, "learning_rate": 1.9091967403958092e-05, "logits/chosen": -4.432641506195068, "logits/rejected": -4.4769206047058105, "logps/chosen": -679.0020751953125, "logps/rejected": -746.960205078125, "loss": 0.8263, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.20592470467090607, "rewards/margins": 0.02772539295256138, "rewards/rejected": -0.2336500585079193, "step": 820 }, { "epoch": 0.19336051252184042, "grad_norm": 8.07143783569336, "learning_rate": 1.9324796274738068e-05, "logits/chosen": -4.513623237609863, "logits/rejected": -4.509900093078613, "logps/chosen": -767.2298583984375, "logps/rejected": -722.4786376953125, "loss": 1.1006, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.43160420656204224, "rewards/margins": -0.44003787636756897, "rewards/rejected": 0.008433675393462181, "step": 830 }, { "epoch": 0.1956901572510192, "grad_norm": 6.413561820983887, "learning_rate": 1.9557625145518047e-05, "logits/chosen": -4.438241958618164, "logits/rejected": -4.43107795715332, "logps/chosen": -717.0433959960938, "logps/rejected": -775.4155883789062, "loss": 0.774, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.01133380550891161, "rewards/margins": 0.15130652487277985, "rewards/rejected": -0.1399727165699005, "step": 840 }, { "epoch": 0.19801980198019803, "grad_norm": 7.26607608795166, "learning_rate": 1.9790454016298022e-05, "logits/chosen": -4.511332988739014, "logits/rejected": -4.516010761260986, "logps/chosen": -741.4783935546875, "logps/rejected": -765.6262817382812, "loss": 0.7566, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04076741635799408, "rewards/margins": 0.17717300355434418, "rewards/rejected": -0.21794047951698303, "step": 850 }, { "epoch": 0.20034944670937682, "grad_norm": 7.267500877380371, "learning_rate": 1.9997411003236248e-05, "logits/chosen": -4.396910667419434, "logits/rejected": -4.434980869293213, "logps/chosen": -721.0802612304688, "logps/rejected": -759.048828125, "loss": 0.8579, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.2259695827960968, "rewards/margins": -0.0007159143569879234, "rewards/rejected": -0.225253626704216, "step": 860 }, { "epoch": 0.20267909143855561, "grad_norm": 5.612873554229736, "learning_rate": 1.9971521035598705e-05, "logits/chosen": -4.474349021911621, "logits/rejected": -4.497475624084473, "logps/chosen": -735.9964599609375, "logps/rejected": -749.2137451171875, "loss": 0.7599, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.16218391060829163, "rewards/margins": 0.2070654332637787, "rewards/rejected": -0.3692493438720703, "step": 870 }, { "epoch": 0.2050087361677344, "grad_norm": 6.136228084564209, "learning_rate": 1.9945631067961166e-05, "logits/chosen": -4.436854839324951, "logits/rejected": -4.380222797393799, "logps/chosen": -713.2384643554688, "logps/rejected": -718.5233154296875, "loss": 0.8263, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.14623989164829254, "rewards/margins": 0.1452106237411499, "rewards/rejected": -0.291450560092926, "step": 880 }, { "epoch": 0.20733838089691323, "grad_norm": 8.56090259552002, "learning_rate": 1.9919741100323626e-05, "logits/chosen": -4.393536567687988, "logits/rejected": -4.48923921585083, "logps/chosen": -723.2747192382812, "logps/rejected": -769.8920288085938, "loss": 0.7963, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.07917286455631256, "rewards/margins": 0.2582031786441803, "rewards/rejected": -0.33737602829933167, "step": 890 }, { "epoch": 0.20966802562609202, "grad_norm": 7.170731544494629, "learning_rate": 1.9893851132686087e-05, "logits/chosen": -4.412181854248047, "logits/rejected": -4.461060523986816, "logps/chosen": -682.1312255859375, "logps/rejected": -784.7744750976562, "loss": 0.8967, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.2536042332649231, "rewards/margins": 0.0834280326962471, "rewards/rejected": -0.3370322585105896, "step": 900 }, { "epoch": 0.20966802562609202, "eval_logits/chosen": -4.388455390930176, "eval_logits/rejected": -4.379168510437012, "eval_logps/chosen": -693.9192504882812, "eval_logps/rejected": -711.1783447265625, "eval_loss": 0.662997305393219, "eval_rewards/accuracies": 0.5950517058372498, "eval_rewards/chosen": -0.30910125374794006, "eval_rewards/margins": 0.12887336313724518, "eval_rewards/rejected": -0.43797463178634644, "eval_runtime": 386.1394, "eval_samples_per_second": 18.527, "eval_steps_per_second": 9.263, "step": 900 }, { "epoch": 0.2119976703552708, "grad_norm": 6.341297149658203, "learning_rate": 1.9867961165048548e-05, "logits/chosen": -4.427966594696045, "logits/rejected": -4.400241374969482, "logps/chosen": -660.0777587890625, "logps/rejected": -687.676513671875, "loss": 0.7655, "rewards/accuracies": 0.5625, "rewards/chosen": -0.24212121963500977, "rewards/margins": 0.15326662361621857, "rewards/rejected": -0.39538782835006714, "step": 910 }, { "epoch": 0.21432731508444963, "grad_norm": 9.103260040283203, "learning_rate": 1.9842071197411005e-05, "logits/chosen": -4.519369125366211, "logits/rejected": -4.517698764801025, "logps/chosen": -740.9689331054688, "logps/rejected": -760.4479370117188, "loss": 0.8808, "rewards/accuracies": 0.5, "rewards/chosen": -0.2737390100955963, "rewards/margins": 0.04373549669981003, "rewards/rejected": -0.31747445464134216, "step": 920 }, { "epoch": 0.21665695981362842, "grad_norm": 9.210978507995605, "learning_rate": 1.9816181229773462e-05, "logits/chosen": -4.456936359405518, "logits/rejected": -4.353301048278809, "logps/chosen": -755.6140747070312, "logps/rejected": -709.889404296875, "loss": 0.7476, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.18943250179290771, "rewards/margins": 0.25421398878097534, "rewards/rejected": -0.44364649057388306, "step": 930 }, { "epoch": 0.21898660454280722, "grad_norm": 9.391003608703613, "learning_rate": 1.9790291262135922e-05, "logits/chosen": -4.424135684967041, "logits/rejected": -4.408591270446777, "logps/chosen": -738.0665893554688, "logps/rejected": -708.8335571289062, "loss": 0.8815, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3330017924308777, "rewards/margins": 0.11240987479686737, "rewards/rejected": -0.445411741733551, "step": 940 }, { "epoch": 0.221316249271986, "grad_norm": 9.435540199279785, "learning_rate": 1.9764401294498383e-05, "logits/chosen": -4.3817338943481445, "logits/rejected": -4.482665538787842, "logps/chosen": -679.2781982421875, "logps/rejected": -762.5584716796875, "loss": 0.8862, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4497762620449066, "rewards/margins": -0.009173527359962463, "rewards/rejected": -0.44060271978378296, "step": 950 }, { "epoch": 0.22364589400116483, "grad_norm": 5.925865173339844, "learning_rate": 1.9738511326860844e-05, "logits/chosen": -4.449588298797607, "logits/rejected": -4.440318584442139, "logps/chosen": -747.8040771484375, "logps/rejected": -748.3953857421875, "loss": 0.9623, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.4708067774772644, "rewards/margins": -0.09435828030109406, "rewards/rejected": -0.37644851207733154, "step": 960 }, { "epoch": 0.22597553873034362, "grad_norm": 9.26547908782959, "learning_rate": 1.9712621359223304e-05, "logits/chosen": -4.4216814041137695, "logits/rejected": -4.480114936828613, "logps/chosen": -650.800537109375, "logps/rejected": -739.6800537109375, "loss": 0.9505, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.33942896127700806, "rewards/margins": -0.065024234354496, "rewards/rejected": -0.27440470457077026, "step": 970 }, { "epoch": 0.22830518345952241, "grad_norm": 6.078829765319824, "learning_rate": 1.968673139158576e-05, "logits/chosen": -4.370000839233398, "logits/rejected": -4.366150856018066, "logps/chosen": -676.8948974609375, "logps/rejected": -675.8610229492188, "loss": 0.8804, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.319058895111084, "rewards/margins": -0.07290103286504745, "rewards/rejected": -0.24615785479545593, "step": 980 }, { "epoch": 0.23063482818870124, "grad_norm": 7.866084098815918, "learning_rate": 1.9660841423948222e-05, "logits/chosen": -4.496729850769043, "logits/rejected": -4.4081621170043945, "logps/chosen": -695.5631103515625, "logps/rejected": -689.4572143554688, "loss": 0.8705, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.31703701615333557, "rewards/margins": 0.10829310119152069, "rewards/rejected": -0.42533010244369507, "step": 990 }, { "epoch": 0.23296447291788003, "grad_norm": 6.092545032501221, "learning_rate": 1.963495145631068e-05, "logits/chosen": -4.45365571975708, "logits/rejected": -4.482058525085449, "logps/chosen": -757.77294921875, "logps/rejected": -785.2354736328125, "loss": 0.7651, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5000513792037964, "rewards/margins": 0.3355986475944519, "rewards/rejected": -0.8356500864028931, "step": 1000 }, { "epoch": 0.23296447291788003, "eval_logits/chosen": -4.350008010864258, "eval_logits/rejected": -4.339516639709473, "eval_logps/chosen": -694.97314453125, "eval_logps/rejected": -712.5101928710938, "eval_loss": 0.6633332967758179, "eval_rewards/accuracies": 0.5963097810745239, "eval_rewards/chosen": -0.4144977927207947, "eval_rewards/margins": 0.1566552221775055, "eval_rewards/rejected": -0.5711529850959778, "eval_runtime": 386.6444, "eval_samples_per_second": 18.503, "eval_steps_per_second": 9.251, "step": 1000 }, { "epoch": 0.23529411764705882, "grad_norm": 8.298626899719238, "learning_rate": 1.960906148867314e-05, "logits/chosen": -4.350672245025635, "logits/rejected": -4.346297740936279, "logps/chosen": -716.8140258789062, "logps/rejected": -699.349365234375, "loss": 0.8517, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.43861478567123413, "rewards/margins": 0.12198150157928467, "rewards/rejected": -0.5605962872505188, "step": 1010 }, { "epoch": 0.2376237623762376, "grad_norm": 8.617300033569336, "learning_rate": 1.95831715210356e-05, "logits/chosen": -4.392172813415527, "logits/rejected": -4.394693851470947, "logps/chosen": -718.5160522460938, "logps/rejected": -735.8982543945312, "loss": 0.8677, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.34239235520362854, "rewards/margins": 0.07352287322282791, "rewards/rejected": -0.415915310382843, "step": 1020 }, { "epoch": 0.23995340710541643, "grad_norm": 7.848562717437744, "learning_rate": 1.955728155339806e-05, "logits/chosen": -4.374752998352051, "logits/rejected": -4.420811653137207, "logps/chosen": -707.7733154296875, "logps/rejected": -725.6073608398438, "loss": 0.8094, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.25753355026245117, "rewards/margins": 0.20300650596618652, "rewards/rejected": -0.4605400562286377, "step": 1030 }, { "epoch": 0.24228305183459523, "grad_norm": 9.19383430480957, "learning_rate": 1.9531391585760518e-05, "logits/chosen": -4.342906475067139, "logits/rejected": -4.432077884674072, "logps/chosen": -674.3995361328125, "logps/rejected": -725.902099609375, "loss": 0.847, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.30617013573646545, "rewards/margins": 0.07979961484670639, "rewards/rejected": -0.38596972823143005, "step": 1040 }, { "epoch": 0.24461269656377402, "grad_norm": 7.746910095214844, "learning_rate": 1.950550161812298e-05, "logits/chosen": -4.319817543029785, "logits/rejected": -4.350963592529297, "logps/chosen": -718.1383666992188, "logps/rejected": -722.8746948242188, "loss": 0.9053, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.3487966060638428, "rewards/margins": 0.08206790685653687, "rewards/rejected": -0.43086448311805725, "step": 1050 }, { "epoch": 0.24694234129295284, "grad_norm": 11.534335136413574, "learning_rate": 1.947961165048544e-05, "logits/chosen": -4.359479904174805, "logits/rejected": -4.37454891204834, "logps/chosen": -709.6820068359375, "logps/rejected": -742.3812255859375, "loss": 0.8113, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.40116387605667114, "rewards/margins": 0.22071246802806854, "rewards/rejected": -0.6218763589859009, "step": 1060 }, { "epoch": 0.24927198602213163, "grad_norm": 8.217765808105469, "learning_rate": 1.9453721682847896e-05, "logits/chosen": -4.362360954284668, "logits/rejected": -4.356196403503418, "logps/chosen": -716.28076171875, "logps/rejected": -748.5253295898438, "loss": 0.8515, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.47076430916786194, "rewards/margins": 0.22061574459075928, "rewards/rejected": -0.6913800239562988, "step": 1070 }, { "epoch": 0.2516016307513104, "grad_norm": 6.56362247467041, "learning_rate": 1.9427831715210357e-05, "logits/chosen": -4.443305492401123, "logits/rejected": -4.4894914627075195, "logps/chosen": -729.105712890625, "logps/rejected": -731.7708129882812, "loss": 0.8697, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.4395231306552887, "rewards/margins": 0.029054516926407814, "rewards/rejected": -0.46857762336730957, "step": 1080 }, { "epoch": 0.2539312754804892, "grad_norm": 7.56439733505249, "learning_rate": 1.9401941747572818e-05, "logits/chosen": -4.284507751464844, "logits/rejected": -4.338918209075928, "logps/chosen": -698.3824462890625, "logps/rejected": -763.0474853515625, "loss": 0.745, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.100661501288414, "rewards/margins": 0.35776785016059875, "rewards/rejected": -0.45842933654785156, "step": 1090 }, { "epoch": 0.256260920209668, "grad_norm": 8.469297409057617, "learning_rate": 1.9376051779935278e-05, "logits/chosen": -4.2608771324157715, "logits/rejected": -4.317883491516113, "logps/chosen": -715.9654541015625, "logps/rejected": -795.2291259765625, "loss": 0.6523, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.29540759325027466, "rewards/margins": 0.6277150511741638, "rewards/rejected": -0.9231227040290833, "step": 1100 }, { "epoch": 0.256260920209668, "eval_logits/chosen": -4.31129789352417, "eval_logits/rejected": -4.299421310424805, "eval_logps/chosen": -695.878173828125, "eval_logps/rejected": -713.7141723632812, "eval_loss": 0.6638560891151428, "eval_rewards/accuracies": 0.5961699485778809, "eval_rewards/chosen": -0.5049953460693359, "eval_rewards/margins": 0.18655292689800262, "eval_rewards/rejected": -0.691548228263855, "eval_runtime": 386.1093, "eval_samples_per_second": 18.528, "eval_steps_per_second": 9.264, "step": 1100 }, { "epoch": 0.2585905649388468, "grad_norm": 11.331911087036133, "learning_rate": 1.9350161812297735e-05, "logits/chosen": -4.392155170440674, "logits/rejected": -4.309884548187256, "logps/chosen": -723.4649658203125, "logps/rejected": -722.3359985351562, "loss": 0.8422, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.35999637842178345, "rewards/margins": 0.22510388493537903, "rewards/rejected": -0.5851002931594849, "step": 1110 }, { "epoch": 0.26092020966802565, "grad_norm": 9.10745906829834, "learning_rate": 1.9324271844660196e-05, "logits/chosen": -4.381644248962402, "logits/rejected": -4.304900169372559, "logps/chosen": -765.546875, "logps/rejected": -742.8046875, "loss": 0.8466, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.6179436445236206, "rewards/margins": 0.15039575099945068, "rewards/rejected": -0.7683394551277161, "step": 1120 }, { "epoch": 0.26324985439720444, "grad_norm": 6.49190616607666, "learning_rate": 1.9298381877022656e-05, "logits/chosen": -4.226799488067627, "logits/rejected": -4.376645088195801, "logps/chosen": -648.4716796875, "logps/rejected": -772.0549926757812, "loss": 0.726, "rewards/accuracies": 0.625, "rewards/chosen": -0.18986962735652924, "rewards/margins": 0.5118517875671387, "rewards/rejected": -0.7017214894294739, "step": 1130 }, { "epoch": 0.26557949912638323, "grad_norm": 7.391536235809326, "learning_rate": 1.9272491909385117e-05, "logits/chosen": -4.360841274261475, "logits/rejected": -4.35951042175293, "logps/chosen": -775.7753295898438, "logps/rejected": -796.6776733398438, "loss": 0.7789, "rewards/accuracies": 0.5625, "rewards/chosen": -0.459010511636734, "rewards/margins": 0.3028544783592224, "rewards/rejected": -0.7618650197982788, "step": 1140 }, { "epoch": 0.267909143855562, "grad_norm": 7.595261573791504, "learning_rate": 1.9246601941747574e-05, "logits/chosen": -4.29518461227417, "logits/rejected": -4.352611064910889, "logps/chosen": -710.1934814453125, "logps/rejected": -733.4653930664062, "loss": 0.9506, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.4916602671146393, "rewards/margins": -0.07938651740550995, "rewards/rejected": -0.4122737944126129, "step": 1150 }, { "epoch": 0.2702387885847408, "grad_norm": 7.323908805847168, "learning_rate": 1.9220711974110035e-05, "logits/chosen": -4.297641754150391, "logits/rejected": -4.3460564613342285, "logps/chosen": -671.62548828125, "logps/rejected": -700.6619873046875, "loss": 0.7593, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.31447774171829224, "rewards/margins": 0.2587481141090393, "rewards/rejected": -0.5732258558273315, "step": 1160 }, { "epoch": 0.2725684333139196, "grad_norm": 4.596531867980957, "learning_rate": 1.9194822006472492e-05, "logits/chosen": -4.439651966094971, "logits/rejected": -4.424775123596191, "logps/chosen": -746.04248046875, "logps/rejected": -760.7760009765625, "loss": 0.69, "rewards/accuracies": 0.625, "rewards/chosen": -0.24861034750938416, "rewards/margins": 0.4928106367588043, "rewards/rejected": -0.7414209842681885, "step": 1170 }, { "epoch": 0.2748980780430984, "grad_norm": 6.0940423011779785, "learning_rate": 1.9168932038834952e-05, "logits/chosen": -4.387946128845215, "logits/rejected": -4.351739883422852, "logps/chosen": -769.9338989257812, "logps/rejected": -746.4041748046875, "loss": 0.7558, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3385796546936035, "rewards/margins": 0.41992324590682983, "rewards/rejected": -0.7585029602050781, "step": 1180 }, { "epoch": 0.27722772277227725, "grad_norm": 7.024069309234619, "learning_rate": 1.9143042071197413e-05, "logits/chosen": -4.372438907623291, "logits/rejected": -4.313912868499756, "logps/chosen": -724.6370849609375, "logps/rejected": -753.0245361328125, "loss": 0.8215, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.5210503935813904, "rewards/margins": 0.0856110230088234, "rewards/rejected": -0.6066614389419556, "step": 1190 }, { "epoch": 0.27955736750145604, "grad_norm": 9.920272827148438, "learning_rate": 1.9117152103559874e-05, "logits/chosen": -4.4457221031188965, "logits/rejected": -4.441542148590088, "logps/chosen": -741.6122436523438, "logps/rejected": -730.5281372070312, "loss": 0.8034, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3603318929672241, "rewards/margins": 0.28044313192367554, "rewards/rejected": -0.6407750844955444, "step": 1200 }, { "epoch": 0.27955736750145604, "eval_logits/chosen": -4.304655075073242, "eval_logits/rejected": -4.292832374572754, "eval_logps/chosen": -695.743896484375, "eval_logps/rejected": -713.6777954101562, "eval_loss": 0.6612324714660645, "eval_rewards/accuracies": 0.6017612814903259, "eval_rewards/chosen": -0.49156203866004944, "eval_rewards/margins": 0.19635748863220215, "eval_rewards/rejected": -0.6879194974899292, "eval_runtime": 386.8833, "eval_samples_per_second": 18.491, "eval_steps_per_second": 9.246, "step": 1200 }, { "epoch": 0.28188701223063484, "grad_norm": 7.033170223236084, "learning_rate": 1.9091262135922334e-05, "logits/chosen": -4.416142463684082, "logits/rejected": -4.44868803024292, "logps/chosen": -772.4678955078125, "logps/rejected": -797.7364501953125, "loss": 0.9886, "rewards/accuracies": 0.4375, "rewards/chosen": -0.44758549332618713, "rewards/margins": -0.10882000625133514, "rewards/rejected": -0.3387654423713684, "step": 1210 }, { "epoch": 0.28421665695981363, "grad_norm": 8.389505386352539, "learning_rate": 1.906537216828479e-05, "logits/chosen": -4.408908843994141, "logits/rejected": -4.390933036804199, "logps/chosen": -770.1661987304688, "logps/rejected": -758.59521484375, "loss": 0.764, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4254428744316101, "rewards/margins": 0.4820783734321594, "rewards/rejected": -0.9075212478637695, "step": 1220 }, { "epoch": 0.2865463016889924, "grad_norm": 8.975470542907715, "learning_rate": 1.903948220064725e-05, "logits/chosen": -4.323489189147949, "logits/rejected": -4.383948802947998, "logps/chosen": -663.6719970703125, "logps/rejected": -720.05322265625, "loss": 0.7602, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2482331544160843, "rewards/margins": 0.3147343397140503, "rewards/rejected": -0.5629674792289734, "step": 1230 }, { "epoch": 0.2888759464181712, "grad_norm": 10.377336502075195, "learning_rate": 1.901359223300971e-05, "logits/chosen": -4.380834102630615, "logits/rejected": -4.364059925079346, "logps/chosen": -719.26904296875, "logps/rejected": -734.4669799804688, "loss": 0.93, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.5593451261520386, "rewards/margins": 0.018338533118367195, "rewards/rejected": -0.577683687210083, "step": 1240 }, { "epoch": 0.29120559114735, "grad_norm": 7.079504013061523, "learning_rate": 1.898770226537217e-05, "logits/chosen": -4.397341728210449, "logits/rejected": -4.27084493637085, "logps/chosen": -759.1246948242188, "logps/rejected": -689.4752197265625, "loss": 0.6636, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.29703229665756226, "rewards/margins": 0.3597315549850464, "rewards/rejected": -0.6567639112472534, "step": 1250 }, { "epoch": 0.29353523587652885, "grad_norm": 10.312458992004395, "learning_rate": 1.896181229773463e-05, "logits/chosen": -4.25052547454834, "logits/rejected": -4.339222431182861, "logps/chosen": -708.4290771484375, "logps/rejected": -756.5723876953125, "loss": 0.8946, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.4672269821166992, "rewards/margins": 0.03996283560991287, "rewards/rejected": -0.5071898698806763, "step": 1260 }, { "epoch": 0.29586488060570765, "grad_norm": 7.909558296203613, "learning_rate": 1.893592233009709e-05, "logits/chosen": -4.474128723144531, "logits/rejected": -4.338606834411621, "logps/chosen": -749.0723876953125, "logps/rejected": -767.09912109375, "loss": 0.7758, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.48534131050109863, "rewards/margins": 0.2608849108219147, "rewards/rejected": -0.7462261915206909, "step": 1270 }, { "epoch": 0.29819452533488644, "grad_norm": 7.5413737297058105, "learning_rate": 1.8910032362459548e-05, "logits/chosen": -4.267604351043701, "logits/rejected": -4.25099515914917, "logps/chosen": -688.8858642578125, "logps/rejected": -764.21484375, "loss": 0.8028, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4112478196620941, "rewards/margins": 0.2967201769351959, "rewards/rejected": -0.7079680562019348, "step": 1280 }, { "epoch": 0.30052417006406523, "grad_norm": 12.668158531188965, "learning_rate": 1.888414239482201e-05, "logits/chosen": -4.369383811950684, "logits/rejected": -4.310977935791016, "logps/chosen": -717.2327880859375, "logps/rejected": -691.5540161132812, "loss": 0.791, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4307662844657898, "rewards/margins": 0.2471897304058075, "rewards/rejected": -0.6779559254646301, "step": 1290 }, { "epoch": 0.302853814793244, "grad_norm": 7.491385459899902, "learning_rate": 1.8858252427184466e-05, "logits/chosen": -4.3847270011901855, "logits/rejected": -4.315208435058594, "logps/chosen": -688.0221557617188, "logps/rejected": -711.5299682617188, "loss": 0.7325, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.23917031288146973, "rewards/margins": 0.3942749500274658, "rewards/rejected": -0.6334452629089355, "step": 1300 }, { "epoch": 0.302853814793244, "eval_logits/chosen": -4.31157922744751, "eval_logits/rejected": -4.299931049346924, "eval_logps/chosen": -695.6024780273438, "eval_logps/rejected": -713.5684204101562, "eval_loss": 0.6586904525756836, "eval_rewards/accuracies": 0.6052557826042175, "eval_rewards/chosen": -0.4774321913719177, "eval_rewards/margins": 0.19954435527324677, "eval_rewards/rejected": -0.6769765019416809, "eval_runtime": 386.9714, "eval_samples_per_second": 18.487, "eval_steps_per_second": 9.244, "step": 1300 }, { "epoch": 0.3051834595224228, "grad_norm": 7.889120578765869, "learning_rate": 1.8832362459546926e-05, "logits/chosen": -4.3173346519470215, "logits/rejected": -4.450728416442871, "logps/chosen": -675.0162963867188, "logps/rejected": -716.7359008789062, "loss": 0.8424, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.42047327756881714, "rewards/margins": 0.21612660586833954, "rewards/rejected": -0.6365998983383179, "step": 1310 }, { "epoch": 0.3075131042516016, "grad_norm": 8.503684043884277, "learning_rate": 1.8806472491909387e-05, "logits/chosen": -4.400944232940674, "logits/rejected": -4.452167987823486, "logps/chosen": -692.8150024414062, "logps/rejected": -786.423828125, "loss": 0.7357, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4632536768913269, "rewards/margins": 0.29990696907043457, "rewards/rejected": -0.7631607055664062, "step": 1320 }, { "epoch": 0.30984274898078046, "grad_norm": 8.10945987701416, "learning_rate": 1.8780582524271848e-05, "logits/chosen": -4.360897541046143, "logits/rejected": -4.393882751464844, "logps/chosen": -682.2403564453125, "logps/rejected": -772.7156372070312, "loss": 0.7315, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2753627896308899, "rewards/margins": 0.5017082095146179, "rewards/rejected": -0.7770709991455078, "step": 1330 }, { "epoch": 0.31217239370995925, "grad_norm": 7.530776023864746, "learning_rate": 1.8754692556634305e-05, "logits/chosen": -4.405557155609131, "logits/rejected": -4.374571800231934, "logps/chosen": -734.5709228515625, "logps/rejected": -706.4002685546875, "loss": 0.8656, "rewards/accuracies": 0.5, "rewards/chosen": -0.5671267509460449, "rewards/margins": 0.08109476417303085, "rewards/rejected": -0.6482214331626892, "step": 1340 }, { "epoch": 0.31450203843913804, "grad_norm": 11.22525405883789, "learning_rate": 1.8728802588996765e-05, "logits/chosen": -4.321959018707275, "logits/rejected": -4.386361122131348, "logps/chosen": -718.044677734375, "logps/rejected": -772.4830932617188, "loss": 0.9531, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.7779480218887329, "rewards/margins": -0.03689789026975632, "rewards/rejected": -0.741050124168396, "step": 1350 }, { "epoch": 0.31683168316831684, "grad_norm": 10.8184232711792, "learning_rate": 1.8702912621359222e-05, "logits/chosen": -4.334450721740723, "logits/rejected": -4.335959434509277, "logps/chosen": -746.68798828125, "logps/rejected": -753.0299072265625, "loss": 0.9349, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.5748792886734009, "rewards/margins": 0.11975729465484619, "rewards/rejected": -0.6946366429328918, "step": 1360 }, { "epoch": 0.31916132789749563, "grad_norm": 6.547021389007568, "learning_rate": 1.8677022653721683e-05, "logits/chosen": -4.310304164886475, "logits/rejected": -4.271243095397949, "logps/chosen": -731.6600341796875, "logps/rejected": -681.7969970703125, "loss": 0.8502, "rewards/accuracies": 0.5625, "rewards/chosen": -0.48259344696998596, "rewards/margins": 0.028724532574415207, "rewards/rejected": -0.5113179087638855, "step": 1370 }, { "epoch": 0.3214909726266744, "grad_norm": 8.706934928894043, "learning_rate": 1.8651132686084144e-05, "logits/chosen": -4.324995994567871, "logits/rejected": -4.29234504699707, "logps/chosen": -676.2664184570312, "logps/rejected": -738.682373046875, "loss": 0.8718, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.5631632804870605, "rewards/margins": 0.12288354337215424, "rewards/rejected": -0.6860467791557312, "step": 1380 }, { "epoch": 0.3238206173558532, "grad_norm": 7.237886428833008, "learning_rate": 1.8625242718446604e-05, "logits/chosen": -4.271629333496094, "logits/rejected": -4.291550636291504, "logps/chosen": -673.0308837890625, "logps/rejected": -716.6060791015625, "loss": 0.8182, "rewards/accuracies": 0.5, "rewards/chosen": -0.4574052691459656, "rewards/margins": 0.1533111035823822, "rewards/rejected": -0.6107163429260254, "step": 1390 }, { "epoch": 0.326150262085032, "grad_norm": 7.548694610595703, "learning_rate": 1.8599352750809065e-05, "logits/chosen": -4.321394920349121, "logits/rejected": -4.348613739013672, "logps/chosen": -730.42578125, "logps/rejected": -778.80615234375, "loss": 0.8771, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4148942828178406, "rewards/margins": 0.17481055855751038, "rewards/rejected": -0.5897048711776733, "step": 1400 }, { "epoch": 0.326150262085032, "eval_logits/chosen": -4.305843353271484, "eval_logits/rejected": -4.295498847961426, "eval_logps/chosen": -696.2532958984375, "eval_logps/rejected": -714.38623046875, "eval_loss": 0.6589279770851135, "eval_rewards/accuracies": 0.6079116463661194, "eval_rewards/chosen": -0.5424960255622864, "eval_rewards/margins": 0.21626760065555573, "eval_rewards/rejected": -0.7587636113166809, "eval_runtime": 386.9373, "eval_samples_per_second": 18.489, "eval_steps_per_second": 9.244, "step": 1400 }, { "epoch": 0.32847990681421085, "grad_norm": 4.411736965179443, "learning_rate": 1.8573462783171522e-05, "logits/chosen": -4.300525665283203, "logits/rejected": -4.326880931854248, "logps/chosen": -726.6177978515625, "logps/rejected": -726.2149047851562, "loss": 0.8027, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3517182171344757, "rewards/margins": 0.3191562294960022, "rewards/rejected": -0.6708744168281555, "step": 1410 }, { "epoch": 0.33080955154338965, "grad_norm": 9.20610523223877, "learning_rate": 1.8547572815533983e-05, "logits/chosen": -4.3987040519714355, "logits/rejected": -4.352162837982178, "logps/chosen": -703.8623046875, "logps/rejected": -684.692626953125, "loss": 0.8377, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.5054625272750854, "rewards/margins": 0.1009974479675293, "rewards/rejected": -0.6064599752426147, "step": 1420 }, { "epoch": 0.33313919627256844, "grad_norm": 9.253528594970703, "learning_rate": 1.852168284789644e-05, "logits/chosen": -4.3750433921813965, "logits/rejected": -4.303120136260986, "logps/chosen": -717.990966796875, "logps/rejected": -651.62841796875, "loss": 0.8924, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.5147886276245117, "rewards/margins": 0.1544334441423416, "rewards/rejected": -0.6692220568656921, "step": 1430 }, { "epoch": 0.33546884100174723, "grad_norm": 10.970667839050293, "learning_rate": 1.84957928802589e-05, "logits/chosen": -4.304279327392578, "logits/rejected": -4.316771507263184, "logps/chosen": -666.4049072265625, "logps/rejected": -702.8588256835938, "loss": 0.9584, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.6840143799781799, "rewards/margins": 0.024236023426055908, "rewards/rejected": -0.7082504034042358, "step": 1440 }, { "epoch": 0.337798485730926, "grad_norm": 7.63844108581543, "learning_rate": 1.846990291262136e-05, "logits/chosen": -4.291044235229492, "logits/rejected": -4.272721290588379, "logps/chosen": -723.7853393554688, "logps/rejected": -716.9478149414062, "loss": 0.845, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.3475242257118225, "rewards/margins": 0.28106704354286194, "rewards/rejected": -0.6285912394523621, "step": 1450 }, { "epoch": 0.3401281304601048, "grad_norm": 6.421882629394531, "learning_rate": 1.844401294498382e-05, "logits/chosen": -4.327022552490234, "logits/rejected": -4.355108261108398, "logps/chosen": -661.6111450195312, "logps/rejected": -672.7149658203125, "loss": 0.7624, "rewards/accuracies": 0.5625, "rewards/chosen": -0.23156344890594482, "rewards/margins": 0.21851006150245667, "rewards/rejected": -0.4500734806060791, "step": 1460 }, { "epoch": 0.3424577751892836, "grad_norm": 3.8347349166870117, "learning_rate": 1.841812297734628e-05, "logits/chosen": -4.450470924377441, "logits/rejected": -4.4103875160217285, "logps/chosen": -746.5904541015625, "logps/rejected": -708.0609130859375, "loss": 0.7367, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.27077916264533997, "rewards/margins": 0.39619284868240356, "rewards/rejected": -0.6669719815254211, "step": 1470 }, { "epoch": 0.34478741991846246, "grad_norm": 7.18529748916626, "learning_rate": 1.839223300970874e-05, "logits/chosen": -4.261987209320068, "logits/rejected": -4.328606128692627, "logps/chosen": -706.8624267578125, "logps/rejected": -720.5556640625, "loss": 0.8931, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.40044647455215454, "rewards/margins": 0.04966818541288376, "rewards/rejected": -0.4501146674156189, "step": 1480 }, { "epoch": 0.34711706464764125, "grad_norm": 8.048136711120605, "learning_rate": 1.83663430420712e-05, "logits/chosen": -4.311758041381836, "logits/rejected": -4.399693012237549, "logps/chosen": -728.1452026367188, "logps/rejected": -779.4000244140625, "loss": 0.7912, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.4728098511695862, "rewards/margins": 0.32029253244400024, "rewards/rejected": -0.7931022644042969, "step": 1490 }, { "epoch": 0.34944670937682004, "grad_norm": 10.150632858276367, "learning_rate": 1.834045307443366e-05, "logits/chosen": -4.398844242095947, "logits/rejected": -4.398817539215088, "logps/chosen": -764.0753173828125, "logps/rejected": -768.4854125976562, "loss": 0.8794, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3697890341281891, "rewards/margins": 0.22263407707214355, "rewards/rejected": -0.5924230813980103, "step": 1500 }, { "epoch": 0.34944670937682004, "eval_logits/chosen": -4.326058864593506, "eval_logits/rejected": -4.316102981567383, "eval_logps/chosen": -695.7061767578125, "eval_logps/rejected": -713.8124389648438, "eval_loss": 0.6541875600814819, "eval_rewards/accuracies": 0.6108470559120178, "eval_rewards/chosen": -0.4877876341342926, "eval_rewards/margins": 0.2135990709066391, "eval_rewards/rejected": -0.7013866901397705, "eval_runtime": 386.448, "eval_samples_per_second": 18.512, "eval_steps_per_second": 9.256, "step": 1500 }, { "epoch": 0.35177635410599883, "grad_norm": 8.593611717224121, "learning_rate": 1.8314563106796118e-05, "logits/chosen": -4.412143707275391, "logits/rejected": -4.4477434158325195, "logps/chosen": -721.7753295898438, "logps/rejected": -744.5516967773438, "loss": 0.7167, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3608551621437073, "rewards/margins": 0.30276018381118774, "rewards/rejected": -0.6636154055595398, "step": 1510 }, { "epoch": 0.3541059988351776, "grad_norm": 8.981035232543945, "learning_rate": 1.8288673139158578e-05, "logits/chosen": -4.372857093811035, "logits/rejected": -4.306519508361816, "logps/chosen": -817.6216430664062, "logps/rejected": -790.0299682617188, "loss": 0.9705, "rewards/accuracies": 0.5, "rewards/chosen": -0.6924741864204407, "rewards/margins": 0.003534305142238736, "rewards/rejected": -0.6960083842277527, "step": 1520 }, { "epoch": 0.3564356435643564, "grad_norm": 6.777766227722168, "learning_rate": 1.8262783171521035e-05, "logits/chosen": -4.304599761962891, "logits/rejected": -4.376704216003418, "logps/chosen": -679.5525512695312, "logps/rejected": -784.6156005859375, "loss": 0.761, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.41890984773635864, "rewards/margins": 0.4356445372104645, "rewards/rejected": -0.854554295539856, "step": 1530 }, { "epoch": 0.3587652882935352, "grad_norm": 9.232786178588867, "learning_rate": 1.8236893203883496e-05, "logits/chosen": -4.431849956512451, "logits/rejected": -4.395738124847412, "logps/chosen": -705.1184692382812, "logps/rejected": -734.9397583007812, "loss": 0.9316, "rewards/accuracies": 0.5, "rewards/chosen": -0.665607213973999, "rewards/margins": -0.041909217834472656, "rewards/rejected": -0.6236980557441711, "step": 1540 }, { "epoch": 0.36109493302271406, "grad_norm": 10.691422462463379, "learning_rate": 1.8211003236245956e-05, "logits/chosen": -4.34941291809082, "logits/rejected": -4.335809230804443, "logps/chosen": -755.9302978515625, "logps/rejected": -768.3907470703125, "loss": 0.8211, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.5321193933486938, "rewards/margins": 0.3216237425804138, "rewards/rejected": -0.8537429571151733, "step": 1550 }, { "epoch": 0.36342457775189285, "grad_norm": 6.296148777008057, "learning_rate": 1.8185113268608417e-05, "logits/chosen": -4.34104061126709, "logits/rejected": -4.358175754547119, "logps/chosen": -736.5698852539062, "logps/rejected": -756.3456420898438, "loss": 0.7405, "rewards/accuracies": 0.5625, "rewards/chosen": -0.39380258321762085, "rewards/margins": 0.2697417140007019, "rewards/rejected": -0.6635442972183228, "step": 1560 }, { "epoch": 0.36575422248107164, "grad_norm": 6.31267786026001, "learning_rate": 1.8159223300970878e-05, "logits/chosen": -4.363784313201904, "logits/rejected": -4.2965593338012695, "logps/chosen": -703.5390014648438, "logps/rejected": -704.9164428710938, "loss": 0.7817, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.47610020637512207, "rewards/margins": 0.39951229095458984, "rewards/rejected": -0.8756124377250671, "step": 1570 }, { "epoch": 0.36808386721025044, "grad_norm": 7.0956130027771, "learning_rate": 1.8133333333333335e-05, "logits/chosen": -4.416982173919678, "logits/rejected": -4.3702006340026855, "logps/chosen": -671.6224365234375, "logps/rejected": -645.9537963867188, "loss": 0.906, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6286529302597046, "rewards/margins": -0.013773918151855469, "rewards/rejected": -0.6148789525032043, "step": 1580 }, { "epoch": 0.37041351193942923, "grad_norm": 6.213327884674072, "learning_rate": 1.8107443365695795e-05, "logits/chosen": -4.385059356689453, "logits/rejected": -4.5222296714782715, "logps/chosen": -697.7130737304688, "logps/rejected": -820.3958740234375, "loss": 0.7444, "rewards/accuracies": 0.625, "rewards/chosen": -0.5424093008041382, "rewards/margins": 0.4374067187309265, "rewards/rejected": -0.9798160791397095, "step": 1590 }, { "epoch": 0.372743156668608, "grad_norm": 8.027915000915527, "learning_rate": 1.8081553398058253e-05, "logits/chosen": -4.39331579208374, "logits/rejected": -4.453400611877441, "logps/chosen": -701.2443237304688, "logps/rejected": -756.2605590820312, "loss": 0.772, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.38883525133132935, "rewards/margins": 0.19603821635246277, "rewards/rejected": -0.5848734974861145, "step": 1600 }, { "epoch": 0.372743156668608, "eval_logits/chosen": -4.31981086730957, "eval_logits/rejected": -4.309142112731934, "eval_logps/chosen": -696.2880859375, "eval_logps/rejected": -714.566650390625, "eval_loss": 0.655998170375824, "eval_rewards/accuracies": 0.6118255257606506, "eval_rewards/chosen": -0.5459803938865662, "eval_rewards/margins": 0.23082096874713898, "eval_rewards/rejected": -0.7768014073371887, "eval_runtime": 387.0177, "eval_samples_per_second": 18.485, "eval_steps_per_second": 9.242, "step": 1600 }, { "epoch": 0.3750728013977868, "grad_norm": 6.432879447937012, "learning_rate": 1.8055663430420713e-05, "logits/chosen": -4.410266399383545, "logits/rejected": -4.367775917053223, "logps/chosen": -733.0140380859375, "logps/rejected": -738.000244140625, "loss": 0.7472, "rewards/accuracies": 0.625, "rewards/chosen": -0.3916794955730438, "rewards/margins": 0.4530234932899475, "rewards/rejected": -0.8447030186653137, "step": 1610 }, { "epoch": 0.37740244612696566, "grad_norm": 10.104506492614746, "learning_rate": 1.8029773462783174e-05, "logits/chosen": -4.375548362731934, "logits/rejected": -4.408837795257568, "logps/chosen": -729.4613037109375, "logps/rejected": -811.3401489257812, "loss": 0.9296, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.3973383903503418, "rewards/margins": 0.030383765697479248, "rewards/rejected": -0.42772215604782104, "step": 1620 }, { "epoch": 0.37973209085614446, "grad_norm": 8.92159366607666, "learning_rate": 1.8003883495145634e-05, "logits/chosen": -4.325889587402344, "logits/rejected": -4.390286445617676, "logps/chosen": -704.9832763671875, "logps/rejected": -721.9518432617188, "loss": 0.7323, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4084460139274597, "rewards/margins": 0.39792656898498535, "rewards/rejected": -0.8063725233078003, "step": 1630 }, { "epoch": 0.38206173558532325, "grad_norm": 9.660841941833496, "learning_rate": 1.797799352750809e-05, "logits/chosen": -4.30095911026001, "logits/rejected": -4.326530456542969, "logps/chosen": -711.6383666992188, "logps/rejected": -741.7811279296875, "loss": 0.8577, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.4106476902961731, "rewards/margins": 0.06246136501431465, "rewards/rejected": -0.47310906648635864, "step": 1640 }, { "epoch": 0.38439138031450204, "grad_norm": 5.930005073547363, "learning_rate": 1.7952103559870552e-05, "logits/chosen": -4.294757843017578, "logits/rejected": -4.306758880615234, "logps/chosen": -688.84375, "logps/rejected": -730.2039794921875, "loss": 0.7176, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3881547451019287, "rewards/margins": 0.5018659830093384, "rewards/rejected": -0.8900208473205566, "step": 1650 }, { "epoch": 0.38672102504368083, "grad_norm": 7.299777507781982, "learning_rate": 1.792621359223301e-05, "logits/chosen": -4.340909957885742, "logits/rejected": -4.285897254943848, "logps/chosen": -688.5858154296875, "logps/rejected": -659.2593383789062, "loss": 0.9392, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.6448443531990051, "rewards/margins": -0.16130118072032928, "rewards/rejected": -0.4835430979728699, "step": 1660 }, { "epoch": 0.3890506697728596, "grad_norm": 7.8056182861328125, "learning_rate": 1.790032362459547e-05, "logits/chosen": -4.302353858947754, "logits/rejected": -4.317826271057129, "logps/chosen": -735.2644653320312, "logps/rejected": -726.2462158203125, "loss": 0.8632, "rewards/accuracies": 0.5, "rewards/chosen": -0.4602840840816498, "rewards/margins": 0.10094530880451202, "rewards/rejected": -0.5612293481826782, "step": 1670 }, { "epoch": 0.3913803145020384, "grad_norm": 6.735440254211426, "learning_rate": 1.787443365695793e-05, "logits/chosen": -4.3868608474731445, "logits/rejected": -4.315372943878174, "logps/chosen": -763.2335815429688, "logps/rejected": -739.8739013671875, "loss": 0.8695, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.4911941885948181, "rewards/margins": 0.14432446658611298, "rewards/rejected": -0.6355187296867371, "step": 1680 }, { "epoch": 0.39370995923121727, "grad_norm": 9.822698593139648, "learning_rate": 1.784854368932039e-05, "logits/chosen": -4.311770439147949, "logits/rejected": -4.3782219886779785, "logps/chosen": -653.6517333984375, "logps/rejected": -746.6046142578125, "loss": 0.7848, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.33290696144104004, "rewards/margins": 0.3551305830478668, "rewards/rejected": -0.6880375146865845, "step": 1690 }, { "epoch": 0.39603960396039606, "grad_norm": 6.626972198486328, "learning_rate": 1.782265372168285e-05, "logits/chosen": -4.282130241394043, "logits/rejected": -4.3887200355529785, "logps/chosen": -716.3925170898438, "logps/rejected": -813.790771484375, "loss": 0.965, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6478350758552551, "rewards/margins": 0.02205513045191765, "rewards/rejected": -0.6698901653289795, "step": 1700 }, { "epoch": 0.39603960396039606, "eval_logits/chosen": -4.333280563354492, "eval_logits/rejected": -4.32305383682251, "eval_logps/chosen": -695.7999877929688, "eval_logps/rejected": -714.0227661132812, "eval_loss": 0.6525019407272339, "eval_rewards/accuracies": 0.6146211624145508, "eval_rewards/chosen": -0.49716758728027344, "eval_rewards/margins": 0.22524842619895935, "eval_rewards/rejected": -0.7224159836769104, "eval_runtime": 386.661, "eval_samples_per_second": 18.502, "eval_steps_per_second": 9.251, "step": 1700 }, { "epoch": 0.39836924868957485, "grad_norm": 9.437028884887695, "learning_rate": 1.779676375404531e-05, "logits/chosen": -4.359824180603027, "logits/rejected": -4.38698148727417, "logps/chosen": -721.0726318359375, "logps/rejected": -744.590087890625, "loss": 0.7858, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.35493481159210205, "rewards/margins": 0.24104556441307068, "rewards/rejected": -0.5959803462028503, "step": 1710 }, { "epoch": 0.40069889341875364, "grad_norm": 4.35906457901001, "learning_rate": 1.7770873786407766e-05, "logits/chosen": -4.450234413146973, "logits/rejected": -4.379258155822754, "logps/chosen": -700.4639892578125, "logps/rejected": -632.0845947265625, "loss": 0.7615, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5291283130645752, "rewards/margins": 0.2273644655942917, "rewards/rejected": -0.7564927935600281, "step": 1720 }, { "epoch": 0.40302853814793244, "grad_norm": 4.904123783111572, "learning_rate": 1.7744983818770226e-05, "logits/chosen": -4.306130409240723, "logits/rejected": -4.339006423950195, "logps/chosen": -698.2767944335938, "logps/rejected": -711.9406127929688, "loss": 0.7953, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3669876754283905, "rewards/margins": 0.28200289607048035, "rewards/rejected": -0.6489905714988708, "step": 1730 }, { "epoch": 0.40535818287711123, "grad_norm": 6.090320587158203, "learning_rate": 1.7719093851132687e-05, "logits/chosen": -4.347052574157715, "logits/rejected": -4.323563098907471, "logps/chosen": -722.5578002929688, "logps/rejected": -726.0507202148438, "loss": 0.7954, "rewards/accuracies": 0.625, "rewards/chosen": -0.4122798442840576, "rewards/margins": 0.28659650683403015, "rewards/rejected": -0.6988764405250549, "step": 1740 }, { "epoch": 0.40768782760629, "grad_norm": 8.839376449584961, "learning_rate": 1.7693203883495148e-05, "logits/chosen": -4.381305694580078, "logits/rejected": -4.384965896606445, "logps/chosen": -760.0098876953125, "logps/rejected": -822.9794921875, "loss": 0.6897, "rewards/accuracies": 0.6875, "rewards/chosen": -0.36683160066604614, "rewards/margins": 0.5118778944015503, "rewards/rejected": -0.8787094354629517, "step": 1750 }, { "epoch": 0.4100174723354688, "grad_norm": 9.633819580078125, "learning_rate": 1.7667313915857608e-05, "logits/chosen": -4.34865665435791, "logits/rejected": -4.264157295227051, "logps/chosen": -707.0336303710938, "logps/rejected": -721.6454467773438, "loss": 0.7819, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.34074828028678894, "rewards/margins": 0.33113110065460205, "rewards/rejected": -0.6718794107437134, "step": 1760 }, { "epoch": 0.41234711706464766, "grad_norm": 6.981215000152588, "learning_rate": 1.7641423948220065e-05, "logits/chosen": -4.279094696044922, "logits/rejected": -4.388249397277832, "logps/chosen": -693.2166748046875, "logps/rejected": -756.8175048828125, "loss": 0.6648, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4187663495540619, "rewards/margins": 0.4338448941707611, "rewards/rejected": -0.852611243724823, "step": 1770 }, { "epoch": 0.41467676179382645, "grad_norm": 9.351151466369629, "learning_rate": 1.7615533980582526e-05, "logits/chosen": -4.388346195220947, "logits/rejected": -4.325381278991699, "logps/chosen": -734.6055908203125, "logps/rejected": -761.28955078125, "loss": 0.8392, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.3040839433670044, "rewards/margins": 0.2513015866279602, "rewards/rejected": -0.5553855895996094, "step": 1780 }, { "epoch": 0.41700640652300525, "grad_norm": 9.658149719238281, "learning_rate": 1.7589644012944986e-05, "logits/chosen": -4.396363258361816, "logits/rejected": -4.409237861633301, "logps/chosen": -751.6793823242188, "logps/rejected": -754.2955932617188, "loss": 0.7963, "rewards/accuracies": 0.5, "rewards/chosen": -0.565523624420166, "rewards/margins": 0.3098762333393097, "rewards/rejected": -0.8753998875617981, "step": 1790 }, { "epoch": 0.41933605125218404, "grad_norm": 8.52037525177002, "learning_rate": 1.7563754045307444e-05, "logits/chosen": -4.327354431152344, "logits/rejected": -4.397865295410156, "logps/chosen": -631.0447998046875, "logps/rejected": -733.0093994140625, "loss": 0.7763, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.38907307386398315, "rewards/margins": 0.29196518659591675, "rewards/rejected": -0.6810382604598999, "step": 1800 }, { "epoch": 0.41933605125218404, "eval_logits/chosen": -4.320523738861084, "eval_logits/rejected": -4.310983657836914, "eval_logps/chosen": -696.6057739257812, "eval_logps/rejected": -715.0689086914062, "eval_loss": 0.6532381176948547, "eval_rewards/accuracies": 0.6126642227172852, "eval_rewards/chosen": -0.577749490737915, "eval_rewards/margins": 0.24928320944309235, "eval_rewards/rejected": -0.8270328044891357, "eval_runtime": 386.9896, "eval_samples_per_second": 18.486, "eval_steps_per_second": 9.243, "step": 1800 }, { "epoch": 0.42166569598136283, "grad_norm": 6.045749664306641, "learning_rate": 1.7537864077669904e-05, "logits/chosen": -4.361016273498535, "logits/rejected": -4.3705153465271, "logps/chosen": -728.2724609375, "logps/rejected": -753.5090942382812, "loss": 0.802, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5839508771896362, "rewards/margins": 0.3277234435081482, "rewards/rejected": -0.9116743206977844, "step": 1810 }, { "epoch": 0.4239953407105416, "grad_norm": 8.700886726379395, "learning_rate": 1.7511974110032365e-05, "logits/chosen": -4.295271873474121, "logits/rejected": -4.3172783851623535, "logps/chosen": -739.1973266601562, "logps/rejected": -734.2899169921875, "loss": 0.9811, "rewards/accuracies": 0.5, "rewards/chosen": -0.833113968372345, "rewards/margins": -0.08772550523281097, "rewards/rejected": -0.745388388633728, "step": 1820 }, { "epoch": 0.4263249854397204, "grad_norm": 6.315260887145996, "learning_rate": 1.7486084142394822e-05, "logits/chosen": -4.33939790725708, "logits/rejected": -4.366152763366699, "logps/chosen": -759.6776123046875, "logps/rejected": -831.0908203125, "loss": 0.7208, "rewards/accuracies": 0.625, "rewards/chosen": -0.4980863928794861, "rewards/margins": 0.3789387345314026, "rewards/rejected": -0.8770251274108887, "step": 1830 }, { "epoch": 0.42865463016889926, "grad_norm": 5.997265815734863, "learning_rate": 1.7460194174757283e-05, "logits/chosen": -4.340653896331787, "logits/rejected": -4.308244705200195, "logps/chosen": -675.6088256835938, "logps/rejected": -747.6590576171875, "loss": 0.8243, "rewards/accuracies": 0.5, "rewards/chosen": -0.451867014169693, "rewards/margins": 0.26603466272354126, "rewards/rejected": -0.7179016470909119, "step": 1840 }, { "epoch": 0.43098427489807806, "grad_norm": 11.125731468200684, "learning_rate": 1.7434304207119743e-05, "logits/chosen": -4.373724937438965, "logits/rejected": -4.415951251983643, "logps/chosen": -709.854248046875, "logps/rejected": -764.4318237304688, "loss": 0.867, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.5079065561294556, "rewards/margins": 0.07335279881954193, "rewards/rejected": -0.5812593698501587, "step": 1850 }, { "epoch": 0.43331391962725685, "grad_norm": 5.7267279624938965, "learning_rate": 1.7408414239482204e-05, "logits/chosen": -4.354807376861572, "logits/rejected": -4.387138366699219, "logps/chosen": -755.1207275390625, "logps/rejected": -792.3892822265625, "loss": 0.8665, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6459492444992065, "rewards/margins": 0.4302903711795807, "rewards/rejected": -1.0762395858764648, "step": 1860 }, { "epoch": 0.43564356435643564, "grad_norm": 8.538694381713867, "learning_rate": 1.738252427184466e-05, "logits/chosen": -4.39115047454834, "logits/rejected": -4.409638404846191, "logps/chosen": -694.1473388671875, "logps/rejected": -731.4945068359375, "loss": 0.7961, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.44678935408592224, "rewards/margins": 0.20480577647686005, "rewards/rejected": -0.6515951156616211, "step": 1870 }, { "epoch": 0.43797320908561443, "grad_norm": 8.02554702758789, "learning_rate": 1.735663430420712e-05, "logits/chosen": -4.342083930969238, "logits/rejected": -4.343569278717041, "logps/chosen": -730.3948364257812, "logps/rejected": -747.2578735351562, "loss": 0.8439, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6137405633926392, "rewards/margins": 0.35736626386642456, "rewards/rejected": -0.9711068272590637, "step": 1880 }, { "epoch": 0.4403028538147932, "grad_norm": 5.830707550048828, "learning_rate": 1.7330744336569582e-05, "logits/chosen": -4.3233842849731445, "logits/rejected": -4.377291202545166, "logps/chosen": -675.5960083007812, "logps/rejected": -787.6995849609375, "loss": 0.6188, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.21142582595348358, "rewards/margins": 0.599100649356842, "rewards/rejected": -0.8105264902114868, "step": 1890 }, { "epoch": 0.442632498543972, "grad_norm": 7.2594194412231445, "learning_rate": 1.730485436893204e-05, "logits/chosen": -4.270476341247559, "logits/rejected": -4.268796920776367, "logps/chosen": -724.4967651367188, "logps/rejected": -682.5066528320312, "loss": 0.7585, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5578378438949585, "rewards/margins": 0.33382734656333923, "rewards/rejected": -0.8916652798652649, "step": 1900 }, { "epoch": 0.442632498543972, "eval_logits/chosen": -4.3057966232299805, "eval_logits/rejected": -4.29526424407959, "eval_logps/chosen": -696.5594482421875, "eval_logps/rejected": -715.0735473632812, "eval_loss": 0.653139054775238, "eval_rewards/accuracies": 0.6157394647598267, "eval_rewards/chosen": -0.5731170773506165, "eval_rewards/margins": 0.2543674111366272, "eval_rewards/rejected": -0.8274844884872437, "eval_runtime": 387.1244, "eval_samples_per_second": 18.48, "eval_steps_per_second": 9.24, "step": 1900 }, { "epoch": 0.44496214327315087, "grad_norm": 8.877416610717773, "learning_rate": 1.72789644012945e-05, "logits/chosen": -4.295590400695801, "logits/rejected": -4.402998447418213, "logps/chosen": -655.2674560546875, "logps/rejected": -729.1608276367188, "loss": 0.7584, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.356262743473053, "rewards/margins": 0.38617458939552307, "rewards/rejected": -0.7424373626708984, "step": 1910 }, { "epoch": 0.44729178800232966, "grad_norm": 5.766246795654297, "learning_rate": 1.725307443365696e-05, "logits/chosen": -4.296099662780762, "logits/rejected": -4.2631754875183105, "logps/chosen": -700.16455078125, "logps/rejected": -688.3130493164062, "loss": 0.7649, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5256198048591614, "rewards/margins": 0.33562612533569336, "rewards/rejected": -0.8612459301948547, "step": 1920 }, { "epoch": 0.44962143273150845, "grad_norm": 10.782840728759766, "learning_rate": 1.722718446601942e-05, "logits/chosen": -4.388967037200928, "logits/rejected": -4.391193866729736, "logps/chosen": -786.0943603515625, "logps/rejected": -793.2239379882812, "loss": 0.945, "rewards/accuracies": 0.5, "rewards/chosen": -0.6996707320213318, "rewards/margins": 0.10727129131555557, "rewards/rejected": -0.8069421052932739, "step": 1930 }, { "epoch": 0.45195107746068724, "grad_norm": 7.510255336761475, "learning_rate": 1.7201294498381878e-05, "logits/chosen": -4.252111911773682, "logits/rejected": -4.227831840515137, "logps/chosen": -678.5157470703125, "logps/rejected": -728.3531494140625, "loss": 0.864, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.39262905716896057, "rewards/margins": 0.1792100965976715, "rewards/rejected": -0.5718391537666321, "step": 1940 }, { "epoch": 0.45428072218986604, "grad_norm": 10.387371063232422, "learning_rate": 1.717540453074434e-05, "logits/chosen": -4.272866249084473, "logits/rejected": -4.370764255523682, "logps/chosen": -711.3887939453125, "logps/rejected": -780.9873657226562, "loss": 0.7402, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.4937829077243805, "rewards/margins": 0.5505653619766235, "rewards/rejected": -1.044348120689392, "step": 1950 }, { "epoch": 0.45661036691904483, "grad_norm": 8.56853199005127, "learning_rate": 1.7149514563106796e-05, "logits/chosen": -4.274458885192871, "logits/rejected": -4.352829933166504, "logps/chosen": -661.9822998046875, "logps/rejected": -747.2185668945312, "loss": 0.6744, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.460103839635849, "rewards/margins": 0.5049347281455994, "rewards/rejected": -0.9650384783744812, "step": 1960 }, { "epoch": 0.4589400116482236, "grad_norm": 7.608241081237793, "learning_rate": 1.7123624595469256e-05, "logits/chosen": -4.223053932189941, "logits/rejected": -4.191582202911377, "logps/chosen": -636.3646240234375, "logps/rejected": -671.3651123046875, "loss": 0.8721, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.595376193523407, "rewards/margins": 0.1826825588941574, "rewards/rejected": -0.7780587673187256, "step": 1970 }, { "epoch": 0.46126965637740247, "grad_norm": 7.9133734703063965, "learning_rate": 1.7097734627831717e-05, "logits/chosen": -4.371428489685059, "logits/rejected": -4.285247325897217, "logps/chosen": -673.6553955078125, "logps/rejected": -660.1832275390625, "loss": 0.849, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.640204906463623, "rewards/margins": 0.06438927352428436, "rewards/rejected": -0.7045941948890686, "step": 1980 }, { "epoch": 0.46359930110658126, "grad_norm": 5.946269512176514, "learning_rate": 1.7071844660194178e-05, "logits/chosen": -4.309154510498047, "logits/rejected": -4.383447647094727, "logps/chosen": -661.7271728515625, "logps/rejected": -756.4297485351562, "loss": 0.6573, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.39469021558761597, "rewards/margins": 0.5698471069335938, "rewards/rejected": -0.9645372629165649, "step": 1990 }, { "epoch": 0.46592894583576006, "grad_norm": 9.132586479187012, "learning_rate": 1.7045954692556638e-05, "logits/chosen": -4.325804710388184, "logits/rejected": -4.374610900878906, "logps/chosen": -753.6336669921875, "logps/rejected": -821.5224609375, "loss": 0.7913, "rewards/accuracies": 0.5625, "rewards/chosen": -0.43599969148635864, "rewards/margins": 0.2553943991661072, "rewards/rejected": -0.691394031047821, "step": 2000 }, { "epoch": 0.46592894583576006, "eval_logits/chosen": -4.316074848175049, "eval_logits/rejected": -4.305654525756836, "eval_logps/chosen": -696.9720458984375, "eval_logps/rejected": -715.6248779296875, "eval_loss": 0.6540352702140808, "eval_rewards/accuracies": 0.6111266613006592, "eval_rewards/chosen": -0.6143832206726074, "eval_rewards/margins": 0.2682493329048157, "eval_rewards/rejected": -0.8826324939727783, "eval_runtime": 387.4872, "eval_samples_per_second": 18.463, "eval_steps_per_second": 9.231, "step": 2000 }, { "epoch": 0.46825859056493885, "grad_norm": 9.916934967041016, "learning_rate": 1.7020064724919095e-05, "logits/chosen": -4.3210673332214355, "logits/rejected": -4.371387481689453, "logps/chosen": -659.3184204101562, "logps/rejected": -739.4029541015625, "loss": 1.0101, "rewards/accuracies": 0.5, "rewards/chosen": -0.5967248678207397, "rewards/margins": -0.05980793386697769, "rewards/rejected": -0.5369168519973755, "step": 2010 }, { "epoch": 0.47058823529411764, "grad_norm": 11.016702651977539, "learning_rate": 1.6994174757281553e-05, "logits/chosen": -4.28005313873291, "logits/rejected": -4.314024925231934, "logps/chosen": -630.3978271484375, "logps/rejected": -720.5894775390625, "loss": 0.8639, "rewards/accuracies": 0.5, "rewards/chosen": -0.6581310033798218, "rewards/margins": 0.1440017968416214, "rewards/rejected": -0.802132785320282, "step": 2020 }, { "epoch": 0.47291788002329643, "grad_norm": 9.603341102600098, "learning_rate": 1.6968284789644013e-05, "logits/chosen": -4.329268455505371, "logits/rejected": -4.366645336151123, "logps/chosen": -709.0260620117188, "logps/rejected": -751.087158203125, "loss": 0.7811, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.4101952612400055, "rewards/margins": 0.3048393130302429, "rewards/rejected": -0.7150346040725708, "step": 2030 }, { "epoch": 0.4752475247524752, "grad_norm": 6.993385314941406, "learning_rate": 1.6942394822006474e-05, "logits/chosen": -4.323848247528076, "logits/rejected": -4.403807640075684, "logps/chosen": -671.4868774414062, "logps/rejected": -710.583984375, "loss": 0.7097, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5717336535453796, "rewards/margins": 0.4448729157447815, "rewards/rejected": -1.0166065692901611, "step": 2040 }, { "epoch": 0.4775771694816541, "grad_norm": 9.222648620605469, "learning_rate": 1.6916504854368934e-05, "logits/chosen": -4.329625129699707, "logits/rejected": -4.326584815979004, "logps/chosen": -730.6510009765625, "logps/rejected": -748.1927490234375, "loss": 0.8364, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6884124279022217, "rewards/margins": 0.2536119520664215, "rewards/rejected": -0.9420243501663208, "step": 2050 }, { "epoch": 0.47990681421083287, "grad_norm": 10.766345977783203, "learning_rate": 1.6890614886731395e-05, "logits/chosen": -4.456717491149902, "logits/rejected": -4.4114227294921875, "logps/chosen": -745.6207275390625, "logps/rejected": -780.1121826171875, "loss": 0.8846, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.6652731895446777, "rewards/margins": 0.17692777514457703, "rewards/rejected": -0.8422010540962219, "step": 2060 }, { "epoch": 0.48223645894001166, "grad_norm": 9.230855941772461, "learning_rate": 1.6864724919093852e-05, "logits/chosen": -4.3310747146606445, "logits/rejected": -4.304535388946533, "logps/chosen": -767.2239990234375, "logps/rejected": -702.9869384765625, "loss": 0.8106, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.44257909059524536, "rewards/margins": 0.2606015205383301, "rewards/rejected": -0.7031804919242859, "step": 2070 }, { "epoch": 0.48456610366919045, "grad_norm": 7.361287593841553, "learning_rate": 1.6838834951456313e-05, "logits/chosen": -4.302729606628418, "logits/rejected": -4.313250541687012, "logps/chosen": -648.447998046875, "logps/rejected": -700.7833862304688, "loss": 0.7302, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.41339558362960815, "rewards/margins": 0.25599178671836853, "rewards/rejected": -0.6693874001502991, "step": 2080 }, { "epoch": 0.48689574839836924, "grad_norm": 8.901455879211426, "learning_rate": 1.681294498381877e-05, "logits/chosen": -4.450462818145752, "logits/rejected": -4.434903621673584, "logps/chosen": -756.1082763671875, "logps/rejected": -768.9176635742188, "loss": 0.8059, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.6943671107292175, "rewards/margins": 0.14816538989543915, "rewards/rejected": -0.8425325155258179, "step": 2090 }, { "epoch": 0.48922539312754804, "grad_norm": 11.805319786071777, "learning_rate": 1.678705501618123e-05, "logits/chosen": -4.2671709060668945, "logits/rejected": -4.283616542816162, "logps/chosen": -706.508056640625, "logps/rejected": -705.4736328125, "loss": 0.8142, "rewards/accuracies": 0.625, "rewards/chosen": -0.6520582437515259, "rewards/margins": 0.3638521134853363, "rewards/rejected": -1.0159103870391846, "step": 2100 }, { "epoch": 0.48922539312754804, "eval_logits/chosen": -4.311291217803955, "eval_logits/rejected": -4.300279140472412, "eval_logps/chosen": -697.2134399414062, "eval_logps/rejected": -715.9606323242188, "eval_loss": 0.6529696583747864, "eval_rewards/accuracies": 0.6108470559120178, "eval_rewards/chosen": -0.6385191679000854, "eval_rewards/margins": 0.27768629789352417, "eval_rewards/rejected": -0.9162055253982544, "eval_runtime": 387.8096, "eval_samples_per_second": 18.447, "eval_steps_per_second": 9.224, "step": 2100 }, { "epoch": 0.49155503785672683, "grad_norm": 7.226107120513916, "learning_rate": 1.676116504854369e-05, "logits/chosen": -4.375695705413818, "logits/rejected": -4.270474433898926, "logps/chosen": -694.0037841796875, "logps/rejected": -699.7886352539062, "loss": 0.7989, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.6933953166007996, "rewards/margins": 0.17309093475341797, "rewards/rejected": -0.8664861917495728, "step": 2110 }, { "epoch": 0.4938846825859057, "grad_norm": 7.346497058868408, "learning_rate": 1.673527508090615e-05, "logits/chosen": -4.305628776550293, "logits/rejected": -4.333715438842773, "logps/chosen": -724.8486328125, "logps/rejected": -760.1463623046875, "loss": 0.8177, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.38773879408836365, "rewards/margins": 0.37163084745407104, "rewards/rejected": -0.7593695521354675, "step": 2120 }, { "epoch": 0.49621432731508447, "grad_norm": 5.669642925262451, "learning_rate": 1.670938511326861e-05, "logits/chosen": -4.388348579406738, "logits/rejected": -4.364192962646484, "logps/chosen": -783.6923828125, "logps/rejected": -751.2498779296875, "loss": 0.8655, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.48794180154800415, "rewards/margins": 0.24359111487865448, "rewards/rejected": -0.7315329313278198, "step": 2130 }, { "epoch": 0.49854397204426326, "grad_norm": 7.022860050201416, "learning_rate": 1.668349514563107e-05, "logits/chosen": -4.392494201660156, "logits/rejected": -4.390700340270996, "logps/chosen": -687.4622802734375, "logps/rejected": -636.7962646484375, "loss": 0.9793, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.7259301543235779, "rewards/margins": -0.1531188040971756, "rewards/rejected": -0.5728113055229187, "step": 2140 }, { "epoch": 0.500873616773442, "grad_norm": 4.802661895751953, "learning_rate": 1.665760517799353e-05, "logits/chosen": -4.416029930114746, "logits/rejected": -4.297064304351807, "logps/chosen": -699.6199951171875, "logps/rejected": -611.7633666992188, "loss": 0.8991, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.4440249502658844, "rewards/margins": 0.1489962339401245, "rewards/rejected": -0.5930211544036865, "step": 2150 }, { "epoch": 0.5032032615026208, "grad_norm": 9.9832181930542, "learning_rate": 1.6631715210355987e-05, "logits/chosen": -4.275888919830322, "logits/rejected": -4.357396602630615, "logps/chosen": -705.728271484375, "logps/rejected": -773.0623779296875, "loss": 0.7735, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7002065777778625, "rewards/margins": 0.29984602332115173, "rewards/rejected": -1.0000526905059814, "step": 2160 }, { "epoch": 0.5055329062317997, "grad_norm": 7.250500679016113, "learning_rate": 1.6605825242718448e-05, "logits/chosen": -4.39943790435791, "logits/rejected": -4.3512492179870605, "logps/chosen": -666.4620971679688, "logps/rejected": -649.422607421875, "loss": 0.8384, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.4334799647331238, "rewards/margins": 0.11691157519817352, "rewards/rejected": -0.5503915548324585, "step": 2170 }, { "epoch": 0.5078625509609784, "grad_norm": 11.566901206970215, "learning_rate": 1.6579935275080908e-05, "logits/chosen": -4.273348808288574, "logits/rejected": -4.383965969085693, "logps/chosen": -741.3567504882812, "logps/rejected": -802.16552734375, "loss": 1.0295, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.6346859335899353, "rewards/margins": -0.07815258949995041, "rewards/rejected": -0.5565333962440491, "step": 2180 }, { "epoch": 0.5101921956901573, "grad_norm": 8.966931343078613, "learning_rate": 1.655404530744337e-05, "logits/chosen": -4.340353965759277, "logits/rejected": -4.431331634521484, "logps/chosen": -661.3947143554688, "logps/rejected": -730.6881103515625, "loss": 0.797, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5236181020736694, "rewards/margins": 0.3085421919822693, "rewards/rejected": -0.8321603536605835, "step": 2190 }, { "epoch": 0.512521840419336, "grad_norm": 7.0853776931762695, "learning_rate": 1.6528155339805826e-05, "logits/chosen": -4.33907413482666, "logits/rejected": -4.302447319030762, "logps/chosen": -730.8140869140625, "logps/rejected": -779.0943603515625, "loss": 0.7641, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3492526710033417, "rewards/margins": 0.4285813271999359, "rewards/rejected": -0.7778339982032776, "step": 2200 }, { "epoch": 0.512521840419336, "eval_logits/chosen": -4.323455810546875, "eval_logits/rejected": -4.312369346618652, "eval_logps/chosen": -695.7344360351562, "eval_logps/rejected": -714.2130737304688, "eval_loss": 0.6444052457809448, "eval_rewards/accuracies": 0.6178361773490906, "eval_rewards/chosen": -0.4906233847141266, "eval_rewards/margins": 0.2508200705051422, "eval_rewards/rejected": -0.7414434552192688, "eval_runtime": 387.8429, "eval_samples_per_second": 18.446, "eval_steps_per_second": 9.223, "step": 2200 }, { "epoch": 0.5148514851485149, "grad_norm": 8.635801315307617, "learning_rate": 1.6502265372168287e-05, "logits/chosen": -4.311502456665039, "logits/rejected": -4.282750129699707, "logps/chosen": -681.6749877929688, "logps/rejected": -721.6594848632812, "loss": 0.7712, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4766474664211273, "rewards/margins": 0.3283555507659912, "rewards/rejected": -0.8050029873847961, "step": 2210 }, { "epoch": 0.5171811298776936, "grad_norm": 7.167092800140381, "learning_rate": 1.6476375404530747e-05, "logits/chosen": -4.369795799255371, "logits/rejected": -4.381664276123047, "logps/chosen": -661.0784912109375, "logps/rejected": -720.4771728515625, "loss": 0.7849, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.22546792030334473, "rewards/margins": 0.4140700399875641, "rewards/rejected": -0.6395379304885864, "step": 2220 }, { "epoch": 0.5195107746068724, "grad_norm": 6.891871929168701, "learning_rate": 1.6450485436893204e-05, "logits/chosen": -4.3150835037231445, "logits/rejected": -4.312820911407471, "logps/chosen": -732.7314453125, "logps/rejected": -758.8406982421875, "loss": 0.8171, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5067173838615417, "rewards/margins": 0.19723649322986603, "rewards/rejected": -0.7039539813995361, "step": 2230 }, { "epoch": 0.5218404193360513, "grad_norm": 6.103372097015381, "learning_rate": 1.6424595469255665e-05, "logits/chosen": -4.419613838195801, "logits/rejected": -4.384373188018799, "logps/chosen": -695.9058837890625, "logps/rejected": -728.0184326171875, "loss": 0.7623, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5310667157173157, "rewards/margins": 0.3870542645454407, "rewards/rejected": -0.9181209802627563, "step": 2240 }, { "epoch": 0.52417006406523, "grad_norm": 7.377810955047607, "learning_rate": 1.6398705501618125e-05, "logits/chosen": -4.379231929779053, "logits/rejected": -4.402649879455566, "logps/chosen": -715.0233154296875, "logps/rejected": -724.0562133789062, "loss": 0.7516, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.4546116888523102, "rewards/margins": 0.2904551029205322, "rewards/rejected": -0.74506676197052, "step": 2250 }, { "epoch": 0.5264997087944089, "grad_norm": 9.442928314208984, "learning_rate": 1.6372815533980583e-05, "logits/chosen": -4.354384422302246, "logits/rejected": -4.3555216789245605, "logps/chosen": -670.1593627929688, "logps/rejected": -718.1063232421875, "loss": 0.819, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.39854809641838074, "rewards/margins": 0.2609342634677887, "rewards/rejected": -0.6594824194908142, "step": 2260 }, { "epoch": 0.5288293535235876, "grad_norm": 10.922154426574707, "learning_rate": 1.6346925566343043e-05, "logits/chosen": -4.298701763153076, "logits/rejected": -4.350190162658691, "logps/chosen": -669.03173828125, "logps/rejected": -745.9063720703125, "loss": 0.8177, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5612319707870483, "rewards/margins": 0.216721773147583, "rewards/rejected": -0.7779537439346313, "step": 2270 }, { "epoch": 0.5311589982527665, "grad_norm": 7.369058609008789, "learning_rate": 1.6321035598705504e-05, "logits/chosen": -4.28975772857666, "logits/rejected": -4.325669765472412, "logps/chosen": -692.7982177734375, "logps/rejected": -773.2313232421875, "loss": 0.7424, "rewards/accuracies": 0.625, "rewards/chosen": -0.41624245047569275, "rewards/margins": 0.27775830030441284, "rewards/rejected": -0.6940008401870728, "step": 2280 }, { "epoch": 0.5334886429819452, "grad_norm": 7.548649787902832, "learning_rate": 1.6295145631067964e-05, "logits/chosen": -4.324959754943848, "logits/rejected": -4.336320877075195, "logps/chosen": -721.7525634765625, "logps/rejected": -764.1182250976562, "loss": 0.6853, "rewards/accuracies": 0.625, "rewards/chosen": -0.4422667622566223, "rewards/margins": 0.5799342393875122, "rewards/rejected": -1.0222009420394897, "step": 2290 }, { "epoch": 0.535818287711124, "grad_norm": 8.149725914001465, "learning_rate": 1.626925566343042e-05, "logits/chosen": -4.364233016967773, "logits/rejected": -4.299471855163574, "logps/chosen": -732.43701171875, "logps/rejected": -720.9097900390625, "loss": 0.8921, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.6769243478775024, "rewards/margins": 0.19919486343860626, "rewards/rejected": -0.8761193156242371, "step": 2300 }, { "epoch": 0.535818287711124, "eval_logits/chosen": -4.308585166931152, "eval_logits/rejected": -4.2970194816589355, "eval_logps/chosen": -696.9252319335938, "eval_logps/rejected": -715.663330078125, "eval_loss": 0.6475224494934082, "eval_rewards/accuracies": 0.6199328899383545, "eval_rewards/chosen": -0.60969078540802, "eval_rewards/margins": 0.2767795920372009, "eval_rewards/rejected": -0.8864704370498657, "eval_runtime": 387.8643, "eval_samples_per_second": 18.445, "eval_steps_per_second": 9.222, "step": 2300 }, { "epoch": 0.5381479324403029, "grad_norm": 8.612531661987305, "learning_rate": 1.6243365695792882e-05, "logits/chosen": -4.266479969024658, "logits/rejected": -4.361386775970459, "logps/chosen": -668.0935668945312, "logps/rejected": -717.5763549804688, "loss": 0.8491, "rewards/accuracies": 0.5, "rewards/chosen": -0.5396800637245178, "rewards/margins": 0.3209841251373291, "rewards/rejected": -0.8606641888618469, "step": 2310 }, { "epoch": 0.5404775771694816, "grad_norm": 7.291008472442627, "learning_rate": 1.621747572815534e-05, "logits/chosen": -4.2777605056762695, "logits/rejected": -4.271374702453613, "logps/chosen": -713.1057739257812, "logps/rejected": -694.7840576171875, "loss": 0.8052, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.5852731466293335, "rewards/margins": 0.13405922055244446, "rewards/rejected": -0.7193323969841003, "step": 2320 }, { "epoch": 0.5428072218986605, "grad_norm": 9.134176254272461, "learning_rate": 1.61915857605178e-05, "logits/chosen": -4.368586540222168, "logits/rejected": -4.426856994628906, "logps/chosen": -713.0442504882812, "logps/rejected": -725.6788940429688, "loss": 0.7815, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5852844715118408, "rewards/margins": 0.25622040033340454, "rewards/rejected": -0.8415048718452454, "step": 2330 }, { "epoch": 0.5451368666278392, "grad_norm": 6.4649763107299805, "learning_rate": 1.616569579288026e-05, "logits/chosen": -4.385904788970947, "logits/rejected": -4.340734481811523, "logps/chosen": -698.6243286132812, "logps/rejected": -749.6116943359375, "loss": 0.7694, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5972713232040405, "rewards/margins": 0.24341309070587158, "rewards/rejected": -0.8406842947006226, "step": 2340 }, { "epoch": 0.5474665113570181, "grad_norm": 8.067676544189453, "learning_rate": 1.613980582524272e-05, "logits/chosen": -4.363903999328613, "logits/rejected": -4.423806667327881, "logps/chosen": -663.022705078125, "logps/rejected": -722.1543579101562, "loss": 0.7407, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.49775949120521545, "rewards/margins": 0.2423984259366989, "rewards/rejected": -0.7401579022407532, "step": 2350 }, { "epoch": 0.5497961560861968, "grad_norm": 5.057220935821533, "learning_rate": 1.611391585760518e-05, "logits/chosen": -4.31404972076416, "logits/rejected": -4.347018718719482, "logps/chosen": -756.7380981445312, "logps/rejected": -747.169189453125, "loss": 0.7591, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5640712976455688, "rewards/margins": 0.3368191719055176, "rewards/rejected": -0.9008904695510864, "step": 2360 }, { "epoch": 0.5521258008153757, "grad_norm": 10.371302604675293, "learning_rate": 1.608802588996764e-05, "logits/chosen": -4.346640586853027, "logits/rejected": -4.344121932983398, "logps/chosen": -713.8607177734375, "logps/rejected": -741.2636108398438, "loss": 0.8372, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.5607599020004272, "rewards/margins": 0.2859343886375427, "rewards/rejected": -0.84669429063797, "step": 2370 }, { "epoch": 0.5544554455445545, "grad_norm": 8.637513160705566, "learning_rate": 1.60621359223301e-05, "logits/chosen": -4.243861198425293, "logits/rejected": -4.345170021057129, "logps/chosen": -642.0486450195312, "logps/rejected": -795.27294921875, "loss": 0.7589, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.675322413444519, "rewards/margins": 0.38120463490486145, "rewards/rejected": -1.056527018547058, "step": 2380 }, { "epoch": 0.5567850902737332, "grad_norm": 9.141393661499023, "learning_rate": 1.6036245954692557e-05, "logits/chosen": -4.281918525695801, "logits/rejected": -4.320727825164795, "logps/chosen": -729.5989990234375, "logps/rejected": -715.9414672851562, "loss": 0.8928, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.7688027024269104, "rewards/margins": 0.02522212825715542, "rewards/rejected": -0.7940248250961304, "step": 2390 }, { "epoch": 0.5591147350029121, "grad_norm": 5.217013359069824, "learning_rate": 1.6010355987055017e-05, "logits/chosen": -4.283066272735596, "logits/rejected": -4.255189895629883, "logps/chosen": -767.1123657226562, "logps/rejected": -819.5071411132812, "loss": 0.6825, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5336806178092957, "rewards/margins": 0.6160721778869629, "rewards/rejected": -1.1497528553009033, "step": 2400 }, { "epoch": 0.5591147350029121, "eval_logits/chosen": -4.297215461730957, "eval_logits/rejected": -4.28573751449585, "eval_logps/chosen": -697.8079833984375, "eval_logps/rejected": -716.7228393554688, "eval_loss": 0.6530700922012329, "eval_rewards/accuracies": 0.6114062070846558, "eval_rewards/chosen": -0.697973906993866, "eval_rewards/margins": 0.2944377660751343, "eval_rewards/rejected": -0.9924116134643555, "eval_runtime": 388.0812, "eval_samples_per_second": 18.434, "eval_steps_per_second": 9.217, "step": 2400 }, { "epoch": 0.5614443797320908, "grad_norm": 10.828152656555176, "learning_rate": 1.5984466019417478e-05, "logits/chosen": -4.3832502365112305, "logits/rejected": -4.312473297119141, "logps/chosen": -733.7899169921875, "logps/rejected": -697.7799072265625, "loss": 0.9236, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.6746879816055298, "rewards/margins": 0.05621149390935898, "rewards/rejected": -0.7308995127677917, "step": 2410 }, { "epoch": 0.5637740244612697, "grad_norm": 11.380895614624023, "learning_rate": 1.5958576051779938e-05, "logits/chosen": -4.3725385665893555, "logits/rejected": -4.367467403411865, "logps/chosen": -758.9491577148438, "logps/rejected": -784.2022094726562, "loss": 0.8901, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.6533423066139221, "rewards/margins": 0.22708959877490997, "rewards/rejected": -0.8804319500923157, "step": 2420 }, { "epoch": 0.5661036691904484, "grad_norm": 9.553884506225586, "learning_rate": 1.5932686084142395e-05, "logits/chosen": -4.271600246429443, "logits/rejected": -4.332309722900391, "logps/chosen": -682.1741943359375, "logps/rejected": -768.156494140625, "loss": 0.7798, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.528712272644043, "rewards/margins": 0.4405105710029602, "rewards/rejected": -0.9692228436470032, "step": 2430 }, { "epoch": 0.5684333139196273, "grad_norm": 8.388798713684082, "learning_rate": 1.5906796116504856e-05, "logits/chosen": -4.364581108093262, "logits/rejected": -4.264370441436768, "logps/chosen": -755.1317138671875, "logps/rejected": -676.2997436523438, "loss": 0.7807, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5193127393722534, "rewards/margins": 0.5082674026489258, "rewards/rejected": -1.0275801420211792, "step": 2440 }, { "epoch": 0.5707629586488061, "grad_norm": 9.449752807617188, "learning_rate": 1.5880906148867313e-05, "logits/chosen": -4.360546588897705, "logits/rejected": -4.376889705657959, "logps/chosen": -746.4757690429688, "logps/rejected": -775.843505859375, "loss": 0.8107, "rewards/accuracies": 0.625, "rewards/chosen": -0.5038831830024719, "rewards/margins": 0.48872479796409607, "rewards/rejected": -0.9926078915596008, "step": 2450 }, { "epoch": 0.5730926033779848, "grad_norm": 8.81622314453125, "learning_rate": 1.5855016181229774e-05, "logits/chosen": -4.269045829772949, "logits/rejected": -4.288808345794678, "logps/chosen": -723.4371948242188, "logps/rejected": -742.6324462890625, "loss": 0.9359, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.5083256959915161, "rewards/margins": 0.05839193984866142, "rewards/rejected": -0.5667175650596619, "step": 2460 }, { "epoch": 0.5754222481071637, "grad_norm": 7.394127368927002, "learning_rate": 1.5829126213592234e-05, "logits/chosen": -4.28656005859375, "logits/rejected": -4.279847621917725, "logps/chosen": -719.2637329101562, "logps/rejected": -764.5406494140625, "loss": 0.6321, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6179152727127075, "rewards/margins": 0.6699618101119995, "rewards/rejected": -1.287877082824707, "step": 2470 }, { "epoch": 0.5777518928363424, "grad_norm": 7.791938781738281, "learning_rate": 1.5803236245954695e-05, "logits/chosen": -4.400963306427002, "logits/rejected": -4.369585990905762, "logps/chosen": -738.0977783203125, "logps/rejected": -776.5066528320312, "loss": 0.7441, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5543312430381775, "rewards/margins": 0.5328670740127563, "rewards/rejected": -1.087198257446289, "step": 2480 }, { "epoch": 0.5800815375655213, "grad_norm": 7.783450603485107, "learning_rate": 1.5777346278317155e-05, "logits/chosen": -4.258950233459473, "logits/rejected": -4.367677688598633, "logps/chosen": -688.4526977539062, "logps/rejected": -737.1357421875, "loss": 0.7498, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.5971588492393494, "rewards/margins": 0.4435792565345764, "rewards/rejected": -1.0407381057739258, "step": 2490 }, { "epoch": 0.5824111822947, "grad_norm": 8.986100196838379, "learning_rate": 1.5751456310679613e-05, "logits/chosen": -4.3454484939575195, "logits/rejected": -4.2997660636901855, "logps/chosen": -679.8911743164062, "logps/rejected": -682.4745483398438, "loss": 0.8481, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.6458913683891296, "rewards/margins": 0.020166147500276566, "rewards/rejected": -0.6660575270652771, "step": 2500 }, { "epoch": 0.5824111822947, "eval_logits/chosen": -4.292790412902832, "eval_logits/rejected": -4.281363010406494, "eval_logps/chosen": -696.9899291992188, "eval_logps/rejected": -715.8394165039062, "eval_loss": 0.6466853618621826, "eval_rewards/accuracies": 0.6182554960250854, "eval_rewards/chosen": -0.6161715388298035, "eval_rewards/margins": 0.28790563344955444, "eval_rewards/rejected": -0.9040771722793579, "eval_runtime": 388.0903, "eval_samples_per_second": 18.434, "eval_steps_per_second": 9.217, "step": 2500 }, { "epoch": 0.5847408270238789, "grad_norm": 5.886007308959961, "learning_rate": 1.5725566343042073e-05, "logits/chosen": -4.2544379234313965, "logits/rejected": -4.31434440612793, "logps/chosen": -690.2918090820312, "logps/rejected": -674.09375, "loss": 0.6412, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.342994749546051, "rewards/margins": 0.6141099333763123, "rewards/rejected": -0.9571046829223633, "step": 2510 }, { "epoch": 0.5870704717530577, "grad_norm": 7.043492794036865, "learning_rate": 1.569967637540453e-05, "logits/chosen": -4.2179460525512695, "logits/rejected": -4.281126976013184, "logps/chosen": -636.9339599609375, "logps/rejected": -724.7083129882812, "loss": 0.6445, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.49156904220581055, "rewards/margins": 0.5859013795852661, "rewards/rejected": -1.0774705410003662, "step": 2520 }, { "epoch": 0.5894001164822364, "grad_norm": 6.933673858642578, "learning_rate": 1.567378640776699e-05, "logits/chosen": -4.389291763305664, "logits/rejected": -4.375206470489502, "logps/chosen": -784.4617919921875, "logps/rejected": -736.8739013671875, "loss": 0.8139, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5662163496017456, "rewards/margins": 0.27462419867515564, "rewards/rejected": -0.8408406376838684, "step": 2530 }, { "epoch": 0.5917297612114153, "grad_norm": 8.004260063171387, "learning_rate": 1.564789644012945e-05, "logits/chosen": -4.330020904541016, "logits/rejected": -4.290645599365234, "logps/chosen": -658.646484375, "logps/rejected": -650.4393920898438, "loss": 0.9183, "rewards/accuracies": 0.5, "rewards/chosen": -0.6325026750564575, "rewards/margins": 0.08804898709058762, "rewards/rejected": -0.7205516695976257, "step": 2540 }, { "epoch": 0.594059405940594, "grad_norm": 6.865865230560303, "learning_rate": 1.5622006472491912e-05, "logits/chosen": -4.3161234855651855, "logits/rejected": -4.298664093017578, "logps/chosen": -713.3058471679688, "logps/rejected": -761.7860107421875, "loss": 0.8578, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.5262347459793091, "rewards/margins": 0.36612358689308167, "rewards/rejected": -0.8923583030700684, "step": 2550 }, { "epoch": 0.5963890506697729, "grad_norm": 8.332757949829102, "learning_rate": 1.559611650485437e-05, "logits/chosen": -4.3331499099731445, "logits/rejected": -4.270108699798584, "logps/chosen": -711.1723022460938, "logps/rejected": -717.375732421875, "loss": 0.7714, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5592328906059265, "rewards/margins": 0.31131255626678467, "rewards/rejected": -0.870545506477356, "step": 2560 }, { "epoch": 0.5987186953989516, "grad_norm": 9.551703453063965, "learning_rate": 1.557022653721683e-05, "logits/chosen": -4.328275203704834, "logits/rejected": -4.383193016052246, "logps/chosen": -684.763671875, "logps/rejected": -722.69580078125, "loss": 0.7131, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2863122522830963, "rewards/margins": 0.4516221880912781, "rewards/rejected": -0.7379344701766968, "step": 2570 }, { "epoch": 0.6010483401281305, "grad_norm": 7.2382402420043945, "learning_rate": 1.554433656957929e-05, "logits/chosen": -4.349534511566162, "logits/rejected": -4.299520015716553, "logps/chosen": -716.7404174804688, "logps/rejected": -692.8947143554688, "loss": 0.9481, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5689787268638611, "rewards/margins": -0.02323698066174984, "rewards/rejected": -0.5457417368888855, "step": 2580 }, { "epoch": 0.6033779848573093, "grad_norm": 9.520913124084473, "learning_rate": 1.5518446601941748e-05, "logits/chosen": -4.272482395172119, "logits/rejected": -4.3258376121521, "logps/chosen": -705.7965087890625, "logps/rejected": -755.6998291015625, "loss": 0.7989, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5515230894088745, "rewards/margins": 0.3450368046760559, "rewards/rejected": -0.8965598344802856, "step": 2590 }, { "epoch": 0.605707629586488, "grad_norm": 7.175052642822266, "learning_rate": 1.5492556634304208e-05, "logits/chosen": -4.400643825531006, "logits/rejected": -4.387502193450928, "logps/chosen": -768.1902465820312, "logps/rejected": -745.0789184570312, "loss": 0.7822, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4362802505493164, "rewards/margins": 0.3764461576938629, "rewards/rejected": -0.8127263784408569, "step": 2600 }, { "epoch": 0.605707629586488, "eval_logits/chosen": -4.292550563812256, "eval_logits/rejected": -4.280701160430908, "eval_logps/chosen": -696.8247680664062, "eval_logps/rejected": -715.6983642578125, "eval_loss": 0.6448931694030762, "eval_rewards/accuracies": 0.6211909651756287, "eval_rewards/chosen": -0.5996540188789368, "eval_rewards/margins": 0.29031842947006226, "eval_rewards/rejected": -0.8899723887443542, "eval_runtime": 388.4993, "eval_samples_per_second": 18.414, "eval_steps_per_second": 9.207, "step": 2600 }, { "epoch": 0.6080372743156669, "grad_norm": 3.6842851638793945, "learning_rate": 1.546666666666667e-05, "logits/chosen": -4.335328102111816, "logits/rejected": -4.325960159301758, "logps/chosen": -761.3397216796875, "logps/rejected": -758.9552001953125, "loss": 0.6935, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.34092193841934204, "rewards/margins": 0.4871430993080139, "rewards/rejected": -0.828065037727356, "step": 2610 }, { "epoch": 0.6103669190448456, "grad_norm": 9.210099220275879, "learning_rate": 1.5440776699029126e-05, "logits/chosen": -4.376111030578613, "logits/rejected": -4.404354572296143, "logps/chosen": -695.9658203125, "logps/rejected": -763.7271118164062, "loss": 0.6971, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3182308077812195, "rewards/margins": 0.586915910243988, "rewards/rejected": -0.9051467180252075, "step": 2620 }, { "epoch": 0.6126965637740245, "grad_norm": 6.681046485900879, "learning_rate": 1.5414886731391587e-05, "logits/chosen": -4.412899017333984, "logits/rejected": -4.380231857299805, "logps/chosen": -720.5208129882812, "logps/rejected": -738.626708984375, "loss": 0.8028, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6240053772926331, "rewards/margins": 0.2591809630393982, "rewards/rejected": -0.8831863403320312, "step": 2630 }, { "epoch": 0.6150262085032032, "grad_norm": 7.9586968421936035, "learning_rate": 1.5388996763754047e-05, "logits/chosen": -4.327067852020264, "logits/rejected": -4.382328033447266, "logps/chosen": -682.6461791992188, "logps/rejected": -751.2008666992188, "loss": 0.7804, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5338839292526245, "rewards/margins": 0.25476861000061035, "rewards/rejected": -0.7886524796485901, "step": 2640 }, { "epoch": 0.6173558532323821, "grad_norm": 4.548120021820068, "learning_rate": 1.5363106796116508e-05, "logits/chosen": -4.392605781555176, "logits/rejected": -4.419025421142578, "logps/chosen": -731.2877197265625, "logps/rejected": -758.0394897460938, "loss": 0.7281, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.34113675355911255, "rewards/margins": 0.3821430802345276, "rewards/rejected": -0.7232798337936401, "step": 2650 }, { "epoch": 0.6196854979615609, "grad_norm": 8.346933364868164, "learning_rate": 1.5337216828478965e-05, "logits/chosen": -4.402939796447754, "logits/rejected": -4.375199794769287, "logps/chosen": -735.7860107421875, "logps/rejected": -770.9775390625, "loss": 0.7416, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.42072242498397827, "rewards/margins": 0.5455988645553589, "rewards/rejected": -0.9663212895393372, "step": 2660 }, { "epoch": 0.6220151426907397, "grad_norm": 5.262242794036865, "learning_rate": 1.5311326860841425e-05, "logits/chosen": -4.312819480895996, "logits/rejected": -4.257102012634277, "logps/chosen": -670.9732055664062, "logps/rejected": -666.3642578125, "loss": 0.7165, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4910193085670471, "rewards/margins": 0.5100225210189819, "rewards/rejected": -1.0010417699813843, "step": 2670 }, { "epoch": 0.6243447874199185, "grad_norm": 7.206348896026611, "learning_rate": 1.5285436893203886e-05, "logits/chosen": -4.353832244873047, "logits/rejected": -4.302159309387207, "logps/chosen": -736.9912109375, "logps/rejected": -758.8922119140625, "loss": 0.8675, "rewards/accuracies": 0.5, "rewards/chosen": -0.5955030918121338, "rewards/margins": 0.2457858771085739, "rewards/rejected": -0.8412889242172241, "step": 2680 }, { "epoch": 0.6266744321490972, "grad_norm": 7.944403648376465, "learning_rate": 1.5259546925566343e-05, "logits/chosen": -4.326117992401123, "logits/rejected": -4.392238616943359, "logps/chosen": -655.5966796875, "logps/rejected": -687.850341796875, "loss": 0.6854, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4322779178619385, "rewards/margins": 0.4284347593784332, "rewards/rejected": -0.8607127070426941, "step": 2690 }, { "epoch": 0.6290040768782761, "grad_norm": 8.794605255126953, "learning_rate": 1.5233656957928804e-05, "logits/chosen": -4.378158092498779, "logits/rejected": -4.327710151672363, "logps/chosen": -737.3948364257812, "logps/rejected": -742.9942626953125, "loss": 0.9305, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.6186449527740479, "rewards/margins": -0.0013167411088943481, "rewards/rejected": -0.6173282861709595, "step": 2700 }, { "epoch": 0.6290040768782761, "eval_logits/chosen": -4.296804904937744, "eval_logits/rejected": -4.284645080566406, "eval_logps/chosen": -696.439697265625, "eval_logps/rejected": -715.352294921875, "eval_loss": 0.6431933641433716, "eval_rewards/accuracies": 0.621330738067627, "eval_rewards/chosen": -0.5611439347267151, "eval_rewards/margins": 0.29422974586486816, "eval_rewards/rejected": -0.8553736805915833, "eval_runtime": 388.3906, "eval_samples_per_second": 18.42, "eval_steps_per_second": 9.21, "step": 2700 }, { "epoch": 0.6313337216074548, "grad_norm": 5.812639236450195, "learning_rate": 1.5207766990291264e-05, "logits/chosen": -4.331504821777344, "logits/rejected": -4.388894081115723, "logps/chosen": -657.0352172851562, "logps/rejected": -717.141845703125, "loss": 0.6821, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.497232586145401, "rewards/margins": 0.3659915328025818, "rewards/rejected": -0.8632240295410156, "step": 2710 }, { "epoch": 0.6336633663366337, "grad_norm": 8.673256874084473, "learning_rate": 1.5181877022653723e-05, "logits/chosen": -4.275472164154053, "logits/rejected": -4.281885623931885, "logps/chosen": -668.5069580078125, "logps/rejected": -711.5523071289062, "loss": 0.7998, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.4561544954776764, "rewards/margins": 0.27673688530921936, "rewards/rejected": -0.7328914403915405, "step": 2720 }, { "epoch": 0.6359930110658125, "grad_norm": 7.681230545043945, "learning_rate": 1.5155987055016182e-05, "logits/chosen": -4.440752983093262, "logits/rejected": -4.328701019287109, "logps/chosen": -738.4246215820312, "logps/rejected": -693.4755249023438, "loss": 0.813, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.6215518116950989, "rewards/margins": 0.08657562732696533, "rewards/rejected": -0.7081274390220642, "step": 2730 }, { "epoch": 0.6383226557949913, "grad_norm": 7.156750679016113, "learning_rate": 1.5130097087378641e-05, "logits/chosen": -4.261274814605713, "logits/rejected": -4.308469295501709, "logps/chosen": -693.0875244140625, "logps/rejected": -768.095703125, "loss": 0.768, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6500027179718018, "rewards/margins": 0.44288986921310425, "rewards/rejected": -1.0928925275802612, "step": 2740 }, { "epoch": 0.6406523005241701, "grad_norm": 7.899952411651611, "learning_rate": 1.5104207119741102e-05, "logits/chosen": -4.327859878540039, "logits/rejected": -4.248705863952637, "logps/chosen": -768.5486450195312, "logps/rejected": -713.5155639648438, "loss": 0.6868, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.42632707953453064, "rewards/margins": 0.5251134634017944, "rewards/rejected": -0.9514405131340027, "step": 2750 }, { "epoch": 0.6429819452533488, "grad_norm": 6.4603729248046875, "learning_rate": 1.507831715210356e-05, "logits/chosen": -4.355565071105957, "logits/rejected": -4.352751731872559, "logps/chosen": -706.5877685546875, "logps/rejected": -733.3256225585938, "loss": 0.8891, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.79791259765625, "rewards/margins": 0.029533350840210915, "rewards/rejected": -0.8274458646774292, "step": 2760 }, { "epoch": 0.6453115899825277, "grad_norm": 7.2228803634643555, "learning_rate": 1.5052427184466021e-05, "logits/chosen": -4.348843574523926, "logits/rejected": -4.36469030380249, "logps/chosen": -736.6920776367188, "logps/rejected": -755.9579467773438, "loss": 0.8354, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5484541058540344, "rewards/margins": 0.36284321546554565, "rewards/rejected": -0.9112973213195801, "step": 2770 }, { "epoch": 0.6476412347117064, "grad_norm": 9.550434112548828, "learning_rate": 1.502653721682848e-05, "logits/chosen": -4.265006065368652, "logits/rejected": -4.404940128326416, "logps/chosen": -741.6475830078125, "logps/rejected": -816.3297119140625, "loss": 0.7099, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.39252057671546936, "rewards/margins": 0.39079761505126953, "rewards/rejected": -0.7833182215690613, "step": 2780 }, { "epoch": 0.6499708794408853, "grad_norm": 7.080128192901611, "learning_rate": 1.500064724919094e-05, "logits/chosen": -4.357051849365234, "logits/rejected": -4.326101303100586, "logps/chosen": -781.0487670898438, "logps/rejected": -725.7145385742188, "loss": 0.6983, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.22293445467948914, "rewards/margins": 0.5743099451065063, "rewards/rejected": -0.7972443699836731, "step": 2790 }, { "epoch": 0.652300524170064, "grad_norm": 6.817419052124023, "learning_rate": 1.4974757281553401e-05, "logits/chosen": -4.357829570770264, "logits/rejected": -4.383296489715576, "logps/chosen": -741.2380981445312, "logps/rejected": -783.3194580078125, "loss": 0.8684, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.686312735080719, "rewards/margins": 0.21443429589271545, "rewards/rejected": -0.9007471203804016, "step": 2800 }, { "epoch": 0.652300524170064, "eval_logits/chosen": -4.2776408195495605, "eval_logits/rejected": -4.264857292175293, "eval_logps/chosen": -697.18115234375, "eval_logps/rejected": -716.2308959960938, "eval_loss": 0.6444410085678101, "eval_rewards/accuracies": 0.6178361773490906, "eval_rewards/chosen": -0.6352859735488892, "eval_rewards/margins": 0.3079439699649811, "eval_rewards/rejected": -0.9432300329208374, "eval_runtime": 388.9003, "eval_samples_per_second": 18.395, "eval_steps_per_second": 9.198, "step": 2800 }, { "epoch": 0.6546301688992429, "grad_norm": 6.708640098571777, "learning_rate": 1.4948867313915858e-05, "logits/chosen": -4.2895307540893555, "logits/rejected": -4.2548699378967285, "logps/chosen": -744.3851318359375, "logps/rejected": -756.6453247070312, "loss": 0.9964, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.845330536365509, "rewards/margins": -0.13607561588287354, "rewards/rejected": -0.7092548608779907, "step": 2810 }, { "epoch": 0.6569598136284217, "grad_norm": 8.062914848327637, "learning_rate": 1.4922977346278317e-05, "logits/chosen": -4.304629802703857, "logits/rejected": -4.2135725021362305, "logps/chosen": -758.5049438476562, "logps/rejected": -712.3289794921875, "loss": 0.9002, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.6158826947212219, "rewards/margins": 0.14774423837661743, "rewards/rejected": -0.7636269330978394, "step": 2820 }, { "epoch": 0.6592894583576004, "grad_norm": 8.468124389648438, "learning_rate": 1.4897087378640778e-05, "logits/chosen": -4.3212080001831055, "logits/rejected": -4.310835838317871, "logps/chosen": -730.5067749023438, "logps/rejected": -711.48095703125, "loss": 0.7278, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.46248364448547363, "rewards/margins": 0.3725832998752594, "rewards/rejected": -0.8350669145584106, "step": 2830 }, { "epoch": 0.6616191030867793, "grad_norm": 5.327014446258545, "learning_rate": 1.4871197411003238e-05, "logits/chosen": -4.333258152008057, "logits/rejected": -4.298372268676758, "logps/chosen": -703.5050048828125, "logps/rejected": -738.3985595703125, "loss": 0.7743, "rewards/accuracies": 0.625, "rewards/chosen": -0.5580412745475769, "rewards/margins": 0.3654623031616211, "rewards/rejected": -0.9235035181045532, "step": 2840 }, { "epoch": 0.663948747815958, "grad_norm": 8.886383056640625, "learning_rate": 1.4845307443365697e-05, "logits/chosen": -4.331049919128418, "logits/rejected": -4.31258487701416, "logps/chosen": -711.8903198242188, "logps/rejected": -739.1852416992188, "loss": 0.9072, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.6002537608146667, "rewards/margins": -0.008847487159073353, "rewards/rejected": -0.5914062261581421, "step": 2850 }, { "epoch": 0.6662783925451369, "grad_norm": 9.955883026123047, "learning_rate": 1.4819417475728158e-05, "logits/chosen": -4.360678672790527, "logits/rejected": -4.346714973449707, "logps/chosen": -741.66357421875, "logps/rejected": -740.5900268554688, "loss": 0.7513, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.43228378891944885, "rewards/margins": 0.45141610503196716, "rewards/rejected": -0.883699893951416, "step": 2860 }, { "epoch": 0.6686080372743156, "grad_norm": 7.811578750610352, "learning_rate": 1.4793527508090617e-05, "logits/chosen": -4.372070789337158, "logits/rejected": -4.431691646575928, "logps/chosen": -701.4199829101562, "logps/rejected": -754.1885986328125, "loss": 0.7976, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.565507173538208, "rewards/margins": 0.3819185197353363, "rewards/rejected": -0.9474257230758667, "step": 2870 }, { "epoch": 0.6709376820034945, "grad_norm": 7.534102439880371, "learning_rate": 1.4767637540453075e-05, "logits/chosen": -4.34897518157959, "logits/rejected": -4.343818187713623, "logps/chosen": -740.5709228515625, "logps/rejected": -758.922119140625, "loss": 0.7366, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5281800627708435, "rewards/margins": 0.488368421792984, "rewards/rejected": -1.0165483951568604, "step": 2880 }, { "epoch": 0.6732673267326733, "grad_norm": 4.9338884353637695, "learning_rate": 1.4741747572815534e-05, "logits/chosen": -4.355449676513672, "logits/rejected": -4.404020309448242, "logps/chosen": -711.3448486328125, "logps/rejected": -729.7279663085938, "loss": 0.8034, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5138441920280457, "rewards/margins": 0.2933233082294464, "rewards/rejected": -0.8071675300598145, "step": 2890 }, { "epoch": 0.675596971461852, "grad_norm": 8.007110595703125, "learning_rate": 1.4715857605177995e-05, "logits/chosen": -4.389501094818115, "logits/rejected": -4.405324935913086, "logps/chosen": -758.5374755859375, "logps/rejected": -777.3524169921875, "loss": 0.7807, "rewards/accuracies": 0.625, "rewards/chosen": -0.41694846749305725, "rewards/margins": 0.5797191858291626, "rewards/rejected": -0.9966676831245422, "step": 2900 }, { "epoch": 0.675596971461852, "eval_logits/chosen": -4.294885635375977, "eval_logits/rejected": -4.282512664794922, "eval_logps/chosen": -696.2392578125, "eval_logps/rejected": -715.1339111328125, "eval_loss": 0.6404834389686584, "eval_rewards/accuracies": 0.6242661476135254, "eval_rewards/chosen": -0.5410952568054199, "eval_rewards/margins": 0.29242727160453796, "eval_rewards/rejected": -0.8335224986076355, "eval_runtime": 388.2427, "eval_samples_per_second": 18.427, "eval_steps_per_second": 9.213, "step": 2900 }, { "epoch": 0.6779266161910309, "grad_norm": 8.221184730529785, "learning_rate": 1.4689967637540454e-05, "logits/chosen": -4.374337673187256, "logits/rejected": -4.375265598297119, "logps/chosen": -738.59228515625, "logps/rejected": -785.3124389648438, "loss": 0.7185, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5105136036872864, "rewards/margins": 0.46654510498046875, "rewards/rejected": -0.9770587682723999, "step": 2910 }, { "epoch": 0.6802562609202096, "grad_norm": 9.547266006469727, "learning_rate": 1.4664077669902914e-05, "logits/chosen": -4.404740333557129, "logits/rejected": -4.382708549499512, "logps/chosen": -784.3701171875, "logps/rejected": -764.4552612304688, "loss": 0.8005, "rewards/accuracies": 0.625, "rewards/chosen": -0.2633661925792694, "rewards/margins": 0.3906458020210266, "rewards/rejected": -0.6540120244026184, "step": 2920 }, { "epoch": 0.6825859056493885, "grad_norm": 5.635769844055176, "learning_rate": 1.4638187702265373e-05, "logits/chosen": -4.346009731292725, "logits/rejected": -4.309237003326416, "logps/chosen": -778.1959228515625, "logps/rejected": -724.6881713867188, "loss": 0.8783, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.6418018937110901, "rewards/margins": 0.07551275193691254, "rewards/rejected": -0.717314600944519, "step": 2930 }, { "epoch": 0.6849155503785672, "grad_norm": 8.018805503845215, "learning_rate": 1.4612297734627834e-05, "logits/chosen": -4.3149094581604, "logits/rejected": -4.2781453132629395, "logps/chosen": -681.9122314453125, "logps/rejected": -679.7462768554688, "loss": 0.9115, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.6334387063980103, "rewards/margins": 0.008119463920593262, "rewards/rejected": -0.6415581703186035, "step": 2940 }, { "epoch": 0.6872451951077461, "grad_norm": 6.547179222106934, "learning_rate": 1.4586407766990291e-05, "logits/chosen": -4.2727508544921875, "logits/rejected": -4.245587348937988, "logps/chosen": -752.4075927734375, "logps/rejected": -750.0736083984375, "loss": 0.6802, "rewards/accuracies": 0.625, "rewards/chosen": -0.3730926513671875, "rewards/margins": 0.4776512086391449, "rewards/rejected": -0.8507438898086548, "step": 2950 }, { "epoch": 0.6895748398369249, "grad_norm": 6.982612133026123, "learning_rate": 1.4560517799352752e-05, "logits/chosen": -4.291788578033447, "logits/rejected": -4.315970420837402, "logps/chosen": -657.02978515625, "logps/rejected": -682.1209716796875, "loss": 0.6605, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3291148841381073, "rewards/margins": 0.6222742199897766, "rewards/rejected": -0.9513891339302063, "step": 2960 }, { "epoch": 0.6919044845661037, "grad_norm": 8.867607116699219, "learning_rate": 1.453462783171521e-05, "logits/chosen": -4.366148471832275, "logits/rejected": -4.393885612487793, "logps/chosen": -675.3447265625, "logps/rejected": -713.319580078125, "loss": 0.6773, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.28730329871177673, "rewards/margins": 0.5438534617424011, "rewards/rejected": -0.8311569094657898, "step": 2970 }, { "epoch": 0.6942341292952825, "grad_norm": 8.969156265258789, "learning_rate": 1.4508737864077671e-05, "logits/chosen": -4.319128036499023, "logits/rejected": -4.303523063659668, "logps/chosen": -755.8877563476562, "logps/rejected": -751.65087890625, "loss": 0.7509, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.24285368621349335, "rewards/margins": 0.549541711807251, "rewards/rejected": -0.7923953533172607, "step": 2980 }, { "epoch": 0.6965637740244612, "grad_norm": 4.678745269775391, "learning_rate": 1.4482847896440132e-05, "logits/chosen": -4.411252021789551, "logits/rejected": -4.31979513168335, "logps/chosen": -780.011962890625, "logps/rejected": -769.9260864257812, "loss": 0.8132, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4631928503513336, "rewards/margins": 0.34427186846733093, "rewards/rejected": -0.8074647188186646, "step": 2990 }, { "epoch": 0.6988934187536401, "grad_norm": 5.628600120544434, "learning_rate": 1.445695792880259e-05, "logits/chosen": -4.367173194885254, "logits/rejected": -4.255070686340332, "logps/chosen": -774.8583984375, "logps/rejected": -759.581298828125, "loss": 0.7141, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3500550389289856, "rewards/margins": 0.35884183645248413, "rewards/rejected": -0.708896815776825, "step": 3000 }, { "epoch": 0.6988934187536401, "eval_logits/chosen": -4.2953410148620605, "eval_logits/rejected": -4.283372402191162, "eval_logps/chosen": -696.170166015625, "eval_logps/rejected": -715.119384765625, "eval_loss": 0.6390213966369629, "eval_rewards/accuracies": 0.626782238483429, "eval_rewards/chosen": -0.5341863036155701, "eval_rewards/margins": 0.2978910207748413, "eval_rewards/rejected": -0.8320773839950562, "eval_runtime": 388.9031, "eval_samples_per_second": 18.395, "eval_steps_per_second": 9.198, "step": 3000 }, { "epoch": 0.7012230634828188, "grad_norm": 8.826177597045898, "learning_rate": 1.4431067961165051e-05, "logits/chosen": -4.272387504577637, "logits/rejected": -4.336935520172119, "logps/chosen": -697.0843505859375, "logps/rejected": -790.090576171875, "loss": 0.8917, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.611141562461853, "rewards/margins": 0.14556744694709778, "rewards/rejected": -0.7567089796066284, "step": 3010 }, { "epoch": 0.7035527082119977, "grad_norm": 10.059699058532715, "learning_rate": 1.4405177993527508e-05, "logits/chosen": -4.255461692810059, "logits/rejected": -4.3038225173950195, "logps/chosen": -657.3980712890625, "logps/rejected": -669.6798095703125, "loss": 0.8871, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.6021971702575684, "rewards/margins": 0.05020788311958313, "rewards/rejected": -0.6524051427841187, "step": 3020 }, { "epoch": 0.7058823529411765, "grad_norm": 8.71876049041748, "learning_rate": 1.4379288025889969e-05, "logits/chosen": -4.3381667137146, "logits/rejected": -4.318350315093994, "logps/chosen": -715.66455078125, "logps/rejected": -674.4814453125, "loss": 0.7505, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5548633933067322, "rewards/margins": 0.2882435917854309, "rewards/rejected": -0.8431070446968079, "step": 3030 }, { "epoch": 0.7082119976703553, "grad_norm": 8.695144653320312, "learning_rate": 1.4353398058252428e-05, "logits/chosen": -4.258905410766602, "logits/rejected": -4.291356086730957, "logps/chosen": -682.4697875976562, "logps/rejected": -735.2022705078125, "loss": 0.8223, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5667489767074585, "rewards/margins": 0.16179592907428741, "rewards/rejected": -0.7285449504852295, "step": 3040 }, { "epoch": 0.7105416423995341, "grad_norm": 8.749677658081055, "learning_rate": 1.4327508090614888e-05, "logits/chosen": -4.3128132820129395, "logits/rejected": -4.305615425109863, "logps/chosen": -707.4427490234375, "logps/rejected": -725.2252807617188, "loss": 0.6859, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2074352204799652, "rewards/margins": 0.5990522503852844, "rewards/rejected": -0.8064874410629272, "step": 3050 }, { "epoch": 0.7128712871287128, "grad_norm": 4.48876953125, "learning_rate": 1.4301618122977347e-05, "logits/chosen": -4.397701263427734, "logits/rejected": -4.405947208404541, "logps/chosen": -702.0162963867188, "logps/rejected": -730.04150390625, "loss": 0.7349, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.35273998975753784, "rewards/margins": 0.4463699460029602, "rewards/rejected": -0.799109935760498, "step": 3060 }, { "epoch": 0.7152009318578917, "grad_norm": 7.877706050872803, "learning_rate": 1.4275728155339808e-05, "logits/chosen": -4.2988691329956055, "logits/rejected": -4.319767951965332, "logps/chosen": -718.3621826171875, "logps/rejected": -700.2260131835938, "loss": 0.89, "rewards/accuracies": 0.5, "rewards/chosen": -0.6528037190437317, "rewards/margins": 0.040874332189559937, "rewards/rejected": -0.6936780214309692, "step": 3070 }, { "epoch": 0.7175305765870704, "grad_norm": 8.941927909851074, "learning_rate": 1.4249838187702267e-05, "logits/chosen": -4.259885787963867, "logits/rejected": -4.335204124450684, "logps/chosen": -665.210205078125, "logps/rejected": -732.52978515625, "loss": 0.8301, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.5433850288391113, "rewards/margins": 0.19039340317249298, "rewards/rejected": -0.7337783575057983, "step": 3080 }, { "epoch": 0.7198602213162493, "grad_norm": 9.396485328674316, "learning_rate": 1.4223948220064725e-05, "logits/chosen": -4.220810890197754, "logits/rejected": -4.346644878387451, "logps/chosen": -667.0452880859375, "logps/rejected": -778.1810913085938, "loss": 0.666, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5383417010307312, "rewards/margins": 0.5022939443588257, "rewards/rejected": -1.0406357049942017, "step": 3090 }, { "epoch": 0.7221898660454281, "grad_norm": 10.992788314819336, "learning_rate": 1.4198058252427184e-05, "logits/chosen": -4.300179481506348, "logits/rejected": -4.309948444366455, "logps/chosen": -678.5414428710938, "logps/rejected": -774.6533203125, "loss": 0.6912, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5962709784507751, "rewards/margins": 0.4656152129173279, "rewards/rejected": -1.061886191368103, "step": 3100 }, { "epoch": 0.7221898660454281, "eval_logits/chosen": -4.290231227874756, "eval_logits/rejected": -4.278411865234375, "eval_logps/chosen": -696.8681030273438, "eval_logps/rejected": -716.00634765625, "eval_loss": 0.6407871246337891, "eval_rewards/accuracies": 0.6272015571594238, "eval_rewards/chosen": -0.603983998298645, "eval_rewards/margins": 0.3167959451675415, "eval_rewards/rejected": -0.9207799434661865, "eval_runtime": 388.9741, "eval_samples_per_second": 18.392, "eval_steps_per_second": 9.196, "step": 3100 }, { "epoch": 0.7245195107746069, "grad_norm": 9.470890998840332, "learning_rate": 1.4172168284789645e-05, "logits/chosen": -4.371718406677246, "logits/rejected": -4.242143154144287, "logps/chosen": -762.9887084960938, "logps/rejected": -700.8438110351562, "loss": 0.7672, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5022997260093689, "rewards/margins": 0.3725367486476898, "rewards/rejected": -0.8748364448547363, "step": 3110 }, { "epoch": 0.7268491555037857, "grad_norm": 7.6680521965026855, "learning_rate": 1.4146278317152104e-05, "logits/chosen": -4.336404323577881, "logits/rejected": -4.37836217880249, "logps/chosen": -727.9071044921875, "logps/rejected": -763.2144775390625, "loss": 0.7235, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3655966818332672, "rewards/margins": 0.3294195234775543, "rewards/rejected": -0.6950162649154663, "step": 3120 }, { "epoch": 0.7291788002329644, "grad_norm": 6.3448991775512695, "learning_rate": 1.4120388349514564e-05, "logits/chosen": -4.234147548675537, "logits/rejected": -4.275125503540039, "logps/chosen": -657.5973510742188, "logps/rejected": -754.4498291015625, "loss": 0.8516, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.6424176096916199, "rewards/margins": 0.21146734058856964, "rewards/rejected": -0.8538848757743835, "step": 3130 }, { "epoch": 0.7315084449621433, "grad_norm": 7.723195552825928, "learning_rate": 1.4094498381877025e-05, "logits/chosen": -4.297318458557129, "logits/rejected": -4.263772964477539, "logps/chosen": -720.8884887695312, "logps/rejected": -688.8799438476562, "loss": 0.9713, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.759177029132843, "rewards/margins": -0.07643283903598785, "rewards/rejected": -0.6827441453933716, "step": 3140 }, { "epoch": 0.733838089691322, "grad_norm": 8.816356658935547, "learning_rate": 1.4068608414239484e-05, "logits/chosen": -4.3540825843811035, "logits/rejected": -4.330616474151611, "logps/chosen": -720.6648559570312, "logps/rejected": -727.0189208984375, "loss": 0.6476, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4314247667789459, "rewards/margins": 0.5888901948928833, "rewards/rejected": -1.0203149318695068, "step": 3150 }, { "epoch": 0.7361677344205009, "grad_norm": 7.842738151550293, "learning_rate": 1.4042718446601944e-05, "logits/chosen": -4.272993564605713, "logits/rejected": -4.329249382019043, "logps/chosen": -707.3156127929688, "logps/rejected": -747.498291015625, "loss": 0.6714, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5479978919029236, "rewards/margins": 0.6574041247367859, "rewards/rejected": -1.20540189743042, "step": 3160 }, { "epoch": 0.7384973791496797, "grad_norm": 6.250893592834473, "learning_rate": 1.4016828478964402e-05, "logits/chosen": -4.374791145324707, "logits/rejected": -4.3477301597595215, "logps/chosen": -748.3688354492188, "logps/rejected": -742.9495239257812, "loss": 0.809, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.6412941217422485, "rewards/margins": 0.2639515995979309, "rewards/rejected": -0.9052456021308899, "step": 3170 }, { "epoch": 0.7408270238788585, "grad_norm": 6.362512588500977, "learning_rate": 1.3990938511326862e-05, "logits/chosen": -4.23079252243042, "logits/rejected": -4.321897506713867, "logps/chosen": -655.5142822265625, "logps/rejected": -756.73681640625, "loss": 0.7603, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6060266494750977, "rewards/margins": 0.39030686020851135, "rewards/rejected": -0.9963334798812866, "step": 3180 }, { "epoch": 0.7431566686080373, "grad_norm": 8.123212814331055, "learning_rate": 1.3965048543689321e-05, "logits/chosen": -4.311104774475098, "logits/rejected": -4.362451553344727, "logps/chosen": -683.316162109375, "logps/rejected": -717.951904296875, "loss": 0.9672, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.7517843246459961, "rewards/margins": 0.05504413694143295, "rewards/rejected": -0.8068283796310425, "step": 3190 }, { "epoch": 0.745486313337216, "grad_norm": 9.257919311523438, "learning_rate": 1.3939158576051782e-05, "logits/chosen": -4.272576808929443, "logits/rejected": -4.2962846755981445, "logps/chosen": -701.0122680664062, "logps/rejected": -724.0266723632812, "loss": 0.8165, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5627827048301697, "rewards/margins": 0.30343493819236755, "rewards/rejected": -0.8662176132202148, "step": 3200 }, { "epoch": 0.745486313337216, "eval_logits/chosen": -4.283018112182617, "eval_logits/rejected": -4.270773410797119, "eval_logps/chosen": -697.2401733398438, "eval_logps/rejected": -716.45654296875, "eval_loss": 0.6407615542411804, "eval_rewards/accuracies": 0.6269220113754272, "eval_rewards/chosen": -0.6411851048469543, "eval_rewards/margins": 0.3246031105518341, "eval_rewards/rejected": -0.9657881259918213, "eval_runtime": 388.9661, "eval_samples_per_second": 18.392, "eval_steps_per_second": 9.196, "step": 3200 }, { "epoch": 0.7478159580663949, "grad_norm": 11.506460189819336, "learning_rate": 1.391326860841424e-05, "logits/chosen": -4.346702575683594, "logits/rejected": -4.339921951293945, "logps/chosen": -722.6053466796875, "logps/rejected": -769.1424560546875, "loss": 0.8317, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7728465795516968, "rewards/margins": 0.3128949999809265, "rewards/rejected": -1.085741639137268, "step": 3210 }, { "epoch": 0.7501456027955736, "grad_norm": 10.876580238342285, "learning_rate": 1.3887378640776701e-05, "logits/chosen": -4.32213020324707, "logits/rejected": -4.19241189956665, "logps/chosen": -683.1779174804688, "logps/rejected": -726.1195068359375, "loss": 0.8666, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.7728645205497742, "rewards/margins": 0.10151295363903046, "rewards/rejected": -0.8743775486946106, "step": 3220 }, { "epoch": 0.7524752475247525, "grad_norm": 5.669135093688965, "learning_rate": 1.386148867313916e-05, "logits/chosen": -4.335180282592773, "logits/rejected": -4.385754108428955, "logps/chosen": -704.3292236328125, "logps/rejected": -790.5955200195312, "loss": 0.7836, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.5992025136947632, "rewards/margins": 0.3988838493824005, "rewards/rejected": -0.9980863332748413, "step": 3230 }, { "epoch": 0.7548048922539313, "grad_norm": 11.22128963470459, "learning_rate": 1.3835598705501619e-05, "logits/chosen": -4.41721248626709, "logits/rejected": -4.297575950622559, "logps/chosen": -755.6207275390625, "logps/rejected": -690.0484619140625, "loss": 0.9106, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.6340324878692627, "rewards/margins": 0.04324830323457718, "rewards/rejected": -0.6772807836532593, "step": 3240 }, { "epoch": 0.7571345369831101, "grad_norm": 7.739163875579834, "learning_rate": 1.3809708737864078e-05, "logits/chosen": -4.304110527038574, "logits/rejected": -4.264829158782959, "logps/chosen": -683.7374877929688, "logps/rejected": -689.4002685546875, "loss": 0.8206, "rewards/accuracies": 0.5, "rewards/chosen": -0.6143852472305298, "rewards/margins": 0.10701651871204376, "rewards/rejected": -0.7214018106460571, "step": 3250 }, { "epoch": 0.7594641817122889, "grad_norm": 6.535190105438232, "learning_rate": 1.3783818770226538e-05, "logits/chosen": -4.372353553771973, "logits/rejected": -4.373166561126709, "logps/chosen": -714.5510864257812, "logps/rejected": -700.5277099609375, "loss": 0.9644, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.7022789120674133, "rewards/margins": 0.01919153705239296, "rewards/rejected": -0.7214704155921936, "step": 3260 }, { "epoch": 0.7617938264414676, "grad_norm": 7.50679874420166, "learning_rate": 1.3757928802588997e-05, "logits/chosen": -4.37531852722168, "logits/rejected": -4.408667087554932, "logps/chosen": -674.136962890625, "logps/rejected": -773.3675537109375, "loss": 0.7061, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.518215537071228, "rewards/margins": 0.5985016822814941, "rewards/rejected": -1.1167173385620117, "step": 3270 }, { "epoch": 0.7641234711706465, "grad_norm": 8.066099166870117, "learning_rate": 1.3732038834951458e-05, "logits/chosen": -4.234104156494141, "logits/rejected": -4.255315780639648, "logps/chosen": -681.3992919921875, "logps/rejected": -715.1346435546875, "loss": 0.7161, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4894762635231018, "rewards/margins": 0.5325658917427063, "rewards/rejected": -1.0220420360565186, "step": 3280 }, { "epoch": 0.7664531158998252, "grad_norm": 8.645896911621094, "learning_rate": 1.3706148867313918e-05, "logits/chosen": -4.340358734130859, "logits/rejected": -4.369351387023926, "logps/chosen": -731.498291015625, "logps/rejected": -842.1834106445312, "loss": 0.6648, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5124009847640991, "rewards/margins": 0.6522801518440247, "rewards/rejected": -1.1646811962127686, "step": 3290 }, { "epoch": 0.7687827606290041, "grad_norm": 8.283692359924316, "learning_rate": 1.3680258899676377e-05, "logits/chosen": -4.37287712097168, "logits/rejected": -4.286995887756348, "logps/chosen": -746.0880126953125, "logps/rejected": -711.8284301757812, "loss": 0.7196, "rewards/accuracies": 0.625, "rewards/chosen": -0.6805142164230347, "rewards/margins": 0.4170430302619934, "rewards/rejected": -1.0975573062896729, "step": 3300 }, { "epoch": 0.7687827606290041, "eval_logits/chosen": -4.292080879211426, "eval_logits/rejected": -4.279994010925293, "eval_logps/chosen": -697.4268798828125, "eval_logps/rejected": -716.6896362304688, "eval_loss": 0.6396226286888123, "eval_rewards/accuracies": 0.6272015571594238, "eval_rewards/chosen": -0.6598604917526245, "eval_rewards/margins": 0.32923582196235657, "eval_rewards/rejected": -0.9890962839126587, "eval_runtime": 389.3547, "eval_samples_per_second": 18.374, "eval_steps_per_second": 9.187, "step": 3300 }, { "epoch": 0.7711124053581829, "grad_norm": 7.7540740966796875, "learning_rate": 1.3654368932038834e-05, "logits/chosen": -4.363582134246826, "logits/rejected": -4.354387283325195, "logps/chosen": -697.3382568359375, "logps/rejected": -747.3855590820312, "loss": 0.7666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5363968014717102, "rewards/margins": 0.40406447649002075, "rewards/rejected": -0.940461277961731, "step": 3310 }, { "epoch": 0.7734420500873617, "grad_norm": 9.846510887145996, "learning_rate": 1.3628478964401295e-05, "logits/chosen": -4.361764430999756, "logits/rejected": -4.345663070678711, "logps/chosen": -691.598388671875, "logps/rejected": -733.6590576171875, "loss": 0.8794, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.7674421072006226, "rewards/margins": 0.21453304588794708, "rewards/rejected": -0.9819751977920532, "step": 3320 }, { "epoch": 0.7757716948165405, "grad_norm": 5.368443489074707, "learning_rate": 1.3602588996763756e-05, "logits/chosen": -4.414107799530029, "logits/rejected": -4.219078540802002, "logps/chosen": -830.5812377929688, "logps/rejected": -738.1531982421875, "loss": 0.779, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6906629204750061, "rewards/margins": 0.3374762237071991, "rewards/rejected": -1.0281391143798828, "step": 3330 }, { "epoch": 0.7781013395457193, "grad_norm": 8.01265811920166, "learning_rate": 1.3576699029126214e-05, "logits/chosen": -4.282806396484375, "logits/rejected": -4.28933048248291, "logps/chosen": -692.3564453125, "logps/rejected": -672.9450073242188, "loss": 0.8813, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.7251057028770447, "rewards/margins": 0.05957312509417534, "rewards/rejected": -0.7846788167953491, "step": 3340 }, { "epoch": 0.7804309842748981, "grad_norm": 9.943344116210938, "learning_rate": 1.3550809061488675e-05, "logits/chosen": -4.298067569732666, "logits/rejected": -4.34145975112915, "logps/chosen": -717.462890625, "logps/rejected": -672.3321533203125, "loss": 0.7298, "rewards/accuracies": 0.625, "rewards/chosen": -0.48818421363830566, "rewards/margins": 0.4551779627799988, "rewards/rejected": -0.9433622360229492, "step": 3350 }, { "epoch": 0.7827606290040768, "grad_norm": 9.782580375671387, "learning_rate": 1.3524919093851134e-05, "logits/chosen": -4.281432151794434, "logits/rejected": -4.373194694519043, "logps/chosen": -691.5526123046875, "logps/rejected": -751.7589111328125, "loss": 0.806, "rewards/accuracies": 0.625, "rewards/chosen": -0.5640980005264282, "rewards/margins": 0.3025267422199249, "rewards/rejected": -0.8666247129440308, "step": 3360 }, { "epoch": 0.7850902737332557, "grad_norm": 11.208285331726074, "learning_rate": 1.3499029126213594e-05, "logits/chosen": -4.372630596160889, "logits/rejected": -4.34097146987915, "logps/chosen": -699.0808715820312, "logps/rejected": -725.1157836914062, "loss": 0.826, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.6243829131126404, "rewards/margins": 0.31237927079200745, "rewards/rejected": -0.9367621541023254, "step": 3370 }, { "epoch": 0.7874199184624345, "grad_norm": 11.51611614227295, "learning_rate": 1.3473139158576052e-05, "logits/chosen": -4.39542293548584, "logits/rejected": -4.369442939758301, "logps/chosen": -714.1785278320312, "logps/rejected": -723.6585083007812, "loss": 0.7919, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3786347508430481, "rewards/margins": 0.3001879155635834, "rewards/rejected": -0.6788226962089539, "step": 3380 }, { "epoch": 0.7897495631916133, "grad_norm": 8.331809997558594, "learning_rate": 1.3447249190938512e-05, "logits/chosen": -4.329986572265625, "logits/rejected": -4.317460536956787, "logps/chosen": -695.0831298828125, "logps/rejected": -723.6203002929688, "loss": 0.6898, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5646909475326538, "rewards/margins": 0.5181136727333069, "rewards/rejected": -1.082804560661316, "step": 3390 }, { "epoch": 0.7920792079207921, "grad_norm": 8.896123886108398, "learning_rate": 1.3421359223300971e-05, "logits/chosen": -4.404135704040527, "logits/rejected": -4.331064701080322, "logps/chosen": -750.0504760742188, "logps/rejected": -708.2687377929688, "loss": 0.6001, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.20269541442394257, "rewards/margins": 0.699763834476471, "rewards/rejected": -0.9024591445922852, "step": 3400 }, { "epoch": 0.7920792079207921, "eval_logits/chosen": -4.312615394592285, "eval_logits/rejected": -4.3008599281311035, "eval_logps/chosen": -696.9474487304688, "eval_logps/rejected": -716.1688842773438, "eval_loss": 0.6369568109512329, "eval_rewards/accuracies": 0.6323735117912292, "eval_rewards/chosen": -0.6119146347045898, "eval_rewards/margins": 0.32510802149772644, "eval_rewards/rejected": -0.9370226263999939, "eval_runtime": 389.7133, "eval_samples_per_second": 18.357, "eval_steps_per_second": 9.179, "step": 3400 }, { "epoch": 0.7944088526499709, "grad_norm": 12.664941787719727, "learning_rate": 1.3395469255663432e-05, "logits/chosen": -4.360062599182129, "logits/rejected": -4.405812740325928, "logps/chosen": -747.7235717773438, "logps/rejected": -776.3199462890625, "loss": 0.8625, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7826858758926392, "rewards/margins": 0.23854394257068634, "rewards/rejected": -1.0212297439575195, "step": 3410 }, { "epoch": 0.7967384973791497, "grad_norm": 6.154604434967041, "learning_rate": 1.336957928802589e-05, "logits/chosen": -4.380074501037598, "logits/rejected": -4.334290504455566, "logps/chosen": -823.052734375, "logps/rejected": -751.04443359375, "loss": 0.883, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.6692854166030884, "rewards/margins": 0.17931106686592102, "rewards/rejected": -0.8485965728759766, "step": 3420 }, { "epoch": 0.7990681421083284, "grad_norm": 9.303884506225586, "learning_rate": 1.3343689320388351e-05, "logits/chosen": -4.323749542236328, "logits/rejected": -4.386262893676758, "logps/chosen": -687.6109619140625, "logps/rejected": -718.0032348632812, "loss": 0.8712, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7120986580848694, "rewards/margins": 0.1205534115433693, "rewards/rejected": -0.83265221118927, "step": 3430 }, { "epoch": 0.8013977868375073, "grad_norm": 6.182861328125, "learning_rate": 1.3317799352750812e-05, "logits/chosen": -4.345360279083252, "logits/rejected": -4.34683895111084, "logps/chosen": -705.4597778320312, "logps/rejected": -755.2239990234375, "loss": 0.6721, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.40056854486465454, "rewards/margins": 0.5601202845573425, "rewards/rejected": -0.9606888890266418, "step": 3440 }, { "epoch": 0.8037274315666861, "grad_norm": 10.787919998168945, "learning_rate": 1.329190938511327e-05, "logits/chosen": -4.387537956237793, "logits/rejected": -4.398334503173828, "logps/chosen": -703.9459838867188, "logps/rejected": -728.9833374023438, "loss": 0.8532, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.7066787481307983, "rewards/margins": 0.12415404617786407, "rewards/rejected": -0.8308327794075012, "step": 3450 }, { "epoch": 0.8060570762958649, "grad_norm": 11.048896789550781, "learning_rate": 1.3266019417475728e-05, "logits/chosen": -4.34719705581665, "logits/rejected": -4.3350510597229, "logps/chosen": -675.7979736328125, "logps/rejected": -711.3985595703125, "loss": 0.8836, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.6477685570716858, "rewards/margins": 0.05750611424446106, "rewards/rejected": -0.7052747011184692, "step": 3460 }, { "epoch": 0.8083867210250437, "grad_norm": 7.28444242477417, "learning_rate": 1.3240129449838188e-05, "logits/chosen": -4.3619279861450195, "logits/rejected": -4.376542568206787, "logps/chosen": -731.7483520507812, "logps/rejected": -795.55859375, "loss": 0.8076, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.4643213748931885, "rewards/margins": 0.3656710982322693, "rewards/rejected": -0.829992413520813, "step": 3470 }, { "epoch": 0.8107163657542225, "grad_norm": 6.876546382904053, "learning_rate": 1.3214239482200649e-05, "logits/chosen": -4.338783264160156, "logits/rejected": -4.366893768310547, "logps/chosen": -692.851806640625, "logps/rejected": -746.337890625, "loss": 0.6853, "rewards/accuracies": 0.625, "rewards/chosen": -0.5015130639076233, "rewards/margins": 0.5323377251625061, "rewards/rejected": -1.033850908279419, "step": 3480 }, { "epoch": 0.8130460104834013, "grad_norm": 7.1680707931518555, "learning_rate": 1.3188349514563108e-05, "logits/chosen": -4.326290607452393, "logits/rejected": -4.401350021362305, "logps/chosen": -669.87353515625, "logps/rejected": -743.699462890625, "loss": 0.7384, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6359532475471497, "rewards/margins": 0.24711358547210693, "rewards/rejected": -0.8830667734146118, "step": 3490 }, { "epoch": 0.81537565521258, "grad_norm": 7.081179618835449, "learning_rate": 1.3162459546925568e-05, "logits/chosen": -4.389810085296631, "logits/rejected": -4.387526035308838, "logps/chosen": -713.7439575195312, "logps/rejected": -774.56298828125, "loss": 0.8193, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6019744277000427, "rewards/margins": 0.28494006395339966, "rewards/rejected": -0.8869145512580872, "step": 3500 }, { "epoch": 0.81537565521258, "eval_logits/chosen": -4.3075971603393555, "eval_logits/rejected": -4.2960638999938965, "eval_logps/chosen": -696.4827880859375, "eval_logps/rejected": -715.6641845703125, "eval_loss": 0.6347267031669617, "eval_rewards/accuracies": 0.6347497701644897, "eval_rewards/chosen": -0.5654566884040833, "eval_rewards/margins": 0.32109642028808594, "eval_rewards/rejected": -0.8865532279014587, "eval_runtime": 389.244, "eval_samples_per_second": 18.379, "eval_steps_per_second": 9.19, "step": 3500 }, { "epoch": 0.8177052999417589, "grad_norm": 8.20933723449707, "learning_rate": 1.3136569579288027e-05, "logits/chosen": -4.342705726623535, "logits/rejected": -4.373048305511475, "logps/chosen": -630.05078125, "logps/rejected": -686.9644165039062, "loss": 0.7903, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4095194339752197, "rewards/margins": 0.2709464430809021, "rewards/rejected": -0.680465817451477, "step": 3510 }, { "epoch": 0.8200349446709376, "grad_norm": 7.260538578033447, "learning_rate": 1.3110679611650488e-05, "logits/chosen": -4.398225784301758, "logits/rejected": -4.343891143798828, "logps/chosen": -702.5809936523438, "logps/rejected": -723.8179321289062, "loss": 0.9465, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.8503254055976868, "rewards/margins": -0.010509604588150978, "rewards/rejected": -0.8398159146308899, "step": 3520 }, { "epoch": 0.8223645894001165, "grad_norm": 8.280593872070312, "learning_rate": 1.3084789644012945e-05, "logits/chosen": -4.371305465698242, "logits/rejected": -4.4022722244262695, "logps/chosen": -680.5316162109375, "logps/rejected": -692.623046875, "loss": 0.7366, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.4734654426574707, "rewards/margins": 0.33583229780197144, "rewards/rejected": -0.8092976808547974, "step": 3530 }, { "epoch": 0.8246942341292953, "grad_norm": 9.60183048248291, "learning_rate": 1.3058899676375406e-05, "logits/chosen": -4.415801048278809, "logits/rejected": -4.346750259399414, "logps/chosen": -799.20361328125, "logps/rejected": -742.3433837890625, "loss": 0.9304, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6391052603721619, "rewards/margins": 0.060576051473617554, "rewards/rejected": -0.6996814012527466, "step": 3540 }, { "epoch": 0.8270238788584741, "grad_norm": 6.183422088623047, "learning_rate": 1.3033009708737864e-05, "logits/chosen": -4.35550594329834, "logits/rejected": -4.362612724304199, "logps/chosen": -713.9013671875, "logps/rejected": -729.5386352539062, "loss": 0.7412, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4271429479122162, "rewards/margins": 0.41919174790382385, "rewards/rejected": -0.8463346362113953, "step": 3550 }, { "epoch": 0.8293535235876529, "grad_norm": 10.527793884277344, "learning_rate": 1.3007119741100325e-05, "logits/chosen": -4.403775215148926, "logits/rejected": -4.371069431304932, "logps/chosen": -736.4136352539062, "logps/rejected": -757.3355712890625, "loss": 0.8426, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5583220720291138, "rewards/margins": 0.11188054084777832, "rewards/rejected": -0.6702027320861816, "step": 3560 }, { "epoch": 0.8316831683168316, "grad_norm": 5.496523857116699, "learning_rate": 1.2981229773462784e-05, "logits/chosen": -4.342892646789551, "logits/rejected": -4.3970513343811035, "logps/chosen": -684.6142578125, "logps/rejected": -785.8450927734375, "loss": 0.76, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.35788896679878235, "rewards/margins": 0.4052864909172058, "rewards/rejected": -0.7631754279136658, "step": 3570 }, { "epoch": 0.8340128130460105, "grad_norm": 6.381747245788574, "learning_rate": 1.2955339805825244e-05, "logits/chosen": -4.360800743103027, "logits/rejected": -4.352953910827637, "logps/chosen": -691.8085327148438, "logps/rejected": -736.9182739257812, "loss": 0.7469, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.47531062364578247, "rewards/margins": 0.36524698138237, "rewards/rejected": -0.8405576944351196, "step": 3580 }, { "epoch": 0.8363424577751892, "grad_norm": 10.559829711914062, "learning_rate": 1.2929449838187705e-05, "logits/chosen": -4.344983100891113, "logits/rejected": -4.416808128356934, "logps/chosen": -693.577880859375, "logps/rejected": -742.8538818359375, "loss": 0.7386, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4921267628669739, "rewards/margins": 0.42470502853393555, "rewards/rejected": -0.9168317914009094, "step": 3590 }, { "epoch": 0.8386721025043681, "grad_norm": 4.188501834869385, "learning_rate": 1.2903559870550162e-05, "logits/chosen": -4.345074653625488, "logits/rejected": -4.432697772979736, "logps/chosen": -746.7095947265625, "logps/rejected": -737.2679443359375, "loss": 0.7706, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5077939033508301, "rewards/margins": 0.286722332239151, "rewards/rejected": -0.7945162653923035, "step": 3600 }, { "epoch": 0.8386721025043681, "eval_logits/chosen": -4.317319393157959, "eval_logits/rejected": -4.3060455322265625, "eval_logps/chosen": -696.902587890625, "eval_logps/rejected": -716.1546020507812, "eval_loss": 0.6351374387741089, "eval_rewards/accuracies": 0.6350293755531311, "eval_rewards/chosen": -0.6074322462081909, "eval_rewards/margins": 0.3281627297401428, "eval_rewards/rejected": -0.9355949759483337, "eval_runtime": 390.3144, "eval_samples_per_second": 18.329, "eval_steps_per_second": 9.164, "step": 3600 }, { "epoch": 0.8410017472335469, "grad_norm": 7.669933795928955, "learning_rate": 1.2877669902912621e-05, "logits/chosen": -4.36743688583374, "logits/rejected": -4.304625511169434, "logps/chosen": -701.3809814453125, "logps/rejected": -658.0457763671875, "loss": 0.7874, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5979401469230652, "rewards/margins": 0.18311890959739685, "rewards/rejected": -0.7810591459274292, "step": 3610 }, { "epoch": 0.8433313919627257, "grad_norm": 5.987900733947754, "learning_rate": 1.2851779935275082e-05, "logits/chosen": -4.3112688064575195, "logits/rejected": -4.322850227355957, "logps/chosen": -705.4276123046875, "logps/rejected": -719.9669189453125, "loss": 0.7801, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4701387882232666, "rewards/margins": 0.3039657473564148, "rewards/rejected": -0.7741045355796814, "step": 3620 }, { "epoch": 0.8456610366919045, "grad_norm": 4.629632949829102, "learning_rate": 1.2825889967637542e-05, "logits/chosen": -4.413471221923828, "logits/rejected": -4.342827320098877, "logps/chosen": -696.3890991210938, "logps/rejected": -672.6505126953125, "loss": 0.8676, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7070828676223755, "rewards/margins": 0.12928228080272675, "rewards/rejected": -0.8363651037216187, "step": 3630 }, { "epoch": 0.8479906814210832, "grad_norm": 8.022156715393066, "learning_rate": 1.2800000000000001e-05, "logits/chosen": -4.351454734802246, "logits/rejected": -4.433553218841553, "logps/chosen": -638.5418701171875, "logps/rejected": -781.3585205078125, "loss": 0.744, "rewards/accuracies": 0.5625, "rewards/chosen": -0.428017795085907, "rewards/margins": 0.3672192096710205, "rewards/rejected": -0.7952369451522827, "step": 3640 }, { "epoch": 0.8503203261502621, "grad_norm": 6.858470439910889, "learning_rate": 1.2774110032362462e-05, "logits/chosen": -4.435838222503662, "logits/rejected": -4.332773208618164, "logps/chosen": -720.262939453125, "logps/rejected": -651.93701171875, "loss": 0.6819, "rewards/accuracies": 0.625, "rewards/chosen": -0.562780499458313, "rewards/margins": 0.36187881231307983, "rewards/rejected": -0.924659252166748, "step": 3650 }, { "epoch": 0.8526499708794408, "grad_norm": 8.157905578613281, "learning_rate": 1.274822006472492e-05, "logits/chosen": -4.371639251708984, "logits/rejected": -4.241855621337891, "logps/chosen": -706.9839477539062, "logps/rejected": -693.9249267578125, "loss": 0.8243, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.6309869885444641, "rewards/margins": 0.22677846252918243, "rewards/rejected": -0.8577653765678406, "step": 3660 }, { "epoch": 0.8549796156086197, "grad_norm": 7.720125198364258, "learning_rate": 1.272233009708738e-05, "logits/chosen": -4.362655162811279, "logits/rejected": -4.343399524688721, "logps/chosen": -693.0546875, "logps/rejected": -795.5716552734375, "loss": 0.7693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.6363159418106079, "rewards/margins": 0.2812911570072174, "rewards/rejected": -0.9176071286201477, "step": 3670 }, { "epoch": 0.8573092603377985, "grad_norm": 4.946926116943359, "learning_rate": 1.2696440129449838e-05, "logits/chosen": -4.334009170532227, "logits/rejected": -4.393346309661865, "logps/chosen": -661.3502197265625, "logps/rejected": -723.93505859375, "loss": 0.6161, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3838418126106262, "rewards/margins": 0.6862874031066895, "rewards/rejected": -1.0701292753219604, "step": 3680 }, { "epoch": 0.8596389050669773, "grad_norm": 7.175718307495117, "learning_rate": 1.2670550161812299e-05, "logits/chosen": -4.439593315124512, "logits/rejected": -4.346892356872559, "logps/chosen": -750.6116943359375, "logps/rejected": -738.089599609375, "loss": 0.7567, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6906553506851196, "rewards/margins": 0.38243815302848816, "rewards/rejected": -1.0730934143066406, "step": 3690 }, { "epoch": 0.8619685497961561, "grad_norm": 3.485154867172241, "learning_rate": 1.2644660194174758e-05, "logits/chosen": -4.373470783233643, "logits/rejected": -4.380204200744629, "logps/chosen": -722.3649291992188, "logps/rejected": -786.5294189453125, "loss": 0.7397, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6116902232170105, "rewards/margins": 0.5988327860832214, "rewards/rejected": -1.2105228900909424, "step": 3700 }, { "epoch": 0.8619685497961561, "eval_logits/chosen": -4.296970367431641, "eval_logits/rejected": -4.285056114196777, "eval_logps/chosen": -697.7417602539062, "eval_logps/rejected": -717.1804809570312, "eval_loss": 0.6377041935920715, "eval_rewards/accuracies": 0.6340509057044983, "eval_rewards/chosen": -0.6913568377494812, "eval_rewards/margins": 0.34682154655456543, "eval_rewards/rejected": -1.0381783246994019, "eval_runtime": 390.2544, "eval_samples_per_second": 18.332, "eval_steps_per_second": 9.166, "step": 3700 }, { "epoch": 0.8642981945253349, "grad_norm": 9.30069351196289, "learning_rate": 1.2618770226537218e-05, "logits/chosen": -4.300488471984863, "logits/rejected": -4.226680755615234, "logps/chosen": -686.4176025390625, "logps/rejected": -642.7655639648438, "loss": 0.8453, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8403793573379517, "rewards/margins": 0.18287669122219086, "rewards/rejected": -1.0232560634613037, "step": 3710 }, { "epoch": 0.8666278392545137, "grad_norm": 10.053071975708008, "learning_rate": 1.2592880258899677e-05, "logits/chosen": -4.428006172180176, "logits/rejected": -4.311192512512207, "logps/chosen": -745.8179931640625, "logps/rejected": -731.64208984375, "loss": 0.7535, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.61701500415802, "rewards/margins": 0.3933693468570709, "rewards/rejected": -1.0103843212127686, "step": 3720 }, { "epoch": 0.8689574839836924, "grad_norm": 10.565129280090332, "learning_rate": 1.2566990291262138e-05, "logits/chosen": -4.2700724601745605, "logits/rejected": -4.286831855773926, "logps/chosen": -701.6298828125, "logps/rejected": -780.7486572265625, "loss": 0.8448, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6534292101860046, "rewards/margins": 0.44209614396095276, "rewards/rejected": -1.0955253839492798, "step": 3730 }, { "epoch": 0.8712871287128713, "grad_norm": 6.875129699707031, "learning_rate": 1.2541100323624595e-05, "logits/chosen": -4.282231330871582, "logits/rejected": -4.344172477722168, "logps/chosen": -733.3506469726562, "logps/rejected": -777.8792724609375, "loss": 0.77, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7044860124588013, "rewards/margins": 0.4850228428840637, "rewards/rejected": -1.1895086765289307, "step": 3740 }, { "epoch": 0.8736167734420501, "grad_norm": 3.8528778553009033, "learning_rate": 1.2515210355987056e-05, "logits/chosen": -4.3902411460876465, "logits/rejected": -4.359185218811035, "logps/chosen": -715.0982666015625, "logps/rejected": -738.7059326171875, "loss": 0.832, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6878668069839478, "rewards/margins": 0.247682124376297, "rewards/rejected": -0.9355489015579224, "step": 3750 }, { "epoch": 0.8759464181712289, "grad_norm": 11.82832145690918, "learning_rate": 1.2489320388349514e-05, "logits/chosen": -4.367011070251465, "logits/rejected": -4.325486183166504, "logps/chosen": -674.1287841796875, "logps/rejected": -743.0347900390625, "loss": 0.7699, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.662690281867981, "rewards/margins": 0.4709756374359131, "rewards/rejected": -1.1336658000946045, "step": 3760 }, { "epoch": 0.8782760629004077, "grad_norm": 6.591240406036377, "learning_rate": 1.2463430420711975e-05, "logits/chosen": -4.304961204528809, "logits/rejected": -4.336878776550293, "logps/chosen": -675.7276000976562, "logps/rejected": -756.3129272460938, "loss": 0.738, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5121418833732605, "rewards/margins": 0.4193623661994934, "rewards/rejected": -0.9315041303634644, "step": 3770 }, { "epoch": 0.8806057076295865, "grad_norm": 8.733416557312012, "learning_rate": 1.2437540453074436e-05, "logits/chosen": -4.301700592041016, "logits/rejected": -4.304194450378418, "logps/chosen": -664.4769287109375, "logps/rejected": -685.9580078125, "loss": 0.8389, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.555719256401062, "rewards/margins": 0.24690046906471252, "rewards/rejected": -0.8026197552680969, "step": 3780 }, { "epoch": 0.8829353523587653, "grad_norm": 7.504303455352783, "learning_rate": 1.2411650485436894e-05, "logits/chosen": -4.322279453277588, "logits/rejected": -4.386387825012207, "logps/chosen": -702.5224609375, "logps/rejected": -734.0633544921875, "loss": 0.683, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4761506915092468, "rewards/margins": 0.6575405597686768, "rewards/rejected": -1.1336911916732788, "step": 3790 }, { "epoch": 0.885264997087944, "grad_norm": 4.952593803405762, "learning_rate": 1.2385760517799355e-05, "logits/chosen": -4.264891624450684, "logits/rejected": -4.3359832763671875, "logps/chosen": -690.0963745117188, "logps/rejected": -771.6055908203125, "loss": 0.7332, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6271868944168091, "rewards/margins": 0.535195529460907, "rewards/rejected": -1.1623823642730713, "step": 3800 }, { "epoch": 0.885264997087944, "eval_logits/chosen": -4.289893627166748, "eval_logits/rejected": -4.277091979980469, "eval_logps/chosen": -697.6959228515625, "eval_logps/rejected": -717.1419067382812, "eval_loss": 0.637164831161499, "eval_rewards/accuracies": 0.6327928304672241, "eval_rewards/chosen": -0.6867589354515076, "eval_rewards/margins": 0.3475603461265564, "eval_rewards/rejected": -1.034319281578064, "eval_runtime": 390.115, "eval_samples_per_second": 18.338, "eval_steps_per_second": 9.169, "step": 3800 }, { "epoch": 0.8875946418171229, "grad_norm": 11.472143173217773, "learning_rate": 1.2359870550161814e-05, "logits/chosen": -4.364859580993652, "logits/rejected": -4.3472161293029785, "logps/chosen": -775.5863037109375, "logps/rejected": -826.8441162109375, "loss": 0.8701, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.7601150870323181, "rewards/margins": 0.15106992423534393, "rewards/rejected": -0.911185085773468, "step": 3810 }, { "epoch": 0.8899242865463017, "grad_norm": 7.065751075744629, "learning_rate": 1.2333980582524273e-05, "logits/chosen": -4.3581085205078125, "logits/rejected": -4.301484107971191, "logps/chosen": -761.5343627929688, "logps/rejected": -733.0517578125, "loss": 0.7902, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6210321187973022, "rewards/margins": 0.3615582585334778, "rewards/rejected": -0.9825904965400696, "step": 3820 }, { "epoch": 0.8922539312754805, "grad_norm": 8.05189323425293, "learning_rate": 1.2308090614886732e-05, "logits/chosen": -4.315977096557617, "logits/rejected": -4.266562461853027, "logps/chosen": -676.1387329101562, "logps/rejected": -718.7037353515625, "loss": 0.7636, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.44627365469932556, "rewards/margins": 0.37797656655311584, "rewards/rejected": -0.8242501020431519, "step": 3830 }, { "epoch": 0.8945835760046593, "grad_norm": 9.934157371520996, "learning_rate": 1.2282200647249192e-05, "logits/chosen": -4.352757930755615, "logits/rejected": -4.351375102996826, "logps/chosen": -721.6439819335938, "logps/rejected": -761.50146484375, "loss": 0.9771, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.9130200147628784, "rewards/margins": 0.052966028451919556, "rewards/rejected": -0.9659860730171204, "step": 3840 }, { "epoch": 0.8969132207338381, "grad_norm": 6.56084680557251, "learning_rate": 1.2256310679611651e-05, "logits/chosen": -4.260069847106934, "logits/rejected": -4.327987194061279, "logps/chosen": -745.3139038085938, "logps/rejected": -761.8585815429688, "loss": 0.8553, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.9242075085639954, "rewards/margins": 0.1127476915717125, "rewards/rejected": -1.0369552373886108, "step": 3850 }, { "epoch": 0.8992428654630169, "grad_norm": 7.803834438323975, "learning_rate": 1.2230420711974112e-05, "logits/chosen": -4.303910255432129, "logits/rejected": -4.297524452209473, "logps/chosen": -769.2096557617188, "logps/rejected": -766.2592163085938, "loss": 0.7203, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5617278218269348, "rewards/margins": 0.6413966417312622, "rewards/rejected": -1.2031244039535522, "step": 3860 }, { "epoch": 0.9015725101921956, "grad_norm": 9.433052062988281, "learning_rate": 1.220453074433657e-05, "logits/chosen": -4.305348873138428, "logits/rejected": -4.278184413909912, "logps/chosen": -722.5424194335938, "logps/rejected": -740.0787353515625, "loss": 0.8749, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.7318505644798279, "rewards/margins": 0.2239595353603363, "rewards/rejected": -0.9558101892471313, "step": 3870 }, { "epoch": 0.9039021549213745, "grad_norm": 4.370671272277832, "learning_rate": 1.2178640776699031e-05, "logits/chosen": -4.366055488586426, "logits/rejected": -4.3162760734558105, "logps/chosen": -765.7427978515625, "logps/rejected": -722.709228515625, "loss": 0.7788, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.49548083543777466, "rewards/margins": 0.2860412001609802, "rewards/rejected": -0.7815219759941101, "step": 3880 }, { "epoch": 0.9062317996505533, "grad_norm": 7.877103328704834, "learning_rate": 1.2152750809061488e-05, "logits/chosen": -4.199087142944336, "logits/rejected": -4.245467185974121, "logps/chosen": -690.6033935546875, "logps/rejected": -745.9832153320312, "loss": 0.7376, "rewards/accuracies": 0.625, "rewards/chosen": -0.4985593259334564, "rewards/margins": 0.4773792624473572, "rewards/rejected": -0.9759386777877808, "step": 3890 }, { "epoch": 0.9085614443797321, "grad_norm": 6.403533935546875, "learning_rate": 1.2126860841423949e-05, "logits/chosen": -4.394805431365967, "logits/rejected": -4.327023506164551, "logps/chosen": -719.85791015625, "logps/rejected": -745.8244018554688, "loss": 0.6818, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5105542540550232, "rewards/margins": 0.5197178721427917, "rewards/rejected": -1.030272126197815, "step": 3900 }, { "epoch": 0.9085614443797321, "eval_logits/chosen": -4.285029411315918, "eval_logits/rejected": -4.271745204925537, "eval_logps/chosen": -697.7550048828125, "eval_logps/rejected": -717.2332763671875, "eval_loss": 0.6361972093582153, "eval_rewards/accuracies": 0.634889543056488, "eval_rewards/chosen": -0.6926815509796143, "eval_rewards/margins": 0.35078319907188416, "eval_rewards/rejected": -1.0434646606445312, "eval_runtime": 390.7908, "eval_samples_per_second": 18.306, "eval_steps_per_second": 9.153, "step": 3900 }, { "epoch": 0.9108910891089109, "grad_norm": 5.553112030029297, "learning_rate": 1.2100970873786408e-05, "logits/chosen": -4.275856018066406, "logits/rejected": -4.270997047424316, "logps/chosen": -668.0219116210938, "logps/rejected": -684.3800048828125, "loss": 0.7816, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5604637861251831, "rewards/margins": 0.280496746301651, "rewards/rejected": -0.8409605026245117, "step": 3910 }, { "epoch": 0.9132207338380897, "grad_norm": 7.20487117767334, "learning_rate": 1.2075080906148868e-05, "logits/chosen": -4.234086990356445, "logits/rejected": -4.341015815734863, "logps/chosen": -652.541259765625, "logps/rejected": -708.7343139648438, "loss": 0.7336, "rewards/accuracies": 0.625, "rewards/chosen": -0.36917343735694885, "rewards/margins": 0.36028575897216797, "rewards/rejected": -0.7294591665267944, "step": 3920 }, { "epoch": 0.9155503785672685, "grad_norm": 6.224857807159424, "learning_rate": 1.2049190938511329e-05, "logits/chosen": -4.316666603088379, "logits/rejected": -4.31832218170166, "logps/chosen": -700.3021850585938, "logps/rejected": -748.1549682617188, "loss": 0.682, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6771694421768188, "rewards/margins": 0.3864041566848755, "rewards/rejected": -1.0635735988616943, "step": 3930 }, { "epoch": 0.9178800232964472, "grad_norm": 9.819120407104492, "learning_rate": 1.2023300970873788e-05, "logits/chosen": -4.288181304931641, "logits/rejected": -4.2603254318237305, "logps/chosen": -701.2803955078125, "logps/rejected": -651.3552856445312, "loss": 0.8137, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7791440486907959, "rewards/margins": 0.28175079822540283, "rewards/rejected": -1.0608948469161987, "step": 3940 }, { "epoch": 0.9202096680256261, "grad_norm": 6.487390995025635, "learning_rate": 1.1997411003236248e-05, "logits/chosen": -4.354862689971924, "logits/rejected": -4.413088798522949, "logps/chosen": -704.1953125, "logps/rejected": -725.8970947265625, "loss": 0.686, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6075937747955322, "rewards/margins": 0.45311158895492554, "rewards/rejected": -1.0607054233551025, "step": 3950 }, { "epoch": 0.9225393127548049, "grad_norm": 7.161795616149902, "learning_rate": 1.1971521035598706e-05, "logits/chosen": -4.353941917419434, "logits/rejected": -4.300901412963867, "logps/chosen": -712.5975341796875, "logps/rejected": -664.6988525390625, "loss": 0.6314, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5916818380355835, "rewards/margins": 0.6264206171035767, "rewards/rejected": -1.2181024551391602, "step": 3960 }, { "epoch": 0.9248689574839837, "grad_norm": 7.680700778961182, "learning_rate": 1.1945631067961166e-05, "logits/chosen": -4.289122581481934, "logits/rejected": -4.3174662590026855, "logps/chosen": -699.622314453125, "logps/rejected": -808.3905639648438, "loss": 0.7317, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4722517132759094, "rewards/margins": 0.5728135704994202, "rewards/rejected": -1.04506516456604, "step": 3970 }, { "epoch": 0.9271986022131625, "grad_norm": 6.7148613929748535, "learning_rate": 1.1919741100323625e-05, "logits/chosen": -4.370627403259277, "logits/rejected": -4.364049434661865, "logps/chosen": -672.5323486328125, "logps/rejected": -693.3355712890625, "loss": 0.6916, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3713933527469635, "rewards/margins": 0.619596004486084, "rewards/rejected": -0.9909893274307251, "step": 3980 }, { "epoch": 0.9295282469423413, "grad_norm": 6.711925029754639, "learning_rate": 1.1893851132686086e-05, "logits/chosen": -4.313281059265137, "logits/rejected": -4.3168206214904785, "logps/chosen": -757.818115234375, "logps/rejected": -744.3825073242188, "loss": 0.8777, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.6283093690872192, "rewards/margins": 0.17840756475925446, "rewards/rejected": -0.8067169189453125, "step": 3990 }, { "epoch": 0.9318578916715201, "grad_norm": 9.016495704650879, "learning_rate": 1.1867961165048544e-05, "logits/chosen": -4.332929611206055, "logits/rejected": -4.292237758636475, "logps/chosen": -754.0440673828125, "logps/rejected": -729.8211669921875, "loss": 0.8391, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7084426283836365, "rewards/margins": 0.30985385179519653, "rewards/rejected": -1.018296480178833, "step": 4000 }, { "epoch": 0.9318578916715201, "eval_logits/chosen": -4.2970099449157715, "eval_logits/rejected": -4.284469127655029, "eval_logps/chosen": -697.7953491210938, "eval_logps/rejected": -717.3357543945312, "eval_loss": 0.6347742080688477, "eval_rewards/accuracies": 0.6379647850990295, "eval_rewards/chosen": -0.6967126131057739, "eval_rewards/margins": 0.35700222849845886, "eval_rewards/rejected": -1.0537148714065552, "eval_runtime": 390.6719, "eval_samples_per_second": 18.312, "eval_steps_per_second": 9.156, "step": 4000 }, { "epoch": 0.9341875364006988, "grad_norm": 5.923018455505371, "learning_rate": 1.1842071197411005e-05, "logits/chosen": -4.305145263671875, "logits/rejected": -4.294079303741455, "logps/chosen": -767.0081787109375, "logps/rejected": -767.0750732421875, "loss": 0.6031, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5457931756973267, "rewards/margins": 0.7378371953964233, "rewards/rejected": -1.28363037109375, "step": 4010 }, { "epoch": 0.9365171811298777, "grad_norm": 7.910881519317627, "learning_rate": 1.1816181229773464e-05, "logits/chosen": -4.279091835021973, "logits/rejected": -4.292242527008057, "logps/chosen": -677.419189453125, "logps/rejected": -674.3132934570312, "loss": 0.8111, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.7452019453048706, "rewards/margins": 0.21577151119709015, "rewards/rejected": -0.9609734416007996, "step": 4020 }, { "epoch": 0.9388468258590565, "grad_norm": 6.735974311828613, "learning_rate": 1.1790291262135923e-05, "logits/chosen": -4.291772365570068, "logits/rejected": -4.343691825866699, "logps/chosen": -713.7225341796875, "logps/rejected": -804.0164184570312, "loss": 0.6981, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5853220224380493, "rewards/margins": 0.46226024627685547, "rewards/rejected": -1.0475821495056152, "step": 4030 }, { "epoch": 0.9411764705882353, "grad_norm": 8.748640060424805, "learning_rate": 1.1764401294498382e-05, "logits/chosen": -4.29034423828125, "logits/rejected": -4.391429901123047, "logps/chosen": -731.4406127929688, "logps/rejected": -813.4442138671875, "loss": 0.8127, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.8790253400802612, "rewards/margins": 0.3499372601509094, "rewards/rejected": -1.2289626598358154, "step": 4040 }, { "epoch": 0.9435061153174141, "grad_norm": 8.355563163757324, "learning_rate": 1.1738511326860842e-05, "logits/chosen": -4.349619388580322, "logits/rejected": -4.397017478942871, "logps/chosen": -756.1160278320312, "logps/rejected": -741.4420776367188, "loss": 0.8698, "rewards/accuracies": 0.5, "rewards/chosen": -0.5679250359535217, "rewards/margins": 0.10850824415683746, "rewards/rejected": -0.676433265209198, "step": 4050 }, { "epoch": 0.9458357600465929, "grad_norm": 7.7695088386535645, "learning_rate": 1.1712621359223301e-05, "logits/chosen": -4.371559143066406, "logits/rejected": -4.285519599914551, "logps/chosen": -667.3077392578125, "logps/rejected": -681.5567626953125, "loss": 0.7885, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6184073686599731, "rewards/margins": 0.35303595662117004, "rewards/rejected": -0.9714432954788208, "step": 4060 }, { "epoch": 0.9481654047757717, "grad_norm": 5.586401462554932, "learning_rate": 1.1686731391585762e-05, "logits/chosen": -4.3023362159729, "logits/rejected": -4.3370680809021, "logps/chosen": -688.8123779296875, "logps/rejected": -772.4279174804688, "loss": 0.8592, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6615897417068481, "rewards/margins": 0.2921566963195801, "rewards/rejected": -0.9537464380264282, "step": 4070 }, { "epoch": 0.9504950495049505, "grad_norm": 9.162353515625, "learning_rate": 1.1660841423948222e-05, "logits/chosen": -4.320708751678467, "logits/rejected": -4.362603187561035, "logps/chosen": -680.4637451171875, "logps/rejected": -716.41796875, "loss": 0.8136, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.632215142250061, "rewards/margins": 0.2641952633857727, "rewards/rejected": -0.896410346031189, "step": 4080 }, { "epoch": 0.9528246942341293, "grad_norm": 6.786946773529053, "learning_rate": 1.1634951456310681e-05, "logits/chosen": -4.182633399963379, "logits/rejected": -4.321324348449707, "logps/chosen": -618.3949584960938, "logps/rejected": -735.050048828125, "loss": 0.8862, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.725447416305542, "rewards/margins": 0.29415830969810486, "rewards/rejected": -1.0196057558059692, "step": 4090 }, { "epoch": 0.9551543389633081, "grad_norm": 9.914709091186523, "learning_rate": 1.1609061488673142e-05, "logits/chosen": -4.354341983795166, "logits/rejected": -4.419948577880859, "logps/chosen": -713.4168090820312, "logps/rejected": -794.8353271484375, "loss": 0.9021, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.7848163843154907, "rewards/margins": 0.18717719614505768, "rewards/rejected": -0.9719935655593872, "step": 4100 }, { "epoch": 0.9551543389633081, "eval_logits/chosen": -4.297923564910889, "eval_logits/rejected": -4.285905838012695, "eval_logps/chosen": -697.90234375, "eval_logps/rejected": -717.4815063476562, "eval_loss": 0.6345070600509644, "eval_rewards/accuracies": 0.6341906785964966, "eval_rewards/chosen": -0.7074149250984192, "eval_rewards/margins": 0.3608627915382385, "eval_rewards/rejected": -1.0682777166366577, "eval_runtime": 390.4537, "eval_samples_per_second": 18.322, "eval_steps_per_second": 9.161, "step": 4100 }, { "epoch": 0.9574839836924869, "grad_norm": 10.379064559936523, "learning_rate": 1.1583171521035599e-05, "logits/chosen": -4.362015724182129, "logits/rejected": -4.387961387634277, "logps/chosen": -696.187255859375, "logps/rejected": -740.3745727539062, "loss": 0.7754, "rewards/accuracies": 0.625, "rewards/chosen": -0.7311137914657593, "rewards/margins": 0.3941892087459564, "rewards/rejected": -1.125303030014038, "step": 4110 }, { "epoch": 0.9598136284216657, "grad_norm": 7.714394569396973, "learning_rate": 1.155728155339806e-05, "logits/chosen": -4.350986003875732, "logits/rejected": -4.345406532287598, "logps/chosen": -687.46435546875, "logps/rejected": -715.8142700195312, "loss": 0.7417, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8579521179199219, "rewards/margins": 0.4291927218437195, "rewards/rejected": -1.2871448993682861, "step": 4120 }, { "epoch": 0.9621432731508445, "grad_norm": 10.189233779907227, "learning_rate": 1.1531391585760518e-05, "logits/chosen": -4.308093070983887, "logits/rejected": -4.3064866065979, "logps/chosen": -720.6092529296875, "logps/rejected": -737.302490234375, "loss": 0.8021, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8064984083175659, "rewards/margins": 0.40783438086509705, "rewards/rejected": -1.2143328189849854, "step": 4130 }, { "epoch": 0.9644729178800233, "grad_norm": 7.011444091796875, "learning_rate": 1.1505501618122979e-05, "logits/chosen": -4.417510032653809, "logits/rejected": -4.422568321228027, "logps/chosen": -755.2730712890625, "logps/rejected": -800.2728881835938, "loss": 0.7996, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7368863821029663, "rewards/margins": 0.3117212653160095, "rewards/rejected": -1.048607587814331, "step": 4140 }, { "epoch": 0.966802562609202, "grad_norm": 7.239262104034424, "learning_rate": 1.1479611650485438e-05, "logits/chosen": -4.340888023376465, "logits/rejected": -4.3161115646362305, "logps/chosen": -725.8350830078125, "logps/rejected": -671.7639770507812, "loss": 0.7794, "rewards/accuracies": 0.5625, "rewards/chosen": -0.71551114320755, "rewards/margins": 0.25095969438552856, "rewards/rejected": -0.9664708971977234, "step": 4150 }, { "epoch": 0.9691322073383809, "grad_norm": 10.932368278503418, "learning_rate": 1.1453721682847898e-05, "logits/chosen": -4.345290184020996, "logits/rejected": -4.2818169593811035, "logps/chosen": -714.5953369140625, "logps/rejected": -723.066162109375, "loss": 0.902, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5351029634475708, "rewards/margins": 0.3501548171043396, "rewards/rejected": -0.8852578401565552, "step": 4160 }, { "epoch": 0.9714618520675598, "grad_norm": 4.409315586090088, "learning_rate": 1.1427831715210357e-05, "logits/chosen": -4.325705528259277, "logits/rejected": -4.331601142883301, "logps/chosen": -687.9990234375, "logps/rejected": -684.64013671875, "loss": 0.7662, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7611755728721619, "rewards/margins": 0.32491764426231384, "rewards/rejected": -1.0860933065414429, "step": 4170 }, { "epoch": 0.9737914967967385, "grad_norm": 6.8606085777282715, "learning_rate": 1.1401941747572816e-05, "logits/chosen": -4.298327445983887, "logits/rejected": -4.327838897705078, "logps/chosen": -691.434326171875, "logps/rejected": -719.2293090820312, "loss": 0.8632, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6933432817459106, "rewards/margins": 0.22406530380249023, "rewards/rejected": -0.9174085855484009, "step": 4180 }, { "epoch": 0.9761211415259173, "grad_norm": 8.066191673278809, "learning_rate": 1.1376051779935275e-05, "logits/chosen": -4.308099746704102, "logits/rejected": -4.330200672149658, "logps/chosen": -724.9734497070312, "logps/rejected": -770.6451416015625, "loss": 0.7989, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5899969935417175, "rewards/margins": 0.49291300773620605, "rewards/rejected": -1.0829100608825684, "step": 4190 }, { "epoch": 0.9784507862550961, "grad_norm": 7.6504597663879395, "learning_rate": 1.1350161812297736e-05, "logits/chosen": -4.331873893737793, "logits/rejected": -4.344123363494873, "logps/chosen": -722.4224243164062, "logps/rejected": -752.9412231445312, "loss": 0.8485, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7468961477279663, "rewards/margins": 0.22630326449871063, "rewards/rejected": -0.9731994867324829, "step": 4200 }, { "epoch": 0.9784507862550961, "eval_logits/chosen": -4.293464660644531, "eval_logits/rejected": -4.280642509460449, "eval_logps/chosen": -697.4835815429688, "eval_logps/rejected": -717.0183715820312, "eval_loss": 0.632033109664917, "eval_rewards/accuracies": 0.6395023465156555, "eval_rewards/chosen": -0.6655304431915283, "eval_rewards/margins": 0.3564453721046448, "eval_rewards/rejected": -1.0219757556915283, "eval_runtime": 390.3572, "eval_samples_per_second": 18.327, "eval_steps_per_second": 9.163, "step": 4200 }, { "epoch": 0.9807804309842749, "grad_norm": 8.241952896118164, "learning_rate": 1.1324271844660195e-05, "logits/chosen": -4.362992286682129, "logits/rejected": -4.3978118896484375, "logps/chosen": -706.3831787109375, "logps/rejected": -748.598876953125, "loss": 0.6911, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5810929536819458, "rewards/margins": 0.5309965014457703, "rewards/rejected": -1.1120895147323608, "step": 4210 }, { "epoch": 0.9831100757134537, "grad_norm": 5.629496097564697, "learning_rate": 1.1298381877022655e-05, "logits/chosen": -4.324032306671143, "logits/rejected": -4.422591686248779, "logps/chosen": -654.5308837890625, "logps/rejected": -739.7088012695312, "loss": 0.687, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5922080278396606, "rewards/margins": 0.5977957248687744, "rewards/rejected": -1.1900036334991455, "step": 4220 }, { "epoch": 0.9854397204426325, "grad_norm": 8.188299179077148, "learning_rate": 1.1272491909385116e-05, "logits/chosen": -4.344983100891113, "logits/rejected": -4.238674640655518, "logps/chosen": -704.5607299804688, "logps/rejected": -669.7830810546875, "loss": 0.8422, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.7657082080841064, "rewards/margins": 0.19472898542881012, "rewards/rejected": -0.9604371190071106, "step": 4230 }, { "epoch": 0.9877693651718114, "grad_norm": 10.56550121307373, "learning_rate": 1.1246601941747575e-05, "logits/chosen": -4.41368293762207, "logits/rejected": -4.372306823730469, "logps/chosen": -759.0302124023438, "logps/rejected": -758.4656372070312, "loss": 0.7326, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5605214834213257, "rewards/margins": 0.5081143379211426, "rewards/rejected": -1.0686357021331787, "step": 4240 }, { "epoch": 0.9900990099009901, "grad_norm": 6.3407111167907715, "learning_rate": 1.1220711974110032e-05, "logits/chosen": -4.351005554199219, "logits/rejected": -4.320067405700684, "logps/chosen": -724.1296997070312, "logps/rejected": -682.4201049804688, "loss": 0.8276, "rewards/accuracies": 0.5, "rewards/chosen": -0.4309804439544678, "rewards/margins": 0.1828249841928482, "rewards/rejected": -0.6138054728507996, "step": 4250 }, { "epoch": 0.9924286546301689, "grad_norm": 7.637398719787598, "learning_rate": 1.1194822006472492e-05, "logits/chosen": -4.266735076904297, "logits/rejected": -4.263129234313965, "logps/chosen": -711.1100463867188, "logps/rejected": -707.9876708984375, "loss": 0.8207, "rewards/accuracies": 0.5, "rewards/chosen": -0.7385729551315308, "rewards/margins": 0.1418248414993286, "rewards/rejected": -0.8803977966308594, "step": 4260 }, { "epoch": 0.9947582993593477, "grad_norm": 5.631221294403076, "learning_rate": 1.1168932038834953e-05, "logits/chosen": -4.288881778717041, "logits/rejected": -4.356893539428711, "logps/chosen": -738.98046875, "logps/rejected": -761.4144897460938, "loss": 0.659, "rewards/accuracies": 0.625, "rewards/chosen": -0.4446313977241516, "rewards/margins": 0.7082819938659668, "rewards/rejected": -1.1529133319854736, "step": 4270 }, { "epoch": 0.9970879440885265, "grad_norm": 7.247366428375244, "learning_rate": 1.1143042071197412e-05, "logits/chosen": -4.375193119049072, "logits/rejected": -4.431723117828369, "logps/chosen": -710.0206298828125, "logps/rejected": -778.4895629882812, "loss": 0.5791, "rewards/accuracies": 0.6875, "rewards/chosen": -0.515932023525238, "rewards/margins": 0.9266525506973267, "rewards/rejected": -1.4425846338272095, "step": 4280 }, { "epoch": 0.9994175888177053, "grad_norm": 7.143087387084961, "learning_rate": 1.1117152103559872e-05, "logits/chosen": -4.352514743804932, "logits/rejected": -4.303081035614014, "logps/chosen": -749.5734252929688, "logps/rejected": -768.8510131835938, "loss": 0.6417, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3860852122306824, "rewards/margins": 0.6603494882583618, "rewards/rejected": -1.0464346408843994, "step": 4290 }, { "epoch": 1.001747233546884, "grad_norm": 10.907618522644043, "learning_rate": 1.1091262135922331e-05, "logits/chosen": -4.354809284210205, "logits/rejected": -4.3103461265563965, "logps/chosen": -759.310791015625, "logps/rejected": -779.1396484375, "loss": 0.8318, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.6837959289550781, "rewards/margins": 0.14305201172828674, "rewards/rejected": -0.8268479108810425, "step": 4300 }, { "epoch": 1.001747233546884, "eval_logits/chosen": -4.28951358795166, "eval_logits/rejected": -4.275976657867432, "eval_logps/chosen": -697.7381591796875, "eval_logps/rejected": -717.3543701171875, "eval_loss": 0.6335008144378662, "eval_rewards/accuracies": 0.6350293755531311, "eval_rewards/chosen": -0.6909992694854736, "eval_rewards/margins": 0.36457014083862305, "eval_rewards/rejected": -1.0555692911148071, "eval_runtime": 390.7708, "eval_samples_per_second": 18.307, "eval_steps_per_second": 9.154, "step": 4300 }, { "epoch": 1.004076878276063, "grad_norm": 8.588318824768066, "learning_rate": 1.1065372168284792e-05, "logits/chosen": -4.376956939697266, "logits/rejected": -4.33894157409668, "logps/chosen": -717.998779296875, "logps/rejected": -729.5446166992188, "loss": 0.7683, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.6977962255477905, "rewards/margins": 0.3738870620727539, "rewards/rejected": -1.0716831684112549, "step": 4310 }, { "epoch": 1.0064065230052417, "grad_norm": 6.269148349761963, "learning_rate": 1.1039482200647249e-05, "logits/chosen": -4.326693534851074, "logits/rejected": -4.353607177734375, "logps/chosen": -755.3944091796875, "logps/rejected": -790.818115234375, "loss": 0.7439, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8258703947067261, "rewards/margins": 0.47228875756263733, "rewards/rejected": -1.298159122467041, "step": 4320 }, { "epoch": 1.0087361677344204, "grad_norm": 10.411598205566406, "learning_rate": 1.101359223300971e-05, "logits/chosen": -4.370461940765381, "logits/rejected": -4.218691825866699, "logps/chosen": -758.8607177734375, "logps/rejected": -717.3480224609375, "loss": 0.6571, "rewards/accuracies": 0.625, "rewards/chosen": -0.5283089876174927, "rewards/margins": 0.50538170337677, "rewards/rejected": -1.0336906909942627, "step": 4330 }, { "epoch": 1.0110658124635994, "grad_norm": 5.391559600830078, "learning_rate": 1.0987702265372168e-05, "logits/chosen": -4.347881317138672, "logits/rejected": -4.359528541564941, "logps/chosen": -708.3019409179688, "logps/rejected": -712.9531860351562, "loss": 0.7141, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7340580821037292, "rewards/margins": 0.37196987867355347, "rewards/rejected": -1.1060279607772827, "step": 4340 }, { "epoch": 1.0133954571927781, "grad_norm": 5.033149242401123, "learning_rate": 1.0961812297734629e-05, "logits/chosen": -4.371495246887207, "logits/rejected": -4.206801891326904, "logps/chosen": -752.9769287109375, "logps/rejected": -694.5695190429688, "loss": 0.8001, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.7193650007247925, "rewards/margins": 0.2597826421260834, "rewards/rejected": -0.979147732257843, "step": 4350 }, { "epoch": 1.0157251019219569, "grad_norm": 7.617324352264404, "learning_rate": 1.0935922330097088e-05, "logits/chosen": -4.31460428237915, "logits/rejected": -4.267983913421631, "logps/chosen": -691.3458862304688, "logps/rejected": -702.7252807617188, "loss": 0.7861, "rewards/accuracies": 0.625, "rewards/chosen": -0.7619749903678894, "rewards/margins": 0.3660062551498413, "rewards/rejected": -1.1279813051223755, "step": 4360 }, { "epoch": 1.0180547466511356, "grad_norm": 4.813138961791992, "learning_rate": 1.0910032362459548e-05, "logits/chosen": -4.37437105178833, "logits/rejected": -4.39128303527832, "logps/chosen": -753.4616088867188, "logps/rejected": -759.0370483398438, "loss": 0.7151, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6380533576011658, "rewards/margins": 0.5098127126693726, "rewards/rejected": -1.1478660106658936, "step": 4370 }, { "epoch": 1.0203843913803146, "grad_norm": 7.298201084136963, "learning_rate": 1.0884142394822009e-05, "logits/chosen": -4.26786994934082, "logits/rejected": -4.357375621795654, "logps/chosen": -680.5759887695312, "logps/rejected": -726.3565673828125, "loss": 0.9456, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.8680903315544128, "rewards/margins": 0.08186836540699005, "rewards/rejected": -0.9499586820602417, "step": 4380 }, { "epoch": 1.0227140361094933, "grad_norm": 8.440717697143555, "learning_rate": 1.0858252427184466e-05, "logits/chosen": -4.3552470207214355, "logits/rejected": -4.355259895324707, "logps/chosen": -598.6309814453125, "logps/rejected": -722.0578002929688, "loss": 0.732, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7806925177574158, "rewards/margins": 0.5147355198860168, "rewards/rejected": -1.2954281568527222, "step": 4390 }, { "epoch": 1.025043680838672, "grad_norm": 8.4296236038208, "learning_rate": 1.0832362459546925e-05, "logits/chosen": -4.287355899810791, "logits/rejected": -4.293883323669434, "logps/chosen": -703.4840087890625, "logps/rejected": -756.8140869140625, "loss": 0.7625, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7262560725212097, "rewards/margins": 0.43259239196777344, "rewards/rejected": -1.158848524093628, "step": 4400 }, { "epoch": 1.025043680838672, "eval_logits/chosen": -4.290392875671387, "eval_logits/rejected": -4.277007102966309, "eval_logps/chosen": -698.2494506835938, "eval_logps/rejected": -717.9976806640625, "eval_loss": 0.6358251571655273, "eval_rewards/accuracies": 0.6337713003158569, "eval_rewards/chosen": -0.7421231269836426, "eval_rewards/margins": 0.3777867555618286, "eval_rewards/rejected": -1.1199098825454712, "eval_runtime": 392.2028, "eval_samples_per_second": 18.241, "eval_steps_per_second": 9.12, "step": 4400 }, { "epoch": 1.027373325567851, "grad_norm": 9.419212341308594, "learning_rate": 1.0806472491909386e-05, "logits/chosen": -4.247093200683594, "logits/rejected": -4.255812644958496, "logps/chosen": -731.6544189453125, "logps/rejected": -766.1512451171875, "loss": 0.7624, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5727885961532593, "rewards/margins": 0.5212075114250183, "rewards/rejected": -1.0939961671829224, "step": 4410 }, { "epoch": 1.0297029702970297, "grad_norm": 6.185367107391357, "learning_rate": 1.0780582524271846e-05, "logits/chosen": -4.298975944519043, "logits/rejected": -4.33304500579834, "logps/chosen": -684.7571411132812, "logps/rejected": -752.9080810546875, "loss": 0.6736, "rewards/accuracies": 0.625, "rewards/chosen": -0.694426417350769, "rewards/margins": 0.5212685465812683, "rewards/rejected": -1.2156950235366821, "step": 4420 }, { "epoch": 1.0320326150262085, "grad_norm": 8.520529747009277, "learning_rate": 1.0754692556634305e-05, "logits/chosen": -4.316971778869629, "logits/rejected": -4.40161657333374, "logps/chosen": -700.2435302734375, "logps/rejected": -781.5968627929688, "loss": 0.8329, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6743239760398865, "rewards/margins": 0.26052436232566833, "rewards/rejected": -0.9348483085632324, "step": 4430 }, { "epoch": 1.0343622597553872, "grad_norm": 8.941186904907227, "learning_rate": 1.0728802588996766e-05, "logits/chosen": -4.344968318939209, "logits/rejected": -4.354222774505615, "logps/chosen": -727.3234252929688, "logps/rejected": -796.7506103515625, "loss": 0.7991, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6619983911514282, "rewards/margins": 0.3960326611995697, "rewards/rejected": -1.0580310821533203, "step": 4440 }, { "epoch": 1.0366919044845662, "grad_norm": 9.501112937927246, "learning_rate": 1.0702912621359225e-05, "logits/chosen": -4.3322954177856445, "logits/rejected": -4.332674026489258, "logps/chosen": -724.5609130859375, "logps/rejected": -736.2202758789062, "loss": 0.554, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5000070929527283, "rewards/margins": 0.7766839265823364, "rewards/rejected": -1.276691198348999, "step": 4450 }, { "epoch": 1.039021549213745, "grad_norm": 8.592558860778809, "learning_rate": 1.0677022653721685e-05, "logits/chosen": -4.405747413635254, "logits/rejected": -4.35235071182251, "logps/chosen": -729.2918701171875, "logps/rejected": -749.1801147460938, "loss": 0.7921, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.7027440071105957, "rewards/margins": 0.24186758697032928, "rewards/rejected": -0.9446115493774414, "step": 4460 }, { "epoch": 1.0413511939429236, "grad_norm": 8.168656349182129, "learning_rate": 1.0651132686084142e-05, "logits/chosen": -4.345917701721191, "logits/rejected": -4.364696025848389, "logps/chosen": -699.2119140625, "logps/rejected": -736.5466918945312, "loss": 0.8149, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6802065968513489, "rewards/margins": 0.30877023935317993, "rewards/rejected": -0.9889768362045288, "step": 4470 }, { "epoch": 1.0436808386721026, "grad_norm": 5.467567443847656, "learning_rate": 1.0625242718446603e-05, "logits/chosen": -4.320838928222656, "logits/rejected": -4.439824104309082, "logps/chosen": -714.8374633789062, "logps/rejected": -820.0277099609375, "loss": 0.8304, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8643010258674622, "rewards/margins": 0.19848057627677917, "rewards/rejected": -1.062781572341919, "step": 4480 }, { "epoch": 1.0460104834012813, "grad_norm": 10.389025688171387, "learning_rate": 1.0599352750809062e-05, "logits/chosen": -4.241874694824219, "logits/rejected": -4.292876243591309, "logps/chosen": -666.0745239257812, "logps/rejected": -695.1250610351562, "loss": 0.6892, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6902895569801331, "rewards/margins": 0.4029058814048767, "rewards/rejected": -1.0931955575942993, "step": 4490 }, { "epoch": 1.04834012813046, "grad_norm": 8.474845886230469, "learning_rate": 1.0573462783171522e-05, "logits/chosen": -4.303086280822754, "logits/rejected": -4.328011512756348, "logps/chosen": -727.4135131835938, "logps/rejected": -791.6959228515625, "loss": 0.625, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5062735676765442, "rewards/margins": 0.6144469976425171, "rewards/rejected": -1.120720624923706, "step": 4500 }, { "epoch": 1.04834012813046, "eval_logits/chosen": -4.285947799682617, "eval_logits/rejected": -4.272801399230957, "eval_logps/chosen": -698.516357421875, "eval_logps/rejected": -718.3611450195312, "eval_loss": 0.6368241310119629, "eval_rewards/accuracies": 0.6340509057044983, "eval_rewards/chosen": -0.7688109278678894, "eval_rewards/margins": 0.3874339163303375, "eval_rewards/rejected": -1.1562447547912598, "eval_runtime": 391.1455, "eval_samples_per_second": 18.29, "eval_steps_per_second": 9.145, "step": 4500 }, { "epoch": 1.0506697728596388, "grad_norm": 5.533942699432373, "learning_rate": 1.0547572815533981e-05, "logits/chosen": -4.259266376495361, "logits/rejected": -4.302947521209717, "logps/chosen": -702.03857421875, "logps/rejected": -733.381591796875, "loss": 0.7977, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7158435583114624, "rewards/margins": 0.3514593541622162, "rewards/rejected": -1.067302942276001, "step": 4510 }, { "epoch": 1.0529994175888178, "grad_norm": 8.337974548339844, "learning_rate": 1.0521682847896442e-05, "logits/chosen": -4.3287506103515625, "logits/rejected": -4.309664249420166, "logps/chosen": -723.6080322265625, "logps/rejected": -692.8987426757812, "loss": 0.7763, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.7586453557014465, "rewards/margins": 0.3815062940120697, "rewards/rejected": -1.1401516199111938, "step": 4520 }, { "epoch": 1.0553290623179965, "grad_norm": 6.677039623260498, "learning_rate": 1.0495792880258902e-05, "logits/chosen": -4.355586051940918, "logits/rejected": -4.369414329528809, "logps/chosen": -717.1536254882812, "logps/rejected": -761.757080078125, "loss": 0.6711, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8186143040657043, "rewards/margins": 0.6329285502433777, "rewards/rejected": -1.451542854309082, "step": 4530 }, { "epoch": 1.0576587070471752, "grad_norm": 7.527097225189209, "learning_rate": 1.046990291262136e-05, "logits/chosen": -4.2965521812438965, "logits/rejected": -4.304562568664551, "logps/chosen": -722.9425048828125, "logps/rejected": -719.3508911132812, "loss": 0.8067, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.8394044041633606, "rewards/margins": 0.35650086402893066, "rewards/rejected": -1.195905327796936, "step": 4540 }, { "epoch": 1.0599883517763542, "grad_norm": 6.02723503112793, "learning_rate": 1.0444012944983818e-05, "logits/chosen": -4.2509379386901855, "logits/rejected": -4.236239910125732, "logps/chosen": -683.6402587890625, "logps/rejected": -730.5028076171875, "loss": 0.7061, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.834450900554657, "rewards/margins": 0.5139259696006775, "rewards/rejected": -1.348376989364624, "step": 4550 }, { "epoch": 1.062317996505533, "grad_norm": 10.399433135986328, "learning_rate": 1.0418122977346279e-05, "logits/chosen": -4.382308006286621, "logits/rejected": -4.351099967956543, "logps/chosen": -766.7814331054688, "logps/rejected": -750.8299560546875, "loss": 0.7899, "rewards/accuracies": 0.625, "rewards/chosen": -0.8868049383163452, "rewards/margins": 0.38993802666664124, "rewards/rejected": -1.276742935180664, "step": 4560 }, { "epoch": 1.0646476412347117, "grad_norm": 8.421102523803711, "learning_rate": 1.0392233009708738e-05, "logits/chosen": -4.400119304656982, "logits/rejected": -4.347952365875244, "logps/chosen": -761.1624755859375, "logps/rejected": -763.1273193359375, "loss": 0.7957, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9390009641647339, "rewards/margins": 0.27838605642318726, "rewards/rejected": -1.217387080192566, "step": 4570 }, { "epoch": 1.0669772859638904, "grad_norm": 8.368112564086914, "learning_rate": 1.0366343042071198e-05, "logits/chosen": -4.340146064758301, "logits/rejected": -4.391610145568848, "logps/chosen": -744.6365966796875, "logps/rejected": -738.5140991210938, "loss": 0.6738, "rewards/accuracies": 0.625, "rewards/chosen": -0.7065492272377014, "rewards/margins": 0.4860905706882477, "rewards/rejected": -1.1926395893096924, "step": 4580 }, { "epoch": 1.0693069306930694, "grad_norm": 5.831106185913086, "learning_rate": 1.0340453074433659e-05, "logits/chosen": -4.410943031311035, "logits/rejected": -4.265590667724609, "logps/chosen": -800.0321044921875, "logps/rejected": -748.1279907226562, "loss": 0.7077, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5919022560119629, "rewards/margins": 0.45542460680007935, "rewards/rejected": -1.047326922416687, "step": 4590 }, { "epoch": 1.071636575422248, "grad_norm": 10.359914779663086, "learning_rate": 1.0314563106796118e-05, "logits/chosen": -4.424242973327637, "logits/rejected": -4.363162040710449, "logps/chosen": -846.7982177734375, "logps/rejected": -757.5208129882812, "loss": 0.8212, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9542733430862427, "rewards/margins": 0.23269712924957275, "rewards/rejected": -1.1869704723358154, "step": 4600 }, { "epoch": 1.071636575422248, "eval_logits/chosen": -4.292172431945801, "eval_logits/rejected": -4.279298305511475, "eval_logps/chosen": -698.2608032226562, "eval_logps/rejected": -718.0540771484375, "eval_loss": 0.6358359456062317, "eval_rewards/accuracies": 0.6313950419425964, "eval_rewards/chosen": -0.743259608745575, "eval_rewards/margins": 0.3822806179523468, "eval_rewards/rejected": -1.1255401372909546, "eval_runtime": 391.0127, "eval_samples_per_second": 18.296, "eval_steps_per_second": 9.148, "step": 4600 }, { "epoch": 1.0739662201514268, "grad_norm": 10.056096076965332, "learning_rate": 1.0288673139158575e-05, "logits/chosen": -4.429704666137695, "logits/rejected": -4.434037208557129, "logps/chosen": -787.2510986328125, "logps/rejected": -794.8635864257812, "loss": 0.616, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5970505475997925, "rewards/margins": 0.8173857927322388, "rewards/rejected": -1.4144362211227417, "step": 4610 }, { "epoch": 1.0762958648806058, "grad_norm": 8.542181968688965, "learning_rate": 1.0262783171521036e-05, "logits/chosen": -4.355952262878418, "logits/rejected": -4.35338020324707, "logps/chosen": -760.3809814453125, "logps/rejected": -771.689453125, "loss": 0.8748, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.9295045733451843, "rewards/margins": 0.21732263267040253, "rewards/rejected": -1.1468273401260376, "step": 4620 }, { "epoch": 1.0786255096097845, "grad_norm": 9.360555648803711, "learning_rate": 1.0236893203883496e-05, "logits/chosen": -4.3113322257995605, "logits/rejected": -4.3568220138549805, "logps/chosen": -703.3505859375, "logps/rejected": -716.5987548828125, "loss": 0.8138, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.5865589380264282, "rewards/margins": 0.2181408703327179, "rewards/rejected": -0.8046997785568237, "step": 4630 }, { "epoch": 1.0809551543389633, "grad_norm": 6.834812164306641, "learning_rate": 1.0211003236245955e-05, "logits/chosen": -4.268773078918457, "logits/rejected": -4.2996439933776855, "logps/chosen": -677.6121826171875, "logps/rejected": -732.3841552734375, "loss": 0.7718, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.9201027154922485, "rewards/margins": 0.29583680629730225, "rewards/rejected": -1.2159394025802612, "step": 4640 }, { "epoch": 1.083284799068142, "grad_norm": 9.268996238708496, "learning_rate": 1.0185113268608416e-05, "logits/chosen": -4.294376850128174, "logits/rejected": -4.4644975662231445, "logps/chosen": -689.6966552734375, "logps/rejected": -825.1298828125, "loss": 0.6809, "rewards/accuracies": 0.625, "rewards/chosen": -0.5040693879127502, "rewards/margins": 0.5739471316337585, "rewards/rejected": -1.0780165195465088, "step": 4650 }, { "epoch": 1.085614443797321, "grad_norm": 6.353123188018799, "learning_rate": 1.0159223300970875e-05, "logits/chosen": -4.318959712982178, "logits/rejected": -4.395209312438965, "logps/chosen": -689.1337280273438, "logps/rejected": -755.0162353515625, "loss": 0.6467, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.33966705203056335, "rewards/margins": 0.7350815534591675, "rewards/rejected": -1.0747486352920532, "step": 4660 }, { "epoch": 1.0879440885264997, "grad_norm": 9.633374214172363, "learning_rate": 1.0133333333333335e-05, "logits/chosen": -4.425009727478027, "logits/rejected": -4.371664524078369, "logps/chosen": -812.9774169921875, "logps/rejected": -719.8851928710938, "loss": 0.6838, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.70526123046875, "rewards/margins": 0.6213148236274719, "rewards/rejected": -1.3265759944915771, "step": 4670 }, { "epoch": 1.0902737332556784, "grad_norm": 7.010782241821289, "learning_rate": 1.0107443365695792e-05, "logits/chosen": -4.395978927612305, "logits/rejected": -4.355780124664307, "logps/chosen": -761.0682373046875, "logps/rejected": -733.3692626953125, "loss": 0.5467, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5477423071861267, "rewards/margins": 0.8993877172470093, "rewards/rejected": -1.4471299648284912, "step": 4680 }, { "epoch": 1.0926033779848574, "grad_norm": 10.430981636047363, "learning_rate": 1.0081553398058253e-05, "logits/chosen": -4.359240531921387, "logits/rejected": -4.325311183929443, "logps/chosen": -746.3953857421875, "logps/rejected": -761.3292846679688, "loss": 0.8116, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6973962187767029, "rewards/margins": 0.5142523646354675, "rewards/rejected": -1.21164870262146, "step": 4690 }, { "epoch": 1.0949330227140361, "grad_norm": 8.63824462890625, "learning_rate": 1.0055663430420712e-05, "logits/chosen": -4.394569396972656, "logits/rejected": -4.347747325897217, "logps/chosen": -779.9010009765625, "logps/rejected": -774.9115600585938, "loss": 0.8804, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.8874024152755737, "rewards/margins": 0.16095098853111267, "rewards/rejected": -1.0483534336090088, "step": 4700 }, { "epoch": 1.0949330227140361, "eval_logits/chosen": -4.293430805206299, "eval_logits/rejected": -4.280736923217773, "eval_logps/chosen": -698.022705078125, "eval_logps/rejected": -717.76416015625, "eval_loss": 0.6339041590690613, "eval_rewards/accuracies": 0.6339110732078552, "eval_rewards/chosen": -0.7194374203681946, "eval_rewards/margins": 0.3771189749240875, "eval_rewards/rejected": -1.096556544303894, "eval_runtime": 392.0836, "eval_samples_per_second": 18.246, "eval_steps_per_second": 9.123, "step": 4700 }, { "epoch": 1.0972626674432149, "grad_norm": 6.816390037536621, "learning_rate": 1.0029773462783172e-05, "logits/chosen": -4.264836311340332, "logits/rejected": -4.192754745483398, "logps/chosen": -699.5757446289062, "logps/rejected": -689.8859252929688, "loss": 0.7551, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7004824876785278, "rewards/margins": 0.3938553035259247, "rewards/rejected": -1.094337821006775, "step": 4710 }, { "epoch": 1.0995923121723936, "grad_norm": 6.77658748626709, "learning_rate": 1.0003883495145631e-05, "logits/chosen": -4.375441551208496, "logits/rejected": -4.27400541305542, "logps/chosen": -765.828369140625, "logps/rejected": -741.5504150390625, "loss": 0.7337, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6998693943023682, "rewards/margins": 0.4171157777309418, "rewards/rejected": -1.1169852018356323, "step": 4720 }, { "epoch": 1.1019219569015726, "grad_norm": 7.092930793762207, "learning_rate": 9.977993527508092e-06, "logits/chosen": -4.354989051818848, "logits/rejected": -4.303897380828857, "logps/chosen": -743.7265625, "logps/rejected": -713.6380615234375, "loss": 0.6894, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5861555933952332, "rewards/margins": 0.5605015158653259, "rewards/rejected": -1.1466572284698486, "step": 4730 }, { "epoch": 1.1042516016307513, "grad_norm": 8.570871353149414, "learning_rate": 9.95210355987055e-06, "logits/chosen": -4.348479747772217, "logits/rejected": -4.361424922943115, "logps/chosen": -715.3938598632812, "logps/rejected": -732.878662109375, "loss": 0.8737, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7268328666687012, "rewards/margins": 0.22926625609397888, "rewards/rejected": -0.9560991525650024, "step": 4740 }, { "epoch": 1.10658124635993, "grad_norm": 6.749851703643799, "learning_rate": 9.926213592233011e-06, "logits/chosen": -4.329443454742432, "logits/rejected": -4.359018802642822, "logps/chosen": -757.46630859375, "logps/rejected": -846.7803955078125, "loss": 0.8767, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.8326984643936157, "rewards/margins": 0.28897660970687866, "rewards/rejected": -1.1216751337051392, "step": 4750 }, { "epoch": 1.108910891089109, "grad_norm": 6.992388725280762, "learning_rate": 9.90032362459547e-06, "logits/chosen": -4.314213752746582, "logits/rejected": -4.2905144691467285, "logps/chosen": -699.3799438476562, "logps/rejected": -697.0438232421875, "loss": 0.8799, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.9238218069076538, "rewards/margins": 0.16288027167320251, "rewards/rejected": -1.0867021083831787, "step": 4760 }, { "epoch": 1.1112405358182877, "grad_norm": 8.790740013122559, "learning_rate": 9.874433656957929e-06, "logits/chosen": -4.342259407043457, "logits/rejected": -4.310133934020996, "logps/chosen": -740.4301147460938, "logps/rejected": -730.6539306640625, "loss": 0.8358, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.703795313835144, "rewards/margins": 0.34042343497276306, "rewards/rejected": -1.0442187786102295, "step": 4770 }, { "epoch": 1.1135701805474665, "grad_norm": 8.854411125183105, "learning_rate": 9.84854368932039e-06, "logits/chosen": -4.325991630554199, "logits/rejected": -4.3423686027526855, "logps/chosen": -649.7286376953125, "logps/rejected": -722.8423461914062, "loss": 0.8412, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6523281335830688, "rewards/margins": 0.2276729792356491, "rewards/rejected": -0.8800011873245239, "step": 4780 }, { "epoch": 1.1158998252766452, "grad_norm": 9.450496673583984, "learning_rate": 9.822653721682848e-06, "logits/chosen": -4.22884464263916, "logits/rejected": -4.330252170562744, "logps/chosen": -689.923583984375, "logps/rejected": -777.794677734375, "loss": 0.8275, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7842633724212646, "rewards/margins": 0.30382487177848816, "rewards/rejected": -1.0880881547927856, "step": 4790 }, { "epoch": 1.1182294700058242, "grad_norm": 9.24414348602295, "learning_rate": 9.796763754045309e-06, "logits/chosen": -4.3309173583984375, "logits/rejected": -4.296918869018555, "logps/chosen": -722.8944091796875, "logps/rejected": -711.1109619140625, "loss": 0.7236, "rewards/accuracies": 0.625, "rewards/chosen": -0.7419768571853638, "rewards/margins": 0.45678672194480896, "rewards/rejected": -1.1987636089324951, "step": 4800 }, { "epoch": 1.1182294700058242, "eval_logits/chosen": -4.296524524688721, "eval_logits/rejected": -4.28386116027832, "eval_logps/chosen": -697.907470703125, "eval_logps/rejected": -717.6570434570312, "eval_loss": 0.6322656869888306, "eval_rewards/accuracies": 0.634889543056488, "eval_rewards/chosen": -0.7079183459281921, "eval_rewards/margins": 0.37791895866394043, "eval_rewards/rejected": -1.0858372449874878, "eval_runtime": 391.976, "eval_samples_per_second": 18.251, "eval_steps_per_second": 9.126, "step": 4800 }, { "epoch": 1.120559114735003, "grad_norm": 8.10059928894043, "learning_rate": 9.770873786407768e-06, "logits/chosen": -4.268490791320801, "logits/rejected": -4.312201499938965, "logps/chosen": -658.3656005859375, "logps/rejected": -724.805419921875, "loss": 0.7867, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6494590044021606, "rewards/margins": 0.30502957105636597, "rewards/rejected": -0.9544886350631714, "step": 4810 }, { "epoch": 1.1228887594641817, "grad_norm": 7.271373271942139, "learning_rate": 9.744983818770227e-06, "logits/chosen": -4.365329265594482, "logits/rejected": -4.415966987609863, "logps/chosen": -647.1744384765625, "logps/rejected": -726.2669677734375, "loss": 0.7327, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4994463324546814, "rewards/margins": 0.5641657114028931, "rewards/rejected": -1.0636119842529297, "step": 4820 }, { "epoch": 1.1252184041933604, "grad_norm": 9.720874786376953, "learning_rate": 9.719093851132687e-06, "logits/chosen": -4.329529762268066, "logits/rejected": -4.333421230316162, "logps/chosen": -696.6600341796875, "logps/rejected": -689.1129760742188, "loss": 0.7579, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6637765765190125, "rewards/margins": 0.37587255239486694, "rewards/rejected": -1.039649248123169, "step": 4830 }, { "epoch": 1.1275480489225393, "grad_norm": 5.86013650894165, "learning_rate": 9.693203883495146e-06, "logits/chosen": -4.383194923400879, "logits/rejected": -4.380536079406738, "logps/chosen": -675.5174560546875, "logps/rejected": -729.9697875976562, "loss": 0.567, "rewards/accuracies": 0.6875, "rewards/chosen": -0.334768146276474, "rewards/margins": 0.8987871408462524, "rewards/rejected": -1.2335551977157593, "step": 4840 }, { "epoch": 1.129877693651718, "grad_norm": 11.408817291259766, "learning_rate": 9.667313915857605e-06, "logits/chosen": -4.298308849334717, "logits/rejected": -4.3367109298706055, "logps/chosen": -733.4371948242188, "logps/rejected": -817.2102661132812, "loss": 0.7335, "rewards/accuracies": 0.625, "rewards/chosen": -0.5985130071640015, "rewards/margins": 0.49690619111061096, "rewards/rejected": -1.09541916847229, "step": 4850 }, { "epoch": 1.1322073383808968, "grad_norm": 7.952821731567383, "learning_rate": 9.641423948220066e-06, "logits/chosen": -4.365422248840332, "logits/rejected": -4.373671531677246, "logps/chosen": -702.8591918945312, "logps/rejected": -762.954345703125, "loss": 0.9034, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.8020332455635071, "rewards/margins": 0.20034417510032654, "rewards/rejected": -1.0023772716522217, "step": 4860 }, { "epoch": 1.1345369831100758, "grad_norm": 8.62785816192627, "learning_rate": 9.615533980582525e-06, "logits/chosen": -4.355130195617676, "logits/rejected": -4.3633222579956055, "logps/chosen": -759.4334106445312, "logps/rejected": -808.9783935546875, "loss": 0.7554, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6296737790107727, "rewards/margins": 0.4100741446018219, "rewards/rejected": -1.039747953414917, "step": 4870 }, { "epoch": 1.1368666278392545, "grad_norm": 8.518671035766602, "learning_rate": 9.589644012944983e-06, "logits/chosen": -4.361384391784668, "logits/rejected": -4.2445831298828125, "logps/chosen": -756.2235717773438, "logps/rejected": -749.2499389648438, "loss": 0.853, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.7251854538917542, "rewards/margins": 0.17502622306346893, "rewards/rejected": -0.90021151304245, "step": 4880 }, { "epoch": 1.1391962725684333, "grad_norm": 7.055928707122803, "learning_rate": 9.563754045307444e-06, "logits/chosen": -4.406973838806152, "logits/rejected": -4.381006717681885, "logps/chosen": -711.85888671875, "logps/rejected": -769.9061889648438, "loss": 0.6148, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5367127656936646, "rewards/margins": 0.725053608417511, "rewards/rejected": -1.2617665529251099, "step": 4890 }, { "epoch": 1.1415259172976122, "grad_norm": 6.893064498901367, "learning_rate": 9.537864077669905e-06, "logits/chosen": -4.3568034172058105, "logits/rejected": -4.30020809173584, "logps/chosen": -746.2471313476562, "logps/rejected": -722.7155151367188, "loss": 0.7335, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6146231889724731, "rewards/margins": 0.5055471658706665, "rewards/rejected": -1.1201703548431396, "step": 4900 }, { "epoch": 1.1415259172976122, "eval_logits/chosen": -4.295858860015869, "eval_logits/rejected": -4.2832417488098145, "eval_logps/chosen": -697.7133178710938, "eval_logps/rejected": -717.448974609375, "eval_loss": 0.630944013595581, "eval_rewards/accuracies": 0.64034104347229, "eval_rewards/chosen": -0.6884997487068176, "eval_rewards/margins": 0.37653934955596924, "eval_rewards/rejected": -1.065039038658142, "eval_runtime": 392.2997, "eval_samples_per_second": 18.236, "eval_steps_per_second": 9.118, "step": 4900 }, { "epoch": 1.143855562026791, "grad_norm": 11.00304126739502, "learning_rate": 9.511974110032363e-06, "logits/chosen": -4.2610039710998535, "logits/rejected": -4.306243896484375, "logps/chosen": -724.7777099609375, "logps/rejected": -772.2752685546875, "loss": 0.8958, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.7763365507125854, "rewards/margins": 0.23164916038513184, "rewards/rejected": -1.0079858303070068, "step": 4910 }, { "epoch": 1.1461852067559697, "grad_norm": 5.603542327880859, "learning_rate": 9.486084142394822e-06, "logits/chosen": -4.164181232452393, "logits/rejected": -4.260239601135254, "logps/chosen": -659.2215576171875, "logps/rejected": -744.3831787109375, "loss": 0.7597, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.6770327091217041, "rewards/margins": 0.40782466530799866, "rewards/rejected": -1.0848573446273804, "step": 4920 }, { "epoch": 1.1485148514851484, "grad_norm": 7.558773994445801, "learning_rate": 9.460194174757283e-06, "logits/chosen": -4.3481245040893555, "logits/rejected": -4.423264503479004, "logps/chosen": -650.3001098632812, "logps/rejected": -732.3673095703125, "loss": 0.5359, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.27106544375419617, "rewards/margins": 0.8346785306930542, "rewards/rejected": -1.1057438850402832, "step": 4930 }, { "epoch": 1.1508444962143274, "grad_norm": 6.854055881500244, "learning_rate": 9.434304207119742e-06, "logits/chosen": -4.359449863433838, "logits/rejected": -4.34786319732666, "logps/chosen": -768.947998046875, "logps/rejected": -747.8753662109375, "loss": 0.8995, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.9032365679740906, "rewards/margins": 0.1830495297908783, "rewards/rejected": -1.086286187171936, "step": 4940 }, { "epoch": 1.1531741409435061, "grad_norm": 10.397383689880371, "learning_rate": 9.4084142394822e-06, "logits/chosen": -4.300423622131348, "logits/rejected": -4.38247537612915, "logps/chosen": -728.3832397460938, "logps/rejected": -797.1463623046875, "loss": 0.8069, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6606366038322449, "rewards/margins": 0.3577214777469635, "rewards/rejected": -1.0183582305908203, "step": 4950 }, { "epoch": 1.1555037856726849, "grad_norm": 5.43316650390625, "learning_rate": 9.382524271844661e-06, "logits/chosen": -4.393747329711914, "logits/rejected": -4.417483329772949, "logps/chosen": -713.3004150390625, "logps/rejected": -757.9130859375, "loss": 0.8116, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7697121500968933, "rewards/margins": 0.36362361907958984, "rewards/rejected": -1.133335828781128, "step": 4960 }, { "epoch": 1.1578334304018636, "grad_norm": 7.3963623046875, "learning_rate": 9.35663430420712e-06, "logits/chosen": -4.301112174987793, "logits/rejected": -4.343433380126953, "logps/chosen": -697.62548828125, "logps/rejected": -657.8740234375, "loss": 0.7422, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6323320269584656, "rewards/margins": 0.5476266741752625, "rewards/rejected": -1.179958701133728, "step": 4970 }, { "epoch": 1.1601630751310426, "grad_norm": 6.423781394958496, "learning_rate": 9.33074433656958e-06, "logits/chosen": -4.308186054229736, "logits/rejected": -4.321841239929199, "logps/chosen": -713.9429321289062, "logps/rejected": -762.6380615234375, "loss": 0.833, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9051560163497925, "rewards/margins": 0.27821090817451477, "rewards/rejected": -1.1833668947219849, "step": 4980 }, { "epoch": 1.1624927198602213, "grad_norm": 8.114919662475586, "learning_rate": 9.30485436893204e-06, "logits/chosen": -4.363166809082031, "logits/rejected": -4.334114074707031, "logps/chosen": -734.3272705078125, "logps/rejected": -783.6725463867188, "loss": 0.6761, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5971099734306335, "rewards/margins": 0.5837138891220093, "rewards/rejected": -1.1808240413665771, "step": 4990 }, { "epoch": 1.1648223645894, "grad_norm": 2.8438472747802734, "learning_rate": 9.278964401294498e-06, "logits/chosen": -4.292839050292969, "logits/rejected": -4.3964738845825195, "logps/chosen": -685.0821533203125, "logps/rejected": -787.2464599609375, "loss": 0.6453, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6114527583122253, "rewards/margins": 0.7760822176933289, "rewards/rejected": -1.3875350952148438, "step": 5000 }, { "epoch": 1.1648223645894, "eval_logits/chosen": -4.296248912811279, "eval_logits/rejected": -4.2839531898498535, "eval_logps/chosen": -698.3717651367188, "eval_logps/rejected": -718.2405395507812, "eval_loss": 0.6339278817176819, "eval_rewards/accuracies": 0.6365669369697571, "eval_rewards/chosen": -0.7543493509292603, "eval_rewards/margins": 0.38984665274620056, "eval_rewards/rejected": -1.1441960334777832, "eval_runtime": 392.4103, "eval_samples_per_second": 18.231, "eval_steps_per_second": 9.115, "step": 5000 }, { "epoch": 1.167152009318579, "grad_norm": 9.56225872039795, "learning_rate": 9.253074433656959e-06, "logits/chosen": -4.369412422180176, "logits/rejected": -4.308619022369385, "logps/chosen": -729.121826171875, "logps/rejected": -766.3821411132812, "loss": 0.7563, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.717940628528595, "rewards/margins": 0.4016820788383484, "rewards/rejected": -1.1196227073669434, "step": 5010 }, { "epoch": 1.1694816540477577, "grad_norm": 7.350949287414551, "learning_rate": 9.227184466019418e-06, "logits/chosen": -4.340554237365723, "logits/rejected": -4.403109550476074, "logps/chosen": -728.0326538085938, "logps/rejected": -739.9369506835938, "loss": 0.7311, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.7404690384864807, "rewards/margins": 0.4509666860103607, "rewards/rejected": -1.1914358139038086, "step": 5020 }, { "epoch": 1.1718112987769365, "grad_norm": 4.559691429138184, "learning_rate": 9.201294498381877e-06, "logits/chosen": -4.373383522033691, "logits/rejected": -4.363102912902832, "logps/chosen": -799.009033203125, "logps/rejected": -789.736572265625, "loss": 0.6564, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5797772407531738, "rewards/margins": 0.6232125163078308, "rewards/rejected": -1.2029898166656494, "step": 5030 }, { "epoch": 1.1741409435061154, "grad_norm": 9.547080993652344, "learning_rate": 9.175404530744337e-06, "logits/chosen": -4.302543640136719, "logits/rejected": -4.301863670349121, "logps/chosen": -706.2684936523438, "logps/rejected": -715.9816284179688, "loss": 0.6453, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6682039499282837, "rewards/margins": 0.6009415984153748, "rewards/rejected": -1.2691456079483032, "step": 5040 }, { "epoch": 1.1764705882352942, "grad_norm": 7.015618801116943, "learning_rate": 9.149514563106798e-06, "logits/chosen": -4.382655620574951, "logits/rejected": -4.368760585784912, "logps/chosen": -708.5529174804688, "logps/rejected": -719.41015625, "loss": 0.7583, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7785158157348633, "rewards/margins": 0.46291232109069824, "rewards/rejected": -1.2414281368255615, "step": 5050 }, { "epoch": 1.178800232964473, "grad_norm": 5.636579990386963, "learning_rate": 9.123624595469255e-06, "logits/chosen": -4.289775371551514, "logits/rejected": -4.319928169250488, "logps/chosen": -700.90185546875, "logps/rejected": -759.4402465820312, "loss": 0.6075, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6372637152671814, "rewards/margins": 0.6822826266288757, "rewards/rejected": -1.3195462226867676, "step": 5060 }, { "epoch": 1.1811298776936516, "grad_norm": 7.952759265899658, "learning_rate": 9.097734627831716e-06, "logits/chosen": -4.374382972717285, "logits/rejected": -4.308639049530029, "logps/chosen": -736.1868896484375, "logps/rejected": -727.0337524414062, "loss": 0.7677, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8247160911560059, "rewards/margins": 0.45139646530151367, "rewards/rejected": -1.2761125564575195, "step": 5070 }, { "epoch": 1.1834595224228306, "grad_norm": 5.8634185791015625, "learning_rate": 9.071844660194176e-06, "logits/chosen": -4.347293376922607, "logits/rejected": -4.27130651473999, "logps/chosen": -689.748046875, "logps/rejected": -695.448486328125, "loss": 0.6795, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6607910394668579, "rewards/margins": 0.6075721979141235, "rewards/rejected": -1.2683632373809814, "step": 5080 }, { "epoch": 1.1857891671520093, "grad_norm": 6.076456069946289, "learning_rate": 9.045954692556635e-06, "logits/chosen": -4.2905659675598145, "logits/rejected": -4.332380771636963, "logps/chosen": -725.4127807617188, "logps/rejected": -792.0933837890625, "loss": 0.7146, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6212302446365356, "rewards/margins": 0.39680036902427673, "rewards/rejected": -1.0180305242538452, "step": 5090 }, { "epoch": 1.188118811881188, "grad_norm": 7.59320592880249, "learning_rate": 9.020064724919094e-06, "logits/chosen": -4.304072380065918, "logits/rejected": -4.223092555999756, "logps/chosen": -731.3455810546875, "logps/rejected": -707.1759643554688, "loss": 0.7496, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5885947942733765, "rewards/margins": 0.4130372405052185, "rewards/rejected": -1.0016319751739502, "step": 5100 }, { "epoch": 1.188118811881188, "eval_logits/chosen": -4.295443534851074, "eval_logits/rejected": -4.2832560539245605, "eval_logps/chosen": -698.3624267578125, "eval_logps/rejected": -718.23583984375, "eval_loss": 0.6326996088027954, "eval_rewards/accuracies": 0.6375454068183899, "eval_rewards/chosen": -0.753415584564209, "eval_rewards/margins": 0.3902973234653473, "eval_rewards/rejected": -1.1437128782272339, "eval_runtime": 393.3354, "eval_samples_per_second": 18.188, "eval_steps_per_second": 9.094, "step": 5100 }, { "epoch": 1.1904484566103668, "grad_norm": 11.862083435058594, "learning_rate": 8.994174757281555e-06, "logits/chosen": -4.289244174957275, "logits/rejected": -4.383764743804932, "logps/chosen": -713.3644409179688, "logps/rejected": -739.8452758789062, "loss": 0.9078, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.898316502571106, "rewards/margins": 0.10962538421154022, "rewards/rejected": -1.0079419612884521, "step": 5110 }, { "epoch": 1.1927781013395458, "grad_norm": 5.873625755310059, "learning_rate": 8.968284789644013e-06, "logits/chosen": -4.285080909729004, "logits/rejected": -4.272884845733643, "logps/chosen": -688.602294921875, "logps/rejected": -684.8470458984375, "loss": 0.7197, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6948808431625366, "rewards/margins": 0.4366269111633301, "rewards/rejected": -1.1315077543258667, "step": 5120 }, { "epoch": 1.1951077460687245, "grad_norm": 8.76635456085205, "learning_rate": 8.942394822006472e-06, "logits/chosen": -4.330078125, "logits/rejected": -4.282149791717529, "logps/chosen": -771.1861572265625, "logps/rejected": -738.9520874023438, "loss": 0.8948, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.7769016027450562, "rewards/margins": 0.17473702132701874, "rewards/rejected": -0.9516385793685913, "step": 5130 }, { "epoch": 1.1974373907979032, "grad_norm": 4.928976535797119, "learning_rate": 8.916504854368933e-06, "logits/chosen": -4.3442463874816895, "logits/rejected": -4.3403706550598145, "logps/chosen": -747.8082275390625, "logps/rejected": -774.0421142578125, "loss": 0.6963, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7253668308258057, "rewards/margins": 0.4606569707393646, "rewards/rejected": -1.1860238313674927, "step": 5140 }, { "epoch": 1.1997670355270822, "grad_norm": 10.240233421325684, "learning_rate": 8.890614886731392e-06, "logits/chosen": -4.33738899230957, "logits/rejected": -4.348621368408203, "logps/chosen": -720.0654907226562, "logps/rejected": -741.6688842773438, "loss": 0.7944, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.870683491230011, "rewards/margins": 0.355567067861557, "rewards/rejected": -1.2262506484985352, "step": 5150 }, { "epoch": 1.202096680256261, "grad_norm": 7.294375419616699, "learning_rate": 8.864724919093852e-06, "logits/chosen": -4.395794868469238, "logits/rejected": -4.36327600479126, "logps/chosen": -723.692626953125, "logps/rejected": -736.25927734375, "loss": 0.6848, "rewards/accuracies": 0.625, "rewards/chosen": -0.5928478240966797, "rewards/margins": 0.5372605919837952, "rewards/rejected": -1.1301084756851196, "step": 5160 }, { "epoch": 1.2044263249854397, "grad_norm": 8.202095031738281, "learning_rate": 8.838834951456311e-06, "logits/chosen": -4.327259540557861, "logits/rejected": -4.300198554992676, "logps/chosen": -770.6749267578125, "logps/rejected": -729.3353271484375, "loss": 0.8911, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.084133505821228, "rewards/margins": 0.147923082113266, "rewards/rejected": -1.2320566177368164, "step": 5170 }, { "epoch": 1.2067559697146186, "grad_norm": 8.397380828857422, "learning_rate": 8.81294498381877e-06, "logits/chosen": -4.301027774810791, "logits/rejected": -4.390149116516113, "logps/chosen": -746.8372802734375, "logps/rejected": -793.6979370117188, "loss": 0.8489, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7529788017272949, "rewards/margins": 0.3230917453765869, "rewards/rejected": -1.0760705471038818, "step": 5180 }, { "epoch": 1.2090856144437974, "grad_norm": 7.202069282531738, "learning_rate": 8.78705501618123e-06, "logits/chosen": -4.267294406890869, "logits/rejected": -4.347865104675293, "logps/chosen": -672.7081298828125, "logps/rejected": -761.9598388671875, "loss": 0.686, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7640711069107056, "rewards/margins": 0.5648372173309326, "rewards/rejected": -1.3289084434509277, "step": 5190 }, { "epoch": 1.211415259172976, "grad_norm": 8.52991771697998, "learning_rate": 8.761165048543691e-06, "logits/chosen": -4.3260908126831055, "logits/rejected": -4.4024786949157715, "logps/chosen": -643.678466796875, "logps/rejected": -768.1748046875, "loss": 0.7476, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5957127213478088, "rewards/margins": 0.46101027727127075, "rewards/rejected": -1.0567229986190796, "step": 5200 }, { "epoch": 1.211415259172976, "eval_logits/chosen": -4.2841081619262695, "eval_logits/rejected": -4.2714667320251465, "eval_logps/chosen": -698.4566040039062, "eval_logps/rejected": -718.3603515625, "eval_loss": 0.632652759552002, "eval_rewards/accuracies": 0.6399217247962952, "eval_rewards/chosen": -0.7628329992294312, "eval_rewards/margins": 0.3933371603488922, "eval_rewards/rejected": -1.156170129776001, "eval_runtime": 392.4296, "eval_samples_per_second": 18.23, "eval_steps_per_second": 9.115, "step": 5200 }, { "epoch": 1.2137449039021548, "grad_norm": 6.904987812042236, "learning_rate": 8.735275080906148e-06, "logits/chosen": -4.297495365142822, "logits/rejected": -4.3596415519714355, "logps/chosen": -615.5276489257812, "logps/rejected": -720.9169921875, "loss": 0.8417, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.8318285942077637, "rewards/margins": 0.24546094238758087, "rewards/rejected": -1.0772894620895386, "step": 5210 }, { "epoch": 1.2160745486313338, "grad_norm": 7.0425639152526855, "learning_rate": 8.709385113268609e-06, "logits/chosen": -4.2223615646362305, "logits/rejected": -4.204460620880127, "logps/chosen": -661.43701171875, "logps/rejected": -671.744140625, "loss": 0.6586, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5984729528427124, "rewards/margins": 0.6058539152145386, "rewards/rejected": -1.204326868057251, "step": 5220 }, { "epoch": 1.2184041933605125, "grad_norm": 8.22312068939209, "learning_rate": 8.68349514563107e-06, "logits/chosen": -4.342942714691162, "logits/rejected": -4.427065849304199, "logps/chosen": -734.3280029296875, "logps/rejected": -794.6597900390625, "loss": 0.5786, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.441389262676239, "rewards/margins": 0.841633677482605, "rewards/rejected": -1.2830231189727783, "step": 5230 }, { "epoch": 1.2207338380896913, "grad_norm": 5.765860080718994, "learning_rate": 8.657605177993529e-06, "logits/chosen": -4.316514492034912, "logits/rejected": -4.353440284729004, "logps/chosen": -719.7871704101562, "logps/rejected": -721.1253662109375, "loss": 0.7851, "rewards/accuracies": 0.625, "rewards/chosen": -0.7890719175338745, "rewards/margins": 0.30373963713645935, "rewards/rejected": -1.0928115844726562, "step": 5240 }, { "epoch": 1.22306348281887, "grad_norm": 9.424662590026855, "learning_rate": 8.631715210355987e-06, "logits/chosen": -4.375064373016357, "logits/rejected": -4.371817111968994, "logps/chosen": -712.6149291992188, "logps/rejected": -771.7597045898438, "loss": 0.7467, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7428840398788452, "rewards/margins": 0.35510486364364624, "rewards/rejected": -1.0979888439178467, "step": 5250 }, { "epoch": 1.225393127548049, "grad_norm": 6.406722545623779, "learning_rate": 8.605825242718448e-06, "logits/chosen": -4.307132244110107, "logits/rejected": -4.281327247619629, "logps/chosen": -725.4629516601562, "logps/rejected": -739.6784057617188, "loss": 0.7597, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7313467860221863, "rewards/margins": 0.40912118554115295, "rewards/rejected": -1.140467882156372, "step": 5260 }, { "epoch": 1.2277227722772277, "grad_norm": 8.925617218017578, "learning_rate": 8.579935275080907e-06, "logits/chosen": -4.30813455581665, "logits/rejected": -4.262563228607178, "logps/chosen": -686.4441528320312, "logps/rejected": -630.260009765625, "loss": 0.8031, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6095466613769531, "rewards/margins": 0.4209167957305908, "rewards/rejected": -1.030463457107544, "step": 5270 }, { "epoch": 1.2300524170064064, "grad_norm": 9.995710372924805, "learning_rate": 8.554045307443366e-06, "logits/chosen": -4.355738639831543, "logits/rejected": -4.22121524810791, "logps/chosen": -764.5777587890625, "logps/rejected": -727.5269165039062, "loss": 0.9327, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8444086313247681, "rewards/margins": 0.1093897819519043, "rewards/rejected": -0.9537984132766724, "step": 5280 }, { "epoch": 1.2323820617355854, "grad_norm": 7.851297855377197, "learning_rate": 8.528155339805826e-06, "logits/chosen": -4.309381484985352, "logits/rejected": -4.3855791091918945, "logps/chosen": -706.9483642578125, "logps/rejected": -803.3857421875, "loss": 0.7377, "rewards/accuracies": 0.625, "rewards/chosen": -0.6579431891441345, "rewards/margins": 0.5061560869216919, "rewards/rejected": -1.1640993356704712, "step": 5290 }, { "epoch": 1.2347117064647641, "grad_norm": 8.250238418579102, "learning_rate": 8.502265372168285e-06, "logits/chosen": -4.303959369659424, "logits/rejected": -4.3808698654174805, "logps/chosen": -731.1709594726562, "logps/rejected": -842.6114501953125, "loss": 0.7532, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8347466588020325, "rewards/margins": 0.4443570077419281, "rewards/rejected": -1.2791036367416382, "step": 5300 }, { "epoch": 1.2347117064647641, "eval_logits/chosen": -4.286922454833984, "eval_logits/rejected": -4.274168491363525, "eval_logps/chosen": -698.2924194335938, "eval_logps/rejected": -718.2178344726562, "eval_loss": 0.6320476531982422, "eval_rewards/accuracies": 0.6407604217529297, "eval_rewards/chosen": -0.7464197278022766, "eval_rewards/margins": 0.3955014944076538, "eval_rewards/rejected": -1.1419211626052856, "eval_runtime": 393.3854, "eval_samples_per_second": 18.186, "eval_steps_per_second": 9.093, "step": 5300 }, { "epoch": 1.2370413511939429, "grad_norm": 7.946064472198486, "learning_rate": 8.476375404530744e-06, "logits/chosen": -4.3965559005737305, "logits/rejected": -4.316365718841553, "logps/chosen": -757.570068359375, "logps/rejected": -776.7664794921875, "loss": 0.813, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6333745718002319, "rewards/margins": 0.3966577649116516, "rewards/rejected": -1.0300323963165283, "step": 5310 }, { "epoch": 1.2393709959231218, "grad_norm": 11.394083023071289, "learning_rate": 8.450485436893205e-06, "logits/chosen": -4.389873027801514, "logits/rejected": -4.220327377319336, "logps/chosen": -769.0503540039062, "logps/rejected": -705.5409545898438, "loss": 0.796, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7753839492797852, "rewards/margins": 0.2997626066207886, "rewards/rejected": -1.0751464366912842, "step": 5320 }, { "epoch": 1.2417006406523006, "grad_norm": 9.40004825592041, "learning_rate": 8.424595469255664e-06, "logits/chosen": -4.316657543182373, "logits/rejected": -4.366091728210449, "logps/chosen": -658.1968994140625, "logps/rejected": -729.2615966796875, "loss": 0.7724, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6652747392654419, "rewards/margins": 0.5693114399909973, "rewards/rejected": -1.234586238861084, "step": 5330 }, { "epoch": 1.2440302853814793, "grad_norm": 9.043682098388672, "learning_rate": 8.398705501618124e-06, "logits/chosen": -4.202693939208984, "logits/rejected": -4.343643665313721, "logps/chosen": -636.1541137695312, "logps/rejected": -754.822509765625, "loss": 0.7941, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5894585847854614, "rewards/margins": 0.3509396016597748, "rewards/rejected": -0.9403982162475586, "step": 5340 }, { "epoch": 1.246359930110658, "grad_norm": 7.098946571350098, "learning_rate": 8.372815533980583e-06, "logits/chosen": -4.231969833374023, "logits/rejected": -4.287779808044434, "logps/chosen": -697.3786010742188, "logps/rejected": -752.2830200195312, "loss": 0.7645, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6086194515228271, "rewards/margins": 0.48688268661499023, "rewards/rejected": -1.0955021381378174, "step": 5350 }, { "epoch": 1.248689574839837, "grad_norm": 6.7419753074646, "learning_rate": 8.346925566343042e-06, "logits/chosen": -4.2158942222595215, "logits/rejected": -4.2786431312561035, "logps/chosen": -655.709716796875, "logps/rejected": -727.7312622070312, "loss": 0.8279, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5915407538414001, "rewards/margins": 0.28814780712127686, "rewards/rejected": -0.8796886205673218, "step": 5360 }, { "epoch": 1.2510192195690157, "grad_norm": 8.620369911193848, "learning_rate": 8.321035598705502e-06, "logits/chosen": -4.3344340324401855, "logits/rejected": -4.348869800567627, "logps/chosen": -704.429443359375, "logps/rejected": -764.4039306640625, "loss": 0.7372, "rewards/accuracies": 0.625, "rewards/chosen": -0.7943536639213562, "rewards/margins": 0.44872117042541504, "rewards/rejected": -1.2430747747421265, "step": 5370 }, { "epoch": 1.2533488642981945, "grad_norm": 8.750239372253418, "learning_rate": 8.295145631067963e-06, "logits/chosen": -4.340044021606445, "logits/rejected": -4.353331089019775, "logps/chosen": -712.9949951171875, "logps/rejected": -736.2276000976562, "loss": 0.7095, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6870006322860718, "rewards/margins": 0.5696112513542175, "rewards/rejected": -1.256611943244934, "step": 5380 }, { "epoch": 1.2556785090273732, "grad_norm": 7.095187664031982, "learning_rate": 8.269255663430422e-06, "logits/chosen": -4.3044915199279785, "logits/rejected": -4.3486504554748535, "logps/chosen": -689.8836059570312, "logps/rejected": -756.3995361328125, "loss": 0.6589, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5822979807853699, "rewards/margins": 0.7012337446212769, "rewards/rejected": -1.2835317850112915, "step": 5390 }, { "epoch": 1.2580081537565522, "grad_norm": 7.411107063293457, "learning_rate": 8.24336569579288e-06, "logits/chosen": -4.365941047668457, "logits/rejected": -4.346836566925049, "logps/chosen": -674.5985107421875, "logps/rejected": -732.57568359375, "loss": 0.6924, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6771245002746582, "rewards/margins": 0.7330614328384399, "rewards/rejected": -1.4101858139038086, "step": 5400 }, { "epoch": 1.2580081537565522, "eval_logits/chosen": -4.280876159667969, "eval_logits/rejected": -4.268177509307861, "eval_logps/chosen": -698.876220703125, "eval_logps/rejected": -718.9097900390625, "eval_loss": 0.633796751499176, "eval_rewards/accuracies": 0.6382443308830261, "eval_rewards/chosen": -0.8047946691513062, "eval_rewards/margins": 0.4063267707824707, "eval_rewards/rejected": -1.2111215591430664, "eval_runtime": 393.2724, "eval_samples_per_second": 18.191, "eval_steps_per_second": 9.095, "step": 5400 }, { "epoch": 1.260337798485731, "grad_norm": 12.6148681640625, "learning_rate": 8.217475728155341e-06, "logits/chosen": -4.354096412658691, "logits/rejected": -4.3372802734375, "logps/chosen": -765.8435668945312, "logps/rejected": -756.4341430664062, "loss": 0.8824, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.077000379562378, "rewards/margins": 0.16846218705177307, "rewards/rejected": -1.2454622983932495, "step": 5410 }, { "epoch": 1.2626674432149096, "grad_norm": 7.284420967102051, "learning_rate": 8.1915857605178e-06, "logits/chosen": -4.285361289978027, "logits/rejected": -4.241901397705078, "logps/chosen": -710.8505859375, "logps/rejected": -629.2987060546875, "loss": 0.7785, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8103518486022949, "rewards/margins": 0.3938572406768799, "rewards/rejected": -1.2042090892791748, "step": 5420 }, { "epoch": 1.2649970879440886, "grad_norm": 8.587806701660156, "learning_rate": 8.165695792880259e-06, "logits/chosen": -4.346640110015869, "logits/rejected": -4.296164512634277, "logps/chosen": -808.0670776367188, "logps/rejected": -764.1261596679688, "loss": 0.7684, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6092880368232727, "rewards/margins": 0.47000154852867126, "rewards/rejected": -1.0792896747589111, "step": 5430 }, { "epoch": 1.2673267326732673, "grad_norm": 9.06271743774414, "learning_rate": 8.13980582524272e-06, "logits/chosen": -4.3552703857421875, "logits/rejected": -4.35286808013916, "logps/chosen": -676.3934326171875, "logps/rejected": -709.7229614257812, "loss": 0.7631, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4008163511753082, "rewards/margins": 0.5974048376083374, "rewards/rejected": -0.998221218585968, "step": 5440 }, { "epoch": 1.269656377402446, "grad_norm": 5.518189907073975, "learning_rate": 8.113915857605179e-06, "logits/chosen": -4.357514381408691, "logits/rejected": -4.337433338165283, "logps/chosen": -685.5467529296875, "logps/rejected": -711.4341430664062, "loss": 0.7718, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.8264461755752563, "rewards/margins": 0.23178991675376892, "rewards/rejected": -1.0582361221313477, "step": 5450 }, { "epoch": 1.271986022131625, "grad_norm": 10.297592163085938, "learning_rate": 8.088025889967637e-06, "logits/chosen": -4.2873921394348145, "logits/rejected": -4.283249378204346, "logps/chosen": -723.2974853515625, "logps/rejected": -750.7506103515625, "loss": 0.6925, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5153021812438965, "rewards/margins": 0.5844428539276123, "rewards/rejected": -1.0997450351715088, "step": 5460 }, { "epoch": 1.2743156668608038, "grad_norm": 9.21555233001709, "learning_rate": 8.062135922330098e-06, "logits/chosen": -4.428861141204834, "logits/rejected": -4.376482009887695, "logps/chosen": -722.2612915039062, "logps/rejected": -748.6749267578125, "loss": 0.7888, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7600167393684387, "rewards/margins": 0.32688966393470764, "rewards/rejected": -1.0869064331054688, "step": 5470 }, { "epoch": 1.2766453115899825, "grad_norm": 9.568215370178223, "learning_rate": 8.036245954692557e-06, "logits/chosen": -4.308640003204346, "logits/rejected": -4.361050128936768, "logps/chosen": -704.77783203125, "logps/rejected": -727.996826171875, "loss": 0.8215, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.6832433938980103, "rewards/margins": 0.3008263409137726, "rewards/rejected": -0.9840697050094604, "step": 5480 }, { "epoch": 1.2789749563191612, "grad_norm": 7.80417013168335, "learning_rate": 8.010355987055017e-06, "logits/chosen": -4.305323600769043, "logits/rejected": -4.314250469207764, "logps/chosen": -719.4246826171875, "logps/rejected": -692.815185546875, "loss": 0.6677, "rewards/accuracies": 0.625, "rewards/chosen": -0.642403244972229, "rewards/margins": 0.47839683294296265, "rewards/rejected": -1.1208001375198364, "step": 5490 }, { "epoch": 1.2813046010483402, "grad_norm": 8.392496109008789, "learning_rate": 7.984466019417476e-06, "logits/chosen": -4.330838680267334, "logits/rejected": -4.276543140411377, "logps/chosen": -748.7491455078125, "logps/rejected": -803.5469970703125, "loss": 0.6556, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7700390815734863, "rewards/margins": 0.7721626162528992, "rewards/rejected": -1.5422016382217407, "step": 5500 }, { "epoch": 1.2813046010483402, "eval_logits/chosen": -4.276762962341309, "eval_logits/rejected": -4.264129638671875, "eval_logps/chosen": -698.3977661132812, "eval_logps/rejected": -718.328125, "eval_loss": 0.6312812566757202, "eval_rewards/accuracies": 0.6388034820556641, "eval_rewards/chosen": -0.7569450736045837, "eval_rewards/margins": 0.3960038125514984, "eval_rewards/rejected": -1.1529488563537598, "eval_runtime": 392.8469, "eval_samples_per_second": 18.211, "eval_steps_per_second": 9.105, "step": 5500 }, { "epoch": 1.283634245777519, "grad_norm": 8.474251747131348, "learning_rate": 7.958576051779935e-06, "logits/chosen": -4.332800388336182, "logits/rejected": -4.342696189880371, "logps/chosen": -655.5740966796875, "logps/rejected": -683.3048095703125, "loss": 0.6476, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6026918292045593, "rewards/margins": 0.5270061492919922, "rewards/rejected": -1.1296980381011963, "step": 5510 }, { "epoch": 1.2859638905066977, "grad_norm": 6.1152472496032715, "learning_rate": 7.932686084142396e-06, "logits/chosen": -4.386120319366455, "logits/rejected": -4.363664150238037, "logps/chosen": -736.0418701171875, "logps/rejected": -755.9088745117188, "loss": 0.6813, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7132939100265503, "rewards/margins": 0.961859405040741, "rewards/rejected": -1.675153374671936, "step": 5520 }, { "epoch": 1.2882935352358764, "grad_norm": 7.68399715423584, "learning_rate": 7.906796116504855e-06, "logits/chosen": -4.306368827819824, "logits/rejected": -4.369459629058838, "logps/chosen": -728.9708251953125, "logps/rejected": -740.6129150390625, "loss": 0.8068, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6639933586120605, "rewards/margins": 0.355257123708725, "rewards/rejected": -1.019250512123108, "step": 5530 }, { "epoch": 1.2906231799650554, "grad_norm": 8.179251670837402, "learning_rate": 7.880906148867315e-06, "logits/chosen": -4.3534159660339355, "logits/rejected": -4.320357322692871, "logps/chosen": -719.878173828125, "logps/rejected": -718.3634033203125, "loss": 0.8182, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7298353910446167, "rewards/margins": 0.35824066400527954, "rewards/rejected": -1.088076114654541, "step": 5540 }, { "epoch": 1.2929528246942341, "grad_norm": 9.077390670776367, "learning_rate": 7.855016181229774e-06, "logits/chosen": -4.333242893218994, "logits/rejected": -4.3756327629089355, "logps/chosen": -657.5175170898438, "logps/rejected": -717.9803466796875, "loss": 0.8273, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.892708957195282, "rewards/margins": 0.15217627584934235, "rewards/rejected": -1.044885277748108, "step": 5550 }, { "epoch": 1.2952824694234129, "grad_norm": 9.775418281555176, "learning_rate": 7.829126213592235e-06, "logits/chosen": -4.333688259124756, "logits/rejected": -4.265844821929932, "logps/chosen": -750.9478759765625, "logps/rejected": -772.283203125, "loss": 0.683, "rewards/accuracies": 0.625, "rewards/chosen": -0.9198330044746399, "rewards/margins": 0.5669907927513123, "rewards/rejected": -1.4868237972259521, "step": 5560 }, { "epoch": 1.2976121141525918, "grad_norm": 8.756830215454102, "learning_rate": 7.803236245954694e-06, "logits/chosen": -4.229300498962402, "logits/rejected": -4.275421142578125, "logps/chosen": -665.558349609375, "logps/rejected": -683.5747680664062, "loss": 0.7361, "rewards/accuracies": 0.5625, "rewards/chosen": -0.49589547514915466, "rewards/margins": 0.5741171836853027, "rewards/rejected": -1.0700128078460693, "step": 5570 }, { "epoch": 1.2999417588817705, "grad_norm": 8.244976997375488, "learning_rate": 7.777346278317152e-06, "logits/chosen": -4.362826347351074, "logits/rejected": -4.415011405944824, "logps/chosen": -744.3865356445312, "logps/rejected": -787.7940673828125, "loss": 0.8813, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.7486151456832886, "rewards/margins": 0.2536180019378662, "rewards/rejected": -1.0022331476211548, "step": 5580 }, { "epoch": 1.3022714036109493, "grad_norm": 7.764080047607422, "learning_rate": 7.751456310679613e-06, "logits/chosen": -4.275409698486328, "logits/rejected": -4.322786808013916, "logps/chosen": -658.6358032226562, "logps/rejected": -733.7100219726562, "loss": 0.6867, "rewards/accuracies": 0.625, "rewards/chosen": -0.4042404294013977, "rewards/margins": 0.5954650640487671, "rewards/rejected": -0.9997054934501648, "step": 5590 }, { "epoch": 1.3046010483401282, "grad_norm": 6.2695231437683105, "learning_rate": 7.725566343042072e-06, "logits/chosen": -4.200558662414551, "logits/rejected": -4.332554817199707, "logps/chosen": -667.927734375, "logps/rejected": -749.1799926757812, "loss": 0.8593, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.858493983745575, "rewards/margins": 0.11628395318984985, "rewards/rejected": -0.9747779965400696, "step": 5600 }, { "epoch": 1.3046010483401282, "eval_logits/chosen": -4.26957893371582, "eval_logits/rejected": -4.256231307983398, "eval_logps/chosen": -698.6621704101562, "eval_logps/rejected": -718.7105102539062, "eval_loss": 0.6317076683044434, "eval_rewards/accuracies": 0.6399217247962952, "eval_rewards/chosen": -0.7833826541900635, "eval_rewards/margins": 0.4078075885772705, "eval_rewards/rejected": -1.1911901235580444, "eval_runtime": 393.8757, "eval_samples_per_second": 18.163, "eval_steps_per_second": 9.082, "step": 5600 }, { "epoch": 1.306930693069307, "grad_norm": 10.356733322143555, "learning_rate": 7.69967637540453e-06, "logits/chosen": -4.231276512145996, "logits/rejected": -4.262662887573242, "logps/chosen": -749.026123046875, "logps/rejected": -756.8435668945312, "loss": 0.8628, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.8626430630683899, "rewards/margins": 0.3443228602409363, "rewards/rejected": -1.2069660425186157, "step": 5610 }, { "epoch": 1.3092603377984857, "grad_norm": 7.466703414916992, "learning_rate": 7.673786407766991e-06, "logits/chosen": -4.292886257171631, "logits/rejected": -4.345783710479736, "logps/chosen": -721.8636474609375, "logps/rejected": -810.0375366210938, "loss": 0.851, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8587177395820618, "rewards/margins": 0.3823612332344055, "rewards/rejected": -1.2410789728164673, "step": 5620 }, { "epoch": 1.3115899825276645, "grad_norm": 6.604434967041016, "learning_rate": 7.64789644012945e-06, "logits/chosen": -4.294118404388428, "logits/rejected": -4.347481727600098, "logps/chosen": -730.0178833007812, "logps/rejected": -833.5206909179688, "loss": 0.5881, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5834293365478516, "rewards/margins": 0.9197348356246948, "rewards/rejected": -1.503164291381836, "step": 5630 }, { "epoch": 1.3139196272568434, "grad_norm": 9.590311050415039, "learning_rate": 7.62200647249191e-06, "logits/chosen": -4.24991512298584, "logits/rejected": -4.206608772277832, "logps/chosen": -727.8446655273438, "logps/rejected": -719.554931640625, "loss": 0.649, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.46159330010414124, "rewards/margins": 0.6928114891052246, "rewards/rejected": -1.154404878616333, "step": 5640 }, { "epoch": 1.3162492719860222, "grad_norm": 9.104934692382812, "learning_rate": 7.59611650485437e-06, "logits/chosen": -4.273480415344238, "logits/rejected": -4.279662132263184, "logps/chosen": -702.955322265625, "logps/rejected": -746.53564453125, "loss": 0.7745, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.8513115048408508, "rewards/margins": 0.5166622400283813, "rewards/rejected": -1.3679735660552979, "step": 5650 }, { "epoch": 1.3185789167152009, "grad_norm": 9.956319808959961, "learning_rate": 7.570226537216829e-06, "logits/chosen": -4.362860202789307, "logits/rejected": -4.4000349044799805, "logps/chosen": -705.8887939453125, "logps/rejected": -815.6693725585938, "loss": 0.7268, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6892780065536499, "rewards/margins": 0.49222391843795776, "rewards/rejected": -1.181501865386963, "step": 5660 }, { "epoch": 1.3209085614443796, "grad_norm": 10.5054292678833, "learning_rate": 7.544336569579289e-06, "logits/chosen": -4.3153886795043945, "logits/rejected": -4.259528160095215, "logps/chosen": -739.3519287109375, "logps/rejected": -677.4149780273438, "loss": 0.7945, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6772125959396362, "rewards/margins": 0.42796260118484497, "rewards/rejected": -1.105175256729126, "step": 5670 }, { "epoch": 1.3232382061735586, "grad_norm": 5.91664457321167, "learning_rate": 7.518446601941748e-06, "logits/chosen": -4.281999588012695, "logits/rejected": -4.293154716491699, "logps/chosen": -718.5802612304688, "logps/rejected": -746.5780029296875, "loss": 0.7888, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.9188039898872375, "rewards/margins": 0.3310539722442627, "rewards/rejected": -1.2498581409454346, "step": 5680 }, { "epoch": 1.3255678509027373, "grad_norm": 10.995230674743652, "learning_rate": 7.492556634304208e-06, "logits/chosen": -4.330352783203125, "logits/rejected": -4.3117570877075195, "logps/chosen": -708.2078857421875, "logps/rejected": -716.5367431640625, "loss": 0.7332, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5375461578369141, "rewards/margins": 0.5031279921531677, "rewards/rejected": -1.0406742095947266, "step": 5690 }, { "epoch": 1.327897495631916, "grad_norm": 9.291178703308105, "learning_rate": 7.4666666666666675e-06, "logits/chosen": -4.3973517417907715, "logits/rejected": -4.330522537231445, "logps/chosen": -761.6275024414062, "logps/rejected": -713.5189208984375, "loss": 0.6934, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5683543682098389, "rewards/margins": 0.4567783772945404, "rewards/rejected": -1.025132656097412, "step": 5700 }, { "epoch": 1.327897495631916, "eval_logits/chosen": -4.26210355758667, "eval_logits/rejected": -4.248154640197754, "eval_logps/chosen": -699.094482421875, "eval_logps/rejected": -719.2811279296875, "eval_loss": 0.6337792277336121, "eval_rewards/accuracies": 0.6406206488609314, "eval_rewards/chosen": -0.826622724533081, "eval_rewards/margins": 0.42162150144577026, "eval_rewards/rejected": -1.248244285583496, "eval_runtime": 394.2579, "eval_samples_per_second": 18.145, "eval_steps_per_second": 9.073, "step": 5700 }, { "epoch": 1.330227140361095, "grad_norm": 7.562618255615234, "learning_rate": 7.440776699029126e-06, "logits/chosen": -4.334902763366699, "logits/rejected": -4.1793293952941895, "logps/chosen": -751.4991455078125, "logps/rejected": -686.8331909179688, "loss": 0.846, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.8237320184707642, "rewards/margins": 0.20931318402290344, "rewards/rejected": -1.0330451726913452, "step": 5710 }, { "epoch": 1.3325567850902738, "grad_norm": 10.239123344421387, "learning_rate": 7.414886731391586e-06, "logits/chosen": -4.351064682006836, "logits/rejected": -4.283308982849121, "logps/chosen": -688.5595703125, "logps/rejected": -710.2664794921875, "loss": 0.7362, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8389534950256348, "rewards/margins": 0.5938393473625183, "rewards/rejected": -1.4327926635742188, "step": 5720 }, { "epoch": 1.3348864298194525, "grad_norm": 5.591737270355225, "learning_rate": 7.388996763754046e-06, "logits/chosen": -4.312826156616211, "logits/rejected": -4.2012529373168945, "logps/chosen": -684.7025756835938, "logps/rejected": -589.3084716796875, "loss": 0.7283, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7459465265274048, "rewards/margins": 0.4310365319252014, "rewards/rejected": -1.176983118057251, "step": 5730 }, { "epoch": 1.3372160745486315, "grad_norm": 7.605983257293701, "learning_rate": 7.3631067961165055e-06, "logits/chosen": -4.290006160736084, "logits/rejected": -4.3164496421813965, "logps/chosen": -702.5801391601562, "logps/rejected": -752.56005859375, "loss": 0.5171, "rewards/accuracies": 0.75, "rewards/chosen": -0.5781384706497192, "rewards/margins": 0.9951320886611938, "rewards/rejected": -1.5732704401016235, "step": 5740 }, { "epoch": 1.3395457192778102, "grad_norm": 8.090303421020508, "learning_rate": 7.337216828478964e-06, "logits/chosen": -4.243030071258545, "logits/rejected": -4.308330535888672, "logps/chosen": -722.51953125, "logps/rejected": -750.7996826171875, "loss": 0.7339, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6192070841789246, "rewards/margins": 0.5955747365951538, "rewards/rejected": -1.2147818803787231, "step": 5750 }, { "epoch": 1.341875364006989, "grad_norm": 7.423137187957764, "learning_rate": 7.311326860841424e-06, "logits/chosen": -4.275225639343262, "logits/rejected": -4.218506813049316, "logps/chosen": -705.7161254882812, "logps/rejected": -707.9550170898438, "loss": 0.7737, "rewards/accuracies": 0.625, "rewards/chosen": -0.9287475347518921, "rewards/margins": 0.3816317915916443, "rewards/rejected": -1.3103792667388916, "step": 5760 }, { "epoch": 1.3442050087361677, "grad_norm": 4.06026554107666, "learning_rate": 7.285436893203885e-06, "logits/chosen": -4.278772830963135, "logits/rejected": -4.319589614868164, "logps/chosen": -694.3591918945312, "logps/rejected": -732.3553466796875, "loss": 0.7029, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7285884022712708, "rewards/margins": 0.5107731819152832, "rewards/rejected": -1.2393615245819092, "step": 5770 }, { "epoch": 1.3465346534653464, "grad_norm": 8.069743156433105, "learning_rate": 7.259546925566343e-06, "logits/chosen": -4.411580562591553, "logits/rejected": -4.367345809936523, "logps/chosen": -779.0450439453125, "logps/rejected": -742.70947265625, "loss": 0.7816, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8539965748786926, "rewards/margins": 0.46470707654953003, "rewards/rejected": -1.3187037706375122, "step": 5780 }, { "epoch": 1.3488642981945254, "grad_norm": 7.119553565979004, "learning_rate": 7.233656957928803e-06, "logits/chosen": -4.2892279624938965, "logits/rejected": -4.25607967376709, "logps/chosen": -751.4293823242188, "logps/rejected": -743.5025634765625, "loss": 0.6099, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.47523608803749084, "rewards/margins": 0.7295746803283691, "rewards/rejected": -1.2048107385635376, "step": 5790 }, { "epoch": 1.351193942923704, "grad_norm": 8.696918487548828, "learning_rate": 7.207766990291263e-06, "logits/chosen": -4.378389358520508, "logits/rejected": -4.295948505401611, "logps/chosen": -724.874755859375, "logps/rejected": -703.1808471679688, "loss": 0.6694, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7297112941741943, "rewards/margins": 0.5419555902481079, "rewards/rejected": -1.2716668844223022, "step": 5800 }, { "epoch": 1.351193942923704, "eval_logits/chosen": -4.2629899978637695, "eval_logits/rejected": -4.249340057373047, "eval_logps/chosen": -698.8829956054688, "eval_logps/rejected": -719.0765991210938, "eval_loss": 0.6321995854377747, "eval_rewards/accuracies": 0.6397819519042969, "eval_rewards/chosen": -0.8054772019386292, "eval_rewards/margins": 0.42231690883636475, "eval_rewards/rejected": -1.2277940511703491, "eval_runtime": 394.4648, "eval_samples_per_second": 18.136, "eval_steps_per_second": 9.068, "step": 5800 }, { "epoch": 1.3535235876528828, "grad_norm": 9.419877052307129, "learning_rate": 7.181877022653723e-06, "logits/chosen": -4.269660949707031, "logits/rejected": -4.29043436050415, "logps/chosen": -714.8873291015625, "logps/rejected": -801.2305908203125, "loss": 0.7412, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7010995149612427, "rewards/margins": 0.4792519509792328, "rewards/rejected": -1.1803514957427979, "step": 5810 }, { "epoch": 1.3558532323820618, "grad_norm": 6.639642715454102, "learning_rate": 7.155987055016182e-06, "logits/chosen": -4.356001853942871, "logits/rejected": -4.3863115310668945, "logps/chosen": -656.8035888671875, "logps/rejected": -730.4910278320312, "loss": 0.7787, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9009748697280884, "rewards/margins": 0.3519083857536316, "rewards/rejected": -1.2528831958770752, "step": 5820 }, { "epoch": 1.3581828771112405, "grad_norm": 7.337728500366211, "learning_rate": 7.130097087378641e-06, "logits/chosen": -4.27675724029541, "logits/rejected": -4.264283180236816, "logps/chosen": -730.9502563476562, "logps/rejected": -788.376953125, "loss": 0.7888, "rewards/accuracies": 0.5625, "rewards/chosen": -0.816659152507782, "rewards/margins": 0.521469235420227, "rewards/rejected": -1.3381284475326538, "step": 5830 }, { "epoch": 1.3605125218404193, "grad_norm": 9.205945014953613, "learning_rate": 7.104207119741101e-06, "logits/chosen": -4.330324649810791, "logits/rejected": -4.350374221801758, "logps/chosen": -783.76220703125, "logps/rejected": -805.0286865234375, "loss": 0.6592, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6576730012893677, "rewards/margins": 0.6231470704078674, "rewards/rejected": -1.2808201313018799, "step": 5840 }, { "epoch": 1.3628421665695982, "grad_norm": 10.915939331054688, "learning_rate": 7.078317152103561e-06, "logits/chosen": -4.300547122955322, "logits/rejected": -4.347357749938965, "logps/chosen": -756.61083984375, "logps/rejected": -746.4322509765625, "loss": 0.9299, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.0734272003173828, "rewards/margins": 0.1357240527868271, "rewards/rejected": -1.209151268005371, "step": 5850 }, { "epoch": 1.365171811298777, "grad_norm": 8.515040397644043, "learning_rate": 7.05242718446602e-06, "logits/chosen": -4.35914945602417, "logits/rejected": -4.41499137878418, "logps/chosen": -685.0623779296875, "logps/rejected": -724.2054443359375, "loss": 0.8295, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.790489673614502, "rewards/margins": 0.21192386746406555, "rewards/rejected": -1.0024135112762451, "step": 5860 }, { "epoch": 1.3675014560279557, "grad_norm": 6.500744819641113, "learning_rate": 7.026537216828479e-06, "logits/chosen": -4.287649154663086, "logits/rejected": -4.258206844329834, "logps/chosen": -717.4063720703125, "logps/rejected": -725.9107666015625, "loss": 0.7797, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.6877506375312805, "rewards/margins": 0.3528876304626465, "rewards/rejected": -1.0406382083892822, "step": 5870 }, { "epoch": 1.3698311007571347, "grad_norm": 8.362839698791504, "learning_rate": 7.000647249190939e-06, "logits/chosen": -4.286078929901123, "logits/rejected": -4.331356048583984, "logps/chosen": -662.5283813476562, "logps/rejected": -703.88818359375, "loss": 0.8605, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7610023617744446, "rewards/margins": 0.5419550538063049, "rewards/rejected": -1.3029574155807495, "step": 5880 }, { "epoch": 1.3721607454863134, "grad_norm": 7.359226226806641, "learning_rate": 6.974757281553398e-06, "logits/chosen": -4.292226314544678, "logits/rejected": -4.382073402404785, "logps/chosen": -713.1748046875, "logps/rejected": -787.0335693359375, "loss": 0.7733, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7213521003723145, "rewards/margins": 0.5341546535491943, "rewards/rejected": -1.2555067539215088, "step": 5890 }, { "epoch": 1.3744903902154921, "grad_norm": 8.315750122070312, "learning_rate": 6.948867313915858e-06, "logits/chosen": -4.223654747009277, "logits/rejected": -4.363278865814209, "logps/chosen": -695.8765869140625, "logps/rejected": -790.5074462890625, "loss": 0.6437, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.715376079082489, "rewards/margins": 0.5789678692817688, "rewards/rejected": -1.2943440675735474, "step": 5900 }, { "epoch": 1.3744903902154921, "eval_logits/chosen": -4.265625, "eval_logits/rejected": -4.251952648162842, "eval_logps/chosen": -698.7142333984375, "eval_logps/rejected": -718.8829345703125, "eval_loss": 0.6310829520225525, "eval_rewards/accuracies": 0.6404808759689331, "eval_rewards/chosen": -0.7885909676551819, "eval_rewards/margins": 0.4198411703109741, "eval_rewards/rejected": -1.2084320783615112, "eval_runtime": 394.5375, "eval_samples_per_second": 18.133, "eval_steps_per_second": 9.066, "step": 5900 }, { "epoch": 1.3768200349446709, "grad_norm": 10.515283584594727, "learning_rate": 6.9229773462783175e-06, "logits/chosen": -4.387662887573242, "logits/rejected": -4.373385429382324, "logps/chosen": -744.9942626953125, "logps/rejected": -810.0918579101562, "loss": 0.8659, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.8038939237594604, "rewards/margins": 0.3602578639984131, "rewards/rejected": -1.164151906967163, "step": 5910 }, { "epoch": 1.3791496796738496, "grad_norm": 10.210684776306152, "learning_rate": 6.897087378640778e-06, "logits/chosen": -4.323285102844238, "logits/rejected": -4.2652082443237305, "logps/chosen": -754.8870849609375, "logps/rejected": -789.4259033203125, "loss": 0.8231, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.7273879051208496, "rewards/margins": 0.2793354392051697, "rewards/rejected": -1.0067232847213745, "step": 5920 }, { "epoch": 1.3814793244030286, "grad_norm": 10.7938232421875, "learning_rate": 6.871197411003236e-06, "logits/chosen": -4.402263641357422, "logits/rejected": -4.343568801879883, "logps/chosen": -713.602783203125, "logps/rejected": -714.8239135742188, "loss": 0.788, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.9156931638717651, "rewards/margins": 0.34777992963790894, "rewards/rejected": -1.2634732723236084, "step": 5930 }, { "epoch": 1.3838089691322073, "grad_norm": 7.41132116317749, "learning_rate": 6.845307443365697e-06, "logits/chosen": -4.270591735839844, "logits/rejected": -4.345918655395508, "logps/chosen": -676.3931884765625, "logps/rejected": -731.2847900390625, "loss": 0.9121, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.9126005172729492, "rewards/margins": 0.3571823239326477, "rewards/rejected": -1.2697827816009521, "step": 5940 }, { "epoch": 1.386138613861386, "grad_norm": 8.748828887939453, "learning_rate": 6.819417475728156e-06, "logits/chosen": -4.402093410491943, "logits/rejected": -4.365542888641357, "logps/chosen": -743.6368408203125, "logps/rejected": -806.1112670898438, "loss": 0.8711, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.981920599937439, "rewards/margins": 0.3139951229095459, "rewards/rejected": -1.2959158420562744, "step": 5950 }, { "epoch": 1.388468258590565, "grad_norm": 4.679051399230957, "learning_rate": 6.793527508090615e-06, "logits/chosen": -4.309473991394043, "logits/rejected": -4.248940467834473, "logps/chosen": -681.0128173828125, "logps/rejected": -692.7999877929688, "loss": 0.7612, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6057000160217285, "rewards/margins": 0.34382814168930054, "rewards/rejected": -0.9495280981063843, "step": 5960 }, { "epoch": 1.3907979033197437, "grad_norm": 8.569405555725098, "learning_rate": 6.767637540453075e-06, "logits/chosen": -4.276402473449707, "logits/rejected": -4.360108375549316, "logps/chosen": -722.626953125, "logps/rejected": -727.410888671875, "loss": 0.8777, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.8245531320571899, "rewards/margins": 0.1962832510471344, "rewards/rejected": -1.0208362340927124, "step": 5970 }, { "epoch": 1.3931275480489225, "grad_norm": 7.404006481170654, "learning_rate": 6.741747572815535e-06, "logits/chosen": -4.379885196685791, "logits/rejected": -4.29095983505249, "logps/chosen": -767.14208984375, "logps/rejected": -748.4708251953125, "loss": 0.7369, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.7033445239067078, "rewards/margins": 0.39107757806777954, "rewards/rejected": -1.0944219827651978, "step": 5980 }, { "epoch": 1.3954571927781014, "grad_norm": 7.095986366271973, "learning_rate": 6.7158576051779944e-06, "logits/chosen": -4.271486282348633, "logits/rejected": -4.316716194152832, "logps/chosen": -722.4144897460938, "logps/rejected": -744.0294799804688, "loss": 0.6972, "rewards/accuracies": 0.625, "rewards/chosen": -0.7354863286018372, "rewards/margins": 0.49441951513290405, "rewards/rejected": -1.2299058437347412, "step": 5990 }, { "epoch": 1.3977868375072802, "grad_norm": 9.544381141662598, "learning_rate": 6.689967637540453e-06, "logits/chosen": -4.179322719573975, "logits/rejected": -4.25494384765625, "logps/chosen": -675.0315551757812, "logps/rejected": -745.0963134765625, "loss": 0.665, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.546692967414856, "rewards/margins": 0.6451650857925415, "rewards/rejected": -1.191857933998108, "step": 6000 }, { "epoch": 1.3977868375072802, "eval_logits/chosen": -4.270597457885742, "eval_logits/rejected": -4.256988525390625, "eval_logps/chosen": -698.1524047851562, "eval_logps/rejected": -718.1962280273438, "eval_loss": 0.6282357573509216, "eval_rewards/accuracies": 0.6407604217529297, "eval_rewards/chosen": -0.7324159145355225, "eval_rewards/margins": 0.4073488712310791, "eval_rewards/rejected": -1.1397647857666016, "eval_runtime": 395.4993, "eval_samples_per_second": 18.089, "eval_steps_per_second": 9.044, "step": 6000 }, { "epoch": 1.400116482236459, "grad_norm": 7.5317206382751465, "learning_rate": 6.664077669902913e-06, "logits/chosen": -4.356226444244385, "logits/rejected": -4.356008529663086, "logps/chosen": -749.1924438476562, "logps/rejected": -852.3751831054688, "loss": 0.6264, "rewards/accuracies": 0.6875, "rewards/chosen": -0.42755842208862305, "rewards/margins": 0.8259701728820801, "rewards/rejected": -1.253528356552124, "step": 6010 }, { "epoch": 1.4024461269656379, "grad_norm": 9.323957443237305, "learning_rate": 6.638187702265373e-06, "logits/chosen": -4.306419849395752, "logits/rejected": -4.26028299331665, "logps/chosen": -713.90380859375, "logps/rejected": -701.1036376953125, "loss": 0.7646, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.72160804271698, "rewards/margins": 0.3279089629650116, "rewards/rejected": -1.049517035484314, "step": 6020 }, { "epoch": 1.4047757716948166, "grad_norm": 6.586495876312256, "learning_rate": 6.6122977346278325e-06, "logits/chosen": -4.380081653594971, "logits/rejected": -4.304697036743164, "logps/chosen": -827.091796875, "logps/rejected": -757.8292846679688, "loss": 0.8853, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.7758504748344421, "rewards/margins": 0.2744961678981781, "rewards/rejected": -1.0503467321395874, "step": 6030 }, { "epoch": 1.4071054164239953, "grad_norm": 7.249460697174072, "learning_rate": 6.586407766990291e-06, "logits/chosen": -4.236702919006348, "logits/rejected": -4.3020524978637695, "logps/chosen": -725.0355224609375, "logps/rejected": -752.8726806640625, "loss": 0.7608, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5313434600830078, "rewards/margins": 0.6068240404129028, "rewards/rejected": -1.1381676197052002, "step": 6040 }, { "epoch": 1.409435061153174, "grad_norm": 5.396103858947754, "learning_rate": 6.560517799352751e-06, "logits/chosen": -4.281092166900635, "logits/rejected": -4.332211494445801, "logps/chosen": -697.1739501953125, "logps/rejected": -740.8766479492188, "loss": 0.6829, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4345017373561859, "rewards/margins": 0.5479524731636047, "rewards/rejected": -0.9824541807174683, "step": 6050 }, { "epoch": 1.4117647058823528, "grad_norm": 8.456120491027832, "learning_rate": 6.534627831715211e-06, "logits/chosen": -4.316834449768066, "logits/rejected": -4.33341646194458, "logps/chosen": -708.6388549804688, "logps/rejected": -770.0853271484375, "loss": 0.7611, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.7554001212120056, "rewards/margins": 0.42426061630249023, "rewards/rejected": -1.1796607971191406, "step": 6060 }, { "epoch": 1.4140943506115318, "grad_norm": 9.169841766357422, "learning_rate": 6.50873786407767e-06, "logits/chosen": -4.311273097991943, "logits/rejected": -4.331109523773193, "logps/chosen": -690.289306640625, "logps/rejected": -726.1717529296875, "loss": 0.6198, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5135329365730286, "rewards/margins": 0.7094392776489258, "rewards/rejected": -1.2229722738265991, "step": 6070 }, { "epoch": 1.4164239953407105, "grad_norm": 7.389662742614746, "learning_rate": 6.4828478964401294e-06, "logits/chosen": -4.348455905914307, "logits/rejected": -4.332047939300537, "logps/chosen": -701.18701171875, "logps/rejected": -718.71484375, "loss": 0.6092, "rewards/accuracies": 0.625, "rewards/chosen": -0.5786556005477905, "rewards/margins": 0.6604295372962952, "rewards/rejected": -1.239085078239441, "step": 6080 }, { "epoch": 1.4187536400698892, "grad_norm": 9.116060256958008, "learning_rate": 6.45695792880259e-06, "logits/chosen": -4.322030067443848, "logits/rejected": -4.3480024337768555, "logps/chosen": -744.7877197265625, "logps/rejected": -769.6700439453125, "loss": 0.8433, "rewards/accuracies": 0.5, "rewards/chosen": -0.6332119107246399, "rewards/margins": 0.38585028052330017, "rewards/rejected": -1.0190622806549072, "step": 6090 }, { "epoch": 1.4210832847990682, "grad_norm": 9.876167297363281, "learning_rate": 6.43106796116505e-06, "logits/chosen": -4.301876544952393, "logits/rejected": -4.181447982788086, "logps/chosen": -748.7747802734375, "logps/rejected": -744.25, "loss": 0.8143, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.5354674458503723, "rewards/margins": 0.28102821111679077, "rewards/rejected": -0.8164957165718079, "step": 6100 }, { "epoch": 1.4210832847990682, "eval_logits/chosen": -4.270403861999512, "eval_logits/rejected": -4.256753444671631, "eval_logps/chosen": -698.1387939453125, "eval_logps/rejected": -718.2033081054688, "eval_loss": 0.6276611685752869, "eval_rewards/accuracies": 0.6421582102775574, "eval_rewards/chosen": -0.7310547828674316, "eval_rewards/margins": 0.4094196856021881, "eval_rewards/rejected": -1.1404744386672974, "eval_runtime": 395.9901, "eval_samples_per_second": 18.066, "eval_steps_per_second": 9.033, "step": 6100 }, { "epoch": 1.423412929528247, "grad_norm": 13.78065299987793, "learning_rate": 6.405177993527509e-06, "logits/chosen": -4.367238521575928, "logits/rejected": -4.370728969573975, "logps/chosen": -732.5328979492188, "logps/rejected": -744.9369506835938, "loss": 0.9071, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.9486724734306335, "rewards/margins": 0.18062777817249298, "rewards/rejected": -1.1293003559112549, "step": 6110 }, { "epoch": 1.4257425742574257, "grad_norm": 8.261577606201172, "learning_rate": 6.379288025889968e-06, "logits/chosen": -4.402305603027344, "logits/rejected": -4.272169589996338, "logps/chosen": -794.4027709960938, "logps/rejected": -672.9693603515625, "loss": 0.9172, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.830672562122345, "rewards/margins": 0.0038339763414114714, "rewards/rejected": -0.834506630897522, "step": 6120 }, { "epoch": 1.4280722189866046, "grad_norm": 6.93900203704834, "learning_rate": 6.353398058252428e-06, "logits/chosen": -4.3805317878723145, "logits/rejected": -4.36024284362793, "logps/chosen": -707.8915405273438, "logps/rejected": -755.4413452148438, "loss": 0.6694, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6830347776412964, "rewards/margins": 0.5677529573440552, "rewards/rejected": -1.2507877349853516, "step": 6130 }, { "epoch": 1.4304018637157834, "grad_norm": 6.082003593444824, "learning_rate": 6.327508090614888e-06, "logits/chosen": -4.284184455871582, "logits/rejected": -4.340609073638916, "logps/chosen": -723.261962890625, "logps/rejected": -769.7706298828125, "loss": 0.6174, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5278186798095703, "rewards/margins": 0.6645825505256653, "rewards/rejected": -1.1924011707305908, "step": 6140 }, { "epoch": 1.432731508444962, "grad_norm": 9.62651252746582, "learning_rate": 6.301618122977347e-06, "logits/chosen": -4.3521809577941895, "logits/rejected": -4.298386096954346, "logps/chosen": -732.197509765625, "logps/rejected": -733.2960205078125, "loss": 0.6925, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7362813353538513, "rewards/margins": 0.5624809265136719, "rewards/rejected": -1.298762321472168, "step": 6150 }, { "epoch": 1.435061153174141, "grad_norm": 3.412282705307007, "learning_rate": 6.275728155339806e-06, "logits/chosen": -4.32267951965332, "logits/rejected": -4.326018810272217, "logps/chosen": -714.3280029296875, "logps/rejected": -781.4513549804688, "loss": 0.7834, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6571785807609558, "rewards/margins": 0.46814459562301636, "rewards/rejected": -1.1253231763839722, "step": 6160 }, { "epoch": 1.4373907979033198, "grad_norm": 6.920522212982178, "learning_rate": 6.249838187702266e-06, "logits/chosen": -4.282754898071289, "logits/rejected": -4.342594623565674, "logps/chosen": -721.2886962890625, "logps/rejected": -721.8851928710938, "loss": 1.0046, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.9614690542221069, "rewards/margins": 0.011788728646934032, "rewards/rejected": -0.9732577204704285, "step": 6170 }, { "epoch": 1.4397204426324985, "grad_norm": 6.783796787261963, "learning_rate": 6.223948220064725e-06, "logits/chosen": -4.263086795806885, "logits/rejected": -4.364320755004883, "logps/chosen": -711.2506103515625, "logps/rejected": -793.2518920898438, "loss": 0.7856, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.8233615756034851, "rewards/margins": 0.4337071478366852, "rewards/rejected": -1.2570687532424927, "step": 6180 }, { "epoch": 1.4420500873616773, "grad_norm": 7.462255954742432, "learning_rate": 6.198058252427185e-06, "logits/chosen": -4.287019729614258, "logits/rejected": -4.253533840179443, "logps/chosen": -726.5230712890625, "logps/rejected": -747.6558837890625, "loss": 0.8425, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.6028618812561035, "rewards/margins": 0.3824983239173889, "rewards/rejected": -0.9853602647781372, "step": 6190 }, { "epoch": 1.444379732090856, "grad_norm": 9.85159683227539, "learning_rate": 6.1721682847896445e-06, "logits/chosen": -4.258937835693359, "logits/rejected": -4.259571075439453, "logps/chosen": -709.08642578125, "logps/rejected": -794.2938232421875, "loss": 0.6821, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5764713287353516, "rewards/margins": 0.6002413034439087, "rewards/rejected": -1.1767126321792603, "step": 6200 }, { "epoch": 1.444379732090856, "eval_logits/chosen": -4.2764997482299805, "eval_logits/rejected": -4.263204097747803, "eval_logps/chosen": -697.7610473632812, "eval_logps/rejected": -717.7634887695312, "eval_loss": 0.6264663338661194, "eval_rewards/accuracies": 0.642577588558197, "eval_rewards/chosen": -0.693281888961792, "eval_rewards/margins": 0.4031990170478821, "eval_rewards/rejected": -1.0964809656143188, "eval_runtime": 395.6778, "eval_samples_per_second": 18.08, "eval_steps_per_second": 9.04, "step": 6200 }, { "epoch": 1.446709376820035, "grad_norm": 6.2666707038879395, "learning_rate": 6.146278317152104e-06, "logits/chosen": -4.378452301025391, "logits/rejected": -4.316521644592285, "logps/chosen": -799.1228637695312, "logps/rejected": -763.740478515625, "loss": 0.7032, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6369025707244873, "rewards/margins": 0.4719843864440918, "rewards/rejected": -1.108886957168579, "step": 6210 }, { "epoch": 1.4490390215492137, "grad_norm": 4.337667942047119, "learning_rate": 6.120388349514563e-06, "logits/chosen": -4.4072771072387695, "logits/rejected": -4.218862056732178, "logps/chosen": -709.3562622070312, "logps/rejected": -662.3941040039062, "loss": 0.7151, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6898874640464783, "rewards/margins": 0.465634822845459, "rewards/rejected": -1.155522346496582, "step": 6220 }, { "epoch": 1.4513686662783924, "grad_norm": 8.891453742980957, "learning_rate": 6.094498381877023e-06, "logits/chosen": -4.323766231536865, "logits/rejected": -4.273622512817383, "logps/chosen": -713.9008178710938, "logps/rejected": -749.7387084960938, "loss": 0.7017, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6876183152198792, "rewards/margins": 0.8101609349250793, "rewards/rejected": -1.4977792501449585, "step": 6230 }, { "epoch": 1.4536983110075714, "grad_norm": 11.587925910949707, "learning_rate": 6.068608414239483e-06, "logits/chosen": -4.3129777908325195, "logits/rejected": -4.2286272048950195, "logps/chosen": -740.3646240234375, "logps/rejected": -652.7010498046875, "loss": 0.8272, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6182198524475098, "rewards/margins": 0.3408670425415039, "rewards/rejected": -0.9590870141983032, "step": 6240 }, { "epoch": 1.4560279557367501, "grad_norm": 10.418200492858887, "learning_rate": 6.042718446601941e-06, "logits/chosen": -4.306921482086182, "logits/rejected": -4.259629726409912, "logps/chosen": -726.8382568359375, "logps/rejected": -741.6177978515625, "loss": 0.7956, "rewards/accuracies": 0.5625, "rewards/chosen": -0.702867329120636, "rewards/margins": 0.33800870180130005, "rewards/rejected": -1.040876030921936, "step": 6250 }, { "epoch": 1.4583576004659289, "grad_norm": 9.083958625793457, "learning_rate": 6.016828478964402e-06, "logits/chosen": -4.267938137054443, "logits/rejected": -4.296270847320557, "logps/chosen": -719.9595947265625, "logps/rejected": -721.6478271484375, "loss": 0.8335, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.725295901298523, "rewards/margins": 0.25456786155700684, "rewards/rejected": -0.9798637628555298, "step": 6260 }, { "epoch": 1.4606872451951078, "grad_norm": 10.350333213806152, "learning_rate": 5.990938511326862e-06, "logits/chosen": -4.272916793823242, "logits/rejected": -4.3289313316345215, "logps/chosen": -728.450927734375, "logps/rejected": -767.2869262695312, "loss": 0.9885, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.7057057619094849, "rewards/margins": -0.07528909295797348, "rewards/rejected": -0.6304166913032532, "step": 6270 }, { "epoch": 1.4630168899242866, "grad_norm": 9.632805824279785, "learning_rate": 5.965048543689321e-06, "logits/chosen": -4.298973083496094, "logits/rejected": -4.322752475738525, "logps/chosen": -763.8212280273438, "logps/rejected": -827.498046875, "loss": 0.9871, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.9077277183532715, "rewards/margins": -0.20082445442676544, "rewards/rejected": -0.7069032788276672, "step": 6280 }, { "epoch": 1.4653465346534653, "grad_norm": 8.367243766784668, "learning_rate": 5.93915857605178e-06, "logits/chosen": -4.3294997215271, "logits/rejected": -4.337015628814697, "logps/chosen": -674.9423217773438, "logps/rejected": -705.767333984375, "loss": 0.8136, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.8321495056152344, "rewards/margins": 0.2989842891693115, "rewards/rejected": -1.1311339139938354, "step": 6290 }, { "epoch": 1.4676761793826443, "grad_norm": 6.319589614868164, "learning_rate": 5.91326860841424e-06, "logits/chosen": -4.261368751525879, "logits/rejected": -4.256316184997559, "logps/chosen": -703.7994384765625, "logps/rejected": -739.6809692382812, "loss": 0.8507, "rewards/accuracies": 0.5, "rewards/chosen": -0.8882040977478027, "rewards/margins": 0.23814082145690918, "rewards/rejected": -1.1263447999954224, "step": 6300 }, { "epoch": 1.4676761793826443, "eval_logits/chosen": -4.281351089477539, "eval_logits/rejected": -4.268238544464111, "eval_logps/chosen": -698.1597290039062, "eval_logps/rejected": -718.27978515625, "eval_loss": 0.6276564002037048, "eval_rewards/accuracies": 0.6410399675369263, "eval_rewards/chosen": -0.7331462502479553, "eval_rewards/margins": 0.41497042775154114, "eval_rewards/rejected": -1.1481167078018188, "eval_runtime": 395.5859, "eval_samples_per_second": 18.085, "eval_steps_per_second": 9.042, "step": 6300 }, { "epoch": 1.470005824111823, "grad_norm": 6.561951160430908, "learning_rate": 5.8873786407767e-06, "logits/chosen": -4.3774094581604, "logits/rejected": -4.387298107147217, "logps/chosen": -729.2420043945312, "logps/rejected": -758.8972778320312, "loss": 0.9366, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.7099128365516663, "rewards/margins": 0.16957201063632965, "rewards/rejected": -0.8794847726821899, "step": 6310 }, { "epoch": 1.4723354688410017, "grad_norm": 7.208823204040527, "learning_rate": 5.8614886731391595e-06, "logits/chosen": -4.253106117248535, "logits/rejected": -4.3132171630859375, "logps/chosen": -681.6314697265625, "logps/rejected": -762.1727294921875, "loss": 0.62, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.39332813024520874, "rewards/margins": 0.6966632008552551, "rewards/rejected": -1.0899913311004639, "step": 6320 }, { "epoch": 1.4746651135701805, "grad_norm": 5.884200096130371, "learning_rate": 5.835598705501618e-06, "logits/chosen": -4.318593502044678, "logits/rejected": -4.255011081695557, "logps/chosen": -756.2564697265625, "logps/rejected": -711.3599853515625, "loss": 0.607, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.40394967794418335, "rewards/margins": 0.7803922891616821, "rewards/rejected": -1.1843420267105103, "step": 6330 }, { "epoch": 1.4769947582993592, "grad_norm": 10.386765480041504, "learning_rate": 5.809708737864078e-06, "logits/chosen": -4.3260297775268555, "logits/rejected": -4.328006267547607, "logps/chosen": -685.3865356445312, "logps/rejected": -731.4774169921875, "loss": 0.7761, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.8475303649902344, "rewards/margins": 0.36684754490852356, "rewards/rejected": -1.2143778800964355, "step": 6340 }, { "epoch": 1.4793244030285382, "grad_norm": 10.14816951751709, "learning_rate": 5.783818770226538e-06, "logits/chosen": -4.235125541687012, "logits/rejected": -4.296066761016846, "logps/chosen": -704.0057373046875, "logps/rejected": -765.9306640625, "loss": 0.8021, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7169125080108643, "rewards/margins": 0.28062504529953003, "rewards/rejected": -0.9975376129150391, "step": 6350 }, { "epoch": 1.481654047757717, "grad_norm": 7.591325759887695, "learning_rate": 5.757928802588997e-06, "logits/chosen": -4.2524213790893555, "logits/rejected": -4.324526309967041, "logps/chosen": -773.524169921875, "logps/rejected": -758.4846801757812, "loss": 0.814, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8144239187240601, "rewards/margins": 0.3293677866458893, "rewards/rejected": -1.143791913986206, "step": 6360 }, { "epoch": 1.4839836924868957, "grad_norm": 6.67929220199585, "learning_rate": 5.732038834951456e-06, "logits/chosen": -4.279242515563965, "logits/rejected": -4.314077854156494, "logps/chosen": -698.8849487304688, "logps/rejected": -761.8951416015625, "loss": 0.6029, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5412794947624207, "rewards/margins": 0.7758609056472778, "rewards/rejected": -1.3171404600143433, "step": 6370 }, { "epoch": 1.4863133372160746, "grad_norm": 7.730616092681885, "learning_rate": 5.706148867313916e-06, "logits/chosen": -4.363424301147461, "logits/rejected": -4.256382942199707, "logps/chosen": -703.0626220703125, "logps/rejected": -616.8387451171875, "loss": 0.7749, "rewards/accuracies": 0.625, "rewards/chosen": -0.7838987708091736, "rewards/margins": 0.3033984303474426, "rewards/rejected": -1.0872972011566162, "step": 6380 }, { "epoch": 1.4886429819452534, "grad_norm": 10.048879623413086, "learning_rate": 5.680258899676377e-06, "logits/chosen": -4.266480922698975, "logits/rejected": -4.252657890319824, "logps/chosen": -756.8456420898438, "logps/rejected": -724.2012939453125, "loss": 0.6698, "rewards/accuracies": 0.625, "rewards/chosen": -0.7446259260177612, "rewards/margins": 0.6785385012626648, "rewards/rejected": -1.4231643676757812, "step": 6390 }, { "epoch": 1.490972626674432, "grad_norm": 9.971052169799805, "learning_rate": 5.654368932038835e-06, "logits/chosen": -4.291266441345215, "logits/rejected": -4.3873162269592285, "logps/chosen": -739.7542724609375, "logps/rejected": -808.8006591796875, "loss": 0.6913, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4845125079154968, "rewards/margins": 0.6399686932563782, "rewards/rejected": -1.1244813203811646, "step": 6400 }, { "epoch": 1.490972626674432, "eval_logits/chosen": -4.279409408569336, "eval_logits/rejected": -4.266495704650879, "eval_logps/chosen": -698.281005859375, "eval_logps/rejected": -718.4324951171875, "eval_loss": 0.6276723742485046, "eval_rewards/accuracies": 0.6434162855148315, "eval_rewards/chosen": -0.7452816367149353, "eval_rewards/margins": 0.4181005358695984, "eval_rewards/rejected": -1.1633821725845337, "eval_runtime": 395.6592, "eval_samples_per_second": 18.081, "eval_steps_per_second": 9.041, "step": 6400 }, { "epoch": 1.493302271403611, "grad_norm": 7.82735538482666, "learning_rate": 5.628478964401295e-06, "logits/chosen": -4.288760185241699, "logits/rejected": -4.4220356941223145, "logps/chosen": -679.4021606445312, "logps/rejected": -795.2906494140625, "loss": 0.8033, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6727933883666992, "rewards/margins": 0.33598798513412476, "rewards/rejected": -1.0087814331054688, "step": 6410 }, { "epoch": 1.4956319161327898, "grad_norm": 4.939048767089844, "learning_rate": 5.602588996763755e-06, "logits/chosen": -4.317221164703369, "logits/rejected": -4.359116077423096, "logps/chosen": -721.6626586914062, "logps/rejected": -742.2684936523438, "loss": 0.8091, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8711726069450378, "rewards/margins": 0.3611958622932434, "rewards/rejected": -1.2323685884475708, "step": 6420 }, { "epoch": 1.4979615608619685, "grad_norm": 9.913616180419922, "learning_rate": 5.576699029126214e-06, "logits/chosen": -4.355402946472168, "logits/rejected": -4.360577583312988, "logps/chosen": -752.7567138671875, "logps/rejected": -770.4042358398438, "loss": 0.7849, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.0307178497314453, "rewards/margins": 0.4963906705379486, "rewards/rejected": -1.5271085500717163, "step": 6430 }, { "epoch": 1.5002912055911475, "grad_norm": 6.584352970123291, "learning_rate": 5.550809061488674e-06, "logits/chosen": -4.346511363983154, "logits/rejected": -4.295920372009277, "logps/chosen": -728.0184326171875, "logps/rejected": -761.0966796875, "loss": 0.6471, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5676674842834473, "rewards/margins": 0.6876822113990784, "rewards/rejected": -1.2553496360778809, "step": 6440 }, { "epoch": 1.502620850320326, "grad_norm": 9.470415115356445, "learning_rate": 5.524919093851133e-06, "logits/chosen": -4.299017906188965, "logits/rejected": -4.294720649719238, "logps/chosen": -711.3177490234375, "logps/rejected": -738.9423217773438, "loss": 0.6542, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5640817284584045, "rewards/margins": 0.7242376804351807, "rewards/rejected": -1.2883192300796509, "step": 6450 }, { "epoch": 1.504950495049505, "grad_norm": 11.724821090698242, "learning_rate": 5.499029126213593e-06, "logits/chosen": -4.337367057800293, "logits/rejected": -4.3308539390563965, "logps/chosen": -726.2224731445312, "logps/rejected": -696.56787109375, "loss": 0.7747, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6660787463188171, "rewards/margins": 0.44035547971725464, "rewards/rejected": -1.1064343452453613, "step": 6460 }, { "epoch": 1.507280139778684, "grad_norm": 8.660778045654297, "learning_rate": 5.473139158576052e-06, "logits/chosen": -4.300487041473389, "logits/rejected": -4.254570960998535, "logps/chosen": -701.4354858398438, "logps/rejected": -739.0933227539062, "loss": 0.8465, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8152783513069153, "rewards/margins": 0.32741162180900574, "rewards/rejected": -1.1426900625228882, "step": 6470 }, { "epoch": 1.5096097845078624, "grad_norm": 4.783875942230225, "learning_rate": 5.447249190938512e-06, "logits/chosen": -4.258687973022461, "logits/rejected": -4.330063819885254, "logps/chosen": -658.1441650390625, "logps/rejected": -762.2044677734375, "loss": 0.6749, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6189396381378174, "rewards/margins": 0.665094256401062, "rewards/rejected": -1.284034013748169, "step": 6480 }, { "epoch": 1.5119394292370414, "grad_norm": 6.743084907531738, "learning_rate": 5.4213592233009714e-06, "logits/chosen": -4.353699684143066, "logits/rejected": -4.270865440368652, "logps/chosen": -750.50341796875, "logps/rejected": -700.8162841796875, "loss": 0.7392, "rewards/accuracies": 0.625, "rewards/chosen": -0.6873252391815186, "rewards/margins": 0.5150385499000549, "rewards/rejected": -1.2023637294769287, "step": 6490 }, { "epoch": 1.5142690739662201, "grad_norm": 6.350234508514404, "learning_rate": 5.395469255663431e-06, "logits/chosen": -4.325373649597168, "logits/rejected": -4.258090972900391, "logps/chosen": -683.0284423828125, "logps/rejected": -685.321533203125, "loss": 0.8345, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6604098081588745, "rewards/margins": 0.30990126729011536, "rewards/rejected": -0.9703109860420227, "step": 6500 }, { "epoch": 1.5142690739662201, "eval_logits/chosen": -4.2847981452941895, "eval_logits/rejected": -4.271369934082031, "eval_logps/chosen": -697.9261474609375, "eval_logps/rejected": -717.9816284179688, "eval_loss": 0.6257911920547485, "eval_rewards/accuracies": 0.6415991187095642, "eval_rewards/chosen": -0.7097821831703186, "eval_rewards/margins": 0.4085138142108917, "eval_rewards/rejected": -1.1182959079742432, "eval_runtime": 396.0331, "eval_samples_per_second": 18.064, "eval_steps_per_second": 9.032, "step": 6500 }, { "epoch": 1.5165987186953989, "grad_norm": 9.322096824645996, "learning_rate": 5.36957928802589e-06, "logits/chosen": -4.336639404296875, "logits/rejected": -4.434247016906738, "logps/chosen": -701.051513671875, "logps/rejected": -772.03662109375, "loss": 0.7471, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7475873231887817, "rewards/margins": 0.5629361867904663, "rewards/rejected": -1.3105233907699585, "step": 6510 }, { "epoch": 1.5189283634245778, "grad_norm": 8.181373596191406, "learning_rate": 5.34368932038835e-06, "logits/chosen": -4.339288234710693, "logits/rejected": -4.350064754486084, "logps/chosen": -761.74169921875, "logps/rejected": -785.0966186523438, "loss": 0.7449, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7028181552886963, "rewards/margins": 0.42330655455589294, "rewards/rejected": -1.126124620437622, "step": 6520 }, { "epoch": 1.5212580081537566, "grad_norm": 10.596341133117676, "learning_rate": 5.3177993527508095e-06, "logits/chosen": -4.388852119445801, "logits/rejected": -4.307581424713135, "logps/chosen": -725.6011352539062, "logps/rejected": -678.3147583007812, "loss": 0.7793, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6951478123664856, "rewards/margins": 0.36321547627449036, "rewards/rejected": -1.0583633184432983, "step": 6530 }, { "epoch": 1.5235876528829353, "grad_norm": 6.610289573669434, "learning_rate": 5.291909385113268e-06, "logits/chosen": -4.336356163024902, "logits/rejected": -4.272176742553711, "logps/chosen": -709.7391357421875, "logps/rejected": -714.23876953125, "loss": 0.6984, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.705763578414917, "rewards/margins": 0.55080246925354, "rewards/rejected": -1.2565661668777466, "step": 6540 }, { "epoch": 1.5259172976121143, "grad_norm": 9.951604843139648, "learning_rate": 5.266019417475728e-06, "logits/chosen": -4.248246669769287, "logits/rejected": -4.297746658325195, "logps/chosen": -597.0340576171875, "logps/rejected": -707.5311889648438, "loss": 0.6395, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5449532270431519, "rewards/margins": 0.538214921951294, "rewards/rejected": -1.0831680297851562, "step": 6550 }, { "epoch": 1.528246942341293, "grad_norm": 8.162659645080566, "learning_rate": 5.240129449838189e-06, "logits/chosen": -4.318647861480713, "logits/rejected": -4.250157356262207, "logps/chosen": -699.1119995117188, "logps/rejected": -662.0938720703125, "loss": 0.6704, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.45620036125183105, "rewards/margins": 0.6418908834457397, "rewards/rejected": -1.0980913639068604, "step": 6560 }, { "epoch": 1.5305765870704717, "grad_norm": 11.98337173461914, "learning_rate": 5.214239482200648e-06, "logits/chosen": -4.308518409729004, "logits/rejected": -4.269722938537598, "logps/chosen": -735.4429321289062, "logps/rejected": -759.6636962890625, "loss": 0.7414, "rewards/accuracies": 0.625, "rewards/chosen": -0.5557039976119995, "rewards/margins": 0.6347615122795105, "rewards/rejected": -1.1904654502868652, "step": 6570 }, { "epoch": 1.5329062317996507, "grad_norm": 9.075456619262695, "learning_rate": 5.188349514563107e-06, "logits/chosen": -4.317292213439941, "logits/rejected": -4.296011924743652, "logps/chosen": -758.3450927734375, "logps/rejected": -791.48974609375, "loss": 0.8578, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6727965474128723, "rewards/margins": 0.38544994592666626, "rewards/rejected": -1.058246374130249, "step": 6580 }, { "epoch": 1.5352358765288292, "grad_norm": 8.431157112121582, "learning_rate": 5.162459546925567e-06, "logits/chosen": -4.2513861656188965, "logits/rejected": -4.324321746826172, "logps/chosen": -662.2391967773438, "logps/rejected": -759.1661987304688, "loss": 0.8565, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6914823651313782, "rewards/margins": 0.35625919699668884, "rewards/rejected": -1.0477415323257446, "step": 6590 }, { "epoch": 1.5375655212580082, "grad_norm": 5.4589948654174805, "learning_rate": 5.136569579288027e-06, "logits/chosen": -4.429289817810059, "logits/rejected": -4.362177848815918, "logps/chosen": -710.226806640625, "logps/rejected": -702.2576293945312, "loss": 0.7486, "rewards/accuracies": 0.5625, "rewards/chosen": -0.664710283279419, "rewards/margins": 0.35124653577804565, "rewards/rejected": -1.0159568786621094, "step": 6600 }, { "epoch": 1.5375655212580082, "eval_logits/chosen": -4.283307075500488, "eval_logits/rejected": -4.269985675811768, "eval_logps/chosen": -697.8657836914062, "eval_logps/rejected": -717.916015625, "eval_loss": 0.6255219578742981, "eval_rewards/accuracies": 0.6459323167800903, "eval_rewards/chosen": -0.703760027885437, "eval_rewards/margins": 0.4079740047454834, "eval_rewards/rejected": -1.1117339134216309, "eval_runtime": 396.0706, "eval_samples_per_second": 18.062, "eval_steps_per_second": 9.031, "step": 6600 }, { "epoch": 1.5398951659871871, "grad_norm": 8.257590293884277, "learning_rate": 5.110679611650486e-06, "logits/chosen": -4.323059558868408, "logits/rejected": -4.37637996673584, "logps/chosen": -688.7907104492188, "logps/rejected": -743.0214233398438, "loss": 0.7344, "rewards/accuracies": 0.625, "rewards/chosen": -0.6487873792648315, "rewards/margins": 0.36815351247787476, "rewards/rejected": -1.0169408321380615, "step": 6610 }, { "epoch": 1.5422248107163656, "grad_norm": 7.590610980987549, "learning_rate": 5.084789644012945e-06, "logits/chosen": -4.344366073608398, "logits/rejected": -4.286723613739014, "logps/chosen": -717.1707763671875, "logps/rejected": -748.5653076171875, "loss": 0.7652, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5456770658493042, "rewards/margins": 0.4762745797634125, "rewards/rejected": -1.021951675415039, "step": 6620 }, { "epoch": 1.5445544554455446, "grad_norm": 9.014822006225586, "learning_rate": 5.058899676375405e-06, "logits/chosen": -4.227101802825928, "logits/rejected": -4.350574493408203, "logps/chosen": -655.3963012695312, "logps/rejected": -781.3607788085938, "loss": 0.7472, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.8425167798995972, "rewards/margins": 0.4325069785118103, "rewards/rejected": -1.2750238180160522, "step": 6630 }, { "epoch": 1.5468841001747233, "grad_norm": 7.267883777618408, "learning_rate": 5.033009708737865e-06, "logits/chosen": -4.215926170349121, "logits/rejected": -4.334687232971191, "logps/chosen": -692.3230590820312, "logps/rejected": -734.2679443359375, "loss": 0.7137, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5044487714767456, "rewards/margins": 0.4881749749183655, "rewards/rejected": -0.9926236867904663, "step": 6640 }, { "epoch": 1.549213744903902, "grad_norm": 7.1380462646484375, "learning_rate": 5.007119741100324e-06, "logits/chosen": -4.351950168609619, "logits/rejected": -4.302297592163086, "logps/chosen": -639.7730712890625, "logps/rejected": -675.5047607421875, "loss": 0.7411, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6441572308540344, "rewards/margins": 0.49536651372909546, "rewards/rejected": -1.1395237445831299, "step": 6650 }, { "epoch": 1.551543389633081, "grad_norm": 9.367436408996582, "learning_rate": 4.981229773462783e-06, "logits/chosen": -4.354855537414551, "logits/rejected": -4.330399513244629, "logps/chosen": -732.99462890625, "logps/rejected": -765.6302490234375, "loss": 0.8679, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.8751713633537292, "rewards/margins": 0.20318977534770966, "rewards/rejected": -1.0783611536026, "step": 6660 }, { "epoch": 1.5538730343622598, "grad_norm": 8.631954193115234, "learning_rate": 4.955339805825243e-06, "logits/chosen": -4.407146453857422, "logits/rejected": -4.191717624664307, "logps/chosen": -781.7088623046875, "logps/rejected": -701.8729248046875, "loss": 0.7618, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.723842442035675, "rewards/margins": 0.3932204246520996, "rewards/rejected": -1.1170628070831299, "step": 6670 }, { "epoch": 1.5562026790914385, "grad_norm": 7.060195446014404, "learning_rate": 4.929449838187703e-06, "logits/chosen": -4.288878440856934, "logits/rejected": -4.307755470275879, "logps/chosen": -694.4677734375, "logps/rejected": -699.7354736328125, "loss": 0.672, "rewards/accuracies": 0.625, "rewards/chosen": -0.5265144109725952, "rewards/margins": 0.6340707540512085, "rewards/rejected": -1.1605851650238037, "step": 6680 }, { "epoch": 1.5585323238206175, "grad_norm": 9.471017837524414, "learning_rate": 4.9035598705501626e-06, "logits/chosen": -4.301576137542725, "logits/rejected": -4.327295303344727, "logps/chosen": -667.992431640625, "logps/rejected": -734.1383056640625, "loss": 0.7076, "rewards/accuracies": 0.625, "rewards/chosen": -0.743894636631012, "rewards/margins": 0.48800763487815857, "rewards/rejected": -1.2319023609161377, "step": 6690 }, { "epoch": 1.5608619685497962, "grad_norm": 8.635560989379883, "learning_rate": 4.8776699029126215e-06, "logits/chosen": -4.34967565536499, "logits/rejected": -4.292224407196045, "logps/chosen": -729.954833984375, "logps/rejected": -719.730224609375, "loss": 0.761, "rewards/accuracies": 0.625, "rewards/chosen": -0.784434974193573, "rewards/margins": 0.4822344183921814, "rewards/rejected": -1.2666693925857544, "step": 6700 }, { "epoch": 1.5608619685497962, "eval_logits/chosen": -4.2828216552734375, "eval_logits/rejected": -4.269512176513672, "eval_logps/chosen": -698.0747680664062, "eval_logps/rejected": -718.1797485351562, "eval_loss": 0.6258891224861145, "eval_rewards/accuracies": 0.6420184373855591, "eval_rewards/chosen": -0.7246599793434143, "eval_rewards/margins": 0.41345661878585815, "eval_rewards/rejected": -1.1381165981292725, "eval_runtime": 396.3176, "eval_samples_per_second": 18.051, "eval_steps_per_second": 9.026, "step": 6700 }, { "epoch": 1.563191613278975, "grad_norm": 6.1233696937561035, "learning_rate": 4.851779935275081e-06, "logits/chosen": -4.277688503265381, "logits/rejected": -4.283129692077637, "logps/chosen": -703.0496215820312, "logps/rejected": -707.1187133789062, "loss": 0.7647, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5167033076286316, "rewards/margins": 0.5566205978393555, "rewards/rejected": -1.0733239650726318, "step": 6710 }, { "epoch": 1.565521258008154, "grad_norm": 8.156778335571289, "learning_rate": 4.825889967637541e-06, "logits/chosen": -4.336854934692383, "logits/rejected": -4.288191795349121, "logps/chosen": -755.5333251953125, "logps/rejected": -736.971435546875, "loss": 0.7263, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9055421948432922, "rewards/margins": 0.5150974988937378, "rewards/rejected": -1.4206396341323853, "step": 6720 }, { "epoch": 1.5678509027373324, "grad_norm": 6.012303352355957, "learning_rate": 4.800000000000001e-06, "logits/chosen": -4.293099880218506, "logits/rejected": -4.312102317810059, "logps/chosen": -667.4832763671875, "logps/rejected": -746.6400756835938, "loss": 0.8205, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8755755424499512, "rewards/margins": 0.29335370659828186, "rewards/rejected": -1.1689293384552002, "step": 6730 }, { "epoch": 1.5701805474665114, "grad_norm": 9.780920028686523, "learning_rate": 4.77411003236246e-06, "logits/chosen": -4.192702770233154, "logits/rejected": -4.291769504547119, "logps/chosen": -641.2814331054688, "logps/rejected": -755.7752075195312, "loss": 0.7455, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.675727367401123, "rewards/margins": 0.45784205198287964, "rewards/rejected": -1.133569359779358, "step": 6740 }, { "epoch": 1.57251019219569, "grad_norm": 10.279903411865234, "learning_rate": 4.748220064724919e-06, "logits/chosen": -4.267851829528809, "logits/rejected": -4.342855930328369, "logps/chosen": -712.3040161132812, "logps/rejected": -765.2335205078125, "loss": 0.8372, "rewards/accuracies": 0.625, "rewards/chosen": -0.7413617372512817, "rewards/margins": 0.4191419184207916, "rewards/rejected": -1.160503625869751, "step": 6750 }, { "epoch": 1.5748398369248688, "grad_norm": 8.927513122558594, "learning_rate": 4.722330097087379e-06, "logits/chosen": -4.281728267669678, "logits/rejected": -4.379133701324463, "logps/chosen": -722.7548828125, "logps/rejected": -774.9669189453125, "loss": 0.9477, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.046169400215149, "rewards/margins": 0.12018339335918427, "rewards/rejected": -1.1663528680801392, "step": 6760 }, { "epoch": 1.5771694816540478, "grad_norm": 8.109047889709473, "learning_rate": 4.696440129449839e-06, "logits/chosen": -4.362876892089844, "logits/rejected": -4.314282417297363, "logps/chosen": -742.7166137695312, "logps/rejected": -780.0384521484375, "loss": 0.651, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6960076093673706, "rewards/margins": 0.5484358072280884, "rewards/rejected": -1.244443416595459, "step": 6770 }, { "epoch": 1.5794991263832265, "grad_norm": 8.884431838989258, "learning_rate": 4.670550161812298e-06, "logits/chosen": -4.268649578094482, "logits/rejected": -4.254140377044678, "logps/chosen": -707.0760498046875, "logps/rejected": -735.7363891601562, "loss": 0.8273, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.8003918528556824, "rewards/margins": 0.3524617552757263, "rewards/rejected": -1.1528536081314087, "step": 6780 }, { "epoch": 1.5818287711124053, "grad_norm": 10.445850372314453, "learning_rate": 4.644660194174757e-06, "logits/chosen": -4.2838969230651855, "logits/rejected": -4.417786121368408, "logps/chosen": -738.2225341796875, "logps/rejected": -828.3897705078125, "loss": 0.7626, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6615802049636841, "rewards/margins": 0.6080848574638367, "rewards/rejected": -1.2696651220321655, "step": 6790 }, { "epoch": 1.5841584158415842, "grad_norm": 12.222297668457031, "learning_rate": 4.618770226537217e-06, "logits/chosen": -4.329405784606934, "logits/rejected": -4.318453788757324, "logps/chosen": -767.93994140625, "logps/rejected": -750.4871215820312, "loss": 0.7829, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7184725999832153, "rewards/margins": 0.2945941388607025, "rewards/rejected": -1.0130667686462402, "step": 6800 }, { "epoch": 1.5841584158415842, "eval_logits/chosen": -4.285313606262207, "eval_logits/rejected": -4.272164821624756, "eval_logps/chosen": -698.1854248046875, "eval_logps/rejected": -718.3281860351562, "eval_loss": 0.6260075569152832, "eval_rewards/accuracies": 0.6421582102775574, "eval_rewards/chosen": -0.7357184886932373, "eval_rewards/margins": 0.41724127531051636, "eval_rewards/rejected": -1.1529598236083984, "eval_runtime": 396.851, "eval_samples_per_second": 18.027, "eval_steps_per_second": 9.013, "step": 6800 }, { "epoch": 1.586488060570763, "grad_norm": 10.387283325195312, "learning_rate": 4.592880258899677e-06, "logits/chosen": -4.342694282531738, "logits/rejected": -4.3356614112854, "logps/chosen": -735.6070556640625, "logps/rejected": -787.474609375, "loss": 0.7371, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5960084795951843, "rewards/margins": 0.5805218815803528, "rewards/rejected": -1.176530361175537, "step": 6810 }, { "epoch": 1.5888177052999417, "grad_norm": 6.978819370269775, "learning_rate": 4.5669902912621365e-06, "logits/chosen": -4.330822944641113, "logits/rejected": -4.264742851257324, "logps/chosen": -742.8681640625, "logps/rejected": -699.9263305664062, "loss": 0.7347, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5250498056411743, "rewards/margins": 0.4952210783958435, "rewards/rejected": -1.0202710628509521, "step": 6820 }, { "epoch": 1.5911473500291207, "grad_norm": 4.977511405944824, "learning_rate": 4.541100323624596e-06, "logits/chosen": -4.301724433898926, "logits/rejected": -4.31328821182251, "logps/chosen": -704.4278564453125, "logps/rejected": -805.1658935546875, "loss": 0.5991, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6408155560493469, "rewards/margins": 0.7181216478347778, "rewards/rejected": -1.3589370250701904, "step": 6830 }, { "epoch": 1.5934769947582994, "grad_norm": 7.5867180824279785, "learning_rate": 4.515210355987055e-06, "logits/chosen": -4.225648880004883, "logits/rejected": -4.192324161529541, "logps/chosen": -701.4697265625, "logps/rejected": -730.7188720703125, "loss": 0.7539, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7625960111618042, "rewards/margins": 0.5121796131134033, "rewards/rejected": -1.274775743484497, "step": 6840 }, { "epoch": 1.5958066394874781, "grad_norm": 8.60496997833252, "learning_rate": 4.489320388349515e-06, "logits/chosen": -4.283195495605469, "logits/rejected": -4.2724738121032715, "logps/chosen": -681.2093505859375, "logps/rejected": -659.5936279296875, "loss": 0.7666, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7397498488426208, "rewards/margins": 0.3033349812030792, "rewards/rejected": -1.0430848598480225, "step": 6850 }, { "epoch": 1.598136284216657, "grad_norm": 8.380082130432129, "learning_rate": 4.4634304207119745e-06, "logits/chosen": -4.3215131759643555, "logits/rejected": -4.331038475036621, "logps/chosen": -709.4156494140625, "logps/rejected": -726.0703125, "loss": 0.6852, "rewards/accuracies": 0.625, "rewards/chosen": -0.5304920673370361, "rewards/margins": 0.7836783528327942, "rewards/rejected": -1.3141703605651855, "step": 6860 }, { "epoch": 1.6004659289458356, "grad_norm": 7.335846424102783, "learning_rate": 4.437540453074434e-06, "logits/chosen": -4.3939008712768555, "logits/rejected": -4.39631986618042, "logps/chosen": -753.7726440429688, "logps/rejected": -834.2058715820312, "loss": 0.7744, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8983514904975891, "rewards/margins": 0.40852493047714233, "rewards/rejected": -1.306876540184021, "step": 6870 }, { "epoch": 1.6027955736750146, "grad_norm": 9.488310813903809, "learning_rate": 4.411650485436894e-06, "logits/chosen": -4.334609031677246, "logits/rejected": -4.30482816696167, "logps/chosen": -828.6798706054688, "logps/rejected": -817.4603271484375, "loss": 0.8288, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7392042875289917, "rewards/margins": 0.388349711894989, "rewards/rejected": -1.1275540590286255, "step": 6880 }, { "epoch": 1.6051252184041933, "grad_norm": 7.569668769836426, "learning_rate": 4.385760517799353e-06, "logits/chosen": -4.326897621154785, "logits/rejected": -4.352515697479248, "logps/chosen": -705.760498046875, "logps/rejected": -750.1553955078125, "loss": 0.86, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.9272357225418091, "rewards/margins": 0.24375076591968536, "rewards/rejected": -1.170986294746399, "step": 6890 }, { "epoch": 1.607454863133372, "grad_norm": 6.373630046844482, "learning_rate": 4.359870550161813e-06, "logits/chosen": -4.288449764251709, "logits/rejected": -4.232588768005371, "logps/chosen": -704.5266723632812, "logps/rejected": -696.7781982421875, "loss": 0.7887, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.7161403298377991, "rewards/margins": 0.3344431519508362, "rewards/rejected": -1.0505833625793457, "step": 6900 }, { "epoch": 1.607454863133372, "eval_logits/chosen": -4.281885623931885, "eval_logits/rejected": -4.268438816070557, "eval_logps/chosen": -698.2005004882812, "eval_logps/rejected": -718.3382568359375, "eval_loss": 0.6262277960777283, "eval_rewards/accuracies": 0.6422980427742004, "eval_rewards/chosen": -0.7372271418571472, "eval_rewards/margins": 0.4167364835739136, "eval_rewards/rejected": -1.1539636850357056, "eval_runtime": 397.1792, "eval_samples_per_second": 18.012, "eval_steps_per_second": 9.006, "step": 6900 }, { "epoch": 1.609784507862551, "grad_norm": 4.869718074798584, "learning_rate": 4.333980582524272e-06, "logits/chosen": -4.281427383422852, "logits/rejected": -4.302354335784912, "logps/chosen": -655.8043823242188, "logps/rejected": -685.4343872070312, "loss": 0.8487, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.6849377155303955, "rewards/margins": 0.14450839161872864, "rewards/rejected": -0.8294461369514465, "step": 6910 }, { "epoch": 1.6121141525917297, "grad_norm": 10.79990291595459, "learning_rate": 4.308090614886732e-06, "logits/chosen": -4.325753688812256, "logits/rejected": -4.275383472442627, "logps/chosen": -808.7792358398438, "logps/rejected": -778.7926025390625, "loss": 0.6496, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6239713430404663, "rewards/margins": 0.6055768132209778, "rewards/rejected": -1.2295480966567993, "step": 6920 }, { "epoch": 1.6144437973209085, "grad_norm": 12.391186714172363, "learning_rate": 4.282200647249191e-06, "logits/chosen": -4.316169738769531, "logits/rejected": -4.227480888366699, "logps/chosen": -763.8555297851562, "logps/rejected": -705.6220703125, "loss": 0.7934, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7102519273757935, "rewards/margins": 0.34832531213760376, "rewards/rejected": -1.058577299118042, "step": 6930 }, { "epoch": 1.6167734420500874, "grad_norm": 11.898706436157227, "learning_rate": 4.256310679611651e-06, "logits/chosen": -4.290870189666748, "logits/rejected": -4.266106605529785, "logps/chosen": -713.439208984375, "logps/rejected": -743.7091064453125, "loss": 0.8143, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.699730396270752, "rewards/margins": 0.231022909283638, "rewards/rejected": -0.9307533502578735, "step": 6940 }, { "epoch": 1.6191030867792662, "grad_norm": 6.399176597595215, "learning_rate": 4.23042071197411e-06, "logits/chosen": -4.361333847045898, "logits/rejected": -4.301661491394043, "logps/chosen": -744.58544921875, "logps/rejected": -732.2003784179688, "loss": 0.771, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.7607227563858032, "rewards/margins": 0.3533969521522522, "rewards/rejected": -1.1141196489334106, "step": 6950 }, { "epoch": 1.621432731508445, "grad_norm": 9.512768745422363, "learning_rate": 4.20453074433657e-06, "logits/chosen": -4.269883632659912, "logits/rejected": -4.253101825714111, "logps/chosen": -680.302734375, "logps/rejected": -736.2039794921875, "loss": 0.7223, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5698976516723633, "rewards/margins": 0.6293027997016907, "rewards/rejected": -1.1992003917694092, "step": 6960 }, { "epoch": 1.6237623762376239, "grad_norm": 9.72691822052002, "learning_rate": 4.17864077669903e-06, "logits/chosen": -4.406013488769531, "logits/rejected": -4.377242565155029, "logps/chosen": -785.4564208984375, "logps/rejected": -820.7072143554688, "loss": 0.8597, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.7639799118041992, "rewards/margins": 0.27192139625549316, "rewards/rejected": -1.0359013080596924, "step": 6970 }, { "epoch": 1.6260920209668026, "grad_norm": 6.968569755554199, "learning_rate": 4.152750809061489e-06, "logits/chosen": -4.263640403747559, "logits/rejected": -4.292579650878906, "logps/chosen": -717.577880859375, "logps/rejected": -740.34375, "loss": 0.6937, "rewards/accuracies": 0.625, "rewards/chosen": -0.5662204623222351, "rewards/margins": 0.4708368182182312, "rewards/rejected": -1.0370572805404663, "step": 6980 }, { "epoch": 1.6284216656959813, "grad_norm": 4.735540866851807, "learning_rate": 4.1268608414239484e-06, "logits/chosen": -4.283355236053467, "logits/rejected": -4.369661331176758, "logps/chosen": -646.6312866210938, "logps/rejected": -711.9313354492188, "loss": 0.5925, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5355406999588013, "rewards/margins": 0.7102679014205933, "rewards/rejected": -1.245808720588684, "step": 6990 }, { "epoch": 1.6307513104251603, "grad_norm": 9.391236305236816, "learning_rate": 4.100970873786408e-06, "logits/chosen": -4.427331447601318, "logits/rejected": -4.339338779449463, "logps/chosen": -737.71728515625, "logps/rejected": -694.397705078125, "loss": 0.8044, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6445012092590332, "rewards/margins": 0.38664406538009644, "rewards/rejected": -1.0311453342437744, "step": 7000 }, { "epoch": 1.6307513104251603, "eval_logits/chosen": -4.281728744506836, "eval_logits/rejected": -4.268459320068359, "eval_logps/chosen": -698.0704956054688, "eval_logps/rejected": -718.185546875, "eval_loss": 0.6258257031440735, "eval_rewards/accuracies": 0.6424378156661987, "eval_rewards/chosen": -0.7242283225059509, "eval_rewards/margins": 0.4144580364227295, "eval_rewards/rejected": -1.1386864185333252, "eval_runtime": 398.3664, "eval_samples_per_second": 17.958, "eval_steps_per_second": 8.979, "step": 7000 }, { "epoch": 1.6330809551543388, "grad_norm": 6.797269344329834, "learning_rate": 4.075080906148868e-06, "logits/chosen": -4.291042327880859, "logits/rejected": -4.331503391265869, "logps/chosen": -685.22900390625, "logps/rejected": -719.4610595703125, "loss": 0.8001, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6552774310112, "rewards/margins": 0.33681371808052063, "rewards/rejected": -0.9920912981033325, "step": 7010 }, { "epoch": 1.6354105998835178, "grad_norm": 7.896794319152832, "learning_rate": 4.049190938511327e-06, "logits/chosen": -4.361865043640137, "logits/rejected": -4.35894775390625, "logps/chosen": -734.9173583984375, "logps/rejected": -729.639892578125, "loss": 0.7177, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.46472835540771484, "rewards/margins": 0.5228708982467651, "rewards/rejected": -0.9875991940498352, "step": 7020 }, { "epoch": 1.6377402446126965, "grad_norm": 9.912988662719727, "learning_rate": 4.0233009708737865e-06, "logits/chosen": -4.3807172775268555, "logits/rejected": -4.355521202087402, "logps/chosen": -731.7893676757812, "logps/rejected": -743.5462646484375, "loss": 0.769, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4980197846889496, "rewards/margins": 0.5245866775512695, "rewards/rejected": -1.0226064920425415, "step": 7030 }, { "epoch": 1.6400698893418753, "grad_norm": 8.826336860656738, "learning_rate": 3.997411003236246e-06, "logits/chosen": -4.4199957847595215, "logits/rejected": -4.296575546264648, "logps/chosen": -777.4635009765625, "logps/rejected": -732.2686767578125, "loss": 0.7529, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5620446801185608, "rewards/margins": 0.4427622854709625, "rewards/rejected": -1.0048068761825562, "step": 7040 }, { "epoch": 1.6423995340710542, "grad_norm": 4.805436611175537, "learning_rate": 3.971521035598706e-06, "logits/chosen": -4.415337562561035, "logits/rejected": -4.307534217834473, "logps/chosen": -772.3611450195312, "logps/rejected": -743.7156982421875, "loss": 0.8022, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6573207378387451, "rewards/margins": 0.4766790270805359, "rewards/rejected": -1.1339998245239258, "step": 7050 }, { "epoch": 1.644729178800233, "grad_norm": 8.641572952270508, "learning_rate": 3.945631067961166e-06, "logits/chosen": -4.261378288269043, "logits/rejected": -4.3650617599487305, "logps/chosen": -692.762451171875, "logps/rejected": -797.03271484375, "loss": 0.7324, "rewards/accuracies": 0.625, "rewards/chosen": -0.6822121143341064, "rewards/margins": 0.6177128553390503, "rewards/rejected": -1.2999250888824463, "step": 7060 }, { "epoch": 1.6470588235294117, "grad_norm": 7.119178295135498, "learning_rate": 3.9197411003236245e-06, "logits/chosen": -4.267707824707031, "logits/rejected": -4.342516899108887, "logps/chosen": -768.018310546875, "logps/rejected": -775.420166015625, "loss": 0.6835, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6481823325157166, "rewards/margins": 0.6533300280570984, "rewards/rejected": -1.3015124797821045, "step": 7070 }, { "epoch": 1.6493884682585906, "grad_norm": 5.9756035804748535, "learning_rate": 3.893851132686084e-06, "logits/chosen": -4.280825614929199, "logits/rejected": -4.224474906921387, "logps/chosen": -766.645263671875, "logps/rejected": -708.4898681640625, "loss": 0.7056, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6119120717048645, "rewards/margins": 0.5710783004760742, "rewards/rejected": -1.182990312576294, "step": 7080 }, { "epoch": 1.6517181129877694, "grad_norm": 7.022614479064941, "learning_rate": 3.867961165048544e-06, "logits/chosen": -4.209417819976807, "logits/rejected": -4.280752182006836, "logps/chosen": -692.9827880859375, "logps/rejected": -725.9755859375, "loss": 0.9008, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.8311317563056946, "rewards/margins": 0.08656143397092819, "rewards/rejected": -0.9176931381225586, "step": 7090 }, { "epoch": 1.6540477577169481, "grad_norm": 10.223764419555664, "learning_rate": 3.842071197411004e-06, "logits/chosen": -4.297172546386719, "logits/rejected": -4.260180950164795, "logps/chosen": -718.2964477539062, "logps/rejected": -706.2061767578125, "loss": 0.7227, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6614014506340027, "rewards/margins": 0.4238288998603821, "rewards/rejected": -1.0852304697036743, "step": 7100 }, { "epoch": 1.6540477577169481, "eval_logits/chosen": -4.282748699188232, "eval_logits/rejected": -4.269398212432861, "eval_logps/chosen": -698.1327514648438, "eval_logps/rejected": -718.2734985351562, "eval_loss": 0.6262640357017517, "eval_rewards/accuracies": 0.6439753770828247, "eval_rewards/chosen": -0.7304497957229614, "eval_rewards/margins": 0.4170399308204651, "eval_rewards/rejected": -1.1474899053573608, "eval_runtime": 397.9053, "eval_samples_per_second": 17.979, "eval_steps_per_second": 8.99, "step": 7100 }, { "epoch": 1.656377402446127, "grad_norm": 9.430929183959961, "learning_rate": 3.816181229773463e-06, "logits/chosen": -4.280282974243164, "logits/rejected": -4.2974395751953125, "logps/chosen": -671.2548828125, "logps/rejected": -727.61474609375, "loss": 0.8933, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.820625901222229, "rewards/margins": 0.22736160457134247, "rewards/rejected": -1.0479875802993774, "step": 7110 }, { "epoch": 1.6587070471753058, "grad_norm": 8.540874481201172, "learning_rate": 3.7902912621359228e-06, "logits/chosen": -4.318397045135498, "logits/rejected": -4.397231578826904, "logps/chosen": -662.1047973632812, "logps/rejected": -736.2491455078125, "loss": 0.589, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.46827468276023865, "rewards/margins": 0.6185547113418579, "rewards/rejected": -1.0868293046951294, "step": 7120 }, { "epoch": 1.6610366919044846, "grad_norm": 5.626376628875732, "learning_rate": 3.764401294498382e-06, "logits/chosen": -4.4004225730896, "logits/rejected": -4.287680625915527, "logps/chosen": -790.2825317382812, "logps/rejected": -747.4461669921875, "loss": 0.739, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6833652853965759, "rewards/margins": 0.49871358275413513, "rewards/rejected": -1.1820788383483887, "step": 7130 }, { "epoch": 1.6633663366336635, "grad_norm": 9.346349716186523, "learning_rate": 3.7385113268608418e-06, "logits/chosen": -4.3877363204956055, "logits/rejected": -4.267838478088379, "logps/chosen": -756.852783203125, "logps/rejected": -692.7822875976562, "loss": 0.8934, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7731833457946777, "rewards/margins": -0.02572050131857395, "rewards/rejected": -0.7474628686904907, "step": 7140 }, { "epoch": 1.665695981362842, "grad_norm": 6.2495856285095215, "learning_rate": 3.712621359223301e-06, "logits/chosen": -4.388154029846191, "logits/rejected": -4.3741984367370605, "logps/chosen": -692.0726318359375, "logps/rejected": -711.5775146484375, "loss": 0.6488, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.49499768018722534, "rewards/margins": 0.555439829826355, "rewards/rejected": -1.050437569618225, "step": 7150 }, { "epoch": 1.668025626092021, "grad_norm": 9.950132369995117, "learning_rate": 3.6867313915857604e-06, "logits/chosen": -4.390973091125488, "logits/rejected": -4.378559112548828, "logps/chosen": -736.49951171875, "logps/rejected": -717.4456176757812, "loss": 0.9051, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7832115292549133, "rewards/margins": -0.003316342830657959, "rewards/rejected": -0.7798951864242554, "step": 7160 }, { "epoch": 1.6703552708211997, "grad_norm": 8.273674964904785, "learning_rate": 3.6608414239482205e-06, "logits/chosen": -4.308285713195801, "logits/rejected": -4.307774543762207, "logps/chosen": -732.8416137695312, "logps/rejected": -751.5208740234375, "loss": 0.8783, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.9038097262382507, "rewards/margins": 0.08503206819295883, "rewards/rejected": -0.9888418316841125, "step": 7170 }, { "epoch": 1.6726849155503785, "grad_norm": 6.736724376678467, "learning_rate": 3.63495145631068e-06, "logits/chosen": -4.277791976928711, "logits/rejected": -4.248794078826904, "logps/chosen": -731.4321899414062, "logps/rejected": -688.3527221679688, "loss": 0.6928, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6009598970413208, "rewards/margins": 0.6401046514511108, "rewards/rejected": -1.241064429283142, "step": 7180 }, { "epoch": 1.6750145602795574, "grad_norm": 6.992065906524658, "learning_rate": 3.6090614886731396e-06, "logits/chosen": -4.259005069732666, "logits/rejected": -4.273360729217529, "logps/chosen": -694.7288818359375, "logps/rejected": -755.7852783203125, "loss": 0.7376, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6417701244354248, "rewards/margins": 0.5308027267456055, "rewards/rejected": -1.1725728511810303, "step": 7190 }, { "epoch": 1.6773442050087362, "grad_norm": 10.378262519836426, "learning_rate": 3.583171521035599e-06, "logits/chosen": -4.214404106140137, "logits/rejected": -4.259570121765137, "logps/chosen": -692.5252685546875, "logps/rejected": -770.9401245117188, "loss": 0.6745, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.46339020133018494, "rewards/margins": 0.5105647444725037, "rewards/rejected": -0.9739548563957214, "step": 7200 }, { "epoch": 1.6773442050087362, "eval_logits/chosen": -4.2833170890808105, "eval_logits/rejected": -4.27020788192749, "eval_logps/chosen": -698.0100708007812, "eval_logps/rejected": -718.1314697265625, "eval_loss": 0.6253830194473267, "eval_rewards/accuracies": 0.6422980427742004, "eval_rewards/chosen": -0.7181804180145264, "eval_rewards/margins": 0.41510212421417236, "eval_rewards/rejected": -1.1332825422286987, "eval_runtime": 397.8645, "eval_samples_per_second": 17.981, "eval_steps_per_second": 8.99, "step": 7200 }, { "epoch": 1.679673849737915, "grad_norm": 10.800102233886719, "learning_rate": 3.5572815533980586e-06, "logits/chosen": -4.311060905456543, "logits/rejected": -4.251940727233887, "logps/chosen": -732.6629638671875, "logps/rejected": -724.3796997070312, "loss": 0.7655, "rewards/accuracies": 0.625, "rewards/chosen": -0.6294369101524353, "rewards/margins": 0.5156153440475464, "rewards/rejected": -1.1450523138046265, "step": 7210 }, { "epoch": 1.6820034944670939, "grad_norm": 10.474735260009766, "learning_rate": 3.531391585760518e-06, "logits/chosen": -4.366720676422119, "logits/rejected": -4.3180999755859375, "logps/chosen": -747.0120239257812, "logps/rejected": -736.7986450195312, "loss": 0.8329, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5397108793258667, "rewards/margins": 0.27118149399757385, "rewards/rejected": -0.8108924627304077, "step": 7220 }, { "epoch": 1.6843331391962726, "grad_norm": 9.822484016418457, "learning_rate": 3.505501618122978e-06, "logits/chosen": -4.2457780838012695, "logits/rejected": -4.331841945648193, "logps/chosen": -668.4440307617188, "logps/rejected": -744.3336181640625, "loss": 0.6862, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6030920147895813, "rewards/margins": 0.6354323625564575, "rewards/rejected": -1.238524317741394, "step": 7230 }, { "epoch": 1.6866627839254513, "grad_norm": 9.211170196533203, "learning_rate": 3.4796116504854374e-06, "logits/chosen": -4.304722785949707, "logits/rejected": -4.387798309326172, "logps/chosen": -735.7977294921875, "logps/rejected": -770.9636840820312, "loss": 0.8774, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.776853084564209, "rewards/margins": 0.18464542925357819, "rewards/rejected": -0.9614984393119812, "step": 7240 }, { "epoch": 1.6889924286546303, "grad_norm": 11.63446044921875, "learning_rate": 3.453721682847897e-06, "logits/chosen": -4.30993127822876, "logits/rejected": -4.366557598114014, "logps/chosen": -655.0383911132812, "logps/rejected": -833.9552612304688, "loss": 0.7086, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5186705589294434, "rewards/margins": 0.5775880813598633, "rewards/rejected": -1.0962587594985962, "step": 7250 }, { "epoch": 1.691322073383809, "grad_norm": 10.69422721862793, "learning_rate": 3.4278317152103564e-06, "logits/chosen": -4.38188362121582, "logits/rejected": -4.391608238220215, "logps/chosen": -738.8275756835938, "logps/rejected": -759.1577758789062, "loss": 0.9575, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.0043985843658447, "rewards/margins": 0.1158156618475914, "rewards/rejected": -1.1202142238616943, "step": 7260 }, { "epoch": 1.6936517181129878, "grad_norm": 7.140145778656006, "learning_rate": 3.4019417475728157e-06, "logits/chosen": -4.351529598236084, "logits/rejected": -4.38330602645874, "logps/chosen": -733.5669555664062, "logps/rejected": -773.0108642578125, "loss": 0.6557, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6187713146209717, "rewards/margins": 0.7960120439529419, "rewards/rejected": -1.4147834777832031, "step": 7270 }, { "epoch": 1.6959813628421667, "grad_norm": 8.086371421813965, "learning_rate": 3.3760517799352754e-06, "logits/chosen": -4.3373541831970215, "logits/rejected": -4.3250932693481445, "logps/chosen": -676.114013671875, "logps/rejected": -698.8336181640625, "loss": 0.7978, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7031859159469604, "rewards/margins": 0.3312601149082184, "rewards/rejected": -1.034446120262146, "step": 7280 }, { "epoch": 1.6983110075713452, "grad_norm": 4.94987154006958, "learning_rate": 3.3501618122977347e-06, "logits/chosen": -4.358597755432129, "logits/rejected": -4.308444023132324, "logps/chosen": -703.7459716796875, "logps/rejected": -711.6162109375, "loss": 0.7399, "rewards/accuracies": 0.625, "rewards/chosen": -0.640599250793457, "rewards/margins": 0.48218145966529846, "rewards/rejected": -1.122780680656433, "step": 7290 }, { "epoch": 1.7006406523005242, "grad_norm": 8.307604789733887, "learning_rate": 3.3242718446601944e-06, "logits/chosen": -4.2530412673950195, "logits/rejected": -4.3351922035217285, "logps/chosen": -679.9979858398438, "logps/rejected": -757.0491943359375, "loss": 0.7378, "rewards/accuracies": 0.625, "rewards/chosen": -0.6946650743484497, "rewards/margins": 0.634982705116272, "rewards/rejected": -1.3296477794647217, "step": 7300 }, { "epoch": 1.7006406523005242, "eval_logits/chosen": -4.2809062004089355, "eval_logits/rejected": -4.267802715301514, "eval_logps/chosen": -698.1642456054688, "eval_logps/rejected": -718.3338012695312, "eval_loss": 0.6260935664176941, "eval_rewards/accuracies": 0.6396421790122986, "eval_rewards/chosen": -0.7335991263389587, "eval_rewards/margins": 0.41992077231407166, "eval_rewards/rejected": -1.153519868850708, "eval_runtime": 398.5397, "eval_samples_per_second": 17.951, "eval_steps_per_second": 8.975, "step": 7300 }, { "epoch": 1.702970297029703, "grad_norm": 5.429368495941162, "learning_rate": 3.2983818770226537e-06, "logits/chosen": -4.3033037185668945, "logits/rejected": -4.257104396820068, "logps/chosen": -705.1695556640625, "logps/rejected": -694.6905517578125, "loss": 0.6325, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6281101703643799, "rewards/margins": 0.6659101247787476, "rewards/rejected": -1.294020414352417, "step": 7310 }, { "epoch": 1.7052999417588817, "grad_norm": 10.893732070922852, "learning_rate": 3.272491909385114e-06, "logits/chosen": -4.365532875061035, "logits/rejected": -4.285029411315918, "logps/chosen": -743.0709228515625, "logps/rejected": -727.3438720703125, "loss": 0.7859, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7658766508102417, "rewards/margins": 0.39953237771987915, "rewards/rejected": -1.165408968925476, "step": 7320 }, { "epoch": 1.7076295864880606, "grad_norm": 7.524363994598389, "learning_rate": 3.246601941747573e-06, "logits/chosen": -4.205938339233398, "logits/rejected": -4.287212371826172, "logps/chosen": -620.0014038085938, "logps/rejected": -696.42529296875, "loss": 0.6766, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5650202631950378, "rewards/margins": 0.5094878077507019, "rewards/rejected": -1.0745080709457397, "step": 7330 }, { "epoch": 1.7099592312172394, "grad_norm": 7.964355945587158, "learning_rate": 3.220711974110033e-06, "logits/chosen": -4.312127590179443, "logits/rejected": -4.2499098777771, "logps/chosen": -713.2406005859375, "logps/rejected": -692.626953125, "loss": 0.825, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.8016546964645386, "rewards/margins": 0.2324962317943573, "rewards/rejected": -1.0341508388519287, "step": 7340 }, { "epoch": 1.712288875946418, "grad_norm": 10.5994291305542, "learning_rate": 3.1948220064724922e-06, "logits/chosen": -4.278526306152344, "logits/rejected": -4.249955654144287, "logps/chosen": -762.7586059570312, "logps/rejected": -740.9987182617188, "loss": 0.8548, "rewards/accuracies": 0.5625, "rewards/chosen": -0.647429883480072, "rewards/margins": 0.18259665369987488, "rewards/rejected": -0.8300265073776245, "step": 7350 }, { "epoch": 1.714618520675597, "grad_norm": 9.083148956298828, "learning_rate": 3.1689320388349515e-06, "logits/chosen": -4.3152174949646, "logits/rejected": -4.333291053771973, "logps/chosen": -673.6456298828125, "logps/rejected": -751.5562133789062, "loss": 0.7587, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7572881579399109, "rewards/margins": 0.3388776183128357, "rewards/rejected": -1.0961657762527466, "step": 7360 }, { "epoch": 1.7169481654047758, "grad_norm": 8.386931419372559, "learning_rate": 3.1430420711974113e-06, "logits/chosen": -4.2550249099731445, "logits/rejected": -4.318543434143066, "logps/chosen": -631.4505615234375, "logps/rejected": -726.1788940429688, "loss": 0.6494, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7165141105651855, "rewards/margins": 0.6534533500671387, "rewards/rejected": -1.3699674606323242, "step": 7370 }, { "epoch": 1.7192778101339545, "grad_norm": 7.15736722946167, "learning_rate": 3.1171521035598706e-06, "logits/chosen": -4.390912055969238, "logits/rejected": -4.2898688316345215, "logps/chosen": -764.761474609375, "logps/rejected": -714.9124145507812, "loss": 0.6744, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.49627989530563354, "rewards/margins": 0.760360836982727, "rewards/rejected": -1.256640911102295, "step": 7380 }, { "epoch": 1.7216074548631335, "grad_norm": 8.566767692565918, "learning_rate": 3.0912621359223303e-06, "logits/chosen": -4.31394100189209, "logits/rejected": -4.317699432373047, "logps/chosen": -742.6536865234375, "logps/rejected": -742.4452514648438, "loss": 0.8337, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.9110280871391296, "rewards/margins": 0.31222930550575256, "rewards/rejected": -1.2232574224472046, "step": 7390 }, { "epoch": 1.7239370995923122, "grad_norm": 10.37869644165039, "learning_rate": 3.0653721682847896e-06, "logits/chosen": -4.3188958168029785, "logits/rejected": -4.317343711853027, "logps/chosen": -710.0857543945312, "logps/rejected": -701.8760986328125, "loss": 0.6513, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6056315302848816, "rewards/margins": 0.6209887862205505, "rewards/rejected": -1.2266203165054321, "step": 7400 }, { "epoch": 1.7239370995923122, "eval_logits/chosen": -4.277156352996826, "eval_logits/rejected": -4.264052867889404, "eval_logps/chosen": -698.2481079101562, "eval_logps/rejected": -718.45166015625, "eval_loss": 0.62615567445755, "eval_rewards/accuracies": 0.6424378156661987, "eval_rewards/chosen": -0.7419866919517517, "eval_rewards/margins": 0.42331522703170776, "eval_rewards/rejected": -1.1653019189834595, "eval_runtime": 398.4685, "eval_samples_per_second": 17.954, "eval_steps_per_second": 8.977, "step": 7400 }, { "epoch": 1.726266744321491, "grad_norm": 7.4377665519714355, "learning_rate": 3.0394822006472497e-06, "logits/chosen": -4.296648979187012, "logits/rejected": -4.405775547027588, "logps/chosen": -651.9999389648438, "logps/rejected": -759.31640625, "loss": 0.7337, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7278409004211426, "rewards/margins": 0.5400255918502808, "rewards/rejected": -1.267866611480713, "step": 7410 }, { "epoch": 1.72859638905067, "grad_norm": 7.177879810333252, "learning_rate": 3.013592233009709e-06, "logits/chosen": -4.278170585632324, "logits/rejected": -4.323129653930664, "logps/chosen": -729.5637817382812, "logps/rejected": -774.7483520507812, "loss": 0.7374, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7349106669425964, "rewards/margins": 0.5853937864303589, "rewards/rejected": -1.3203043937683105, "step": 7420 }, { "epoch": 1.7309260337798484, "grad_norm": 7.8414154052734375, "learning_rate": 2.9877022653721688e-06, "logits/chosen": -4.2496867179870605, "logits/rejected": -4.125625133514404, "logps/chosen": -684.25439453125, "logps/rejected": -632.718017578125, "loss": 0.9269, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.7515495419502258, "rewards/margins": 0.010830598883330822, "rewards/rejected": -0.7623801231384277, "step": 7430 }, { "epoch": 1.7332556785090274, "grad_norm": 6.723340034484863, "learning_rate": 2.961812297734628e-06, "logits/chosen": -4.267904281616211, "logits/rejected": -4.327345848083496, "logps/chosen": -729.6060180664062, "logps/rejected": -797.0587768554688, "loss": 0.706, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7860048413276672, "rewards/margins": 0.5039297342300415, "rewards/rejected": -1.289934515953064, "step": 7440 }, { "epoch": 1.7355853232382061, "grad_norm": 9.484472274780273, "learning_rate": 2.9359223300970874e-06, "logits/chosen": -4.306872844696045, "logits/rejected": -4.2969560623168945, "logps/chosen": -731.2567138671875, "logps/rejected": -699.7662963867188, "loss": 0.7022, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4795847535133362, "rewards/margins": 0.6933883428573608, "rewards/rejected": -1.1729731559753418, "step": 7450 }, { "epoch": 1.7379149679673849, "grad_norm": 8.210362434387207, "learning_rate": 2.910032362459547e-06, "logits/chosen": -4.308070659637451, "logits/rejected": -4.306254863739014, "logps/chosen": -738.3827514648438, "logps/rejected": -738.0606689453125, "loss": 0.8305, "rewards/accuracies": 0.625, "rewards/chosen": -0.6364607810974121, "rewards/margins": 0.3409319221973419, "rewards/rejected": -0.9773927927017212, "step": 7460 }, { "epoch": 1.7402446126965638, "grad_norm": 6.760463237762451, "learning_rate": 2.8841423948220064e-06, "logits/chosen": -4.3071770668029785, "logits/rejected": -4.340117454528809, "logps/chosen": -691.4385375976562, "logps/rejected": -691.5602416992188, "loss": 0.7247, "rewards/accuracies": 0.625, "rewards/chosen": -0.44640105962753296, "rewards/margins": 0.4229421615600586, "rewards/rejected": -0.8693434000015259, "step": 7470 }, { "epoch": 1.7425742574257426, "grad_norm": 8.015266418457031, "learning_rate": 2.8582524271844665e-06, "logits/chosen": -4.395244598388672, "logits/rejected": -4.3816423416137695, "logps/chosen": -722.7725830078125, "logps/rejected": -766.9971923828125, "loss": 0.6531, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.49877268075942993, "rewards/margins": 0.8005030751228333, "rewards/rejected": -1.2992757558822632, "step": 7480 }, { "epoch": 1.7449039021549213, "grad_norm": 5.885158538818359, "learning_rate": 2.832362459546926e-06, "logits/chosen": -4.314398765563965, "logits/rejected": -4.305683135986328, "logps/chosen": -717.0361328125, "logps/rejected": -767.6456909179688, "loss": 0.7443, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6509785652160645, "rewards/margins": 0.6104000806808472, "rewards/rejected": -1.2613786458969116, "step": 7490 }, { "epoch": 1.7472335468841003, "grad_norm": 9.181596755981445, "learning_rate": 2.8064724919093856e-06, "logits/chosen": -4.307205677032471, "logits/rejected": -4.181800842285156, "logps/chosen": -753.4910888671875, "logps/rejected": -698.2742309570312, "loss": 0.8344, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.74730384349823, "rewards/margins": 0.28025683760643005, "rewards/rejected": -1.0275605916976929, "step": 7500 }, { "epoch": 1.7472335468841003, "eval_logits/chosen": -4.275453567504883, "eval_logits/rejected": -4.262080669403076, "eval_logps/chosen": -698.1603393554688, "eval_logps/rejected": -718.3576049804688, "eval_loss": 0.6255109310150146, "eval_rewards/accuracies": 0.6420184373855591, "eval_rewards/chosen": -0.7332110404968262, "eval_rewards/margins": 0.4226882755756378, "eval_rewards/rejected": -1.1558992862701416, "eval_runtime": 398.5378, "eval_samples_per_second": 17.951, "eval_steps_per_second": 8.975, "step": 7500 }, { "epoch": 1.749563191613279, "grad_norm": 9.900083541870117, "learning_rate": 2.780582524271845e-06, "logits/chosen": -4.25106143951416, "logits/rejected": -4.330036640167236, "logps/chosen": -697.1131591796875, "logps/rejected": -813.1942138671875, "loss": 0.8211, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.755881130695343, "rewards/margins": 0.3740493059158325, "rewards/rejected": -1.1299302577972412, "step": 7510 }, { "epoch": 1.7518928363424577, "grad_norm": 5.772104263305664, "learning_rate": 2.7546925566343046e-06, "logits/chosen": -4.359757423400879, "logits/rejected": -4.24202299118042, "logps/chosen": -744.2601318359375, "logps/rejected": -661.6866455078125, "loss": 0.9188, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.7953876852989197, "rewards/margins": 0.12192598730325699, "rewards/rejected": -0.9173136949539185, "step": 7520 }, { "epoch": 1.7542224810716367, "grad_norm": 7.5138258934021, "learning_rate": 2.728802588996764e-06, "logits/chosen": -4.272562503814697, "logits/rejected": -4.372793674468994, "logps/chosen": -690.07763671875, "logps/rejected": -707.4510498046875, "loss": 0.6859, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6146534085273743, "rewards/margins": 0.5785967111587524, "rewards/rejected": -1.1932501792907715, "step": 7530 }, { "epoch": 1.7565521258008152, "grad_norm": 9.83190631866455, "learning_rate": 2.702912621359223e-06, "logits/chosen": -4.3367462158203125, "logits/rejected": -4.351749897003174, "logps/chosen": -745.0218505859375, "logps/rejected": -784.8512573242188, "loss": 0.7967, "rewards/accuracies": 0.625, "rewards/chosen": -0.7240052819252014, "rewards/margins": 0.3044678866863251, "rewards/rejected": -1.028473138809204, "step": 7540 }, { "epoch": 1.7588817705299942, "grad_norm": 5.653636455535889, "learning_rate": 2.677022653721683e-06, "logits/chosen": -4.269984245300293, "logits/rejected": -4.253081321716309, "logps/chosen": -725.6463623046875, "logps/rejected": -775.1932373046875, "loss": 0.745, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7236585021018982, "rewards/margins": 0.5556066036224365, "rewards/rejected": -1.2792651653289795, "step": 7550 }, { "epoch": 1.7612114152591731, "grad_norm": 8.759044647216797, "learning_rate": 2.6511326860841422e-06, "logits/chosen": -4.290041923522949, "logits/rejected": -4.351320266723633, "logps/chosen": -702.0689697265625, "logps/rejected": -783.780517578125, "loss": 0.6961, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7005491256713867, "rewards/margins": 0.575498104095459, "rewards/rejected": -1.2760472297668457, "step": 7560 }, { "epoch": 1.7635410599883516, "grad_norm": 9.059598922729492, "learning_rate": 2.6252427184466024e-06, "logits/chosen": -4.330138683319092, "logits/rejected": -4.341468811035156, "logps/chosen": -710.1514892578125, "logps/rejected": -788.5646362304688, "loss": 0.7336, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6530278921127319, "rewards/margins": 0.44766393303871155, "rewards/rejected": -1.1006916761398315, "step": 7570 }, { "epoch": 1.7658707047175306, "grad_norm": 8.274618148803711, "learning_rate": 2.5993527508090617e-06, "logits/chosen": -4.33445930480957, "logits/rejected": -4.335292816162109, "logps/chosen": -665.8831176757812, "logps/rejected": -717.4337158203125, "loss": 0.7093, "rewards/accuracies": 0.625, "rewards/chosen": -0.7828375101089478, "rewards/margins": 0.5649530291557312, "rewards/rejected": -1.3477904796600342, "step": 7580 }, { "epoch": 1.7682003494467093, "grad_norm": 9.58355712890625, "learning_rate": 2.5734627831715214e-06, "logits/chosen": -4.344472408294678, "logits/rejected": -4.394615650177002, "logps/chosen": -649.8656616210938, "logps/rejected": -706.46435546875, "loss": 0.7247, "rewards/accuracies": 0.625, "rewards/chosen": -0.6893410086631775, "rewards/margins": 0.4330008029937744, "rewards/rejected": -1.1223418712615967, "step": 7590 }, { "epoch": 1.770529994175888, "grad_norm": 9.934365272521973, "learning_rate": 2.5475728155339807e-06, "logits/chosen": -4.332821846008301, "logits/rejected": -4.36389684677124, "logps/chosen": -700.098876953125, "logps/rejected": -759.6007690429688, "loss": 0.7871, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6798760890960693, "rewards/margins": 0.39228060841560364, "rewards/rejected": -1.0721566677093506, "step": 7600 }, { "epoch": 1.770529994175888, "eval_logits/chosen": -4.275769233703613, "eval_logits/rejected": -4.262385368347168, "eval_logps/chosen": -698.19287109375, "eval_logps/rejected": -718.3925170898438, "eval_loss": 0.6256328225135803, "eval_rewards/accuracies": 0.642577588558197, "eval_rewards/chosen": -0.7364568114280701, "eval_rewards/margins": 0.4229297339916229, "eval_rewards/rejected": -1.1593865156173706, "eval_runtime": 399.4902, "eval_samples_per_second": 17.908, "eval_steps_per_second": 8.954, "step": 7600 }, { "epoch": 1.772859638905067, "grad_norm": 3.8265748023986816, "learning_rate": 2.5216828478964404e-06, "logits/chosen": -4.340038299560547, "logits/rejected": -4.323204040527344, "logps/chosen": -684.2075805664062, "logps/rejected": -782.0001220703125, "loss": 0.6874, "rewards/accuracies": 0.625, "rewards/chosen": -0.5509809255599976, "rewards/margins": 0.7414697408676147, "rewards/rejected": -1.2924506664276123, "step": 7610 }, { "epoch": 1.7751892836342458, "grad_norm": 8.738218307495117, "learning_rate": 2.4957928802588998e-06, "logits/chosen": -4.31033992767334, "logits/rejected": -4.3488922119140625, "logps/chosen": -703.8746948242188, "logps/rejected": -761.0021362304688, "loss": 0.7514, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7934512495994568, "rewards/margins": 0.521425724029541, "rewards/rejected": -1.314876914024353, "step": 7620 }, { "epoch": 1.7775189283634245, "grad_norm": 6.381500720977783, "learning_rate": 2.4699029126213595e-06, "logits/chosen": -4.314557075500488, "logits/rejected": -4.293426513671875, "logps/chosen": -685.4746704101562, "logps/rejected": -710.4097900390625, "loss": 0.7379, "rewards/accuracies": 0.5625, "rewards/chosen": -0.48360586166381836, "rewards/margins": 0.5202947854995728, "rewards/rejected": -1.0039006471633911, "step": 7630 }, { "epoch": 1.7798485730926035, "grad_norm": 5.309506416320801, "learning_rate": 2.444012944983819e-06, "logits/chosen": -4.322248935699463, "logits/rejected": -4.2495527267456055, "logps/chosen": -675.5975952148438, "logps/rejected": -633.2434692382812, "loss": 0.9243, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7094519138336182, "rewards/margins": 0.12580251693725586, "rewards/rejected": -0.8352544903755188, "step": 7640 }, { "epoch": 1.7821782178217822, "grad_norm": 6.463253021240234, "learning_rate": 2.4181229773462785e-06, "logits/chosen": -4.193962097167969, "logits/rejected": -4.322009086608887, "logps/chosen": -679.2346801757812, "logps/rejected": -781.1619873046875, "loss": 0.7621, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6658368706703186, "rewards/margins": 0.421161025762558, "rewards/rejected": -1.0869977474212646, "step": 7650 }, { "epoch": 1.784507862550961, "grad_norm": 7.450245380401611, "learning_rate": 2.3922330097087382e-06, "logits/chosen": -4.363302707672119, "logits/rejected": -4.293172836303711, "logps/chosen": -732.4378051757812, "logps/rejected": -791.06884765625, "loss": 0.8789, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.0643038749694824, "rewards/margins": 0.19379642605781555, "rewards/rejected": -1.2581002712249756, "step": 7660 }, { "epoch": 1.78683750728014, "grad_norm": 9.34884262084961, "learning_rate": 2.3663430420711975e-06, "logits/chosen": -4.283385276794434, "logits/rejected": -4.333686351776123, "logps/chosen": -695.9212646484375, "logps/rejected": -753.2310791015625, "loss": 0.7264, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.694754958152771, "rewards/margins": 0.5363430380821228, "rewards/rejected": -1.231097936630249, "step": 7670 }, { "epoch": 1.7891671520093184, "grad_norm": 8.882437705993652, "learning_rate": 2.340453074433657e-06, "logits/chosen": -4.361789226531982, "logits/rejected": -4.341314792633057, "logps/chosen": -699.6898803710938, "logps/rejected": -758.737548828125, "loss": 0.7417, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7322936058044434, "rewards/margins": 0.5006028413772583, "rewards/rejected": -1.2328965663909912, "step": 7680 }, { "epoch": 1.7914967967384974, "grad_norm": 5.45563268661499, "learning_rate": 2.3145631067961166e-06, "logits/chosen": -4.283277988433838, "logits/rejected": -4.236169815063477, "logps/chosen": -698.2086181640625, "logps/rejected": -688.638671875, "loss": 0.6836, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6562092304229736, "rewards/margins": 0.5385984778404236, "rewards/rejected": -1.194807767868042, "step": 7690 }, { "epoch": 1.7938264414676763, "grad_norm": 10.295047760009766, "learning_rate": 2.2886731391585763e-06, "logits/chosen": -4.275271415710449, "logits/rejected": -4.330410003662109, "logps/chosen": -690.9629516601562, "logps/rejected": -742.60791015625, "loss": 0.775, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6221812963485718, "rewards/margins": 0.4292815327644348, "rewards/rejected": -1.0514627695083618, "step": 7700 }, { "epoch": 1.7938264414676763, "eval_logits/chosen": -4.2768354415893555, "eval_logits/rejected": -4.263593673706055, "eval_logps/chosen": -698.334228515625, "eval_logps/rejected": -718.576416015625, "eval_loss": 0.626298725605011, "eval_rewards/accuracies": 0.6427173614501953, "eval_rewards/chosen": -0.7505965828895569, "eval_rewards/margins": 0.4271797239780426, "eval_rewards/rejected": -1.1777764558792114, "eval_runtime": 398.6638, "eval_samples_per_second": 17.945, "eval_steps_per_second": 8.972, "step": 7700 }, { "epoch": 1.7961560861968549, "grad_norm": 7.378889560699463, "learning_rate": 2.2627831715210356e-06, "logits/chosen": -4.222140312194824, "logits/rejected": -4.275264263153076, "logps/chosen": -627.9237060546875, "logps/rejected": -636.3966064453125, "loss": 0.6343, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.20627155900001526, "rewards/margins": 0.7409781217575073, "rewards/rejected": -0.9472497701644897, "step": 7710 }, { "epoch": 1.7984857309260338, "grad_norm": 9.264955520629883, "learning_rate": 2.2368932038834953e-06, "logits/chosen": -4.308340072631836, "logits/rejected": -4.277257442474365, "logps/chosen": -746.6083374023438, "logps/rejected": -733.2451782226562, "loss": 0.8311, "rewards/accuracies": 0.5625, "rewards/chosen": -0.899739146232605, "rewards/margins": 0.07021939754486084, "rewards/rejected": -0.969958484172821, "step": 7720 }, { "epoch": 1.8008153756552125, "grad_norm": 5.251680374145508, "learning_rate": 2.211003236245955e-06, "logits/chosen": -4.405594825744629, "logits/rejected": -4.321906089782715, "logps/chosen": -751.5158081054688, "logps/rejected": -801.6177368164062, "loss": 0.8219, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8107985258102417, "rewards/margins": 0.6600787043571472, "rewards/rejected": -1.4708774089813232, "step": 7730 }, { "epoch": 1.8031450203843913, "grad_norm": 8.310518264770508, "learning_rate": 2.1851132686084143e-06, "logits/chosen": -4.286992073059082, "logits/rejected": -4.3545308113098145, "logps/chosen": -703.4472045898438, "logps/rejected": -783.1981811523438, "loss": 0.8341, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8503850102424622, "rewards/margins": 0.3780274987220764, "rewards/rejected": -1.228412389755249, "step": 7740 }, { "epoch": 1.8054746651135702, "grad_norm": 8.352068901062012, "learning_rate": 2.159223300970874e-06, "logits/chosen": -4.327363014221191, "logits/rejected": -4.168322563171387, "logps/chosen": -769.9176025390625, "logps/rejected": -680.9280395507812, "loss": 0.8691, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.8926218152046204, "rewards/margins": 0.17855612933635712, "rewards/rejected": -1.0711780786514282, "step": 7750 }, { "epoch": 1.807804309842749, "grad_norm": 8.405198097229004, "learning_rate": 2.133333333333334e-06, "logits/chosen": -4.351681709289551, "logits/rejected": -4.3126020431518555, "logps/chosen": -745.2198486328125, "logps/rejected": -738.9443969726562, "loss": 0.6288, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6772146821022034, "rewards/margins": 0.5970760583877563, "rewards/rejected": -1.274290680885315, "step": 7760 }, { "epoch": 1.8101339545719277, "grad_norm": 7.872401714324951, "learning_rate": 2.107443365695793e-06, "logits/chosen": -4.302443027496338, "logits/rejected": -4.289053440093994, "logps/chosen": -640.4171752929688, "logps/rejected": -674.5543212890625, "loss": 0.7753, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.8442346453666687, "rewards/margins": 0.3581005036830902, "rewards/rejected": -1.2023351192474365, "step": 7770 }, { "epoch": 1.8124635993011067, "grad_norm": 4.250477313995361, "learning_rate": 2.0815533980582524e-06, "logits/chosen": -4.271813869476318, "logits/rejected": -4.248841285705566, "logps/chosen": -711.6376342773438, "logps/rejected": -629.3045043945312, "loss": 0.8603, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8051861524581909, "rewards/margins": 0.3091093599796295, "rewards/rejected": -1.114295482635498, "step": 7780 }, { "epoch": 1.8147932440302854, "grad_norm": 8.479049682617188, "learning_rate": 2.055663430420712e-06, "logits/chosen": -4.2787370681762695, "logits/rejected": -4.315961837768555, "logps/chosen": -670.7811279296875, "logps/rejected": -719.6918334960938, "loss": 0.756, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5828672647476196, "rewards/margins": 0.5051405429840088, "rewards/rejected": -1.0880076885223389, "step": 7790 }, { "epoch": 1.8171228887594641, "grad_norm": 13.810235977172852, "learning_rate": 2.029773462783172e-06, "logits/chosen": -4.362678527832031, "logits/rejected": -4.296935081481934, "logps/chosen": -753.6568603515625, "logps/rejected": -747.1669921875, "loss": 0.9258, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.816769003868103, "rewards/margins": 0.12891487777233124, "rewards/rejected": -0.9456839561462402, "step": 7800 }, { "epoch": 1.8171228887594641, "eval_logits/chosen": -4.273819923400879, "eval_logits/rejected": -4.260204315185547, "eval_logps/chosen": -698.2659301757812, "eval_logps/rejected": -718.5016479492188, "eval_loss": 0.6260212063789368, "eval_rewards/accuracies": 0.6434162855148315, "eval_rewards/chosen": -0.7437689304351807, "eval_rewards/margins": 0.42653757333755493, "eval_rewards/rejected": -1.1703065633773804, "eval_runtime": 400.821, "eval_samples_per_second": 17.848, "eval_steps_per_second": 8.924, "step": 7800 }, { "epoch": 1.819452533488643, "grad_norm": 9.626912117004395, "learning_rate": 2.003883495145631e-06, "logits/chosen": -4.315255165100098, "logits/rejected": -4.34834098815918, "logps/chosen": -734.5718383789062, "logps/rejected": -787.1271362304688, "loss": 0.7108, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7061156034469604, "rewards/margins": 0.692793071269989, "rewards/rejected": -1.3989086151123047, "step": 7810 }, { "epoch": 1.8217821782178216, "grad_norm": 8.975574493408203, "learning_rate": 1.977993527508091e-06, "logits/chosen": -4.334473133087158, "logits/rejected": -4.307866096496582, "logps/chosen": -764.4278564453125, "logps/rejected": -787.1414794921875, "loss": 0.7731, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.5991033315658569, "rewards/margins": 0.35862669348716736, "rewards/rejected": -0.9577299952507019, "step": 7820 }, { "epoch": 1.8241118229470006, "grad_norm": 4.2930908203125, "learning_rate": 1.95210355987055e-06, "logits/chosen": -4.372093200683594, "logits/rejected": -4.383025169372559, "logps/chosen": -773.4154663085938, "logps/rejected": -833.3216552734375, "loss": 0.8166, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8959034085273743, "rewards/margins": 0.4735845923423767, "rewards/rejected": -1.369488000869751, "step": 7830 }, { "epoch": 1.8264414676761795, "grad_norm": 7.499445915222168, "learning_rate": 1.92621359223301e-06, "logits/chosen": -4.287027359008789, "logits/rejected": -4.379525184631348, "logps/chosen": -655.5816650390625, "logps/rejected": -765.8964233398438, "loss": 0.7044, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6393209099769592, "rewards/margins": 0.46456795930862427, "rewards/rejected": -1.103888750076294, "step": 7840 }, { "epoch": 1.828771112405358, "grad_norm": 5.650694847106934, "learning_rate": 1.9003236245954696e-06, "logits/chosen": -4.295722007751465, "logits/rejected": -4.299536228179932, "logps/chosen": -736.7391357421875, "logps/rejected": -712.6022338867188, "loss": 0.7138, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5302426218986511, "rewards/margins": 0.5727535486221313, "rewards/rejected": -1.1029961109161377, "step": 7850 }, { "epoch": 1.831100757134537, "grad_norm": 11.053319931030273, "learning_rate": 1.8744336569579287e-06, "logits/chosen": -4.285447120666504, "logits/rejected": -4.212759494781494, "logps/chosen": -772.167236328125, "logps/rejected": -714.1157836914062, "loss": 0.7928, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6176115870475769, "rewards/margins": 0.4766506552696228, "rewards/rejected": -1.0942623615264893, "step": 7860 }, { "epoch": 1.8334304018637158, "grad_norm": 8.503183364868164, "learning_rate": 1.8485436893203885e-06, "logits/chosen": -4.3330230712890625, "logits/rejected": -4.239395618438721, "logps/chosen": -706.237060546875, "logps/rejected": -652.3427124023438, "loss": 0.7318, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6463378667831421, "rewards/margins": 0.46101585030555725, "rewards/rejected": -1.107353687286377, "step": 7870 }, { "epoch": 1.8357600465928945, "grad_norm": 8.738791465759277, "learning_rate": 1.822653721682848e-06, "logits/chosen": -4.2857489585876465, "logits/rejected": -4.343883991241455, "logps/chosen": -678.5247802734375, "logps/rejected": -713.4631958007812, "loss": 0.7368, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6008560657501221, "rewards/margins": 0.5160638093948364, "rewards/rejected": -1.116919755935669, "step": 7880 }, { "epoch": 1.8380896913220734, "grad_norm": 5.170600414276123, "learning_rate": 1.7967637540453075e-06, "logits/chosen": -4.332296371459961, "logits/rejected": -4.3902058601379395, "logps/chosen": -717.0233764648438, "logps/rejected": -801.6724243164062, "loss": 0.8154, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7391607165336609, "rewards/margins": 0.354645311832428, "rewards/rejected": -1.0938060283660889, "step": 7890 }, { "epoch": 1.8404193360512522, "grad_norm": 7.153270244598389, "learning_rate": 1.7708737864077672e-06, "logits/chosen": -4.206341743469238, "logits/rejected": -4.272002220153809, "logps/chosen": -659.7720947265625, "logps/rejected": -721.71044921875, "loss": 0.7981, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.6344069242477417, "rewards/margins": 0.3891531825065613, "rewards/rejected": -1.0235600471496582, "step": 7900 }, { "epoch": 1.8404193360512522, "eval_logits/chosen": -4.273324489593506, "eval_logits/rejected": -4.259591579437256, "eval_logps/chosen": -698.2616577148438, "eval_logps/rejected": -718.4935913085938, "eval_loss": 0.6259632110595703, "eval_rewards/accuracies": 0.6406206488609314, "eval_rewards/chosen": -0.7433328032493591, "eval_rewards/margins": 0.42616304755210876, "eval_rewards/rejected": -1.1694958209991455, "eval_runtime": 399.2234, "eval_samples_per_second": 17.92, "eval_steps_per_second": 8.96, "step": 7900 }, { "epoch": 1.842748980780431, "grad_norm": 5.014840602874756, "learning_rate": 1.7449838187702267e-06, "logits/chosen": -4.308610916137695, "logits/rejected": -4.290936470031738, "logps/chosen": -711.2816162109375, "logps/rejected": -699.1672973632812, "loss": 0.633, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5748124718666077, "rewards/margins": 0.48621124029159546, "rewards/rejected": -1.0610238313674927, "step": 7910 }, { "epoch": 1.8450786255096099, "grad_norm": 2.524541139602661, "learning_rate": 1.7190938511326862e-06, "logits/chosen": -4.281649589538574, "logits/rejected": -4.29948091506958, "logps/chosen": -682.9008178710938, "logps/rejected": -737.8621215820312, "loss": 0.6611, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.45142731070518494, "rewards/margins": 0.7023296356201172, "rewards/rejected": -1.153756856918335, "step": 7920 }, { "epoch": 1.8474082702387886, "grad_norm": 7.655969619750977, "learning_rate": 1.6932038834951458e-06, "logits/chosen": -4.2819390296936035, "logits/rejected": -4.311500549316406, "logps/chosen": -703.700439453125, "logps/rejected": -756.3419189453125, "loss": 0.7514, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7004653811454773, "rewards/margins": 0.49804216623306274, "rewards/rejected": -1.1985076665878296, "step": 7930 }, { "epoch": 1.8497379149679674, "grad_norm": 10.318922996520996, "learning_rate": 1.6673139158576055e-06, "logits/chosen": -4.371182441711426, "logits/rejected": -4.295583248138428, "logps/chosen": -759.6697998046875, "logps/rejected": -713.5474853515625, "loss": 0.7842, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7593971490859985, "rewards/margins": 0.34532198309898376, "rewards/rejected": -1.1047192811965942, "step": 7940 }, { "epoch": 1.8520675596971463, "grad_norm": 7.683895587921143, "learning_rate": 1.6414239482200648e-06, "logits/chosen": -4.274389266967773, "logits/rejected": -4.26663064956665, "logps/chosen": -701.9673461914062, "logps/rejected": -762.0696411132812, "loss": 0.6948, "rewards/accuracies": 0.625, "rewards/chosen": -0.5486912131309509, "rewards/margins": 0.6160353422164917, "rewards/rejected": -1.1647266149520874, "step": 7950 }, { "epoch": 1.8543972044263248, "grad_norm": 5.8917694091796875, "learning_rate": 1.6155339805825243e-06, "logits/chosen": -4.227823257446289, "logits/rejected": -4.251143455505371, "logps/chosen": -699.05908203125, "logps/rejected": -751.8037719726562, "loss": 0.7264, "rewards/accuracies": 0.625, "rewards/chosen": -0.6425224542617798, "rewards/margins": 0.3867589831352234, "rewards/rejected": -1.0292813777923584, "step": 7960 }, { "epoch": 1.8567268491555038, "grad_norm": 8.299991607666016, "learning_rate": 1.5896440129449838e-06, "logits/chosen": -4.296449184417725, "logits/rejected": -4.244095802307129, "logps/chosen": -728.1547241210938, "logps/rejected": -708.0887451171875, "loss": 0.7491, "rewards/accuracies": 0.625, "rewards/chosen": -0.6678886413574219, "rewards/margins": 0.49384164810180664, "rewards/rejected": -1.1617302894592285, "step": 7970 }, { "epoch": 1.8590564938846827, "grad_norm": 8.590520858764648, "learning_rate": 1.5637540453074435e-06, "logits/chosen": -4.296567440032959, "logits/rejected": -4.204328536987305, "logps/chosen": -734.1271362304688, "logps/rejected": -685.5897827148438, "loss": 0.6546, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6129536628723145, "rewards/margins": 0.6994892358779907, "rewards/rejected": -1.3124430179595947, "step": 7980 }, { "epoch": 1.8613861386138613, "grad_norm": 8.054755210876465, "learning_rate": 1.537864077669903e-06, "logits/chosen": -4.2982258796691895, "logits/rejected": -4.374433994293213, "logps/chosen": -710.0291748046875, "logps/rejected": -802.5664672851562, "loss": 0.7795, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6659218668937683, "rewards/margins": 0.44334450364112854, "rewards/rejected": -1.1092662811279297, "step": 7990 }, { "epoch": 1.8637157833430402, "grad_norm": 8.272866249084473, "learning_rate": 1.5119741100323626e-06, "logits/chosen": -4.2993011474609375, "logits/rejected": -4.279488563537598, "logps/chosen": -712.0296630859375, "logps/rejected": -736.62060546875, "loss": 0.7925, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5873527526855469, "rewards/margins": 0.4799245297908783, "rewards/rejected": -1.0672773122787476, "step": 8000 }, { "epoch": 1.8637157833430402, "eval_logits/chosen": -4.2714433670043945, "eval_logits/rejected": -4.257770538330078, "eval_logps/chosen": -698.35888671875, "eval_logps/rejected": -718.630615234375, "eval_loss": 0.62626051902771, "eval_rewards/accuracies": 0.6418786644935608, "eval_rewards/chosen": -0.7530632615089417, "eval_rewards/margins": 0.4301401674747467, "eval_rewards/rejected": -1.1832033395767212, "eval_runtime": 400.3459, "eval_samples_per_second": 17.87, "eval_steps_per_second": 8.935, "step": 8000 }, { "epoch": 1.866045428072219, "grad_norm": 8.103987693786621, "learning_rate": 1.486084142394822e-06, "logits/chosen": -4.286310195922852, "logits/rejected": -4.278182506561279, "logps/chosen": -654.7042846679688, "logps/rejected": -674.2049560546875, "loss": 0.6231, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4982284903526306, "rewards/margins": 0.7050498127937317, "rewards/rejected": -1.2032783031463623, "step": 8010 }, { "epoch": 1.8683750728013977, "grad_norm": 10.897774696350098, "learning_rate": 1.4601941747572818e-06, "logits/chosen": -4.338923454284668, "logits/rejected": -4.356582164764404, "logps/chosen": -692.2127685546875, "logps/rejected": -728.3756103515625, "loss": 0.7361, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.737460196018219, "rewards/margins": 0.5897017121315002, "rewards/rejected": -1.3271619081497192, "step": 8020 }, { "epoch": 1.8707047175305767, "grad_norm": 6.542692184448242, "learning_rate": 1.4343042071197413e-06, "logits/chosen": -4.281493663787842, "logits/rejected": -4.266264915466309, "logps/chosen": -690.42529296875, "logps/rejected": -724.5419921875, "loss": 0.7727, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7195274233818054, "rewards/margins": 0.527152419090271, "rewards/rejected": -1.2466800212860107, "step": 8030 }, { "epoch": 1.8730343622597554, "grad_norm": 5.282635688781738, "learning_rate": 1.4084142394822006e-06, "logits/chosen": -4.363732814788818, "logits/rejected": -4.371499538421631, "logps/chosen": -707.1060791015625, "logps/rejected": -674.9572143554688, "loss": 0.7736, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.6483897566795349, "rewards/margins": 0.3561496436595917, "rewards/rejected": -1.0045394897460938, "step": 8040 }, { "epoch": 1.8753640069889341, "grad_norm": 7.903660774230957, "learning_rate": 1.3825242718446601e-06, "logits/chosen": -4.357583522796631, "logits/rejected": -4.340155124664307, "logps/chosen": -715.6651611328125, "logps/rejected": -782.0966796875, "loss": 0.5883, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6292458772659302, "rewards/margins": 0.7687798738479614, "rewards/rejected": -1.3980258703231812, "step": 8050 }, { "epoch": 1.877693651718113, "grad_norm": 7.870277404785156, "learning_rate": 1.3566343042071199e-06, "logits/chosen": -4.245657444000244, "logits/rejected": -4.232403755187988, "logps/chosen": -734.6783447265625, "logps/rejected": -739.2367553710938, "loss": 0.8023, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.762460470199585, "rewards/margins": 0.41493305563926697, "rewards/rejected": -1.1773935556411743, "step": 8060 }, { "epoch": 1.8800232964472918, "grad_norm": 4.446013927459717, "learning_rate": 1.3307443365695794e-06, "logits/chosen": -4.216975212097168, "logits/rejected": -4.27420711517334, "logps/chosen": -696.5819091796875, "logps/rejected": -726.9581909179688, "loss": 0.7313, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6386895775794983, "rewards/margins": 0.49886369705200195, "rewards/rejected": -1.137553334236145, "step": 8070 }, { "epoch": 1.8823529411764706, "grad_norm": 8.197561264038086, "learning_rate": 1.304854368932039e-06, "logits/chosen": -4.304692268371582, "logits/rejected": -4.309518337249756, "logps/chosen": -668.96728515625, "logps/rejected": -743.6773681640625, "loss": 0.8273, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6109591722488403, "rewards/margins": 0.3413018584251404, "rewards/rejected": -0.9522610902786255, "step": 8080 }, { "epoch": 1.8846825859056495, "grad_norm": 6.251431941986084, "learning_rate": 1.2789644012944984e-06, "logits/chosen": -4.228099346160889, "logits/rejected": -4.20468807220459, "logps/chosen": -669.6293334960938, "logps/rejected": -689.2266845703125, "loss": 0.633, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6801440119743347, "rewards/margins": 0.6941016316413879, "rewards/rejected": -1.3742456436157227, "step": 8090 }, { "epoch": 1.887012230634828, "grad_norm": 11.473604202270508, "learning_rate": 1.2530744336569581e-06, "logits/chosen": -4.333575248718262, "logits/rejected": -4.392006874084473, "logps/chosen": -756.8983154296875, "logps/rejected": -833.8203125, "loss": 0.7757, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7302811741828918, "rewards/margins": 0.4153829514980316, "rewards/rejected": -1.1456642150878906, "step": 8100 }, { "epoch": 1.887012230634828, "eval_logits/chosen": -4.270134925842285, "eval_logits/rejected": -4.256483554840088, "eval_logps/chosen": -698.4719848632812, "eval_logps/rejected": -718.76123046875, "eval_loss": 0.6265602707862854, "eval_rewards/accuracies": 0.6420184373855591, "eval_rewards/chosen": -0.7643771767616272, "eval_rewards/margins": 0.43188703060150146, "eval_rewards/rejected": -1.1962642669677734, "eval_runtime": 401.3482, "eval_samples_per_second": 17.825, "eval_steps_per_second": 8.912, "step": 8100 }, { "epoch": 1.889341875364007, "grad_norm": 6.380511283874512, "learning_rate": 1.2271844660194174e-06, "logits/chosen": -4.341069221496582, "logits/rejected": -4.404160499572754, "logps/chosen": -696.8298950195312, "logps/rejected": -806.1851806640625, "loss": 0.5914, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5037180781364441, "rewards/margins": 0.8781551122665405, "rewards/rejected": -1.3818731307983398, "step": 8110 }, { "epoch": 1.891671520093186, "grad_norm": 7.608849048614502, "learning_rate": 1.2012944983818772e-06, "logits/chosen": -4.330622673034668, "logits/rejected": -4.2414679527282715, "logps/chosen": -784.8829956054688, "logps/rejected": -726.9249877929688, "loss": 0.7297, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6424816250801086, "rewards/margins": 0.39398622512817383, "rewards/rejected": -1.0364679098129272, "step": 8120 }, { "epoch": 1.8940011648223645, "grad_norm": 9.878482818603516, "learning_rate": 1.1754045307443367e-06, "logits/chosen": -4.255199432373047, "logits/rejected": -4.249835014343262, "logps/chosen": -713.4352416992188, "logps/rejected": -753.8508911132812, "loss": 0.6942, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.8424083590507507, "rewards/margins": 0.5438202619552612, "rewards/rejected": -1.3862287998199463, "step": 8130 }, { "epoch": 1.8963308095515434, "grad_norm": 9.539684295654297, "learning_rate": 1.1495145631067962e-06, "logits/chosen": -4.222280025482178, "logits/rejected": -4.286808013916016, "logps/chosen": -668.6334838867188, "logps/rejected": -658.0465087890625, "loss": 0.8308, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.8567003011703491, "rewards/margins": 0.35510557889938354, "rewards/rejected": -1.2118059396743774, "step": 8140 }, { "epoch": 1.8986604542807222, "grad_norm": 6.965002059936523, "learning_rate": 1.1236245954692557e-06, "logits/chosen": -4.31527853012085, "logits/rejected": -4.246081352233887, "logps/chosen": -706.160400390625, "logps/rejected": -683.8353271484375, "loss": 0.795, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6172300577163696, "rewards/margins": 0.39958369731903076, "rewards/rejected": -1.0168136358261108, "step": 8150 }, { "epoch": 1.900990099009901, "grad_norm": 6.807827472686768, "learning_rate": 1.0977346278317152e-06, "logits/chosen": -4.384181022644043, "logits/rejected": -4.358831882476807, "logps/chosen": -772.3015747070312, "logps/rejected": -818.3670043945312, "loss": 0.8398, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7376090288162231, "rewards/margins": 0.4424554705619812, "rewards/rejected": -1.1800644397735596, "step": 8160 }, { "epoch": 1.9033197437390799, "grad_norm": 8.301105499267578, "learning_rate": 1.0718446601941747e-06, "logits/chosen": -4.291867256164551, "logits/rejected": -4.323934078216553, "logps/chosen": -726.5836791992188, "logps/rejected": -758.00048828125, "loss": 0.7862, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.764946699142456, "rewards/margins": 0.43642458319664, "rewards/rejected": -1.2013713121414185, "step": 8170 }, { "epoch": 1.9056493884682586, "grad_norm": 7.553655624389648, "learning_rate": 1.0459546925566345e-06, "logits/chosen": -4.2980146408081055, "logits/rejected": -4.247636795043945, "logps/chosen": -691.2302856445312, "logps/rejected": -720.0042114257812, "loss": 0.784, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8119754791259766, "rewards/margins": 0.34611016511917114, "rewards/rejected": -1.1580857038497925, "step": 8180 }, { "epoch": 1.9079790331974373, "grad_norm": 5.689741611480713, "learning_rate": 1.020064724919094e-06, "logits/chosen": -4.381438732147217, "logits/rejected": -4.324999809265137, "logps/chosen": -739.9146728515625, "logps/rejected": -710.13623046875, "loss": 0.7337, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.577957272529602, "rewards/margins": 0.40901726484298706, "rewards/rejected": -0.9869745373725891, "step": 8190 }, { "epoch": 1.9103086779266163, "grad_norm": 11.527932167053223, "learning_rate": 9.941747572815535e-07, "logits/chosen": -4.309118747711182, "logits/rejected": -4.265748977661133, "logps/chosen": -731.2278442382812, "logps/rejected": -681.3240966796875, "loss": 0.8045, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.7814499139785767, "rewards/margins": 0.3624773621559143, "rewards/rejected": -1.1439272165298462, "step": 8200 }, { "epoch": 1.9103086779266163, "eval_logits/chosen": -4.268999099731445, "eval_logits/rejected": -4.255213737487793, "eval_logps/chosen": -698.4553833007812, "eval_logps/rejected": -718.7557373046875, "eval_loss": 0.6263387203216553, "eval_rewards/accuracies": 0.6413195133209229, "eval_rewards/chosen": -0.762718677520752, "eval_rewards/margins": 0.4329899847507477, "eval_rewards/rejected": -1.1957087516784668, "eval_runtime": 401.6446, "eval_samples_per_second": 17.812, "eval_steps_per_second": 8.906, "step": 8200 }, { "epoch": 1.912638322655795, "grad_norm": 9.736942291259766, "learning_rate": 9.68284789644013e-07, "logits/chosen": -4.324416160583496, "logits/rejected": -4.226147174835205, "logps/chosen": -716.7040405273438, "logps/rejected": -628.514404296875, "loss": 0.67, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5403128862380981, "rewards/margins": 0.6297000646591187, "rewards/rejected": -1.1700130701065063, "step": 8210 }, { "epoch": 1.9149679673849738, "grad_norm": 7.084342956542969, "learning_rate": 9.423948220064725e-07, "logits/chosen": -4.212306022644043, "logits/rejected": -4.274476051330566, "logps/chosen": -632.3684692382812, "logps/rejected": -746.298828125, "loss": 0.7419, "rewards/accuracies": 0.625, "rewards/chosen": -0.607592761516571, "rewards/margins": 0.5596817135810852, "rewards/rejected": -1.1672742366790771, "step": 8220 }, { "epoch": 1.9172976121141527, "grad_norm": 7.207754135131836, "learning_rate": 9.165048543689321e-07, "logits/chosen": -4.234009265899658, "logits/rejected": -4.276724338531494, "logps/chosen": -658.24609375, "logps/rejected": -690.4041748046875, "loss": 0.7392, "rewards/accuracies": 0.625, "rewards/chosen": -0.6036561727523804, "rewards/margins": 0.697953462600708, "rewards/rejected": -1.301609754562378, "step": 8230 }, { "epoch": 1.9196272568433312, "grad_norm": 9.125251770019531, "learning_rate": 8.906148867313917e-07, "logits/chosen": -4.313952922821045, "logits/rejected": -4.3028974533081055, "logps/chosen": -757.9784545898438, "logps/rejected": -726.7279052734375, "loss": 0.7373, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6254912614822388, "rewards/margins": 0.4764394164085388, "rewards/rejected": -1.1019306182861328, "step": 8240 }, { "epoch": 1.9219569015725102, "grad_norm": 6.093334197998047, "learning_rate": 8.647249190938512e-07, "logits/chosen": -4.276091575622559, "logits/rejected": -4.3135552406311035, "logps/chosen": -667.789306640625, "logps/rejected": -731.2029418945312, "loss": 0.7606, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8004811406135559, "rewards/margins": 0.5769751667976379, "rewards/rejected": -1.3774563074111938, "step": 8250 }, { "epoch": 1.924286546301689, "grad_norm": 7.825791358947754, "learning_rate": 8.388349514563107e-07, "logits/chosen": -4.3161139488220215, "logits/rejected": -4.360238075256348, "logps/chosen": -693.4849853515625, "logps/rejected": -744.6207275390625, "loss": 0.7171, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5785298347473145, "rewards/margins": 0.4514383375644684, "rewards/rejected": -1.0299681425094604, "step": 8260 }, { "epoch": 1.9266161910308677, "grad_norm": 9.916013717651367, "learning_rate": 8.129449838187703e-07, "logits/chosen": -4.298556327819824, "logits/rejected": -4.30155611038208, "logps/chosen": -658.8529663085938, "logps/rejected": -694.6532592773438, "loss": 0.8381, "rewards/accuracies": 0.5, "rewards/chosen": -1.0241538286209106, "rewards/margins": 0.14310523867607117, "rewards/rejected": -1.1672589778900146, "step": 8270 }, { "epoch": 1.9289458357600466, "grad_norm": 8.170594215393066, "learning_rate": 7.870550161812298e-07, "logits/chosen": -4.3301568031311035, "logits/rejected": -4.340358734130859, "logps/chosen": -723.5537109375, "logps/rejected": -732.6986083984375, "loss": 0.7501, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.59930020570755, "rewards/margins": 0.5064767599105835, "rewards/rejected": -1.1057769060134888, "step": 8280 }, { "epoch": 1.9312754804892254, "grad_norm": 8.464248657226562, "learning_rate": 7.611650485436894e-07, "logits/chosen": -4.264374732971191, "logits/rejected": -4.301041603088379, "logps/chosen": -712.2825927734375, "logps/rejected": -738.6800537109375, "loss": 0.7616, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6949116587638855, "rewards/margins": 0.40924954414367676, "rewards/rejected": -1.1041611433029175, "step": 8290 }, { "epoch": 1.933605125218404, "grad_norm": 5.5404486656188965, "learning_rate": 7.352750809061489e-07, "logits/chosen": -4.312140464782715, "logits/rejected": -4.395899772644043, "logps/chosen": -734.2998657226562, "logps/rejected": -804.1611328125, "loss": 0.7502, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4405832290649414, "rewards/margins": 0.5064100027084351, "rewards/rejected": -0.9469932317733765, "step": 8300 }, { "epoch": 1.933605125218404, "eval_logits/chosen": -4.269779205322266, "eval_logits/rejected": -4.255950450897217, "eval_logps/chosen": -698.4453735351562, "eval_logps/rejected": -718.7404174804688, "eval_loss": 0.6262587308883667, "eval_rewards/accuracies": 0.6404808759689331, "eval_rewards/chosen": -0.7617153525352478, "eval_rewards/margins": 0.43246886134147644, "eval_rewards/rejected": -1.1941843032836914, "eval_runtime": 402.0528, "eval_samples_per_second": 17.794, "eval_steps_per_second": 8.897, "step": 8300 }, { "epoch": 1.935934769947583, "grad_norm": 10.476713180541992, "learning_rate": 7.093851132686085e-07, "logits/chosen": -4.345536231994629, "logits/rejected": -4.35883092880249, "logps/chosen": -726.04248046875, "logps/rejected": -762.2106323242188, "loss": 0.6994, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6188130378723145, "rewards/margins": 0.6291608214378357, "rewards/rejected": -1.2479736804962158, "step": 8310 }, { "epoch": 1.9382644146767618, "grad_norm": 6.145667552947998, "learning_rate": 6.83495145631068e-07, "logits/chosen": -4.3391337394714355, "logits/rejected": -4.24570894241333, "logps/chosen": -774.0887451171875, "logps/rejected": -720.0848388671875, "loss": 0.7267, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5717106461524963, "rewards/margins": 0.6787561774253845, "rewards/rejected": -1.2504669427871704, "step": 8320 }, { "epoch": 1.9405940594059405, "grad_norm": 8.16446304321289, "learning_rate": 6.576051779935276e-07, "logits/chosen": -4.334807395935059, "logits/rejected": -4.253960132598877, "logps/chosen": -725.342041015625, "logps/rejected": -719.07568359375, "loss": 0.8266, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.633865475654602, "rewards/margins": 0.2745392322540283, "rewards/rejected": -0.9084047079086304, "step": 8330 }, { "epoch": 1.9429237041351195, "grad_norm": 10.712969779968262, "learning_rate": 6.31715210355987e-07, "logits/chosen": -4.292527675628662, "logits/rejected": -4.2442946434021, "logps/chosen": -761.6686401367188, "logps/rejected": -714.60009765625, "loss": 0.8968, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9049726724624634, "rewards/margins": 0.27385425567626953, "rewards/rejected": -1.1788270473480225, "step": 8340 }, { "epoch": 1.9452533488642982, "grad_norm": 6.336966514587402, "learning_rate": 6.058252427184466e-07, "logits/chosen": -4.317821025848389, "logits/rejected": -4.242893218994141, "logps/chosen": -717.8641967773438, "logps/rejected": -699.2586059570312, "loss": 0.8501, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7709203958511353, "rewards/margins": 0.3294094204902649, "rewards/rejected": -1.1003297567367554, "step": 8350 }, { "epoch": 1.947582993593477, "grad_norm": 10.864871978759766, "learning_rate": 5.799352750809062e-07, "logits/chosen": -4.350275993347168, "logits/rejected": -4.370471000671387, "logps/chosen": -731.3067626953125, "logps/rejected": -769.8187255859375, "loss": 0.6432, "rewards/accuracies": 0.6875, "rewards/chosen": -0.558472752571106, "rewards/margins": 0.7509238123893738, "rewards/rejected": -1.309396505355835, "step": 8360 }, { "epoch": 1.949912638322656, "grad_norm": 8.846085548400879, "learning_rate": 5.540453074433658e-07, "logits/chosen": -4.353498458862305, "logits/rejected": -4.3908796310424805, "logps/chosen": -741.0128173828125, "logps/rejected": -692.1842651367188, "loss": 0.8107, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7740066647529602, "rewards/margins": 0.366254597902298, "rewards/rejected": -1.1402612924575806, "step": 8370 }, { "epoch": 1.9522422830518344, "grad_norm": 7.637683868408203, "learning_rate": 5.281553398058253e-07, "logits/chosen": -4.264584541320801, "logits/rejected": -4.2607526779174805, "logps/chosen": -735.846923828125, "logps/rejected": -754.1130981445312, "loss": 0.7077, "rewards/accuracies": 0.625, "rewards/chosen": -0.7124431729316711, "rewards/margins": 0.6570056676864624, "rewards/rejected": -1.3694486618041992, "step": 8380 }, { "epoch": 1.9545719277810134, "grad_norm": 4.9761223793029785, "learning_rate": 5.022653721682848e-07, "logits/chosen": -4.241456985473633, "logits/rejected": -4.229941368103027, "logps/chosen": -664.3502197265625, "logps/rejected": -715.2335205078125, "loss": 0.8032, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.8136642575263977, "rewards/margins": 0.39051347970962524, "rewards/rejected": -1.2041776180267334, "step": 8390 }, { "epoch": 1.9569015725101921, "grad_norm": 10.152214050292969, "learning_rate": 4.7637540453074437e-07, "logits/chosen": -4.321287631988525, "logits/rejected": -4.322064399719238, "logps/chosen": -711.2864990234375, "logps/rejected": -713.7066650390625, "loss": 0.8314, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9399154782295227, "rewards/margins": 0.31622153520584106, "rewards/rejected": -1.2561370134353638, "step": 8400 }, { "epoch": 1.9569015725101921, "eval_logits/chosen": -4.2698798179626465, "eval_logits/rejected": -4.2562713623046875, "eval_logps/chosen": -698.441162109375, "eval_logps/rejected": -718.7234497070312, "eval_loss": 0.6265471577644348, "eval_rewards/accuracies": 0.6420184373855591, "eval_rewards/chosen": -0.7612878084182739, "eval_rewards/margins": 0.4311898946762085, "eval_rewards/rejected": -1.1924777030944824, "eval_runtime": 402.4783, "eval_samples_per_second": 17.775, "eval_steps_per_second": 8.887, "step": 8400 }, { "epoch": 1.9592312172393709, "grad_norm": 8.497055053710938, "learning_rate": 4.5048543689320394e-07, "logits/chosen": -4.284560203552246, "logits/rejected": -4.310437202453613, "logps/chosen": -723.8456420898438, "logps/rejected": -715.3678588867188, "loss": 0.7216, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8485382795333862, "rewards/margins": 0.5425747036933899, "rewards/rejected": -1.3911129236221313, "step": 8410 }, { "epoch": 1.9615608619685498, "grad_norm": 6.9082255363464355, "learning_rate": 4.2459546925566345e-07, "logits/chosen": -4.283592700958252, "logits/rejected": -4.316073417663574, "logps/chosen": -669.37109375, "logps/rejected": -663.8726806640625, "loss": 0.8655, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.8164242506027222, "rewards/margins": 0.10506206750869751, "rewards/rejected": -0.9214862585067749, "step": 8420 }, { "epoch": 1.9638905066977286, "grad_norm": 8.472549438476562, "learning_rate": 3.98705501618123e-07, "logits/chosen": -4.299191474914551, "logits/rejected": -4.256920337677002, "logps/chosen": -747.3541259765625, "logps/rejected": -740.9989013671875, "loss": 0.8372, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6805824041366577, "rewards/margins": 0.365847647190094, "rewards/rejected": -1.046429991722107, "step": 8430 }, { "epoch": 1.9662201514269073, "grad_norm": 7.3164167404174805, "learning_rate": 3.728155339805826e-07, "logits/chosen": -4.24942684173584, "logits/rejected": -4.248818397521973, "logps/chosen": -681.0838623046875, "logps/rejected": -723.9698486328125, "loss": 0.8116, "rewards/accuracies": 0.5, "rewards/chosen": -0.8195317983627319, "rewards/margins": 0.21901898086071014, "rewards/rejected": -1.0385507345199585, "step": 8440 }, { "epoch": 1.9685497961560863, "grad_norm": 7.188282489776611, "learning_rate": 3.469255663430421e-07, "logits/chosen": -4.256657600402832, "logits/rejected": -4.36815881729126, "logps/chosen": -641.541748046875, "logps/rejected": -759.9830322265625, "loss": 0.6642, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5071582794189453, "rewards/margins": 0.6715329885482788, "rewards/rejected": -1.1786913871765137, "step": 8450 }, { "epoch": 1.970879440885265, "grad_norm": 10.558123588562012, "learning_rate": 3.2103559870550167e-07, "logits/chosen": -4.252476692199707, "logits/rejected": -4.316245079040527, "logps/chosen": -730.2252197265625, "logps/rejected": -751.7780151367188, "loss": 0.7471, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7278391718864441, "rewards/margins": 0.45042625069618225, "rewards/rejected": -1.1782654523849487, "step": 8460 }, { "epoch": 1.9732090856144437, "grad_norm": 5.863772869110107, "learning_rate": 2.951456310679612e-07, "logits/chosen": -4.232786178588867, "logits/rejected": -4.32352876663208, "logps/chosen": -692.9447021484375, "logps/rejected": -767.4256591796875, "loss": 0.6891, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.6030572652816772, "rewards/margins": 0.5774694085121155, "rewards/rejected": -1.1805267333984375, "step": 8470 }, { "epoch": 1.9755387303436227, "grad_norm": 3.489633083343506, "learning_rate": 2.6925566343042075e-07, "logits/chosen": -4.237580299377441, "logits/rejected": -4.371821403503418, "logps/chosen": -693.5853881835938, "logps/rejected": -830.3731689453125, "loss": 0.7209, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7704498767852783, "rewards/margins": 0.7803723216056824, "rewards/rejected": -1.550822138786316, "step": 8480 }, { "epoch": 1.9778683750728014, "grad_norm": 8.350820541381836, "learning_rate": 2.4336569579288027e-07, "logits/chosen": -4.262223243713379, "logits/rejected": -4.329285621643066, "logps/chosen": -667.3316040039062, "logps/rejected": -720.9274291992188, "loss": 0.8041, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7020472288131714, "rewards/margins": 0.3439479470252991, "rewards/rejected": -1.0459951162338257, "step": 8490 }, { "epoch": 1.9801980198019802, "grad_norm": 7.101047992706299, "learning_rate": 2.1747572815533983e-07, "logits/chosen": -4.292538166046143, "logits/rejected": -4.3668742179870605, "logps/chosen": -746.7476806640625, "logps/rejected": -765.1817626953125, "loss": 0.694, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6788529753684998, "rewards/margins": 0.801043689250946, "rewards/rejected": -1.4798967838287354, "step": 8500 }, { "epoch": 1.9801980198019802, "eval_logits/chosen": -4.269947528839111, "eval_logits/rejected": -4.256167888641357, "eval_logps/chosen": -698.446044921875, "eval_logps/rejected": -718.7388916015625, "eval_loss": 0.6263306140899658, "eval_rewards/accuracies": 0.6432765126228333, "eval_rewards/chosen": -0.7617831826210022, "eval_rewards/margins": 0.4322440028190613, "eval_rewards/rejected": -1.1940271854400635, "eval_runtime": 401.1668, "eval_samples_per_second": 17.833, "eval_steps_per_second": 8.916, "step": 8500 } ], "logging_steps": 10, "max_steps": 8584, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }