diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14143 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9801980198019802, + "eval_steps": 100, + "global_step": 8500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0023296447291788003, + "grad_norm": 3.924811840057373, + "learning_rate": 2.3282887077997674e-07, + "logits/chosen": -4.522591590881348, + "logits/rejected": -4.452101707458496, + "logps/chosen": -684.3924560546875, + "logps/rejected": -665.7002563476562, + "loss": 0.878, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.12848183512687683, + "rewards/margins": -0.08472710102796555, + "rewards/rejected": -0.04375474527478218, + "step": 10 + }, + { + "epoch": 0.004659289458357601, + "grad_norm": 4.744262218475342, + "learning_rate": 4.656577415599535e-07, + "logits/chosen": -4.606254577636719, + "logits/rejected": -4.580027103424072, + "logps/chosen": -706.7974243164062, + "logps/rejected": -657.6036987304688, + "loss": 0.8944, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.10672809183597565, + "rewards/margins": -0.03675349801778793, + "rewards/rejected": -0.06997456401586533, + "step": 20 + }, + { + "epoch": 0.006988934187536401, + "grad_norm": 5.417810440063477, + "learning_rate": 6.984866123399302e-07, + "logits/chosen": -4.553195476531982, + "logits/rejected": -4.513618469238281, + "logps/chosen": -703.244873046875, + "logps/rejected": -695.6546630859375, + "loss": 0.9522, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.12389123439788818, + "rewards/margins": -0.1695944219827652, + "rewards/rejected": 0.045703209936618805, + "step": 30 + }, + { + "epoch": 0.009318578916715201, + "grad_norm": 3.657839775085449, + "learning_rate": 9.31315483119907e-07, + "logits/chosen": -4.52838134765625, + "logits/rejected": -4.499939441680908, + "logps/chosen": -709.3988037109375, + "logps/rejected": -729.5408935546875, + "loss": 0.7638, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.016293197870254517, + "rewards/margins": 0.1943705528974533, + "rewards/rejected": -0.21066375076770782, + "step": 40 + }, + { + "epoch": 0.011648223645894, + "grad_norm": 4.347109317779541, + "learning_rate": 1.1641443538998836e-06, + "logits/chosen": -4.601327419281006, + "logits/rejected": -4.51685094833374, + "logps/chosen": -728.6759643554688, + "logps/rejected": -757.50390625, + "loss": 0.8954, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.026629159227013588, + "rewards/margins": -0.0508374348282814, + "rewards/rejected": 0.07746660709381104, + "step": 50 + }, + { + "epoch": 0.013977868375072802, + "grad_norm": 4.963409900665283, + "learning_rate": 1.3969732246798604e-06, + "logits/chosen": -4.556868076324463, + "logits/rejected": -4.478787422180176, + "logps/chosen": -754.9805908203125, + "logps/rejected": -714.3489379882812, + "loss": 1.0465, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.1778850257396698, + "rewards/margins": -0.26734185218811035, + "rewards/rejected": 0.08945684880018234, + "step": 60 + }, + { + "epoch": 0.016307513104251603, + "grad_norm": 4.398076057434082, + "learning_rate": 1.629802095459837e-06, + "logits/chosen": -4.506744861602783, + "logits/rejected": -4.536816596984863, + "logps/chosen": -714.246337890625, + "logps/rejected": -728.5725708007812, + "loss": 0.8227, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.009435917250812054, + "rewards/margins": 0.08285114914178848, + "rewards/rejected": -0.07341523468494415, + "step": 70 + }, + { + "epoch": 0.018637157833430402, + "grad_norm": 3.7324328422546387, + "learning_rate": 1.862630966239814e-06, + "logits/chosen": -4.5511274337768555, + "logits/rejected": -4.537388801574707, + "logps/chosen": -686.8524780273438, + "logps/rejected": -745.160888671875, + "loss": 0.8821, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.02854597568511963, + "rewards/margins": -0.06114887073636055, + "rewards/rejected": 0.08969485759735107, + "step": 80 + }, + { + "epoch": 0.020966802562609202, + "grad_norm": 5.837793350219727, + "learning_rate": 2.0954598370197905e-06, + "logits/chosen": -4.489697456359863, + "logits/rejected": -4.449906826019287, + "logps/chosen": -680.8670654296875, + "logps/rejected": -650.242431640625, + "loss": 0.8567, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.08860909938812256, + "rewards/margins": 0.003706153482198715, + "rewards/rejected": -0.09231523424386978, + "step": 90 + }, + { + "epoch": 0.023296447291788, + "grad_norm": 5.7243475914001465, + "learning_rate": 2.3282887077997673e-06, + "logits/chosen": -4.5048298835754395, + "logits/rejected": -4.545318603515625, + "logps/chosen": -686.437744140625, + "logps/rejected": -765.6935424804688, + "loss": 0.7578, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.1273827850818634, + "rewards/margins": 0.3406364917755127, + "rewards/rejected": -0.21325376629829407, + "step": 100 + }, + { + "epoch": 0.023296447291788, + "eval_logits/chosen": -4.505529403686523, + "eval_logits/rejected": -4.499370574951172, + "eval_logps/chosen": -690.83056640625, + "eval_logps/rejected": -706.8093872070312, + "eval_loss": 0.6929930448532104, + "eval_rewards/accuracies": 0.5116019248962402, + "eval_rewards/chosen": -0.0002367756824241951, + "eval_rewards/margins": 0.0008354606688953936, + "eval_rewards/rejected": -0.0010722363367676735, + "eval_runtime": 384.5796, + "eval_samples_per_second": 18.602, + "eval_steps_per_second": 9.301, + "step": 100 + }, + { + "epoch": 0.0256260920209668, + "grad_norm": 4.138678550720215, + "learning_rate": 2.5611175785797445e-06, + "logits/chosen": -4.5441060066223145, + "logits/rejected": -4.443483829498291, + "logps/chosen": -748.9180908203125, + "logps/rejected": -738.2803955078125, + "loss": 0.7429, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06179947406053543, + "rewards/margins": 0.23742561042308807, + "rewards/rejected": -0.17562614381313324, + "step": 110 + }, + { + "epoch": 0.027955736750145604, + "grad_norm": 3.8973371982574463, + "learning_rate": 2.793946449359721e-06, + "logits/chosen": -4.614588260650635, + "logits/rejected": -4.612898826599121, + "logps/chosen": -732.0816650390625, + "logps/rejected": -785.8519287109375, + "loss": 0.7942, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.07624070346355438, + "rewards/margins": 0.12703315913677216, + "rewards/rejected": -0.05079244449734688, + "step": 120 + }, + { + "epoch": 0.030285381479324403, + "grad_norm": 2.5106780529022217, + "learning_rate": 3.0267753201396976e-06, + "logits/chosen": -4.505324363708496, + "logits/rejected": -4.493563652038574, + "logps/chosen": -688.0574951171875, + "logps/rejected": -751.806396484375, + "loss": 0.9648, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.05348503589630127, + "rewards/margins": -0.17224565148353577, + "rewards/rejected": 0.1187606006860733, + "step": 130 + }, + { + "epoch": 0.032615026208503206, + "grad_norm": 3.486295700073242, + "learning_rate": 3.259604190919674e-06, + "logits/chosen": -4.501693248748779, + "logits/rejected": -4.5221099853515625, + "logps/chosen": -692.7454223632812, + "logps/rejected": -702.983154296875, + "loss": 0.8493, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.02032635547220707, + "rewards/margins": 0.046482861042022705, + "rewards/rejected": -0.06680919975042343, + "step": 140 + }, + { + "epoch": 0.034944670937682006, + "grad_norm": 4.175703048706055, + "learning_rate": 3.492433061699651e-06, + "logits/chosen": -4.6017351150512695, + "logits/rejected": -4.442572593688965, + "logps/chosen": -810.1216430664062, + "logps/rejected": -699.7447509765625, + "loss": 0.9037, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.06848535686731339, + "rewards/margins": -0.10816104710102081, + "rewards/rejected": 0.03967570140957832, + "step": 150 + }, + { + "epoch": 0.037274315666860805, + "grad_norm": 4.84321403503418, + "learning_rate": 3.725261932479628e-06, + "logits/chosen": -4.5791015625, + "logits/rejected": -4.534665107727051, + "logps/chosen": -746.9636840820312, + "logps/rejected": -760.25390625, + "loss": 0.8184, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.015708060935139656, + "rewards/margins": 0.1453014314174652, + "rewards/rejected": -0.16100946068763733, + "step": 160 + }, + { + "epoch": 0.039603960396039604, + "grad_norm": 5.143755912780762, + "learning_rate": 3.958090803259605e-06, + "logits/chosen": -4.579737663269043, + "logits/rejected": -4.561814785003662, + "logps/chosen": -731.2387084960938, + "logps/rejected": -708.92578125, + "loss": 0.7407, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.1105731949210167, + "rewards/margins": 0.251499742269516, + "rewards/rejected": -0.1409265697002411, + "step": 170 + }, + { + "epoch": 0.041933605125218404, + "grad_norm": 3.8264622688293457, + "learning_rate": 4.190919674039581e-06, + "logits/chosen": -4.5193586349487305, + "logits/rejected": -4.51457405090332, + "logps/chosen": -698.3035888671875, + "logps/rejected": -717.1882934570312, + "loss": 0.762, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.054043758660554886, + "rewards/margins": 0.21840786933898926, + "rewards/rejected": -0.16436411440372467, + "step": 180 + }, + { + "epoch": 0.0442632498543972, + "grad_norm": 4.408588409423828, + "learning_rate": 4.423748544819557e-06, + "logits/chosen": -4.572482585906982, + "logits/rejected": -4.569614410400391, + "logps/chosen": -784.6768188476562, + "logps/rejected": -776.5426635742188, + "loss": 1.1467, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.06866107136011124, + "rewards/margins": -0.35513216257095337, + "rewards/rejected": 0.28647106885910034, + "step": 190 + }, + { + "epoch": 0.046592894583576, + "grad_norm": 4.938295364379883, + "learning_rate": 4.6565774155995345e-06, + "logits/chosen": -4.534358024597168, + "logits/rejected": -4.555275917053223, + "logps/chosen": -691.1871337890625, + "logps/rejected": -718.7325439453125, + "loss": 0.7219, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.007454717066138983, + "rewards/margins": 0.2289915531873703, + "rewards/rejected": -0.22153684496879578, + "step": 200 + }, + { + "epoch": 0.046592894583576, + "eval_logits/chosen": -4.502413749694824, + "eval_logits/rejected": -4.496292591094971, + "eval_logps/chosen": -690.849365234375, + "eval_logps/rejected": -706.842041015625, + "eval_loss": 0.6923176050186157, + "eval_rewards/accuracies": 0.517892062664032, + "eval_rewards/chosen": -0.0021104670595377684, + "eval_rewards/margins": 0.0022353504318743944, + "eval_rewards/rejected": -0.004345817491412163, + "eval_runtime": 385.229, + "eval_samples_per_second": 18.571, + "eval_steps_per_second": 9.285, + "step": 200 + }, + { + "epoch": 0.0489225393127548, + "grad_norm": 4.715427875518799, + "learning_rate": 4.889406286379512e-06, + "logits/chosen": -4.474791526794434, + "logits/rejected": -4.529447555541992, + "logps/chosen": -701.8544921875, + "logps/rejected": -734.6244506835938, + "loss": 0.7614, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.10978059470653534, + "rewards/margins": 0.19003726541996002, + "rewards/rejected": -0.08025668561458588, + "step": 210 + }, + { + "epoch": 0.0512521840419336, + "grad_norm": 5.457529067993164, + "learning_rate": 5.122235157159489e-06, + "logits/chosen": -4.573043346405029, + "logits/rejected": -4.5491943359375, + "logps/chosen": -690.1962280273438, + "logps/rejected": -711.5494384765625, + "loss": 0.9218, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.1155082955956459, + "rewards/margins": -0.14690880477428436, + "rewards/rejected": 0.03140052407979965, + "step": 220 + }, + { + "epoch": 0.05358182877111241, + "grad_norm": 4.8636555671691895, + "learning_rate": 5.355064027939465e-06, + "logits/chosen": -4.635704040527344, + "logits/rejected": -4.550039768218994, + "logps/chosen": -756.4803466796875, + "logps/rejected": -737.9407958984375, + "loss": 0.8103, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.03032105602324009, + "rewards/margins": 0.0932706966996193, + "rewards/rejected": -0.06294964253902435, + "step": 230 + }, + { + "epoch": 0.05591147350029121, + "grad_norm": 5.851677417755127, + "learning_rate": 5.587892898719442e-06, + "logits/chosen": -4.482217788696289, + "logits/rejected": -4.481154441833496, + "logps/chosen": -727.5157470703125, + "logps/rejected": -705.8177490234375, + "loss": 0.8402, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.07104573398828506, + "rewards/margins": 0.024619558826088905, + "rewards/rejected": 0.04642615467309952, + "step": 240 + }, + { + "epoch": 0.05824111822947001, + "grad_norm": 3.755674123764038, + "learning_rate": 5.820721769499419e-06, + "logits/chosen": -4.555675506591797, + "logits/rejected": -4.55229377746582, + "logps/chosen": -688.3670654296875, + "logps/rejected": -759.0899047851562, + "loss": 0.9507, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.029390472918748856, + "rewards/margins": -0.06705763190984726, + "rewards/rejected": 0.09644811600446701, + "step": 250 + }, + { + "epoch": 0.060570762958648806, + "grad_norm": 4.303655624389648, + "learning_rate": 6.053550640279395e-06, + "logits/chosen": -4.517312049865723, + "logits/rejected": -4.544795036315918, + "logps/chosen": -664.3435668945312, + "logps/rejected": -723.6569213867188, + "loss": 0.8181, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.03719661012291908, + "rewards/margins": 0.10673441737890244, + "rewards/rejected": -0.1439310610294342, + "step": 260 + }, + { + "epoch": 0.0629004076878276, + "grad_norm": 3.8483376502990723, + "learning_rate": 6.2863795110593715e-06, + "logits/chosen": -4.524532318115234, + "logits/rejected": -4.585890293121338, + "logps/chosen": -664.9467163085938, + "logps/rejected": -704.1334228515625, + "loss": 0.8362, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1777094155550003, + "rewards/margins": 0.05187971517443657, + "rewards/rejected": -0.22958913445472717, + "step": 270 + }, + { + "epoch": 0.06523005241700641, + "grad_norm": 5.691915988922119, + "learning_rate": 6.519208381839348e-06, + "logits/chosen": -4.5398993492126465, + "logits/rejected": -4.574265480041504, + "logps/chosen": -702.3248291015625, + "logps/rejected": -724.2568359375, + "loss": 0.9145, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.08214785158634186, + "rewards/margins": -0.08197866380214691, + "rewards/rejected": -0.00016917586617637426, + "step": 280 + }, + { + "epoch": 0.0675596971461852, + "grad_norm": 6.575222015380859, + "learning_rate": 6.752037252619326e-06, + "logits/chosen": -4.494467735290527, + "logits/rejected": -4.485854148864746, + "logps/chosen": -725.3032836914062, + "logps/rejected": -746.1402587890625, + "loss": 0.8225, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.014149573631584644, + "rewards/margins": 0.014778072014451027, + "rewards/rejected": -0.0006285011768341064, + "step": 290 + }, + { + "epoch": 0.06988934187536401, + "grad_norm": 4.541840076446533, + "learning_rate": 6.984866123399302e-06, + "logits/chosen": -4.506882190704346, + "logits/rejected": -4.480749607086182, + "logps/chosen": -778.0560302734375, + "logps/rejected": -752.30224609375, + "loss": 0.9151, + "rewards/accuracies": 0.38749998807907104, + "rewards/chosen": -0.11194051802158356, + "rewards/margins": -0.137613907456398, + "rewards/rejected": 0.0256733950227499, + "step": 300 + }, + { + "epoch": 0.06988934187536401, + "eval_logits/chosen": -4.500364780426025, + "eval_logits/rejected": -4.49424934387207, + "eval_logps/chosen": -690.8859252929688, + "eval_logps/rejected": -706.9114990234375, + "eval_loss": 0.6907868981361389, + "eval_rewards/accuracies": 0.5388593673706055, + "eval_rewards/chosen": -0.005771713797003031, + "eval_rewards/margins": 0.005515825469046831, + "eval_rewards/rejected": -0.011287540197372437, + "eval_runtime": 385.5275, + "eval_samples_per_second": 18.556, + "eval_steps_per_second": 9.278, + "step": 300 + }, + { + "epoch": 0.0722189866045428, + "grad_norm": 5.339130401611328, + "learning_rate": 7.2176949941792785e-06, + "logits/chosen": -4.623049259185791, + "logits/rejected": -4.607933044433594, + "logps/chosen": -787.4456176757812, + "logps/rejected": -809.6871337890625, + "loss": 1.0983, + "rewards/accuracies": 0.38749998807907104, + "rewards/chosen": -0.2957928776741028, + "rewards/margins": -0.4004906117916107, + "rewards/rejected": 0.10469770431518555, + "step": 310 + }, + { + "epoch": 0.07454863133372161, + "grad_norm": 4.564884662628174, + "learning_rate": 7.450523864959256e-06, + "logits/chosen": -4.502755165100098, + "logits/rejected": -4.475649833679199, + "logps/chosen": -669.8948364257812, + "logps/rejected": -723.6424560546875, + "loss": 0.8954, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.05094115808606148, + "rewards/margins": -0.03177493438124657, + "rewards/rejected": -0.019166234880685806, + "step": 320 + }, + { + "epoch": 0.0768782760629004, + "grad_norm": 5.733292102813721, + "learning_rate": 7.683352735739232e-06, + "logits/chosen": -4.540478706359863, + "logits/rejected": -4.566623210906982, + "logps/chosen": -757.8765258789062, + "logps/rejected": -708.9313354492188, + "loss": 0.8511, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.047536421567201614, + "rewards/margins": 0.053884364664554596, + "rewards/rejected": -0.006347939372062683, + "step": 330 + }, + { + "epoch": 0.07920792079207921, + "grad_norm": 5.753773212432861, + "learning_rate": 7.91618160651921e-06, + "logits/chosen": -4.459725856781006, + "logits/rejected": -4.57793664932251, + "logps/chosen": -723.3109741210938, + "logps/rejected": -762.54736328125, + "loss": 1.0541, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.049885571002960205, + "rewards/margins": -0.25494030117988586, + "rewards/rejected": 0.20505471527576447, + "step": 340 + }, + { + "epoch": 0.081537565521258, + "grad_norm": 5.136266231536865, + "learning_rate": 8.149010477299186e-06, + "logits/chosen": -4.529461860656738, + "logits/rejected": -4.453719615936279, + "logps/chosen": -711.1002197265625, + "logps/rejected": -640.2122802734375, + "loss": 0.8314, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0677490234375, + "rewards/margins": 0.18676519393920898, + "rewards/rejected": -0.11901617050170898, + "step": 350 + }, + { + "epoch": 0.08386721025043681, + "grad_norm": 5.856282711029053, + "learning_rate": 8.381839348079162e-06, + "logits/chosen": -4.503441333770752, + "logits/rejected": -4.504217624664307, + "logps/chosen": -777.5228271484375, + "logps/rejected": -750.1853637695312, + "loss": 0.9783, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.07102885842323303, + "rewards/margins": -0.1274302750825882, + "rewards/rejected": 0.19845914840698242, + "step": 360 + }, + { + "epoch": 0.08619685497961561, + "grad_norm": 5.638661861419678, + "learning_rate": 8.61466821885914e-06, + "logits/chosen": -4.5200605392456055, + "logits/rejected": -4.6175665855407715, + "logps/chosen": -752.9575805664062, + "logps/rejected": -788.6130981445312, + "loss": 0.8494, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.14627352356910706, + "rewards/margins": 0.04493988677859306, + "rewards/rejected": 0.10133364051580429, + "step": 370 + }, + { + "epoch": 0.0885264997087944, + "grad_norm": 6.1573357582092285, + "learning_rate": 8.847497089639115e-06, + "logits/chosen": -4.608702182769775, + "logits/rejected": -4.623651027679443, + "logps/chosen": -706.5177612304688, + "logps/rejected": -751.2601318359375, + "loss": 1.0472, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.27272742986679077, + "rewards/margins": -0.3680197596549988, + "rewards/rejected": 0.09529231488704681, + "step": 380 + }, + { + "epoch": 0.09085614443797321, + "grad_norm": 6.762535095214844, + "learning_rate": 9.080325960419094e-06, + "logits/chosen": -4.571442604064941, + "logits/rejected": -4.51932430267334, + "logps/chosen": -751.0111083984375, + "logps/rejected": -743.8901977539062, + "loss": 0.9392, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.057555168867111206, + "rewards/margins": -0.0428026020526886, + "rewards/rejected": 0.100357785820961, + "step": 390 + }, + { + "epoch": 0.093185789167152, + "grad_norm": 5.95464563369751, + "learning_rate": 9.313154831199069e-06, + "logits/chosen": -4.542258262634277, + "logits/rejected": -4.514950752258301, + "logps/chosen": -790.67919921875, + "logps/rejected": -716.7508544921875, + "loss": 0.8007, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12491132318973541, + "rewards/margins": 0.12783382833003998, + "rewards/rejected": -0.0029225251637399197, + "step": 400 + }, + { + "epoch": 0.093185789167152, + "eval_logits/chosen": -4.494009017944336, + "eval_logits/rejected": -4.487839698791504, + "eval_logps/chosen": -690.876220703125, + "eval_logps/rejected": -706.9720458984375, + "eval_loss": 0.6875714063644409, + "eval_rewards/accuracies": 0.5735253095626831, + "eval_rewards/chosen": -0.004798768553882837, + "eval_rewards/margins": 0.012539190240204334, + "eval_rewards/rejected": -0.01733795739710331, + "eval_runtime": 386.3479, + "eval_samples_per_second": 18.517, + "eval_steps_per_second": 9.258, + "step": 400 + }, + { + "epoch": 0.09551543389633081, + "grad_norm": 6.097142696380615, + "learning_rate": 9.545983701979046e-06, + "logits/chosen": -4.555731773376465, + "logits/rejected": -4.547555923461914, + "logps/chosen": -764.7968139648438, + "logps/rejected": -813.4968872070312, + "loss": 1.0184, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.14817802608013153, + "rewards/margins": -0.21674151718616486, + "rewards/rejected": 0.06856345385313034, + "step": 410 + }, + { + "epoch": 0.0978450786255096, + "grad_norm": 3.8165435791015625, + "learning_rate": 9.778812572759023e-06, + "logits/chosen": -4.537326812744141, + "logits/rejected": -4.496391296386719, + "logps/chosen": -661.8091430664062, + "logps/rejected": -675.4200439453125, + "loss": 0.8522, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.1329130232334137, + "rewards/margins": -0.04306463152170181, + "rewards/rejected": -0.08984839916229248, + "step": 420 + }, + { + "epoch": 0.10017472335468841, + "grad_norm": 6.400147914886475, + "learning_rate": 1.0011641443538999e-05, + "logits/chosen": -4.467156410217285, + "logits/rejected": -4.5243730545043945, + "logps/chosen": -700.6357421875, + "logps/rejected": -794.3414306640625, + "loss": 0.949, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.02168646827340126, + "rewards/margins": -0.14708887040615082, + "rewards/rejected": 0.12540242075920105, + "step": 430 + }, + { + "epoch": 0.1025043680838672, + "grad_norm": 5.348326206207275, + "learning_rate": 1.0244470314318978e-05, + "logits/chosen": -4.550189018249512, + "logits/rejected": -4.601241588592529, + "logps/chosen": -675.608154296875, + "logps/rejected": -734.4439086914062, + "loss": 0.8471, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.15092246234416962, + "rewards/margins": 0.09153494983911514, + "rewards/rejected": 0.05938751623034477, + "step": 440 + }, + { + "epoch": 0.10483401281304601, + "grad_norm": 5.341457843780518, + "learning_rate": 1.0477299185098953e-05, + "logits/chosen": -4.568203449249268, + "logits/rejected": -4.5304179191589355, + "logps/chosen": -729.2503662109375, + "logps/rejected": -744.8440551757812, + "loss": 0.8296, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.04322073608636856, + "rewards/margins": 0.1277836114168167, + "rewards/rejected": -0.08456286042928696, + "step": 450 + }, + { + "epoch": 0.10716365754222482, + "grad_norm": 4.877196788787842, + "learning_rate": 1.071012805587893e-05, + "logits/chosen": -4.477053165435791, + "logits/rejected": -4.52747106552124, + "logps/chosen": -678.0662231445312, + "logps/rejected": -736.2633056640625, + "loss": 0.8902, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.03617674857378006, + "rewards/margins": -0.0766257494688034, + "rewards/rejected": 0.040448982268571854, + "step": 460 + }, + { + "epoch": 0.10949330227140361, + "grad_norm": 6.649731159210205, + "learning_rate": 1.0942956926658908e-05, + "logits/chosen": -4.47160005569458, + "logits/rejected": -4.600485801696777, + "logps/chosen": -767.81103515625, + "logps/rejected": -851.0872802734375, + "loss": 0.8232, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.023301953449845314, + "rewards/margins": -0.006503301672637463, + "rewards/rejected": 0.029805243015289307, + "step": 470 + }, + { + "epoch": 0.11182294700058241, + "grad_norm": 5.362112998962402, + "learning_rate": 1.1175785797438883e-05, + "logits/chosen": -4.62471866607666, + "logits/rejected": -4.54784631729126, + "logps/chosen": -733.701416015625, + "logps/rejected": -679.8546142578125, + "loss": 0.7379, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1373155266046524, + "rewards/margins": 0.20806872844696045, + "rewards/rejected": -0.07075319439172745, + "step": 480 + }, + { + "epoch": 0.11415259172976121, + "grad_norm": 4.938389301300049, + "learning_rate": 1.140861466821886e-05, + "logits/chosen": -4.493394374847412, + "logits/rejected": -4.541499137878418, + "logps/chosen": -714.9727783203125, + "logps/rejected": -744.9597778320312, + "loss": 0.9243, + "rewards/accuracies": 0.38749998807907104, + "rewards/chosen": -0.053753603249788284, + "rewards/margins": -0.20864422619342804, + "rewards/rejected": 0.15489062666893005, + "step": 490 + }, + { + "epoch": 0.11648223645894001, + "grad_norm": 3.43468976020813, + "learning_rate": 1.1641443538998838e-05, + "logits/chosen": -4.504472732543945, + "logits/rejected": -4.528023719787598, + "logps/chosen": -671.1829833984375, + "logps/rejected": -717.5294189453125, + "loss": 0.8537, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0035562929697334766, + "rewards/margins": 0.06256623566150665, + "rewards/rejected": -0.05900995805859566, + "step": 500 + }, + { + "epoch": 0.11648223645894001, + "eval_logits/chosen": -4.482612609863281, + "eval_logits/rejected": -4.475794315338135, + "eval_logps/chosen": -690.953369140625, + "eval_logps/rejected": -707.1657104492188, + "eval_loss": 0.6825693249702454, + "eval_rewards/accuracies": 0.5932345390319824, + "eval_rewards/chosen": -0.012506458908319473, + "eval_rewards/margins": 0.024205248802900314, + "eval_rewards/rejected": -0.03671170771121979, + "eval_runtime": 386.3263, + "eval_samples_per_second": 18.518, + "eval_steps_per_second": 9.259, + "step": 500 + }, + { + "epoch": 0.1188118811881188, + "grad_norm": 5.996777534484863, + "learning_rate": 1.1874272409778813e-05, + "logits/chosen": -4.582979202270508, + "logits/rejected": -4.537229061126709, + "logps/chosen": -756.7322998046875, + "logps/rejected": -711.3515014648438, + "loss": 0.8802, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.016579514369368553, + "rewards/margins": -0.010887527838349342, + "rewards/rejected": 0.02746700681746006, + "step": 510 + }, + { + "epoch": 0.12114152591729761, + "grad_norm": 5.092260837554932, + "learning_rate": 1.210710128055879e-05, + "logits/chosen": -4.395134925842285, + "logits/rejected": -4.46746826171875, + "logps/chosen": -605.616455078125, + "logps/rejected": -695.7340698242188, + "loss": 0.8504, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05290902778506279, + "rewards/margins": 0.08201033622026443, + "rewards/rejected": -0.13491936028003693, + "step": 520 + }, + { + "epoch": 0.12347117064647642, + "grad_norm": 6.615758895874023, + "learning_rate": 1.2339930151338766e-05, + "logits/chosen": -4.505391597747803, + "logits/rejected": -4.579988479614258, + "logps/chosen": -662.4857177734375, + "logps/rejected": -739.6719970703125, + "loss": 0.8742, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.057824719697237015, + "rewards/margins": -0.011621838435530663, + "rewards/rejected": -0.0462028793990612, + "step": 530 + }, + { + "epoch": 0.1258008153756552, + "grad_norm": 7.350104808807373, + "learning_rate": 1.2572759022118743e-05, + "logits/chosen": -4.466317176818848, + "logits/rejected": -4.424395561218262, + "logps/chosen": -771.0465087890625, + "logps/rejected": -732.0314331054688, + "loss": 0.9882, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.07970836013555527, + "rewards/margins": -0.20637159049510956, + "rewards/rejected": 0.1266632378101349, + "step": 540 + }, + { + "epoch": 0.128130460104834, + "grad_norm": 6.6570281982421875, + "learning_rate": 1.280558789289872e-05, + "logits/chosen": -4.606103897094727, + "logits/rejected": -4.553511619567871, + "logps/chosen": -730.1910400390625, + "logps/rejected": -739.556884765625, + "loss": 0.9002, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.08168536424636841, + "rewards/margins": -0.03502635285258293, + "rewards/rejected": -0.04665901139378548, + "step": 550 + }, + { + "epoch": 0.13046010483401282, + "grad_norm": 4.95475435256958, + "learning_rate": 1.3038416763678696e-05, + "logits/chosen": -4.475506782531738, + "logits/rejected": -4.3944501876831055, + "logps/chosen": -758.3416137695312, + "logps/rejected": -779.4380493164062, + "loss": 0.9269, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.09176072478294373, + "rewards/margins": -0.047757431864738464, + "rewards/rejected": -0.04400332272052765, + "step": 560 + }, + { + "epoch": 0.13278974956319162, + "grad_norm": 4.766490936279297, + "learning_rate": 1.3271245634458675e-05, + "logits/chosen": -4.471567153930664, + "logits/rejected": -4.58807373046875, + "logps/chosen": -716.8036499023438, + "logps/rejected": -793.9383544921875, + "loss": 0.839, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.04037877544760704, + "rewards/margins": 0.085518978536129, + "rewards/rejected": -0.04514019936323166, + "step": 570 + }, + { + "epoch": 0.1351193942923704, + "grad_norm": 6.618696689605713, + "learning_rate": 1.3504074505238652e-05, + "logits/chosen": -4.571255683898926, + "logits/rejected": -4.508147239685059, + "logps/chosen": -782.623046875, + "logps/rejected": -730.2835693359375, + "loss": 0.9174, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.12230058014392853, + "rewards/margins": -0.1568373292684555, + "rewards/rejected": 0.03453673794865608, + "step": 580 + }, + { + "epoch": 0.1374490390215492, + "grad_norm": 8.756173133850098, + "learning_rate": 1.3736903376018627e-05, + "logits/chosen": -4.579248905181885, + "logits/rejected": -4.511783599853516, + "logps/chosen": -723.1712646484375, + "logps/rejected": -692.4654541015625, + "loss": 0.8997, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.20503079891204834, + "rewards/margins": -0.061770010739564896, + "rewards/rejected": -0.14326077699661255, + "step": 590 + }, + { + "epoch": 0.13977868375072802, + "grad_norm": 4.448201656341553, + "learning_rate": 1.3969732246798604e-05, + "logits/chosen": -4.505821704864502, + "logits/rejected": -4.529115676879883, + "logps/chosen": -720.322509765625, + "logps/rejected": -713.7578735351562, + "loss": 0.8746, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.03187558054924011, + "rewards/margins": -0.010967904701828957, + "rewards/rejected": 0.042843498289585114, + "step": 600 + }, + { + "epoch": 0.13977868375072802, + "eval_logits/chosen": -4.465523719787598, + "eval_logits/rejected": -4.4580254554748535, + "eval_logps/chosen": -691.2229614257812, + "eval_logps/rejected": -707.6290283203125, + "eval_loss": 0.6756829619407654, + "eval_rewards/accuracies": 0.5942130088806152, + "eval_rewards/chosen": -0.03947289660573006, + "eval_rewards/margins": 0.043567754328250885, + "eval_rewards/rejected": -0.08304064720869064, + "eval_runtime": 386.2804, + "eval_samples_per_second": 18.52, + "eval_steps_per_second": 9.26, + "step": 600 + }, + { + "epoch": 0.14210832847990681, + "grad_norm": 3.8562746047973633, + "learning_rate": 1.4202561117578582e-05, + "logits/chosen": -4.432703971862793, + "logits/rejected": -4.496449947357178, + "logps/chosen": -671.8670043945312, + "logps/rejected": -723.2869262695312, + "loss": 0.8301, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.08208735287189484, + "rewards/margins": 0.1055004820227623, + "rewards/rejected": -0.023413140326738358, + "step": 610 + }, + { + "epoch": 0.1444379732090856, + "grad_norm": 5.218752384185791, + "learning_rate": 1.4435389988358557e-05, + "logits/chosen": -4.540579795837402, + "logits/rejected": -4.502984046936035, + "logps/chosen": -732.4451904296875, + "logps/rejected": -736.8656005859375, + "loss": 0.8271, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.07555864006280899, + "rewards/margins": 0.08679278939962387, + "rewards/rejected": -0.16235145926475525, + "step": 620 + }, + { + "epoch": 0.14676761793826443, + "grad_norm": 4.187116622924805, + "learning_rate": 1.4668218859138534e-05, + "logits/chosen": -4.4309468269348145, + "logits/rejected": -4.475230693817139, + "logps/chosen": -670.060302734375, + "logps/rejected": -692.5626831054688, + "loss": 0.7864, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.09609910100698471, + "rewards/margins": 0.13632231950759888, + "rewards/rejected": -0.04022323340177536, + "step": 630 + }, + { + "epoch": 0.14909726266744322, + "grad_norm": 6.015897750854492, + "learning_rate": 1.4901047729918511e-05, + "logits/chosen": -4.529510498046875, + "logits/rejected": -4.5059356689453125, + "logps/chosen": -695.6485595703125, + "logps/rejected": -638.5431518554688, + "loss": 0.8831, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.07902977615594864, + "rewards/margins": -0.08549060672521591, + "rewards/rejected": 0.006460844073444605, + "step": 640 + }, + { + "epoch": 0.151426907396622, + "grad_norm": 9.856568336486816, + "learning_rate": 1.5133876600698487e-05, + "logits/chosen": -4.480548858642578, + "logits/rejected": -4.469473838806152, + "logps/chosen": -748.206787109375, + "logps/rejected": -737.4120483398438, + "loss": 0.9739, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.027322787791490555, + "rewards/margins": -0.0710093080997467, + "rewards/rejected": 0.04368652403354645, + "step": 650 + }, + { + "epoch": 0.1537565521258008, + "grad_norm": 5.658621311187744, + "learning_rate": 1.5366705471478464e-05, + "logits/chosen": -4.546767234802246, + "logits/rejected": -4.459005355834961, + "logps/chosen": -669.4649658203125, + "logps/rejected": -639.6283569335938, + "loss": 0.8724, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.019634447991847992, + "rewards/margins": -0.0433480404317379, + "rewards/rejected": 0.023713573813438416, + "step": 660 + }, + { + "epoch": 0.15608619685497963, + "grad_norm": 6.9683308601379395, + "learning_rate": 1.5599534342258443e-05, + "logits/chosen": -4.432524681091309, + "logits/rejected": -4.509583473205566, + "logps/chosen": -714.3270874023438, + "logps/rejected": -806.752197265625, + "loss": 0.7997, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.03766901418566704, + "rewards/margins": 0.21293482184410095, + "rewards/rejected": -0.1752658188343048, + "step": 670 + }, + { + "epoch": 0.15841584158415842, + "grad_norm": 6.2949113845825195, + "learning_rate": 1.583236321303842e-05, + "logits/chosen": -4.499951362609863, + "logits/rejected": -4.426943302154541, + "logps/chosen": -678.3032836914062, + "logps/rejected": -684.98388671875, + "loss": 0.8157, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.06583984196186066, + "rewards/margins": 0.10869854688644409, + "rewards/rejected": -0.17453840374946594, + "step": 680 + }, + { + "epoch": 0.1607454863133372, + "grad_norm": 6.752503871917725, + "learning_rate": 1.6065192083818394e-05, + "logits/chosen": -4.490216255187988, + "logits/rejected": -4.514633655548096, + "logps/chosen": -717.6605224609375, + "logps/rejected": -744.0989990234375, + "loss": 0.9619, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.10855916887521744, + "rewards/margins": -0.13426534831523895, + "rewards/rejected": 0.025706171989440918, + "step": 690 + }, + { + "epoch": 0.163075131042516, + "grad_norm": 6.129384517669678, + "learning_rate": 1.6298020954598373e-05, + "logits/chosen": -4.400869846343994, + "logits/rejected": -4.518251895904541, + "logps/chosen": -754.826416015625, + "logps/rejected": -817.2966918945312, + "loss": 0.7814, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.015134274959564209, + "rewards/margins": 0.2830790877342224, + "rewards/rejected": -0.29821330308914185, + "step": 700 + }, + { + "epoch": 0.163075131042516, + "eval_logits/chosen": -4.4405694007873535, + "eval_logits/rejected": -4.4323883056640625, + "eval_logps/chosen": -691.7077026367188, + "eval_logps/rejected": -708.324951171875, + "eval_loss": 0.6701375246047974, + "eval_rewards/accuracies": 0.5978473424911499, + "eval_rewards/chosen": -0.08795131742954254, + "eval_rewards/margins": 0.06467774510383606, + "eval_rewards/rejected": -0.1526290476322174, + "eval_runtime": 388.0467, + "eval_samples_per_second": 18.436, + "eval_steps_per_second": 9.218, + "step": 700 + }, + { + "epoch": 0.16540477577169482, + "grad_norm": 6.498562335968018, + "learning_rate": 1.653084982537835e-05, + "logits/chosen": -4.449611663818359, + "logits/rejected": -4.466498851776123, + "logps/chosen": -664.6646728515625, + "logps/rejected": -748.9295654296875, + "loss": 0.8716, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.07674837857484818, + "rewards/margins": 0.03639410063624382, + "rewards/rejected": -0.1131424754858017, + "step": 710 + }, + { + "epoch": 0.16773442050087362, + "grad_norm": 7.23454475402832, + "learning_rate": 1.6763678696158324e-05, + "logits/chosen": -4.440798759460449, + "logits/rejected": -4.442746162414551, + "logps/chosen": -708.8589477539062, + "logps/rejected": -670.4962158203125, + "loss": 0.6813, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.19437508285045624, + "rewards/margins": 0.3670928478240967, + "rewards/rejected": -0.17271773517131805, + "step": 720 + }, + { + "epoch": 0.1700640652300524, + "grad_norm": 4.659682273864746, + "learning_rate": 1.6996507566938303e-05, + "logits/chosen": -4.551013946533203, + "logits/rejected": -4.534272193908691, + "logps/chosen": -711.7779541015625, + "logps/rejected": -745.0709228515625, + "loss": 0.9104, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.1599648892879486, + "rewards/margins": 0.05072777345776558, + "rewards/rejected": -0.2106926441192627, + "step": 730 + }, + { + "epoch": 0.17239370995923123, + "grad_norm": 6.002203941345215, + "learning_rate": 1.722933643771828e-05, + "logits/chosen": -4.513463497161865, + "logits/rejected": -4.531512260437012, + "logps/chosen": -743.213623046875, + "logps/rejected": -761.3690185546875, + "loss": 0.8078, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0007909566047601402, + "rewards/margins": 0.25189337134361267, + "rewards/rejected": -0.2511024475097656, + "step": 740 + }, + { + "epoch": 0.17472335468841002, + "grad_norm": 4.882469654083252, + "learning_rate": 1.7462165308498257e-05, + "logits/chosen": -4.452869415283203, + "logits/rejected": -4.476421356201172, + "logps/chosen": -705.3482055664062, + "logps/rejected": -641.9478759765625, + "loss": 0.7559, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.00530981132760644, + "rewards/margins": 0.2093144953250885, + "rewards/rejected": -0.2040046900510788, + "step": 750 + }, + { + "epoch": 0.1770529994175888, + "grad_norm": 8.57348918914795, + "learning_rate": 1.769499417927823e-05, + "logits/chosen": -4.47825813293457, + "logits/rejected": -4.470673561096191, + "logps/chosen": -696.1239013671875, + "logps/rejected": -751.4468383789062, + "loss": 0.8814, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.01002002228051424, + "rewards/margins": -0.008996338583528996, + "rewards/rejected": 0.019016362726688385, + "step": 760 + }, + { + "epoch": 0.1793826441467676, + "grad_norm": 7.979362964630127, + "learning_rate": 1.7927823050058208e-05, + "logits/chosen": -4.551217079162598, + "logits/rejected": -4.480177402496338, + "logps/chosen": -750.54052734375, + "logps/rejected": -755.7467041015625, + "loss": 0.9642, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.27815550565719604, + "rewards/margins": -0.1696508228778839, + "rewards/rejected": -0.10850467532873154, + "step": 770 + }, + { + "epoch": 0.18171228887594643, + "grad_norm": 9.473190307617188, + "learning_rate": 1.8160651920838187e-05, + "logits/chosen": -4.4927873611450195, + "logits/rejected": -4.439538955688477, + "logps/chosen": -681.1729125976562, + "logps/rejected": -659.6249389648438, + "loss": 0.9542, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.21827229857444763, + "rewards/margins": -0.15694120526313782, + "rewards/rejected": -0.061331115663051605, + "step": 780 + }, + { + "epoch": 0.18404193360512522, + "grad_norm": 8.782654762268066, + "learning_rate": 1.8393480791618163e-05, + "logits/chosen": -4.477587699890137, + "logits/rejected": -4.4893479347229, + "logps/chosen": -691.896484375, + "logps/rejected": -744.2025756835938, + "loss": 0.8238, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.06819736957550049, + "rewards/margins": 0.09529562294483185, + "rewards/rejected": -0.16349297761917114, + "step": 790 + }, + { + "epoch": 0.186371578334304, + "grad_norm": 4.834448337554932, + "learning_rate": 1.8626309662398138e-05, + "logits/chosen": -4.495875835418701, + "logits/rejected": -4.446606159210205, + "logps/chosen": -712.1375732421875, + "logps/rejected": -698.7463989257812, + "loss": 0.8807, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06593946367502213, + "rewards/margins": 0.09279344230890274, + "rewards/rejected": -0.15873286128044128, + "step": 800 + }, + { + "epoch": 0.186371578334304, + "eval_logits/chosen": -4.416188716888428, + "eval_logits/rejected": -4.407541275024414, + "eval_logps/chosen": -692.6242065429688, + "eval_logps/rejected": -709.51318359375, + "eval_loss": 0.6647844314575195, + "eval_rewards/accuracies": 0.5993849635124207, + "eval_rewards/chosen": -0.1795974224805832, + "eval_rewards/margins": 0.09186282753944397, + "eval_rewards/rejected": -0.27146023511886597, + "eval_runtime": 386.9115, + "eval_samples_per_second": 18.49, + "eval_steps_per_second": 9.245, + "step": 800 + }, + { + "epoch": 0.18870122306348283, + "grad_norm": 6.349125862121582, + "learning_rate": 1.8859138533178117e-05, + "logits/chosen": -4.330979824066162, + "logits/rejected": -4.421043872833252, + "logps/chosen": -646.6573486328125, + "logps/rejected": -706.3137817382812, + "loss": 0.907, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.03287108615040779, + "rewards/margins": 0.07876729965209961, + "rewards/rejected": -0.1116383820772171, + "step": 810 + }, + { + "epoch": 0.19103086779266162, + "grad_norm": 6.316599369049072, + "learning_rate": 1.9091967403958092e-05, + "logits/chosen": -4.432641506195068, + "logits/rejected": -4.4769206047058105, + "logps/chosen": -679.0020751953125, + "logps/rejected": -746.960205078125, + "loss": 0.8263, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.20592470467090607, + "rewards/margins": 0.02772539295256138, + "rewards/rejected": -0.2336500585079193, + "step": 820 + }, + { + "epoch": 0.19336051252184042, + "grad_norm": 8.07143783569336, + "learning_rate": 1.9324796274738068e-05, + "logits/chosen": -4.513623237609863, + "logits/rejected": -4.509900093078613, + "logps/chosen": -767.2298583984375, + "logps/rejected": -722.4786376953125, + "loss": 1.1006, + "rewards/accuracies": 0.36250001192092896, + "rewards/chosen": -0.43160420656204224, + "rewards/margins": -0.44003787636756897, + "rewards/rejected": 0.008433675393462181, + "step": 830 + }, + { + "epoch": 0.1956901572510192, + "grad_norm": 6.413561820983887, + "learning_rate": 1.9557625145518047e-05, + "logits/chosen": -4.438241958618164, + "logits/rejected": -4.43107795715332, + "logps/chosen": -717.0433959960938, + "logps/rejected": -775.4155883789062, + "loss": 0.774, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.01133380550891161, + "rewards/margins": 0.15130652487277985, + "rewards/rejected": -0.1399727165699005, + "step": 840 + }, + { + "epoch": 0.19801980198019803, + "grad_norm": 7.26607608795166, + "learning_rate": 1.9790454016298022e-05, + "logits/chosen": -4.511332988739014, + "logits/rejected": -4.516010761260986, + "logps/chosen": -741.4783935546875, + "logps/rejected": -765.6262817382812, + "loss": 0.7566, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.04076741635799408, + "rewards/margins": 0.17717300355434418, + "rewards/rejected": -0.21794047951698303, + "step": 850 + }, + { + "epoch": 0.20034944670937682, + "grad_norm": 7.267500877380371, + "learning_rate": 1.9997411003236248e-05, + "logits/chosen": -4.396910667419434, + "logits/rejected": -4.434980869293213, + "logps/chosen": -721.0802612304688, + "logps/rejected": -759.048828125, + "loss": 0.8579, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.2259695827960968, + "rewards/margins": -0.0007159143569879234, + "rewards/rejected": -0.225253626704216, + "step": 860 + }, + { + "epoch": 0.20267909143855561, + "grad_norm": 5.612873554229736, + "learning_rate": 1.9971521035598705e-05, + "logits/chosen": -4.474349021911621, + "logits/rejected": -4.497475624084473, + "logps/chosen": -735.9964599609375, + "logps/rejected": -749.2137451171875, + "loss": 0.7599, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.16218391060829163, + "rewards/margins": 0.2070654332637787, + "rewards/rejected": -0.3692493438720703, + "step": 870 + }, + { + "epoch": 0.2050087361677344, + "grad_norm": 6.136228084564209, + "learning_rate": 1.9945631067961166e-05, + "logits/chosen": -4.436854839324951, + "logits/rejected": -4.380222797393799, + "logps/chosen": -713.2384643554688, + "logps/rejected": -718.5233154296875, + "loss": 0.8263, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.14623989164829254, + "rewards/margins": 0.1452106237411499, + "rewards/rejected": -0.291450560092926, + "step": 880 + }, + { + "epoch": 0.20733838089691323, + "grad_norm": 8.56090259552002, + "learning_rate": 1.9919741100323626e-05, + "logits/chosen": -4.393536567687988, + "logits/rejected": -4.48923921585083, + "logps/chosen": -723.2747192382812, + "logps/rejected": -769.8920288085938, + "loss": 0.7963, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.07917286455631256, + "rewards/margins": 0.2582031786441803, + "rewards/rejected": -0.33737602829933167, + "step": 890 + }, + { + "epoch": 0.20966802562609202, + "grad_norm": 7.170731544494629, + "learning_rate": 1.9893851132686087e-05, + "logits/chosen": -4.412181854248047, + "logits/rejected": -4.461060523986816, + "logps/chosen": -682.1312255859375, + "logps/rejected": -784.7744750976562, + "loss": 0.8967, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.2536042332649231, + "rewards/margins": 0.0834280326962471, + "rewards/rejected": -0.3370322585105896, + "step": 900 + }, + { + "epoch": 0.20966802562609202, + "eval_logits/chosen": -4.388455390930176, + "eval_logits/rejected": -4.379168510437012, + "eval_logps/chosen": -693.9192504882812, + "eval_logps/rejected": -711.1783447265625, + "eval_loss": 0.662997305393219, + "eval_rewards/accuracies": 0.5950517058372498, + "eval_rewards/chosen": -0.30910125374794006, + "eval_rewards/margins": 0.12887336313724518, + "eval_rewards/rejected": -0.43797463178634644, + "eval_runtime": 386.1394, + "eval_samples_per_second": 18.527, + "eval_steps_per_second": 9.263, + "step": 900 + }, + { + "epoch": 0.2119976703552708, + "grad_norm": 6.341297149658203, + "learning_rate": 1.9867961165048548e-05, + "logits/chosen": -4.427966594696045, + "logits/rejected": -4.400241374969482, + "logps/chosen": -660.0777587890625, + "logps/rejected": -687.676513671875, + "loss": 0.7655, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.24212121963500977, + "rewards/margins": 0.15326662361621857, + "rewards/rejected": -0.39538782835006714, + "step": 910 + }, + { + "epoch": 0.21432731508444963, + "grad_norm": 9.103260040283203, + "learning_rate": 1.9842071197411005e-05, + "logits/chosen": -4.519369125366211, + "logits/rejected": -4.517698764801025, + "logps/chosen": -740.9689331054688, + "logps/rejected": -760.4479370117188, + "loss": 0.8808, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2737390100955963, + "rewards/margins": 0.04373549669981003, + "rewards/rejected": -0.31747445464134216, + "step": 920 + }, + { + "epoch": 0.21665695981362842, + "grad_norm": 9.210978507995605, + "learning_rate": 1.9816181229773462e-05, + "logits/chosen": -4.456936359405518, + "logits/rejected": -4.353301048278809, + "logps/chosen": -755.6140747070312, + "logps/rejected": -709.889404296875, + "loss": 0.7476, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18943250179290771, + "rewards/margins": 0.25421398878097534, + "rewards/rejected": -0.44364649057388306, + "step": 930 + }, + { + "epoch": 0.21898660454280722, + "grad_norm": 9.391003608703613, + "learning_rate": 1.9790291262135922e-05, + "logits/chosen": -4.424135684967041, + "logits/rejected": -4.408591270446777, + "logps/chosen": -738.0665893554688, + "logps/rejected": -708.8335571289062, + "loss": 0.8815, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.3330017924308777, + "rewards/margins": 0.11240987479686737, + "rewards/rejected": -0.445411741733551, + "step": 940 + }, + { + "epoch": 0.221316249271986, + "grad_norm": 9.435540199279785, + "learning_rate": 1.9764401294498383e-05, + "logits/chosen": -4.3817338943481445, + "logits/rejected": -4.482665538787842, + "logps/chosen": -679.2781982421875, + "logps/rejected": -762.5584716796875, + "loss": 0.8862, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4497762620449066, + "rewards/margins": -0.009173527359962463, + "rewards/rejected": -0.44060271978378296, + "step": 950 + }, + { + "epoch": 0.22364589400116483, + "grad_norm": 5.925865173339844, + "learning_rate": 1.9738511326860844e-05, + "logits/chosen": -4.449588298797607, + "logits/rejected": -4.440318584442139, + "logps/chosen": -747.8040771484375, + "logps/rejected": -748.3953857421875, + "loss": 0.9623, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.4708067774772644, + "rewards/margins": -0.09435828030109406, + "rewards/rejected": -0.37644851207733154, + "step": 960 + }, + { + "epoch": 0.22597553873034362, + "grad_norm": 9.26547908782959, + "learning_rate": 1.9712621359223304e-05, + "logits/chosen": -4.4216814041137695, + "logits/rejected": -4.480114936828613, + "logps/chosen": -650.800537109375, + "logps/rejected": -739.6800537109375, + "loss": 0.9505, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.33942896127700806, + "rewards/margins": -0.065024234354496, + "rewards/rejected": -0.27440470457077026, + "step": 970 + }, + { + "epoch": 0.22830518345952241, + "grad_norm": 6.078829765319824, + "learning_rate": 1.968673139158576e-05, + "logits/chosen": -4.370000839233398, + "logits/rejected": -4.366150856018066, + "logps/chosen": -676.8948974609375, + "logps/rejected": -675.8610229492188, + "loss": 0.8804, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.319058895111084, + "rewards/margins": -0.07290103286504745, + "rewards/rejected": -0.24615785479545593, + "step": 980 + }, + { + "epoch": 0.23063482818870124, + "grad_norm": 7.866084098815918, + "learning_rate": 1.9660841423948222e-05, + "logits/chosen": -4.496729850769043, + "logits/rejected": -4.4081621170043945, + "logps/chosen": -695.5631103515625, + "logps/rejected": -689.4572143554688, + "loss": 0.8705, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.31703701615333557, + "rewards/margins": 0.10829310119152069, + "rewards/rejected": -0.42533010244369507, + "step": 990 + }, + { + "epoch": 0.23296447291788003, + "grad_norm": 6.092545032501221, + "learning_rate": 1.963495145631068e-05, + "logits/chosen": -4.45365571975708, + "logits/rejected": -4.482058525085449, + "logps/chosen": -757.77294921875, + "logps/rejected": -785.2354736328125, + "loss": 0.7651, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5000513792037964, + "rewards/margins": 0.3355986475944519, + "rewards/rejected": -0.8356500864028931, + "step": 1000 + }, + { + "epoch": 0.23296447291788003, + "eval_logits/chosen": -4.350008010864258, + "eval_logits/rejected": -4.339516639709473, + "eval_logps/chosen": -694.97314453125, + "eval_logps/rejected": -712.5101928710938, + "eval_loss": 0.6633332967758179, + "eval_rewards/accuracies": 0.5963097810745239, + "eval_rewards/chosen": -0.4144977927207947, + "eval_rewards/margins": 0.1566552221775055, + "eval_rewards/rejected": -0.5711529850959778, + "eval_runtime": 386.6444, + "eval_samples_per_second": 18.503, + "eval_steps_per_second": 9.251, + "step": 1000 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 8.298626899719238, + "learning_rate": 1.960906148867314e-05, + "logits/chosen": -4.350672245025635, + "logits/rejected": -4.346297740936279, + "logps/chosen": -716.8140258789062, + "logps/rejected": -699.349365234375, + "loss": 0.8517, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.43861478567123413, + "rewards/margins": 0.12198150157928467, + "rewards/rejected": -0.5605962872505188, + "step": 1010 + }, + { + "epoch": 0.2376237623762376, + "grad_norm": 8.617300033569336, + "learning_rate": 1.95831715210356e-05, + "logits/chosen": -4.392172813415527, + "logits/rejected": -4.394693851470947, + "logps/chosen": -718.5160522460938, + "logps/rejected": -735.8982543945312, + "loss": 0.8677, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.34239235520362854, + "rewards/margins": 0.07352287322282791, + "rewards/rejected": -0.415915310382843, + "step": 1020 + }, + { + "epoch": 0.23995340710541643, + "grad_norm": 7.848562717437744, + "learning_rate": 1.955728155339806e-05, + "logits/chosen": -4.374752998352051, + "logits/rejected": -4.420811653137207, + "logps/chosen": -707.7733154296875, + "logps/rejected": -725.6073608398438, + "loss": 0.8094, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.25753355026245117, + "rewards/margins": 0.20300650596618652, + "rewards/rejected": -0.4605400562286377, + "step": 1030 + }, + { + "epoch": 0.24228305183459523, + "grad_norm": 9.19383430480957, + "learning_rate": 1.9531391585760518e-05, + "logits/chosen": -4.342906475067139, + "logits/rejected": -4.432077884674072, + "logps/chosen": -674.3995361328125, + "logps/rejected": -725.902099609375, + "loss": 0.847, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.30617013573646545, + "rewards/margins": 0.07979961484670639, + "rewards/rejected": -0.38596972823143005, + "step": 1040 + }, + { + "epoch": 0.24461269656377402, + "grad_norm": 7.746910095214844, + "learning_rate": 1.950550161812298e-05, + "logits/chosen": -4.319817543029785, + "logits/rejected": -4.350963592529297, + "logps/chosen": -718.1383666992188, + "logps/rejected": -722.8746948242188, + "loss": 0.9053, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.3487966060638428, + "rewards/margins": 0.08206790685653687, + "rewards/rejected": -0.43086448311805725, + "step": 1050 + }, + { + "epoch": 0.24694234129295284, + "grad_norm": 11.534335136413574, + "learning_rate": 1.947961165048544e-05, + "logits/chosen": -4.359479904174805, + "logits/rejected": -4.37454891204834, + "logps/chosen": -709.6820068359375, + "logps/rejected": -742.3812255859375, + "loss": 0.8113, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.40116387605667114, + "rewards/margins": 0.22071246802806854, + "rewards/rejected": -0.6218763589859009, + "step": 1060 + }, + { + "epoch": 0.24927198602213163, + "grad_norm": 8.217765808105469, + "learning_rate": 1.9453721682847896e-05, + "logits/chosen": -4.362360954284668, + "logits/rejected": -4.356196403503418, + "logps/chosen": -716.28076171875, + "logps/rejected": -748.5253295898438, + "loss": 0.8515, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.47076430916786194, + "rewards/margins": 0.22061574459075928, + "rewards/rejected": -0.6913800239562988, + "step": 1070 + }, + { + "epoch": 0.2516016307513104, + "grad_norm": 6.56362247467041, + "learning_rate": 1.9427831715210357e-05, + "logits/chosen": -4.443305492401123, + "logits/rejected": -4.4894914627075195, + "logps/chosen": -729.105712890625, + "logps/rejected": -731.7708129882812, + "loss": 0.8697, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.4395231306552887, + "rewards/margins": 0.029054516926407814, + "rewards/rejected": -0.46857762336730957, + "step": 1080 + }, + { + "epoch": 0.2539312754804892, + "grad_norm": 7.56439733505249, + "learning_rate": 1.9401941747572818e-05, + "logits/chosen": -4.284507751464844, + "logits/rejected": -4.338918209075928, + "logps/chosen": -698.3824462890625, + "logps/rejected": -763.0474853515625, + "loss": 0.745, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.100661501288414, + "rewards/margins": 0.35776785016059875, + "rewards/rejected": -0.45842933654785156, + "step": 1090 + }, + { + "epoch": 0.256260920209668, + "grad_norm": 8.469297409057617, + "learning_rate": 1.9376051779935278e-05, + "logits/chosen": -4.2608771324157715, + "logits/rejected": -4.317883491516113, + "logps/chosen": -715.9654541015625, + "logps/rejected": -795.2291259765625, + "loss": 0.6523, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.29540759325027466, + "rewards/margins": 0.6277150511741638, + "rewards/rejected": -0.9231227040290833, + "step": 1100 + }, + { + "epoch": 0.256260920209668, + "eval_logits/chosen": -4.31129789352417, + "eval_logits/rejected": -4.299421310424805, + "eval_logps/chosen": -695.878173828125, + "eval_logps/rejected": -713.7141723632812, + "eval_loss": 0.6638560891151428, + "eval_rewards/accuracies": 0.5961699485778809, + "eval_rewards/chosen": -0.5049953460693359, + "eval_rewards/margins": 0.18655292689800262, + "eval_rewards/rejected": -0.691548228263855, + "eval_runtime": 386.1093, + "eval_samples_per_second": 18.528, + "eval_steps_per_second": 9.264, + "step": 1100 + }, + { + "epoch": 0.2585905649388468, + "grad_norm": 11.331911087036133, + "learning_rate": 1.9350161812297735e-05, + "logits/chosen": -4.392155170440674, + "logits/rejected": -4.309884548187256, + "logps/chosen": -723.4649658203125, + "logps/rejected": -722.3359985351562, + "loss": 0.8422, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.35999637842178345, + "rewards/margins": 0.22510388493537903, + "rewards/rejected": -0.5851002931594849, + "step": 1110 + }, + { + "epoch": 0.26092020966802565, + "grad_norm": 9.10745906829834, + "learning_rate": 1.9324271844660196e-05, + "logits/chosen": -4.381644248962402, + "logits/rejected": -4.304900169372559, + "logps/chosen": -765.546875, + "logps/rejected": -742.8046875, + "loss": 0.8466, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.6179436445236206, + "rewards/margins": 0.15039575099945068, + "rewards/rejected": -0.7683394551277161, + "step": 1120 + }, + { + "epoch": 0.26324985439720444, + "grad_norm": 6.49190616607666, + "learning_rate": 1.9298381877022656e-05, + "logits/chosen": -4.226799488067627, + "logits/rejected": -4.376645088195801, + "logps/chosen": -648.4716796875, + "logps/rejected": -772.0549926757812, + "loss": 0.726, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18986962735652924, + "rewards/margins": 0.5118517875671387, + "rewards/rejected": -0.7017214894294739, + "step": 1130 + }, + { + "epoch": 0.26557949912638323, + "grad_norm": 7.391536235809326, + "learning_rate": 1.9272491909385117e-05, + "logits/chosen": -4.360841274261475, + "logits/rejected": -4.35951042175293, + "logps/chosen": -775.7753295898438, + "logps/rejected": -796.6776733398438, + "loss": 0.7789, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.459010511636734, + "rewards/margins": 0.3028544783592224, + "rewards/rejected": -0.7618650197982788, + "step": 1140 + }, + { + "epoch": 0.267909143855562, + "grad_norm": 7.595261573791504, + "learning_rate": 1.9246601941747574e-05, + "logits/chosen": -4.29518461227417, + "logits/rejected": -4.352611064910889, + "logps/chosen": -710.1934814453125, + "logps/rejected": -733.4653930664062, + "loss": 0.9506, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.4916602671146393, + "rewards/margins": -0.07938651740550995, + "rewards/rejected": -0.4122737944126129, + "step": 1150 + }, + { + "epoch": 0.2702387885847408, + "grad_norm": 7.323908805847168, + "learning_rate": 1.9220711974110035e-05, + "logits/chosen": -4.297641754150391, + "logits/rejected": -4.3460564613342285, + "logps/chosen": -671.62548828125, + "logps/rejected": -700.6619873046875, + "loss": 0.7593, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.31447774171829224, + "rewards/margins": 0.2587481141090393, + "rewards/rejected": -0.5732258558273315, + "step": 1160 + }, + { + "epoch": 0.2725684333139196, + "grad_norm": 4.596531867980957, + "learning_rate": 1.9194822006472492e-05, + "logits/chosen": -4.439651966094971, + "logits/rejected": -4.424775123596191, + "logps/chosen": -746.04248046875, + "logps/rejected": -760.7760009765625, + "loss": 0.69, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24861034750938416, + "rewards/margins": 0.4928106367588043, + "rewards/rejected": -0.7414209842681885, + "step": 1170 + }, + { + "epoch": 0.2748980780430984, + "grad_norm": 6.0940423011779785, + "learning_rate": 1.9168932038834952e-05, + "logits/chosen": -4.387946128845215, + "logits/rejected": -4.351739883422852, + "logps/chosen": -769.9338989257812, + "logps/rejected": -746.4041748046875, + "loss": 0.7558, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.3385796546936035, + "rewards/margins": 0.41992324590682983, + "rewards/rejected": -0.7585029602050781, + "step": 1180 + }, + { + "epoch": 0.27722772277227725, + "grad_norm": 7.024069309234619, + "learning_rate": 1.9143042071197413e-05, + "logits/chosen": -4.372438907623291, + "logits/rejected": -4.313912868499756, + "logps/chosen": -724.6370849609375, + "logps/rejected": -753.0245361328125, + "loss": 0.8215, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.5210503935813904, + "rewards/margins": 0.0856110230088234, + "rewards/rejected": -0.6066614389419556, + "step": 1190 + }, + { + "epoch": 0.27955736750145604, + "grad_norm": 9.920272827148438, + "learning_rate": 1.9117152103559874e-05, + "logits/chosen": -4.4457221031188965, + "logits/rejected": -4.441542148590088, + "logps/chosen": -741.6122436523438, + "logps/rejected": -730.5281372070312, + "loss": 0.8034, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3603318929672241, + "rewards/margins": 0.28044313192367554, + "rewards/rejected": -0.6407750844955444, + "step": 1200 + }, + { + "epoch": 0.27955736750145604, + "eval_logits/chosen": -4.304655075073242, + "eval_logits/rejected": -4.292832374572754, + "eval_logps/chosen": -695.743896484375, + "eval_logps/rejected": -713.6777954101562, + "eval_loss": 0.6612324714660645, + "eval_rewards/accuracies": 0.6017612814903259, + "eval_rewards/chosen": -0.49156203866004944, + "eval_rewards/margins": 0.19635748863220215, + "eval_rewards/rejected": -0.6879194974899292, + "eval_runtime": 386.8833, + "eval_samples_per_second": 18.491, + "eval_steps_per_second": 9.246, + "step": 1200 + }, + { + "epoch": 0.28188701223063484, + "grad_norm": 7.033170223236084, + "learning_rate": 1.9091262135922334e-05, + "logits/chosen": -4.416142463684082, + "logits/rejected": -4.44868803024292, + "logps/chosen": -772.4678955078125, + "logps/rejected": -797.7364501953125, + "loss": 0.9886, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.44758549332618713, + "rewards/margins": -0.10882000625133514, + "rewards/rejected": -0.3387654423713684, + "step": 1210 + }, + { + "epoch": 0.28421665695981363, + "grad_norm": 8.389505386352539, + "learning_rate": 1.906537216828479e-05, + "logits/chosen": -4.408908843994141, + "logits/rejected": -4.390933036804199, + "logps/chosen": -770.1661987304688, + "logps/rejected": -758.59521484375, + "loss": 0.764, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.4254428744316101, + "rewards/margins": 0.4820783734321594, + "rewards/rejected": -0.9075212478637695, + "step": 1220 + }, + { + "epoch": 0.2865463016889924, + "grad_norm": 8.975470542907715, + "learning_rate": 1.903948220064725e-05, + "logits/chosen": -4.323489189147949, + "logits/rejected": -4.383948802947998, + "logps/chosen": -663.6719970703125, + "logps/rejected": -720.05322265625, + "loss": 0.7602, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2482331544160843, + "rewards/margins": 0.3147343397140503, + "rewards/rejected": -0.5629674792289734, + "step": 1230 + }, + { + "epoch": 0.2888759464181712, + "grad_norm": 10.377336502075195, + "learning_rate": 1.901359223300971e-05, + "logits/chosen": -4.380834102630615, + "logits/rejected": -4.364059925079346, + "logps/chosen": -719.26904296875, + "logps/rejected": -734.4669799804688, + "loss": 0.93, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.5593451261520386, + "rewards/margins": 0.018338533118367195, + "rewards/rejected": -0.577683687210083, + "step": 1240 + }, + { + "epoch": 0.29120559114735, + "grad_norm": 7.079504013061523, + "learning_rate": 1.898770226537217e-05, + "logits/chosen": -4.397341728210449, + "logits/rejected": -4.27084493637085, + "logps/chosen": -759.1246948242188, + "logps/rejected": -689.4752197265625, + "loss": 0.6636, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.29703229665756226, + "rewards/margins": 0.3597315549850464, + "rewards/rejected": -0.6567639112472534, + "step": 1250 + }, + { + "epoch": 0.29353523587652885, + "grad_norm": 10.312458992004395, + "learning_rate": 1.896181229773463e-05, + "logits/chosen": -4.25052547454834, + "logits/rejected": -4.339222431182861, + "logps/chosen": -708.4290771484375, + "logps/rejected": -756.5723876953125, + "loss": 0.8946, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.4672269821166992, + "rewards/margins": 0.03996283560991287, + "rewards/rejected": -0.5071898698806763, + "step": 1260 + }, + { + "epoch": 0.29586488060570765, + "grad_norm": 7.909558296203613, + "learning_rate": 1.893592233009709e-05, + "logits/chosen": -4.474128723144531, + "logits/rejected": -4.338606834411621, + "logps/chosen": -749.0723876953125, + "logps/rejected": -767.09912109375, + "loss": 0.7758, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.48534131050109863, + "rewards/margins": 0.2608849108219147, + "rewards/rejected": -0.7462261915206909, + "step": 1270 + }, + { + "epoch": 0.29819452533488644, + "grad_norm": 7.5413737297058105, + "learning_rate": 1.8910032362459548e-05, + "logits/chosen": -4.267604351043701, + "logits/rejected": -4.25099515914917, + "logps/chosen": -688.8858642578125, + "logps/rejected": -764.21484375, + "loss": 0.8028, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4112478196620941, + "rewards/margins": 0.2967201769351959, + "rewards/rejected": -0.7079680562019348, + "step": 1280 + }, + { + "epoch": 0.30052417006406523, + "grad_norm": 12.668158531188965, + "learning_rate": 1.888414239482201e-05, + "logits/chosen": -4.369383811950684, + "logits/rejected": -4.310977935791016, + "logps/chosen": -717.2327880859375, + "logps/rejected": -691.5540161132812, + "loss": 0.791, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4307662844657898, + "rewards/margins": 0.2471897304058075, + "rewards/rejected": -0.6779559254646301, + "step": 1290 + }, + { + "epoch": 0.302853814793244, + "grad_norm": 7.491385459899902, + "learning_rate": 1.8858252427184466e-05, + "logits/chosen": -4.3847270011901855, + "logits/rejected": -4.315208435058594, + "logps/chosen": -688.0221557617188, + "logps/rejected": -711.5299682617188, + "loss": 0.7325, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.23917031288146973, + "rewards/margins": 0.3942749500274658, + "rewards/rejected": -0.6334452629089355, + "step": 1300 + }, + { + "epoch": 0.302853814793244, + "eval_logits/chosen": -4.31157922744751, + "eval_logits/rejected": -4.299931049346924, + "eval_logps/chosen": -695.6024780273438, + "eval_logps/rejected": -713.5684204101562, + "eval_loss": 0.6586904525756836, + "eval_rewards/accuracies": 0.6052557826042175, + "eval_rewards/chosen": -0.4774321913719177, + "eval_rewards/margins": 0.19954435527324677, + "eval_rewards/rejected": -0.6769765019416809, + "eval_runtime": 386.9714, + "eval_samples_per_second": 18.487, + "eval_steps_per_second": 9.244, + "step": 1300 + }, + { + "epoch": 0.3051834595224228, + "grad_norm": 7.889120578765869, + "learning_rate": 1.8832362459546926e-05, + "logits/chosen": -4.3173346519470215, + "logits/rejected": -4.450728416442871, + "logps/chosen": -675.0162963867188, + "logps/rejected": -716.7359008789062, + "loss": 0.8424, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.42047327756881714, + "rewards/margins": 0.21612660586833954, + "rewards/rejected": -0.6365998983383179, + "step": 1310 + }, + { + "epoch": 0.3075131042516016, + "grad_norm": 8.503684043884277, + "learning_rate": 1.8806472491909387e-05, + "logits/chosen": -4.400944232940674, + "logits/rejected": -4.452167987823486, + "logps/chosen": -692.8150024414062, + "logps/rejected": -786.423828125, + "loss": 0.7357, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4632536768913269, + "rewards/margins": 0.29990696907043457, + "rewards/rejected": -0.7631607055664062, + "step": 1320 + }, + { + "epoch": 0.30984274898078046, + "grad_norm": 8.10945987701416, + "learning_rate": 1.8780582524271848e-05, + "logits/chosen": -4.360897541046143, + "logits/rejected": -4.393882751464844, + "logps/chosen": -682.2403564453125, + "logps/rejected": -772.7156372070312, + "loss": 0.7315, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2753627896308899, + "rewards/margins": 0.5017082095146179, + "rewards/rejected": -0.7770709991455078, + "step": 1330 + }, + { + "epoch": 0.31217239370995925, + "grad_norm": 7.530776023864746, + "learning_rate": 1.8754692556634305e-05, + "logits/chosen": -4.405557155609131, + "logits/rejected": -4.374571800231934, + "logps/chosen": -734.5709228515625, + "logps/rejected": -706.4002685546875, + "loss": 0.8656, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5671267509460449, + "rewards/margins": 0.08109476417303085, + "rewards/rejected": -0.6482214331626892, + "step": 1340 + }, + { + "epoch": 0.31450203843913804, + "grad_norm": 11.22525405883789, + "learning_rate": 1.8728802588996765e-05, + "logits/chosen": -4.321959018707275, + "logits/rejected": -4.386361122131348, + "logps/chosen": -718.044677734375, + "logps/rejected": -772.4830932617188, + "loss": 0.9531, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.7779480218887329, + "rewards/margins": -0.03689789026975632, + "rewards/rejected": -0.741050124168396, + "step": 1350 + }, + { + "epoch": 0.31683168316831684, + "grad_norm": 10.8184232711792, + "learning_rate": 1.8702912621359222e-05, + "logits/chosen": -4.334450721740723, + "logits/rejected": -4.335959434509277, + "logps/chosen": -746.68798828125, + "logps/rejected": -753.0299072265625, + "loss": 0.9349, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.5748792886734009, + "rewards/margins": 0.11975729465484619, + "rewards/rejected": -0.6946366429328918, + "step": 1360 + }, + { + "epoch": 0.31916132789749563, + "grad_norm": 6.547021389007568, + "learning_rate": 1.8677022653721683e-05, + "logits/chosen": -4.310304164886475, + "logits/rejected": -4.271243095397949, + "logps/chosen": -731.6600341796875, + "logps/rejected": -681.7969970703125, + "loss": 0.8502, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.48259344696998596, + "rewards/margins": 0.028724532574415207, + "rewards/rejected": -0.5113179087638855, + "step": 1370 + }, + { + "epoch": 0.3214909726266744, + "grad_norm": 8.706934928894043, + "learning_rate": 1.8651132686084144e-05, + "logits/chosen": -4.324995994567871, + "logits/rejected": -4.29234504699707, + "logps/chosen": -676.2664184570312, + "logps/rejected": -738.682373046875, + "loss": 0.8718, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.5631632804870605, + "rewards/margins": 0.12288354337215424, + "rewards/rejected": -0.6860467791557312, + "step": 1380 + }, + { + "epoch": 0.3238206173558532, + "grad_norm": 7.237886428833008, + "learning_rate": 1.8625242718446604e-05, + "logits/chosen": -4.271629333496094, + "logits/rejected": -4.291550636291504, + "logps/chosen": -673.0308837890625, + "logps/rejected": -716.6060791015625, + "loss": 0.8182, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4574052691459656, + "rewards/margins": 0.1533111035823822, + "rewards/rejected": -0.6107163429260254, + "step": 1390 + }, + { + "epoch": 0.326150262085032, + "grad_norm": 7.548694610595703, + "learning_rate": 1.8599352750809065e-05, + "logits/chosen": -4.321394920349121, + "logits/rejected": -4.348613739013672, + "logps/chosen": -730.42578125, + "logps/rejected": -778.80615234375, + "loss": 0.8771, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4148942828178406, + "rewards/margins": 0.17481055855751038, + "rewards/rejected": -0.5897048711776733, + "step": 1400 + }, + { + "epoch": 0.326150262085032, + "eval_logits/chosen": -4.305843353271484, + "eval_logits/rejected": -4.295498847961426, + "eval_logps/chosen": -696.2532958984375, + "eval_logps/rejected": -714.38623046875, + "eval_loss": 0.6589279770851135, + "eval_rewards/accuracies": 0.6079116463661194, + "eval_rewards/chosen": -0.5424960255622864, + "eval_rewards/margins": 0.21626760065555573, + "eval_rewards/rejected": -0.7587636113166809, + "eval_runtime": 386.9373, + "eval_samples_per_second": 18.489, + "eval_steps_per_second": 9.244, + "step": 1400 + }, + { + "epoch": 0.32847990681421085, + "grad_norm": 4.411736965179443, + "learning_rate": 1.8573462783171522e-05, + "logits/chosen": -4.300525665283203, + "logits/rejected": -4.326880931854248, + "logps/chosen": -726.6177978515625, + "logps/rejected": -726.2149047851562, + "loss": 0.8027, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3517182171344757, + "rewards/margins": 0.3191562294960022, + "rewards/rejected": -0.6708744168281555, + "step": 1410 + }, + { + "epoch": 0.33080955154338965, + "grad_norm": 9.20610523223877, + "learning_rate": 1.8547572815533983e-05, + "logits/chosen": -4.3987040519714355, + "logits/rejected": -4.352162837982178, + "logps/chosen": -703.8623046875, + "logps/rejected": -684.692626953125, + "loss": 0.8377, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.5054625272750854, + "rewards/margins": 0.1009974479675293, + "rewards/rejected": -0.6064599752426147, + "step": 1420 + }, + { + "epoch": 0.33313919627256844, + "grad_norm": 9.253528594970703, + "learning_rate": 1.852168284789644e-05, + "logits/chosen": -4.3750433921813965, + "logits/rejected": -4.303120136260986, + "logps/chosen": -717.990966796875, + "logps/rejected": -651.62841796875, + "loss": 0.8924, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.5147886276245117, + "rewards/margins": 0.1544334441423416, + "rewards/rejected": -0.6692220568656921, + "step": 1430 + }, + { + "epoch": 0.33546884100174723, + "grad_norm": 10.970667839050293, + "learning_rate": 1.84957928802589e-05, + "logits/chosen": -4.304279327392578, + "logits/rejected": -4.316771507263184, + "logps/chosen": -666.4049072265625, + "logps/rejected": -702.8588256835938, + "loss": 0.9584, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.6840143799781799, + "rewards/margins": 0.024236023426055908, + "rewards/rejected": -0.7082504034042358, + "step": 1440 + }, + { + "epoch": 0.337798485730926, + "grad_norm": 7.63844108581543, + "learning_rate": 1.846990291262136e-05, + "logits/chosen": -4.291044235229492, + "logits/rejected": -4.272721290588379, + "logps/chosen": -723.7853393554688, + "logps/rejected": -716.9478149414062, + "loss": 0.845, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.3475242257118225, + "rewards/margins": 0.28106704354286194, + "rewards/rejected": -0.6285912394523621, + "step": 1450 + }, + { + "epoch": 0.3401281304601048, + "grad_norm": 6.421882629394531, + "learning_rate": 1.844401294498382e-05, + "logits/chosen": -4.327022552490234, + "logits/rejected": -4.355108261108398, + "logps/chosen": -661.6111450195312, + "logps/rejected": -672.7149658203125, + "loss": 0.7624, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.23156344890594482, + "rewards/margins": 0.21851006150245667, + "rewards/rejected": -0.4500734806060791, + "step": 1460 + }, + { + "epoch": 0.3424577751892836, + "grad_norm": 3.8347349166870117, + "learning_rate": 1.841812297734628e-05, + "logits/chosen": -4.450470924377441, + "logits/rejected": -4.4103875160217285, + "logps/chosen": -746.5904541015625, + "logps/rejected": -708.0609130859375, + "loss": 0.7367, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.27077916264533997, + "rewards/margins": 0.39619284868240356, + "rewards/rejected": -0.6669719815254211, + "step": 1470 + }, + { + "epoch": 0.34478741991846246, + "grad_norm": 7.18529748916626, + "learning_rate": 1.839223300970874e-05, + "logits/chosen": -4.261987209320068, + "logits/rejected": -4.328606128692627, + "logps/chosen": -706.8624267578125, + "logps/rejected": -720.5556640625, + "loss": 0.8931, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.40044647455215454, + "rewards/margins": 0.04966818541288376, + "rewards/rejected": -0.4501146674156189, + "step": 1480 + }, + { + "epoch": 0.34711706464764125, + "grad_norm": 8.048136711120605, + "learning_rate": 1.83663430420712e-05, + "logits/chosen": -4.311758041381836, + "logits/rejected": -4.399693012237549, + "logps/chosen": -728.1452026367188, + "logps/rejected": -779.4000244140625, + "loss": 0.7912, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.4728098511695862, + "rewards/margins": 0.32029253244400024, + "rewards/rejected": -0.7931022644042969, + "step": 1490 + }, + { + "epoch": 0.34944670937682004, + "grad_norm": 10.150632858276367, + "learning_rate": 1.834045307443366e-05, + "logits/chosen": -4.398844242095947, + "logits/rejected": -4.398817539215088, + "logps/chosen": -764.0753173828125, + "logps/rejected": -768.4854125976562, + "loss": 0.8794, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3697890341281891, + "rewards/margins": 0.22263407707214355, + "rewards/rejected": -0.5924230813980103, + "step": 1500 + }, + { + "epoch": 0.34944670937682004, + "eval_logits/chosen": -4.326058864593506, + "eval_logits/rejected": -4.316102981567383, + "eval_logps/chosen": -695.7061767578125, + "eval_logps/rejected": -713.8124389648438, + "eval_loss": 0.6541875600814819, + "eval_rewards/accuracies": 0.6108470559120178, + "eval_rewards/chosen": -0.4877876341342926, + "eval_rewards/margins": 0.2135990709066391, + "eval_rewards/rejected": -0.7013866901397705, + "eval_runtime": 386.448, + "eval_samples_per_second": 18.512, + "eval_steps_per_second": 9.256, + "step": 1500 + }, + { + "epoch": 0.35177635410599883, + "grad_norm": 8.593611717224121, + "learning_rate": 1.8314563106796118e-05, + "logits/chosen": -4.412143707275391, + "logits/rejected": -4.4477434158325195, + "logps/chosen": -721.7753295898438, + "logps/rejected": -744.5516967773438, + "loss": 0.7167, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3608551621437073, + "rewards/margins": 0.30276018381118774, + "rewards/rejected": -0.6636154055595398, + "step": 1510 + }, + { + "epoch": 0.3541059988351776, + "grad_norm": 8.981035232543945, + "learning_rate": 1.8288673139158578e-05, + "logits/chosen": -4.372857093811035, + "logits/rejected": -4.306519508361816, + "logps/chosen": -817.6216430664062, + "logps/rejected": -790.0299682617188, + "loss": 0.9705, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6924741864204407, + "rewards/margins": 0.003534305142238736, + "rewards/rejected": -0.6960083842277527, + "step": 1520 + }, + { + "epoch": 0.3564356435643564, + "grad_norm": 6.777766227722168, + "learning_rate": 1.8262783171521035e-05, + "logits/chosen": -4.304599761962891, + "logits/rejected": -4.376704216003418, + "logps/chosen": -679.5525512695312, + "logps/rejected": -784.6156005859375, + "loss": 0.761, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.41890984773635864, + "rewards/margins": 0.4356445372104645, + "rewards/rejected": -0.854554295539856, + "step": 1530 + }, + { + "epoch": 0.3587652882935352, + "grad_norm": 9.232786178588867, + "learning_rate": 1.8236893203883496e-05, + "logits/chosen": -4.431849956512451, + "logits/rejected": -4.395738124847412, + "logps/chosen": -705.1184692382812, + "logps/rejected": -734.9397583007812, + "loss": 0.9316, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.665607213973999, + "rewards/margins": -0.041909217834472656, + "rewards/rejected": -0.6236980557441711, + "step": 1540 + }, + { + "epoch": 0.36109493302271406, + "grad_norm": 10.691422462463379, + "learning_rate": 1.8211003236245956e-05, + "logits/chosen": -4.34941291809082, + "logits/rejected": -4.335809230804443, + "logps/chosen": -755.9302978515625, + "logps/rejected": -768.3907470703125, + "loss": 0.8211, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.5321193933486938, + "rewards/margins": 0.3216237425804138, + "rewards/rejected": -0.8537429571151733, + "step": 1550 + }, + { + "epoch": 0.36342457775189285, + "grad_norm": 6.296148777008057, + "learning_rate": 1.8185113268608417e-05, + "logits/chosen": -4.34104061126709, + "logits/rejected": -4.358175754547119, + "logps/chosen": -736.5698852539062, + "logps/rejected": -756.3456420898438, + "loss": 0.7405, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.39380258321762085, + "rewards/margins": 0.2697417140007019, + "rewards/rejected": -0.6635442972183228, + "step": 1560 + }, + { + "epoch": 0.36575422248107164, + "grad_norm": 6.31267786026001, + "learning_rate": 1.8159223300970878e-05, + "logits/chosen": -4.363784313201904, + "logits/rejected": -4.2965593338012695, + "logps/chosen": -703.5390014648438, + "logps/rejected": -704.9164428710938, + "loss": 0.7817, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.47610020637512207, + "rewards/margins": 0.39951229095458984, + "rewards/rejected": -0.8756124377250671, + "step": 1570 + }, + { + "epoch": 0.36808386721025044, + "grad_norm": 7.0956130027771, + "learning_rate": 1.8133333333333335e-05, + "logits/chosen": -4.416982173919678, + "logits/rejected": -4.3702006340026855, + "logps/chosen": -671.6224365234375, + "logps/rejected": -645.9537963867188, + "loss": 0.906, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6286529302597046, + "rewards/margins": -0.013773918151855469, + "rewards/rejected": -0.6148789525032043, + "step": 1580 + }, + { + "epoch": 0.37041351193942923, + "grad_norm": 6.213327884674072, + "learning_rate": 1.8107443365695795e-05, + "logits/chosen": -4.385059356689453, + "logits/rejected": -4.5222296714782715, + "logps/chosen": -697.7130737304688, + "logps/rejected": -820.3958740234375, + "loss": 0.7444, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5424093008041382, + "rewards/margins": 0.4374067187309265, + "rewards/rejected": -0.9798160791397095, + "step": 1590 + }, + { + "epoch": 0.372743156668608, + "grad_norm": 8.027915000915527, + "learning_rate": 1.8081553398058253e-05, + "logits/chosen": -4.39331579208374, + "logits/rejected": -4.453400611877441, + "logps/chosen": -701.2443237304688, + "logps/rejected": -756.2605590820312, + "loss": 0.772, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.38883525133132935, + "rewards/margins": 0.19603821635246277, + "rewards/rejected": -0.5848734974861145, + "step": 1600 + }, + { + "epoch": 0.372743156668608, + "eval_logits/chosen": -4.31981086730957, + "eval_logits/rejected": -4.309142112731934, + "eval_logps/chosen": -696.2880859375, + "eval_logps/rejected": -714.566650390625, + "eval_loss": 0.655998170375824, + "eval_rewards/accuracies": 0.6118255257606506, + "eval_rewards/chosen": -0.5459803938865662, + "eval_rewards/margins": 0.23082096874713898, + "eval_rewards/rejected": -0.7768014073371887, + "eval_runtime": 387.0177, + "eval_samples_per_second": 18.485, + "eval_steps_per_second": 9.242, + "step": 1600 + }, + { + "epoch": 0.3750728013977868, + "grad_norm": 6.432879447937012, + "learning_rate": 1.8055663430420713e-05, + "logits/chosen": -4.410266399383545, + "logits/rejected": -4.367775917053223, + "logps/chosen": -733.0140380859375, + "logps/rejected": -738.000244140625, + "loss": 0.7472, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3916794955730438, + "rewards/margins": 0.4530234932899475, + "rewards/rejected": -0.8447030186653137, + "step": 1610 + }, + { + "epoch": 0.37740244612696566, + "grad_norm": 10.104506492614746, + "learning_rate": 1.8029773462783174e-05, + "logits/chosen": -4.375548362731934, + "logits/rejected": -4.408837795257568, + "logps/chosen": -729.4613037109375, + "logps/rejected": -811.3401489257812, + "loss": 0.9296, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.3973383903503418, + "rewards/margins": 0.030383765697479248, + "rewards/rejected": -0.42772215604782104, + "step": 1620 + }, + { + "epoch": 0.37973209085614446, + "grad_norm": 8.92159366607666, + "learning_rate": 1.8003883495145634e-05, + "logits/chosen": -4.325889587402344, + "logits/rejected": -4.390286445617676, + "logps/chosen": -704.9832763671875, + "logps/rejected": -721.9518432617188, + "loss": 0.7323, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4084460139274597, + "rewards/margins": 0.39792656898498535, + "rewards/rejected": -0.8063725233078003, + "step": 1630 + }, + { + "epoch": 0.38206173558532325, + "grad_norm": 9.660841941833496, + "learning_rate": 1.797799352750809e-05, + "logits/chosen": -4.30095911026001, + "logits/rejected": -4.326530456542969, + "logps/chosen": -711.6383666992188, + "logps/rejected": -741.7811279296875, + "loss": 0.8577, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.4106476902961731, + "rewards/margins": 0.06246136501431465, + "rewards/rejected": -0.47310906648635864, + "step": 1640 + }, + { + "epoch": 0.38439138031450204, + "grad_norm": 5.930005073547363, + "learning_rate": 1.7952103559870552e-05, + "logits/chosen": -4.294757843017578, + "logits/rejected": -4.306758880615234, + "logps/chosen": -688.84375, + "logps/rejected": -730.2039794921875, + "loss": 0.7176, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.3881547451019287, + "rewards/margins": 0.5018659830093384, + "rewards/rejected": -0.8900208473205566, + "step": 1650 + }, + { + "epoch": 0.38672102504368083, + "grad_norm": 7.299777507781982, + "learning_rate": 1.792621359223301e-05, + "logits/chosen": -4.340909957885742, + "logits/rejected": -4.285897254943848, + "logps/chosen": -688.5858154296875, + "logps/rejected": -659.2593383789062, + "loss": 0.9392, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.6448443531990051, + "rewards/margins": -0.16130118072032928, + "rewards/rejected": -0.4835430979728699, + "step": 1660 + }, + { + "epoch": 0.3890506697728596, + "grad_norm": 7.8056182861328125, + "learning_rate": 1.790032362459547e-05, + "logits/chosen": -4.302353858947754, + "logits/rejected": -4.317826271057129, + "logps/chosen": -735.2644653320312, + "logps/rejected": -726.2462158203125, + "loss": 0.8632, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4602840840816498, + "rewards/margins": 0.10094530880451202, + "rewards/rejected": -0.5612293481826782, + "step": 1670 + }, + { + "epoch": 0.3913803145020384, + "grad_norm": 6.735440254211426, + "learning_rate": 1.787443365695793e-05, + "logits/chosen": -4.3868608474731445, + "logits/rejected": -4.315372943878174, + "logps/chosen": -763.2335815429688, + "logps/rejected": -739.8739013671875, + "loss": 0.8695, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.4911941885948181, + "rewards/margins": 0.14432446658611298, + "rewards/rejected": -0.6355187296867371, + "step": 1680 + }, + { + "epoch": 0.39370995923121727, + "grad_norm": 9.822698593139648, + "learning_rate": 1.784854368932039e-05, + "logits/chosen": -4.311770439147949, + "logits/rejected": -4.3782219886779785, + "logps/chosen": -653.6517333984375, + "logps/rejected": -746.6046142578125, + "loss": 0.7848, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.33290696144104004, + "rewards/margins": 0.3551305830478668, + "rewards/rejected": -0.6880375146865845, + "step": 1690 + }, + { + "epoch": 0.39603960396039606, + "grad_norm": 6.626972198486328, + "learning_rate": 1.782265372168285e-05, + "logits/chosen": -4.282130241394043, + "logits/rejected": -4.3887200355529785, + "logps/chosen": -716.3925170898438, + "logps/rejected": -813.790771484375, + "loss": 0.965, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6478350758552551, + "rewards/margins": 0.02205513045191765, + "rewards/rejected": -0.6698901653289795, + "step": 1700 + }, + { + "epoch": 0.39603960396039606, + "eval_logits/chosen": -4.333280563354492, + "eval_logits/rejected": -4.32305383682251, + "eval_logps/chosen": -695.7999877929688, + "eval_logps/rejected": -714.0227661132812, + "eval_loss": 0.6525019407272339, + "eval_rewards/accuracies": 0.6146211624145508, + "eval_rewards/chosen": -0.49716758728027344, + "eval_rewards/margins": 0.22524842619895935, + "eval_rewards/rejected": -0.7224159836769104, + "eval_runtime": 386.661, + "eval_samples_per_second": 18.502, + "eval_steps_per_second": 9.251, + "step": 1700 + }, + { + "epoch": 0.39836924868957485, + "grad_norm": 9.437028884887695, + "learning_rate": 1.779676375404531e-05, + "logits/chosen": -4.359824180603027, + "logits/rejected": -4.38698148727417, + "logps/chosen": -721.0726318359375, + "logps/rejected": -744.590087890625, + "loss": 0.7858, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.35493481159210205, + "rewards/margins": 0.24104556441307068, + "rewards/rejected": -0.5959803462028503, + "step": 1710 + }, + { + "epoch": 0.40069889341875364, + "grad_norm": 4.35906457901001, + "learning_rate": 1.7770873786407766e-05, + "logits/chosen": -4.450234413146973, + "logits/rejected": -4.379258155822754, + "logps/chosen": -700.4639892578125, + "logps/rejected": -632.0845947265625, + "loss": 0.7615, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5291283130645752, + "rewards/margins": 0.2273644655942917, + "rewards/rejected": -0.7564927935600281, + "step": 1720 + }, + { + "epoch": 0.40302853814793244, + "grad_norm": 4.904123783111572, + "learning_rate": 1.7744983818770226e-05, + "logits/chosen": -4.306130409240723, + "logits/rejected": -4.339006423950195, + "logps/chosen": -698.2767944335938, + "logps/rejected": -711.9406127929688, + "loss": 0.7953, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.3669876754283905, + "rewards/margins": 0.28200289607048035, + "rewards/rejected": -0.6489905714988708, + "step": 1730 + }, + { + "epoch": 0.40535818287711123, + "grad_norm": 6.090320587158203, + "learning_rate": 1.7719093851132687e-05, + "logits/chosen": -4.347052574157715, + "logits/rejected": -4.323563098907471, + "logps/chosen": -722.5578002929688, + "logps/rejected": -726.0507202148438, + "loss": 0.7954, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4122798442840576, + "rewards/margins": 0.28659650683403015, + "rewards/rejected": -0.6988764405250549, + "step": 1740 + }, + { + "epoch": 0.40768782760629, + "grad_norm": 8.839376449584961, + "learning_rate": 1.7693203883495148e-05, + "logits/chosen": -4.381305694580078, + "logits/rejected": -4.384965896606445, + "logps/chosen": -760.0098876953125, + "logps/rejected": -822.9794921875, + "loss": 0.6897, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.36683160066604614, + "rewards/margins": 0.5118778944015503, + "rewards/rejected": -0.8787094354629517, + "step": 1750 + }, + { + "epoch": 0.4100174723354688, + "grad_norm": 9.633819580078125, + "learning_rate": 1.7667313915857608e-05, + "logits/chosen": -4.34865665435791, + "logits/rejected": -4.264157295227051, + "logps/chosen": -707.0336303710938, + "logps/rejected": -721.6454467773438, + "loss": 0.7819, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.34074828028678894, + "rewards/margins": 0.33113110065460205, + "rewards/rejected": -0.6718794107437134, + "step": 1760 + }, + { + "epoch": 0.41234711706464766, + "grad_norm": 6.981215000152588, + "learning_rate": 1.7641423948220065e-05, + "logits/chosen": -4.279094696044922, + "logits/rejected": -4.388249397277832, + "logps/chosen": -693.2166748046875, + "logps/rejected": -756.8175048828125, + "loss": 0.6648, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4187663495540619, + "rewards/margins": 0.4338448941707611, + "rewards/rejected": -0.852611243724823, + "step": 1770 + }, + { + "epoch": 0.41467676179382645, + "grad_norm": 9.351151466369629, + "learning_rate": 1.7615533980582526e-05, + "logits/chosen": -4.388346195220947, + "logits/rejected": -4.325381278991699, + "logps/chosen": -734.6055908203125, + "logps/rejected": -761.28955078125, + "loss": 0.8392, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.3040839433670044, + "rewards/margins": 0.2513015866279602, + "rewards/rejected": -0.5553855895996094, + "step": 1780 + }, + { + "epoch": 0.41700640652300525, + "grad_norm": 9.658149719238281, + "learning_rate": 1.7589644012944986e-05, + "logits/chosen": -4.396363258361816, + "logits/rejected": -4.409237861633301, + "logps/chosen": -751.6793823242188, + "logps/rejected": -754.2955932617188, + "loss": 0.7963, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.565523624420166, + "rewards/margins": 0.3098762333393097, + "rewards/rejected": -0.8753998875617981, + "step": 1790 + }, + { + "epoch": 0.41933605125218404, + "grad_norm": 8.52037525177002, + "learning_rate": 1.7563754045307444e-05, + "logits/chosen": -4.327354431152344, + "logits/rejected": -4.397865295410156, + "logps/chosen": -631.0447998046875, + "logps/rejected": -733.0093994140625, + "loss": 0.7763, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.38907307386398315, + "rewards/margins": 0.29196518659591675, + "rewards/rejected": -0.6810382604598999, + "step": 1800 + }, + { + "epoch": 0.41933605125218404, + "eval_logits/chosen": -4.320523738861084, + "eval_logits/rejected": -4.310983657836914, + "eval_logps/chosen": -696.6057739257812, + "eval_logps/rejected": -715.0689086914062, + "eval_loss": 0.6532381176948547, + "eval_rewards/accuracies": 0.6126642227172852, + "eval_rewards/chosen": -0.577749490737915, + "eval_rewards/margins": 0.24928320944309235, + "eval_rewards/rejected": -0.8270328044891357, + "eval_runtime": 386.9896, + "eval_samples_per_second": 18.486, + "eval_steps_per_second": 9.243, + "step": 1800 + }, + { + "epoch": 0.42166569598136283, + "grad_norm": 6.045749664306641, + "learning_rate": 1.7537864077669904e-05, + "logits/chosen": -4.361016273498535, + "logits/rejected": -4.3705153465271, + "logps/chosen": -728.2724609375, + "logps/rejected": -753.5090942382812, + "loss": 0.802, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5839508771896362, + "rewards/margins": 0.3277234435081482, + "rewards/rejected": -0.9116743206977844, + "step": 1810 + }, + { + "epoch": 0.4239953407105416, + "grad_norm": 8.700886726379395, + "learning_rate": 1.7511974110032365e-05, + "logits/chosen": -4.295271873474121, + "logits/rejected": -4.3172783851623535, + "logps/chosen": -739.1973266601562, + "logps/rejected": -734.2899169921875, + "loss": 0.9811, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.833113968372345, + "rewards/margins": -0.08772550523281097, + "rewards/rejected": -0.745388388633728, + "step": 1820 + }, + { + "epoch": 0.4263249854397204, + "grad_norm": 6.315260887145996, + "learning_rate": 1.7486084142394822e-05, + "logits/chosen": -4.33939790725708, + "logits/rejected": -4.366152763366699, + "logps/chosen": -759.6776123046875, + "logps/rejected": -831.0908203125, + "loss": 0.7208, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4980863928794861, + "rewards/margins": 0.3789387345314026, + "rewards/rejected": -0.8770251274108887, + "step": 1830 + }, + { + "epoch": 0.42865463016889926, + "grad_norm": 5.997265815734863, + "learning_rate": 1.7460194174757283e-05, + "logits/chosen": -4.340653896331787, + "logits/rejected": -4.308244705200195, + "logps/chosen": -675.6088256835938, + "logps/rejected": -747.6590576171875, + "loss": 0.8243, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.451867014169693, + "rewards/margins": 0.26603466272354126, + "rewards/rejected": -0.7179016470909119, + "step": 1840 + }, + { + "epoch": 0.43098427489807806, + "grad_norm": 11.125731468200684, + "learning_rate": 1.7434304207119743e-05, + "logits/chosen": -4.373724937438965, + "logits/rejected": -4.415951251983643, + "logps/chosen": -709.854248046875, + "logps/rejected": -764.4318237304688, + "loss": 0.867, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5079065561294556, + "rewards/margins": 0.07335279881954193, + "rewards/rejected": -0.5812593698501587, + "step": 1850 + }, + { + "epoch": 0.43331391962725685, + "grad_norm": 5.7267279624938965, + "learning_rate": 1.7408414239482204e-05, + "logits/chosen": -4.354807376861572, + "logits/rejected": -4.387138366699219, + "logps/chosen": -755.1207275390625, + "logps/rejected": -792.3892822265625, + "loss": 0.8665, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6459492444992065, + "rewards/margins": 0.4302903711795807, + "rewards/rejected": -1.0762395858764648, + "step": 1860 + }, + { + "epoch": 0.43564356435643564, + "grad_norm": 8.538694381713867, + "learning_rate": 1.738252427184466e-05, + "logits/chosen": -4.39115047454834, + "logits/rejected": -4.409638404846191, + "logps/chosen": -694.1473388671875, + "logps/rejected": -731.4945068359375, + "loss": 0.7961, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.44678935408592224, + "rewards/margins": 0.20480577647686005, + "rewards/rejected": -0.6515951156616211, + "step": 1870 + }, + { + "epoch": 0.43797320908561443, + "grad_norm": 8.02554702758789, + "learning_rate": 1.735663430420712e-05, + "logits/chosen": -4.342083930969238, + "logits/rejected": -4.343569278717041, + "logps/chosen": -730.3948364257812, + "logps/rejected": -747.2578735351562, + "loss": 0.8439, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6137405633926392, + "rewards/margins": 0.35736626386642456, + "rewards/rejected": -0.9711068272590637, + "step": 1880 + }, + { + "epoch": 0.4403028538147932, + "grad_norm": 5.830707550048828, + "learning_rate": 1.7330744336569582e-05, + "logits/chosen": -4.3233842849731445, + "logits/rejected": -4.377291202545166, + "logps/chosen": -675.5960083007812, + "logps/rejected": -787.6995849609375, + "loss": 0.6188, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.21142582595348358, + "rewards/margins": 0.599100649356842, + "rewards/rejected": -0.8105264902114868, + "step": 1890 + }, + { + "epoch": 0.442632498543972, + "grad_norm": 7.2594194412231445, + "learning_rate": 1.730485436893204e-05, + "logits/chosen": -4.270476341247559, + "logits/rejected": -4.268796920776367, + "logps/chosen": -724.4967651367188, + "logps/rejected": -682.5066528320312, + "loss": 0.7585, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5578378438949585, + "rewards/margins": 0.33382734656333923, + "rewards/rejected": -0.8916652798652649, + "step": 1900 + }, + { + "epoch": 0.442632498543972, + "eval_logits/chosen": -4.3057966232299805, + "eval_logits/rejected": -4.29526424407959, + "eval_logps/chosen": -696.5594482421875, + "eval_logps/rejected": -715.0735473632812, + "eval_loss": 0.653139054775238, + "eval_rewards/accuracies": 0.6157394647598267, + "eval_rewards/chosen": -0.5731170773506165, + "eval_rewards/margins": 0.2543674111366272, + "eval_rewards/rejected": -0.8274844884872437, + "eval_runtime": 387.1244, + "eval_samples_per_second": 18.48, + "eval_steps_per_second": 9.24, + "step": 1900 + }, + { + "epoch": 0.44496214327315087, + "grad_norm": 8.877416610717773, + "learning_rate": 1.72789644012945e-05, + "logits/chosen": -4.295590400695801, + "logits/rejected": -4.402998447418213, + "logps/chosen": -655.2674560546875, + "logps/rejected": -729.1608276367188, + "loss": 0.7584, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.356262743473053, + "rewards/margins": 0.38617458939552307, + "rewards/rejected": -0.7424373626708984, + "step": 1910 + }, + { + "epoch": 0.44729178800232966, + "grad_norm": 5.766246795654297, + "learning_rate": 1.725307443365696e-05, + "logits/chosen": -4.296099662780762, + "logits/rejected": -4.2631754875183105, + "logps/chosen": -700.16455078125, + "logps/rejected": -688.3130493164062, + "loss": 0.7649, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5256198048591614, + "rewards/margins": 0.33562612533569336, + "rewards/rejected": -0.8612459301948547, + "step": 1920 + }, + { + "epoch": 0.44962143273150845, + "grad_norm": 10.782840728759766, + "learning_rate": 1.722718446601942e-05, + "logits/chosen": -4.388967037200928, + "logits/rejected": -4.391193866729736, + "logps/chosen": -786.0943603515625, + "logps/rejected": -793.2239379882812, + "loss": 0.945, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6996707320213318, + "rewards/margins": 0.10727129131555557, + "rewards/rejected": -0.8069421052932739, + "step": 1930 + }, + { + "epoch": 0.45195107746068724, + "grad_norm": 7.510255336761475, + "learning_rate": 1.7201294498381878e-05, + "logits/chosen": -4.252111911773682, + "logits/rejected": -4.227831840515137, + "logps/chosen": -678.5157470703125, + "logps/rejected": -728.3531494140625, + "loss": 0.864, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.39262905716896057, + "rewards/margins": 0.1792100965976715, + "rewards/rejected": -0.5718391537666321, + "step": 1940 + }, + { + "epoch": 0.45428072218986604, + "grad_norm": 10.387371063232422, + "learning_rate": 1.717540453074434e-05, + "logits/chosen": -4.272866249084473, + "logits/rejected": -4.370764255523682, + "logps/chosen": -711.3887939453125, + "logps/rejected": -780.9873657226562, + "loss": 0.7402, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.4937829077243805, + "rewards/margins": 0.5505653619766235, + "rewards/rejected": -1.044348120689392, + "step": 1950 + }, + { + "epoch": 0.45661036691904483, + "grad_norm": 8.56853199005127, + "learning_rate": 1.7149514563106796e-05, + "logits/chosen": -4.274458885192871, + "logits/rejected": -4.352829933166504, + "logps/chosen": -661.9822998046875, + "logps/rejected": -747.2185668945312, + "loss": 0.6744, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.460103839635849, + "rewards/margins": 0.5049347281455994, + "rewards/rejected": -0.9650384783744812, + "step": 1960 + }, + { + "epoch": 0.4589400116482236, + "grad_norm": 7.608241081237793, + "learning_rate": 1.7123624595469256e-05, + "logits/chosen": -4.223053932189941, + "logits/rejected": -4.191582202911377, + "logps/chosen": -636.3646240234375, + "logps/rejected": -671.3651123046875, + "loss": 0.8721, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.595376193523407, + "rewards/margins": 0.1826825588941574, + "rewards/rejected": -0.7780587673187256, + "step": 1970 + }, + { + "epoch": 0.46126965637740247, + "grad_norm": 7.9133734703063965, + "learning_rate": 1.7097734627831717e-05, + "logits/chosen": -4.371428489685059, + "logits/rejected": -4.285247325897217, + "logps/chosen": -673.6553955078125, + "logps/rejected": -660.1832275390625, + "loss": 0.849, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.640204906463623, + "rewards/margins": 0.06438927352428436, + "rewards/rejected": -0.7045941948890686, + "step": 1980 + }, + { + "epoch": 0.46359930110658126, + "grad_norm": 5.946269512176514, + "learning_rate": 1.7071844660194178e-05, + "logits/chosen": -4.309154510498047, + "logits/rejected": -4.383447647094727, + "logps/chosen": -661.7271728515625, + "logps/rejected": -756.4297485351562, + "loss": 0.6573, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.39469021558761597, + "rewards/margins": 0.5698471069335938, + "rewards/rejected": -0.9645372629165649, + "step": 1990 + }, + { + "epoch": 0.46592894583576006, + "grad_norm": 9.132586479187012, + "learning_rate": 1.7045954692556638e-05, + "logits/chosen": -4.325804710388184, + "logits/rejected": -4.374610900878906, + "logps/chosen": -753.6336669921875, + "logps/rejected": -821.5224609375, + "loss": 0.7913, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.43599969148635864, + "rewards/margins": 0.2553943991661072, + "rewards/rejected": -0.691394031047821, + "step": 2000 + }, + { + "epoch": 0.46592894583576006, + "eval_logits/chosen": -4.316074848175049, + "eval_logits/rejected": -4.305654525756836, + "eval_logps/chosen": -696.9720458984375, + "eval_logps/rejected": -715.6248779296875, + "eval_loss": 0.6540352702140808, + "eval_rewards/accuracies": 0.6111266613006592, + "eval_rewards/chosen": -0.6143832206726074, + "eval_rewards/margins": 0.2682493329048157, + "eval_rewards/rejected": -0.8826324939727783, + "eval_runtime": 387.4872, + "eval_samples_per_second": 18.463, + "eval_steps_per_second": 9.231, + "step": 2000 + }, + { + "epoch": 0.46825859056493885, + "grad_norm": 9.916934967041016, + "learning_rate": 1.7020064724919095e-05, + "logits/chosen": -4.3210673332214355, + "logits/rejected": -4.371387481689453, + "logps/chosen": -659.3184204101562, + "logps/rejected": -739.4029541015625, + "loss": 1.0101, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5967248678207397, + "rewards/margins": -0.05980793386697769, + "rewards/rejected": -0.5369168519973755, + "step": 2010 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 11.016702651977539, + "learning_rate": 1.6994174757281553e-05, + "logits/chosen": -4.28005313873291, + "logits/rejected": -4.314024925231934, + "logps/chosen": -630.3978271484375, + "logps/rejected": -720.5894775390625, + "loss": 0.8639, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6581310033798218, + "rewards/margins": 0.1440017968416214, + "rewards/rejected": -0.802132785320282, + "step": 2020 + }, + { + "epoch": 0.47291788002329643, + "grad_norm": 9.603341102600098, + "learning_rate": 1.6968284789644013e-05, + "logits/chosen": -4.329268455505371, + "logits/rejected": -4.366645336151123, + "logps/chosen": -709.0260620117188, + "logps/rejected": -751.087158203125, + "loss": 0.7811, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4101952612400055, + "rewards/margins": 0.3048393130302429, + "rewards/rejected": -0.7150346040725708, + "step": 2030 + }, + { + "epoch": 0.4752475247524752, + "grad_norm": 6.993385314941406, + "learning_rate": 1.6942394822006474e-05, + "logits/chosen": -4.323848247528076, + "logits/rejected": -4.403807640075684, + "logps/chosen": -671.4868774414062, + "logps/rejected": -710.583984375, + "loss": 0.7097, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5717336535453796, + "rewards/margins": 0.4448729157447815, + "rewards/rejected": -1.0166065692901611, + "step": 2040 + }, + { + "epoch": 0.4775771694816541, + "grad_norm": 9.222648620605469, + "learning_rate": 1.6916504854368934e-05, + "logits/chosen": -4.329625129699707, + "logits/rejected": -4.326584815979004, + "logps/chosen": -730.6510009765625, + "logps/rejected": -748.1927490234375, + "loss": 0.8364, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6884124279022217, + "rewards/margins": 0.2536119520664215, + "rewards/rejected": -0.9420243501663208, + "step": 2050 + }, + { + "epoch": 0.47990681421083287, + "grad_norm": 10.766345977783203, + "learning_rate": 1.6890614886731395e-05, + "logits/chosen": -4.456717491149902, + "logits/rejected": -4.4114227294921875, + "logps/chosen": -745.6207275390625, + "logps/rejected": -780.1121826171875, + "loss": 0.8846, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.6652731895446777, + "rewards/margins": 0.17692777514457703, + "rewards/rejected": -0.8422010540962219, + "step": 2060 + }, + { + "epoch": 0.48223645894001166, + "grad_norm": 9.230855941772461, + "learning_rate": 1.6864724919093852e-05, + "logits/chosen": -4.3310747146606445, + "logits/rejected": -4.304535388946533, + "logps/chosen": -767.2239990234375, + "logps/rejected": -702.9869384765625, + "loss": 0.8106, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.44257909059524536, + "rewards/margins": 0.2606015205383301, + "rewards/rejected": -0.7031804919242859, + "step": 2070 + }, + { + "epoch": 0.48456610366919045, + "grad_norm": 7.361287593841553, + "learning_rate": 1.6838834951456313e-05, + "logits/chosen": -4.302729606628418, + "logits/rejected": -4.313250541687012, + "logps/chosen": -648.447998046875, + "logps/rejected": -700.7833862304688, + "loss": 0.7302, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.41339558362960815, + "rewards/margins": 0.25599178671836853, + "rewards/rejected": -0.6693874001502991, + "step": 2080 + }, + { + "epoch": 0.48689574839836924, + "grad_norm": 8.901455879211426, + "learning_rate": 1.681294498381877e-05, + "logits/chosen": -4.450462818145752, + "logits/rejected": -4.434903621673584, + "logps/chosen": -756.1082763671875, + "logps/rejected": -768.9176635742188, + "loss": 0.8059, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.6943671107292175, + "rewards/margins": 0.14816538989543915, + "rewards/rejected": -0.8425325155258179, + "step": 2090 + }, + { + "epoch": 0.48922539312754804, + "grad_norm": 11.805319786071777, + "learning_rate": 1.678705501618123e-05, + "logits/chosen": -4.2671709060668945, + "logits/rejected": -4.283616542816162, + "logps/chosen": -706.508056640625, + "logps/rejected": -705.4736328125, + "loss": 0.8142, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6520582437515259, + "rewards/margins": 0.3638521134853363, + "rewards/rejected": -1.0159103870391846, + "step": 2100 + }, + { + "epoch": 0.48922539312754804, + "eval_logits/chosen": -4.311291217803955, + "eval_logits/rejected": -4.300279140472412, + "eval_logps/chosen": -697.2134399414062, + "eval_logps/rejected": -715.9606323242188, + "eval_loss": 0.6529696583747864, + "eval_rewards/accuracies": 0.6108470559120178, + "eval_rewards/chosen": -0.6385191679000854, + "eval_rewards/margins": 0.27768629789352417, + "eval_rewards/rejected": -0.9162055253982544, + "eval_runtime": 387.8096, + "eval_samples_per_second": 18.447, + "eval_steps_per_second": 9.224, + "step": 2100 + }, + { + "epoch": 0.49155503785672683, + "grad_norm": 7.226107120513916, + "learning_rate": 1.676116504854369e-05, + "logits/chosen": -4.375695705413818, + "logits/rejected": -4.270474433898926, + "logps/chosen": -694.0037841796875, + "logps/rejected": -699.7886352539062, + "loss": 0.7989, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.6933953166007996, + "rewards/margins": 0.17309093475341797, + "rewards/rejected": -0.8664861917495728, + "step": 2110 + }, + { + "epoch": 0.4938846825859057, + "grad_norm": 7.346497058868408, + "learning_rate": 1.673527508090615e-05, + "logits/chosen": -4.305628776550293, + "logits/rejected": -4.333715438842773, + "logps/chosen": -724.8486328125, + "logps/rejected": -760.1463623046875, + "loss": 0.8177, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.38773879408836365, + "rewards/margins": 0.37163084745407104, + "rewards/rejected": -0.7593695521354675, + "step": 2120 + }, + { + "epoch": 0.49621432731508447, + "grad_norm": 5.669642925262451, + "learning_rate": 1.670938511326861e-05, + "logits/chosen": -4.388348579406738, + "logits/rejected": -4.364192962646484, + "logps/chosen": -783.6923828125, + "logps/rejected": -751.2498779296875, + "loss": 0.8655, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.48794180154800415, + "rewards/margins": 0.24359111487865448, + "rewards/rejected": -0.7315329313278198, + "step": 2130 + }, + { + "epoch": 0.49854397204426326, + "grad_norm": 7.022860050201416, + "learning_rate": 1.668349514563107e-05, + "logits/chosen": -4.392494201660156, + "logits/rejected": -4.390700340270996, + "logps/chosen": -687.4622802734375, + "logps/rejected": -636.7962646484375, + "loss": 0.9793, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.7259301543235779, + "rewards/margins": -0.1531188040971756, + "rewards/rejected": -0.5728113055229187, + "step": 2140 + }, + { + "epoch": 0.500873616773442, + "grad_norm": 4.802661895751953, + "learning_rate": 1.665760517799353e-05, + "logits/chosen": -4.416029930114746, + "logits/rejected": -4.297064304351807, + "logps/chosen": -699.6199951171875, + "logps/rejected": -611.7633666992188, + "loss": 0.8991, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.4440249502658844, + "rewards/margins": 0.1489962339401245, + "rewards/rejected": -0.5930211544036865, + "step": 2150 + }, + { + "epoch": 0.5032032615026208, + "grad_norm": 9.9832181930542, + "learning_rate": 1.6631715210355987e-05, + "logits/chosen": -4.275888919830322, + "logits/rejected": -4.357396602630615, + "logps/chosen": -705.728271484375, + "logps/rejected": -773.0623779296875, + "loss": 0.7735, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7002065777778625, + "rewards/margins": 0.29984602332115173, + "rewards/rejected": -1.0000526905059814, + "step": 2160 + }, + { + "epoch": 0.5055329062317997, + "grad_norm": 7.250500679016113, + "learning_rate": 1.6605825242718448e-05, + "logits/chosen": -4.39943790435791, + "logits/rejected": -4.3512492179870605, + "logps/chosen": -666.4620971679688, + "logps/rejected": -649.422607421875, + "loss": 0.8384, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.4334799647331238, + "rewards/margins": 0.11691157519817352, + "rewards/rejected": -0.5503915548324585, + "step": 2170 + }, + { + "epoch": 0.5078625509609784, + "grad_norm": 11.566901206970215, + "learning_rate": 1.6579935275080908e-05, + "logits/chosen": -4.273348808288574, + "logits/rejected": -4.383965969085693, + "logps/chosen": -741.3567504882812, + "logps/rejected": -802.16552734375, + "loss": 1.0295, + "rewards/accuracies": 0.38749998807907104, + "rewards/chosen": -0.6346859335899353, + "rewards/margins": -0.07815258949995041, + "rewards/rejected": -0.5565333962440491, + "step": 2180 + }, + { + "epoch": 0.5101921956901573, + "grad_norm": 8.966931343078613, + "learning_rate": 1.655404530744337e-05, + "logits/chosen": -4.340353965759277, + "logits/rejected": -4.431331634521484, + "logps/chosen": -661.3947143554688, + "logps/rejected": -730.6881103515625, + "loss": 0.797, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5236181020736694, + "rewards/margins": 0.3085421919822693, + "rewards/rejected": -0.8321603536605835, + "step": 2190 + }, + { + "epoch": 0.512521840419336, + "grad_norm": 7.0853776931762695, + "learning_rate": 1.6528155339805826e-05, + "logits/chosen": -4.33907413482666, + "logits/rejected": -4.302447319030762, + "logps/chosen": -730.8140869140625, + "logps/rejected": -779.0943603515625, + "loss": 0.7641, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3492526710033417, + "rewards/margins": 0.4285813271999359, + "rewards/rejected": -0.7778339982032776, + "step": 2200 + }, + { + "epoch": 0.512521840419336, + "eval_logits/chosen": -4.323455810546875, + "eval_logits/rejected": -4.312369346618652, + "eval_logps/chosen": -695.7344360351562, + "eval_logps/rejected": -714.2130737304688, + "eval_loss": 0.6444052457809448, + "eval_rewards/accuracies": 0.6178361773490906, + "eval_rewards/chosen": -0.4906233847141266, + "eval_rewards/margins": 0.2508200705051422, + "eval_rewards/rejected": -0.7414434552192688, + "eval_runtime": 387.8429, + "eval_samples_per_second": 18.446, + "eval_steps_per_second": 9.223, + "step": 2200 + }, + { + "epoch": 0.5148514851485149, + "grad_norm": 8.635801315307617, + "learning_rate": 1.6502265372168287e-05, + "logits/chosen": -4.311502456665039, + "logits/rejected": -4.282750129699707, + "logps/chosen": -681.6749877929688, + "logps/rejected": -721.6594848632812, + "loss": 0.7712, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4766474664211273, + "rewards/margins": 0.3283555507659912, + "rewards/rejected": -0.8050029873847961, + "step": 2210 + }, + { + "epoch": 0.5171811298776936, + "grad_norm": 7.167092800140381, + "learning_rate": 1.6476375404530747e-05, + "logits/chosen": -4.369795799255371, + "logits/rejected": -4.381664276123047, + "logps/chosen": -661.0784912109375, + "logps/rejected": -720.4771728515625, + "loss": 0.7849, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.22546792030334473, + "rewards/margins": 0.4140700399875641, + "rewards/rejected": -0.6395379304885864, + "step": 2220 + }, + { + "epoch": 0.5195107746068724, + "grad_norm": 6.891871929168701, + "learning_rate": 1.6450485436893204e-05, + "logits/chosen": -4.3150835037231445, + "logits/rejected": -4.312820911407471, + "logps/chosen": -732.7314453125, + "logps/rejected": -758.8406982421875, + "loss": 0.8171, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5067173838615417, + "rewards/margins": 0.19723649322986603, + "rewards/rejected": -0.7039539813995361, + "step": 2230 + }, + { + "epoch": 0.5218404193360513, + "grad_norm": 6.103372097015381, + "learning_rate": 1.6424595469255665e-05, + "logits/chosen": -4.419613838195801, + "logits/rejected": -4.384373188018799, + "logps/chosen": -695.9058837890625, + "logps/rejected": -728.0184326171875, + "loss": 0.7623, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5310667157173157, + "rewards/margins": 0.3870542645454407, + "rewards/rejected": -0.9181209802627563, + "step": 2240 + }, + { + "epoch": 0.52417006406523, + "grad_norm": 7.377810955047607, + "learning_rate": 1.6398705501618125e-05, + "logits/chosen": -4.379231929779053, + "logits/rejected": -4.402649879455566, + "logps/chosen": -715.0233154296875, + "logps/rejected": -724.0562133789062, + "loss": 0.7516, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.4546116888523102, + "rewards/margins": 0.2904551029205322, + "rewards/rejected": -0.74506676197052, + "step": 2250 + }, + { + "epoch": 0.5264997087944089, + "grad_norm": 9.442928314208984, + "learning_rate": 1.6372815533980583e-05, + "logits/chosen": -4.354384422302246, + "logits/rejected": -4.3555216789245605, + "logps/chosen": -670.1593627929688, + "logps/rejected": -718.1063232421875, + "loss": 0.819, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.39854809641838074, + "rewards/margins": 0.2609342634677887, + "rewards/rejected": -0.6594824194908142, + "step": 2260 + }, + { + "epoch": 0.5288293535235876, + "grad_norm": 10.922154426574707, + "learning_rate": 1.6346925566343043e-05, + "logits/chosen": -4.298701763153076, + "logits/rejected": -4.350190162658691, + "logps/chosen": -669.03173828125, + "logps/rejected": -745.9063720703125, + "loss": 0.8177, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5612319707870483, + "rewards/margins": 0.216721773147583, + "rewards/rejected": -0.7779537439346313, + "step": 2270 + }, + { + "epoch": 0.5311589982527665, + "grad_norm": 7.369058609008789, + "learning_rate": 1.6321035598705504e-05, + "logits/chosen": -4.28975772857666, + "logits/rejected": -4.325669765472412, + "logps/chosen": -692.7982177734375, + "logps/rejected": -773.2313232421875, + "loss": 0.7424, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41624245047569275, + "rewards/margins": 0.27775830030441284, + "rewards/rejected": -0.6940008401870728, + "step": 2280 + }, + { + "epoch": 0.5334886429819452, + "grad_norm": 7.548649787902832, + "learning_rate": 1.6295145631067964e-05, + "logits/chosen": -4.324959754943848, + "logits/rejected": -4.336320877075195, + "logps/chosen": -721.7525634765625, + "logps/rejected": -764.1182250976562, + "loss": 0.6853, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4422667622566223, + "rewards/margins": 0.5799342393875122, + "rewards/rejected": -1.0222009420394897, + "step": 2290 + }, + { + "epoch": 0.535818287711124, + "grad_norm": 8.149725914001465, + "learning_rate": 1.626925566343042e-05, + "logits/chosen": -4.364233016967773, + "logits/rejected": -4.299471855163574, + "logps/chosen": -732.43701171875, + "logps/rejected": -720.9097900390625, + "loss": 0.8921, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.6769243478775024, + "rewards/margins": 0.19919486343860626, + "rewards/rejected": -0.8761193156242371, + "step": 2300 + }, + { + "epoch": 0.535818287711124, + "eval_logits/chosen": -4.308585166931152, + "eval_logits/rejected": -4.2970194816589355, + "eval_logps/chosen": -696.9252319335938, + "eval_logps/rejected": -715.663330078125, + "eval_loss": 0.6475224494934082, + "eval_rewards/accuracies": 0.6199328899383545, + "eval_rewards/chosen": -0.60969078540802, + "eval_rewards/margins": 0.2767795920372009, + "eval_rewards/rejected": -0.8864704370498657, + "eval_runtime": 387.8643, + "eval_samples_per_second": 18.445, + "eval_steps_per_second": 9.222, + "step": 2300 + }, + { + "epoch": 0.5381479324403029, + "grad_norm": 8.612531661987305, + "learning_rate": 1.6243365695792882e-05, + "logits/chosen": -4.266479969024658, + "logits/rejected": -4.361386775970459, + "logps/chosen": -668.0935668945312, + "logps/rejected": -717.5763549804688, + "loss": 0.8491, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5396800637245178, + "rewards/margins": 0.3209841251373291, + "rewards/rejected": -0.8606641888618469, + "step": 2310 + }, + { + "epoch": 0.5404775771694816, + "grad_norm": 7.291008472442627, + "learning_rate": 1.621747572815534e-05, + "logits/chosen": -4.2777605056762695, + "logits/rejected": -4.271374702453613, + "logps/chosen": -713.1057739257812, + "logps/rejected": -694.7840576171875, + "loss": 0.8052, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.5852731466293335, + "rewards/margins": 0.13405922055244446, + "rewards/rejected": -0.7193323969841003, + "step": 2320 + }, + { + "epoch": 0.5428072218986605, + "grad_norm": 9.134176254272461, + "learning_rate": 1.61915857605178e-05, + "logits/chosen": -4.368586540222168, + "logits/rejected": -4.426856994628906, + "logps/chosen": -713.0442504882812, + "logps/rejected": -725.6788940429688, + "loss": 0.7815, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5852844715118408, + "rewards/margins": 0.25622040033340454, + "rewards/rejected": -0.8415048718452454, + "step": 2330 + }, + { + "epoch": 0.5451368666278392, + "grad_norm": 6.4649763107299805, + "learning_rate": 1.616569579288026e-05, + "logits/chosen": -4.385904788970947, + "logits/rejected": -4.340734481811523, + "logps/chosen": -698.6243286132812, + "logps/rejected": -749.6116943359375, + "loss": 0.7694, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5972713232040405, + "rewards/margins": 0.24341309070587158, + "rewards/rejected": -0.8406842947006226, + "step": 2340 + }, + { + "epoch": 0.5474665113570181, + "grad_norm": 8.067676544189453, + "learning_rate": 1.613980582524272e-05, + "logits/chosen": -4.363903999328613, + "logits/rejected": -4.423806667327881, + "logps/chosen": -663.022705078125, + "logps/rejected": -722.1543579101562, + "loss": 0.7407, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.49775949120521545, + "rewards/margins": 0.2423984259366989, + "rewards/rejected": -0.7401579022407532, + "step": 2350 + }, + { + "epoch": 0.5497961560861968, + "grad_norm": 5.057220935821533, + "learning_rate": 1.611391585760518e-05, + "logits/chosen": -4.31404972076416, + "logits/rejected": -4.347018718719482, + "logps/chosen": -756.7380981445312, + "logps/rejected": -747.169189453125, + "loss": 0.7591, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5640712976455688, + "rewards/margins": 0.3368191719055176, + "rewards/rejected": -0.9008904695510864, + "step": 2360 + }, + { + "epoch": 0.5521258008153757, + "grad_norm": 10.371302604675293, + "learning_rate": 1.608802588996764e-05, + "logits/chosen": -4.346640586853027, + "logits/rejected": -4.344121932983398, + "logps/chosen": -713.8607177734375, + "logps/rejected": -741.2636108398438, + "loss": 0.8372, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.5607599020004272, + "rewards/margins": 0.2859343886375427, + "rewards/rejected": -0.84669429063797, + "step": 2370 + }, + { + "epoch": 0.5544554455445545, + "grad_norm": 8.637513160705566, + "learning_rate": 1.60621359223301e-05, + "logits/chosen": -4.243861198425293, + "logits/rejected": -4.345170021057129, + "logps/chosen": -642.0486450195312, + "logps/rejected": -795.27294921875, + "loss": 0.7589, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.675322413444519, + "rewards/margins": 0.38120463490486145, + "rewards/rejected": -1.056527018547058, + "step": 2380 + }, + { + "epoch": 0.5567850902737332, + "grad_norm": 9.141393661499023, + "learning_rate": 1.6036245954692557e-05, + "logits/chosen": -4.281918525695801, + "logits/rejected": -4.320727825164795, + "logps/chosen": -729.5989990234375, + "logps/rejected": -715.9414672851562, + "loss": 0.8928, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.7688027024269104, + "rewards/margins": 0.02522212825715542, + "rewards/rejected": -0.7940248250961304, + "step": 2390 + }, + { + "epoch": 0.5591147350029121, + "grad_norm": 5.217013359069824, + "learning_rate": 1.6010355987055017e-05, + "logits/chosen": -4.283066272735596, + "logits/rejected": -4.255189895629883, + "logps/chosen": -767.1123657226562, + "logps/rejected": -819.5071411132812, + "loss": 0.6825, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5336806178092957, + "rewards/margins": 0.6160721778869629, + "rewards/rejected": -1.1497528553009033, + "step": 2400 + }, + { + "epoch": 0.5591147350029121, + "eval_logits/chosen": -4.297215461730957, + "eval_logits/rejected": -4.28573751449585, + "eval_logps/chosen": -697.8079833984375, + "eval_logps/rejected": -716.7228393554688, + "eval_loss": 0.6530700922012329, + "eval_rewards/accuracies": 0.6114062070846558, + "eval_rewards/chosen": -0.697973906993866, + "eval_rewards/margins": 0.2944377660751343, + "eval_rewards/rejected": -0.9924116134643555, + "eval_runtime": 388.0812, + "eval_samples_per_second": 18.434, + "eval_steps_per_second": 9.217, + "step": 2400 + }, + { + "epoch": 0.5614443797320908, + "grad_norm": 10.828152656555176, + "learning_rate": 1.5984466019417478e-05, + "logits/chosen": -4.3832502365112305, + "logits/rejected": -4.312473297119141, + "logps/chosen": -733.7899169921875, + "logps/rejected": -697.7799072265625, + "loss": 0.9236, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.6746879816055298, + "rewards/margins": 0.05621149390935898, + "rewards/rejected": -0.7308995127677917, + "step": 2410 + }, + { + "epoch": 0.5637740244612697, + "grad_norm": 11.380895614624023, + "learning_rate": 1.5958576051779938e-05, + "logits/chosen": -4.3725385665893555, + "logits/rejected": -4.367467403411865, + "logps/chosen": -758.9491577148438, + "logps/rejected": -784.2022094726562, + "loss": 0.8901, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.6533423066139221, + "rewards/margins": 0.22708959877490997, + "rewards/rejected": -0.8804319500923157, + "step": 2420 + }, + { + "epoch": 0.5661036691904484, + "grad_norm": 9.553884506225586, + "learning_rate": 1.5932686084142395e-05, + "logits/chosen": -4.271600246429443, + "logits/rejected": -4.332309722900391, + "logps/chosen": -682.1741943359375, + "logps/rejected": -768.156494140625, + "loss": 0.7798, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.528712272644043, + "rewards/margins": 0.4405105710029602, + "rewards/rejected": -0.9692228436470032, + "step": 2430 + }, + { + "epoch": 0.5684333139196273, + "grad_norm": 8.388798713684082, + "learning_rate": 1.5906796116504856e-05, + "logits/chosen": -4.364581108093262, + "logits/rejected": -4.264370441436768, + "logps/chosen": -755.1317138671875, + "logps/rejected": -676.2997436523438, + "loss": 0.7807, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5193127393722534, + "rewards/margins": 0.5082674026489258, + "rewards/rejected": -1.0275801420211792, + "step": 2440 + }, + { + "epoch": 0.5707629586488061, + "grad_norm": 9.449752807617188, + "learning_rate": 1.5880906148867313e-05, + "logits/chosen": -4.360546588897705, + "logits/rejected": -4.376889705657959, + "logps/chosen": -746.4757690429688, + "logps/rejected": -775.843505859375, + "loss": 0.8107, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5038831830024719, + "rewards/margins": 0.48872479796409607, + "rewards/rejected": -0.9926078915596008, + "step": 2450 + }, + { + "epoch": 0.5730926033779848, + "grad_norm": 8.81622314453125, + "learning_rate": 1.5855016181229774e-05, + "logits/chosen": -4.269045829772949, + "logits/rejected": -4.288808345794678, + "logps/chosen": -723.4371948242188, + "logps/rejected": -742.6324462890625, + "loss": 0.9359, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.5083256959915161, + "rewards/margins": 0.05839193984866142, + "rewards/rejected": -0.5667175650596619, + "step": 2460 + }, + { + "epoch": 0.5754222481071637, + "grad_norm": 7.394127368927002, + "learning_rate": 1.5829126213592234e-05, + "logits/chosen": -4.28656005859375, + "logits/rejected": -4.279847621917725, + "logps/chosen": -719.2637329101562, + "logps/rejected": -764.5406494140625, + "loss": 0.6321, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6179152727127075, + "rewards/margins": 0.6699618101119995, + "rewards/rejected": -1.287877082824707, + "step": 2470 + }, + { + "epoch": 0.5777518928363424, + "grad_norm": 7.791938781738281, + "learning_rate": 1.5803236245954695e-05, + "logits/chosen": -4.400963306427002, + "logits/rejected": -4.369585990905762, + "logps/chosen": -738.0977783203125, + "logps/rejected": -776.5066528320312, + "loss": 0.7441, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5543312430381775, + "rewards/margins": 0.5328670740127563, + "rewards/rejected": -1.087198257446289, + "step": 2480 + }, + { + "epoch": 0.5800815375655213, + "grad_norm": 7.783450603485107, + "learning_rate": 1.5777346278317155e-05, + "logits/chosen": -4.258950233459473, + "logits/rejected": -4.367677688598633, + "logps/chosen": -688.4526977539062, + "logps/rejected": -737.1357421875, + "loss": 0.7498, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5971588492393494, + "rewards/margins": 0.4435792565345764, + "rewards/rejected": -1.0407381057739258, + "step": 2490 + }, + { + "epoch": 0.5824111822947, + "grad_norm": 8.986100196838379, + "learning_rate": 1.5751456310679613e-05, + "logits/chosen": -4.3454484939575195, + "logits/rejected": -4.2997660636901855, + "logps/chosen": -679.8911743164062, + "logps/rejected": -682.4745483398438, + "loss": 0.8481, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.6458913683891296, + "rewards/margins": 0.020166147500276566, + "rewards/rejected": -0.6660575270652771, + "step": 2500 + }, + { + "epoch": 0.5824111822947, + "eval_logits/chosen": -4.292790412902832, + "eval_logits/rejected": -4.281363010406494, + "eval_logps/chosen": -696.9899291992188, + "eval_logps/rejected": -715.8394165039062, + "eval_loss": 0.6466853618621826, + "eval_rewards/accuracies": 0.6182554960250854, + "eval_rewards/chosen": -0.6161715388298035, + "eval_rewards/margins": 0.28790563344955444, + "eval_rewards/rejected": -0.9040771722793579, + "eval_runtime": 388.0903, + "eval_samples_per_second": 18.434, + "eval_steps_per_second": 9.217, + "step": 2500 + }, + { + "epoch": 0.5847408270238789, + "grad_norm": 5.886007308959961, + "learning_rate": 1.5725566343042073e-05, + "logits/chosen": -4.2544379234313965, + "logits/rejected": -4.31434440612793, + "logps/chosen": -690.2918090820312, + "logps/rejected": -674.09375, + "loss": 0.6412, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.342994749546051, + "rewards/margins": 0.6141099333763123, + "rewards/rejected": -0.9571046829223633, + "step": 2510 + }, + { + "epoch": 0.5870704717530577, + "grad_norm": 7.043492794036865, + "learning_rate": 1.569967637540453e-05, + "logits/chosen": -4.2179460525512695, + "logits/rejected": -4.281126976013184, + "logps/chosen": -636.9339599609375, + "logps/rejected": -724.7083129882812, + "loss": 0.6445, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.49156904220581055, + "rewards/margins": 0.5859013795852661, + "rewards/rejected": -1.0774705410003662, + "step": 2520 + }, + { + "epoch": 0.5894001164822364, + "grad_norm": 6.933673858642578, + "learning_rate": 1.567378640776699e-05, + "logits/chosen": -4.389291763305664, + "logits/rejected": -4.375206470489502, + "logps/chosen": -784.4617919921875, + "logps/rejected": -736.8739013671875, + "loss": 0.8139, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5662163496017456, + "rewards/margins": 0.27462419867515564, + "rewards/rejected": -0.8408406376838684, + "step": 2530 + }, + { + "epoch": 0.5917297612114153, + "grad_norm": 8.004260063171387, + "learning_rate": 1.564789644012945e-05, + "logits/chosen": -4.330020904541016, + "logits/rejected": -4.290645599365234, + "logps/chosen": -658.646484375, + "logps/rejected": -650.4393920898438, + "loss": 0.9183, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6325026750564575, + "rewards/margins": 0.08804898709058762, + "rewards/rejected": -0.7205516695976257, + "step": 2540 + }, + { + "epoch": 0.594059405940594, + "grad_norm": 6.865865230560303, + "learning_rate": 1.5622006472491912e-05, + "logits/chosen": -4.3161234855651855, + "logits/rejected": -4.298664093017578, + "logps/chosen": -713.3058471679688, + "logps/rejected": -761.7860107421875, + "loss": 0.8578, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.5262347459793091, + "rewards/margins": 0.36612358689308167, + "rewards/rejected": -0.8923583030700684, + "step": 2550 + }, + { + "epoch": 0.5963890506697729, + "grad_norm": 8.332757949829102, + "learning_rate": 1.559611650485437e-05, + "logits/chosen": -4.3331499099731445, + "logits/rejected": -4.270108699798584, + "logps/chosen": -711.1723022460938, + "logps/rejected": -717.375732421875, + "loss": 0.7714, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5592328906059265, + "rewards/margins": 0.31131255626678467, + "rewards/rejected": -0.870545506477356, + "step": 2560 + }, + { + "epoch": 0.5987186953989516, + "grad_norm": 9.551703453063965, + "learning_rate": 1.557022653721683e-05, + "logits/chosen": -4.328275203704834, + "logits/rejected": -4.383193016052246, + "logps/chosen": -684.763671875, + "logps/rejected": -722.69580078125, + "loss": 0.7131, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.2863122522830963, + "rewards/margins": 0.4516221880912781, + "rewards/rejected": -0.7379344701766968, + "step": 2570 + }, + { + "epoch": 0.6010483401281305, + "grad_norm": 7.2382402420043945, + "learning_rate": 1.554433656957929e-05, + "logits/chosen": -4.349534511566162, + "logits/rejected": -4.299520015716553, + "logps/chosen": -716.7404174804688, + "logps/rejected": -692.8947143554688, + "loss": 0.9481, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5689787268638611, + "rewards/margins": -0.02323698066174984, + "rewards/rejected": -0.5457417368888855, + "step": 2580 + }, + { + "epoch": 0.6033779848573093, + "grad_norm": 9.520913124084473, + "learning_rate": 1.5518446601941748e-05, + "logits/chosen": -4.272482395172119, + "logits/rejected": -4.3258376121521, + "logps/chosen": -705.7965087890625, + "logps/rejected": -755.6998291015625, + "loss": 0.7989, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5515230894088745, + "rewards/margins": 0.3450368046760559, + "rewards/rejected": -0.8965598344802856, + "step": 2590 + }, + { + "epoch": 0.605707629586488, + "grad_norm": 7.175052642822266, + "learning_rate": 1.5492556634304208e-05, + "logits/chosen": -4.400643825531006, + "logits/rejected": -4.387502193450928, + "logps/chosen": -768.1902465820312, + "logps/rejected": -745.0789184570312, + "loss": 0.7822, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4362802505493164, + "rewards/margins": 0.3764461576938629, + "rewards/rejected": -0.8127263784408569, + "step": 2600 + }, + { + "epoch": 0.605707629586488, + "eval_logits/chosen": -4.292550563812256, + "eval_logits/rejected": -4.280701160430908, + "eval_logps/chosen": -696.8247680664062, + "eval_logps/rejected": -715.6983642578125, + "eval_loss": 0.6448931694030762, + "eval_rewards/accuracies": 0.6211909651756287, + "eval_rewards/chosen": -0.5996540188789368, + "eval_rewards/margins": 0.29031842947006226, + "eval_rewards/rejected": -0.8899723887443542, + "eval_runtime": 388.4993, + "eval_samples_per_second": 18.414, + "eval_steps_per_second": 9.207, + "step": 2600 + }, + { + "epoch": 0.6080372743156669, + "grad_norm": 3.6842851638793945, + "learning_rate": 1.546666666666667e-05, + "logits/chosen": -4.335328102111816, + "logits/rejected": -4.325960159301758, + "logps/chosen": -761.3397216796875, + "logps/rejected": -758.9552001953125, + "loss": 0.6935, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.34092193841934204, + "rewards/margins": 0.4871430993080139, + "rewards/rejected": -0.828065037727356, + "step": 2610 + }, + { + "epoch": 0.6103669190448456, + "grad_norm": 9.210099220275879, + "learning_rate": 1.5440776699029126e-05, + "logits/chosen": -4.376111030578613, + "logits/rejected": -4.404354572296143, + "logps/chosen": -695.9658203125, + "logps/rejected": -763.7271118164062, + "loss": 0.6971, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.3182308077812195, + "rewards/margins": 0.586915910243988, + "rewards/rejected": -0.9051467180252075, + "step": 2620 + }, + { + "epoch": 0.6126965637740245, + "grad_norm": 6.681046485900879, + "learning_rate": 1.5414886731391587e-05, + "logits/chosen": -4.412899017333984, + "logits/rejected": -4.380231857299805, + "logps/chosen": -720.5208129882812, + "logps/rejected": -738.626708984375, + "loss": 0.8028, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6240053772926331, + "rewards/margins": 0.2591809630393982, + "rewards/rejected": -0.8831863403320312, + "step": 2630 + }, + { + "epoch": 0.6150262085032032, + "grad_norm": 7.9586968421936035, + "learning_rate": 1.5388996763754047e-05, + "logits/chosen": -4.327067852020264, + "logits/rejected": -4.382328033447266, + "logps/chosen": -682.6461791992188, + "logps/rejected": -751.2008666992188, + "loss": 0.7804, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5338839292526245, + "rewards/margins": 0.25476861000061035, + "rewards/rejected": -0.7886524796485901, + "step": 2640 + }, + { + "epoch": 0.6173558532323821, + "grad_norm": 4.548120021820068, + "learning_rate": 1.5363106796116508e-05, + "logits/chosen": -4.392605781555176, + "logits/rejected": -4.419025421142578, + "logps/chosen": -731.2877197265625, + "logps/rejected": -758.0394897460938, + "loss": 0.7281, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.34113675355911255, + "rewards/margins": 0.3821430802345276, + "rewards/rejected": -0.7232798337936401, + "step": 2650 + }, + { + "epoch": 0.6196854979615609, + "grad_norm": 8.346933364868164, + "learning_rate": 1.5337216828478965e-05, + "logits/chosen": -4.402939796447754, + "logits/rejected": -4.375199794769287, + "logps/chosen": -735.7860107421875, + "logps/rejected": -770.9775390625, + "loss": 0.7416, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.42072242498397827, + "rewards/margins": 0.5455988645553589, + "rewards/rejected": -0.9663212895393372, + "step": 2660 + }, + { + "epoch": 0.6220151426907397, + "grad_norm": 5.262242794036865, + "learning_rate": 1.5311326860841425e-05, + "logits/chosen": -4.312819480895996, + "logits/rejected": -4.257102012634277, + "logps/chosen": -670.9732055664062, + "logps/rejected": -666.3642578125, + "loss": 0.7165, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.4910193085670471, + "rewards/margins": 0.5100225210189819, + "rewards/rejected": -1.0010417699813843, + "step": 2670 + }, + { + "epoch": 0.6243447874199185, + "grad_norm": 7.206348896026611, + "learning_rate": 1.5285436893203886e-05, + "logits/chosen": -4.353832244873047, + "logits/rejected": -4.302159309387207, + "logps/chosen": -736.9912109375, + "logps/rejected": -758.8922119140625, + "loss": 0.8675, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5955030918121338, + "rewards/margins": 0.2457858771085739, + "rewards/rejected": -0.8412889242172241, + "step": 2680 + }, + { + "epoch": 0.6266744321490972, + "grad_norm": 7.944403648376465, + "learning_rate": 1.5259546925566343e-05, + "logits/chosen": -4.326117992401123, + "logits/rejected": -4.392238616943359, + "logps/chosen": -655.5966796875, + "logps/rejected": -687.850341796875, + "loss": 0.6854, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4322779178619385, + "rewards/margins": 0.4284347593784332, + "rewards/rejected": -0.8607127070426941, + "step": 2690 + }, + { + "epoch": 0.6290040768782761, + "grad_norm": 8.794605255126953, + "learning_rate": 1.5233656957928804e-05, + "logits/chosen": -4.378158092498779, + "logits/rejected": -4.327710151672363, + "logps/chosen": -737.3948364257812, + "logps/rejected": -742.9942626953125, + "loss": 0.9305, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.6186449527740479, + "rewards/margins": -0.0013167411088943481, + "rewards/rejected": -0.6173282861709595, + "step": 2700 + }, + { + "epoch": 0.6290040768782761, + "eval_logits/chosen": -4.296804904937744, + "eval_logits/rejected": -4.284645080566406, + "eval_logps/chosen": -696.439697265625, + "eval_logps/rejected": -715.352294921875, + "eval_loss": 0.6431933641433716, + "eval_rewards/accuracies": 0.621330738067627, + "eval_rewards/chosen": -0.5611439347267151, + "eval_rewards/margins": 0.29422974586486816, + "eval_rewards/rejected": -0.8553736805915833, + "eval_runtime": 388.3906, + "eval_samples_per_second": 18.42, + "eval_steps_per_second": 9.21, + "step": 2700 + }, + { + "epoch": 0.6313337216074548, + "grad_norm": 5.812639236450195, + "learning_rate": 1.5207766990291264e-05, + "logits/chosen": -4.331504821777344, + "logits/rejected": -4.388894081115723, + "logps/chosen": -657.0352172851562, + "logps/rejected": -717.141845703125, + "loss": 0.6821, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.497232586145401, + "rewards/margins": 0.3659915328025818, + "rewards/rejected": -0.8632240295410156, + "step": 2710 + }, + { + "epoch": 0.6336633663366337, + "grad_norm": 8.673256874084473, + "learning_rate": 1.5181877022653723e-05, + "logits/chosen": -4.275472164154053, + "logits/rejected": -4.281885623931885, + "logps/chosen": -668.5069580078125, + "logps/rejected": -711.5523071289062, + "loss": 0.7998, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.4561544954776764, + "rewards/margins": 0.27673688530921936, + "rewards/rejected": -0.7328914403915405, + "step": 2720 + }, + { + "epoch": 0.6359930110658125, + "grad_norm": 7.681230545043945, + "learning_rate": 1.5155987055016182e-05, + "logits/chosen": -4.440752983093262, + "logits/rejected": -4.328701019287109, + "logps/chosen": -738.4246215820312, + "logps/rejected": -693.4755249023438, + "loss": 0.813, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.6215518116950989, + "rewards/margins": 0.08657562732696533, + "rewards/rejected": -0.7081274390220642, + "step": 2730 + }, + { + "epoch": 0.6383226557949913, + "grad_norm": 7.156750679016113, + "learning_rate": 1.5130097087378641e-05, + "logits/chosen": -4.261274814605713, + "logits/rejected": -4.308469295501709, + "logps/chosen": -693.0875244140625, + "logps/rejected": -768.095703125, + "loss": 0.768, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6500027179718018, + "rewards/margins": 0.44288986921310425, + "rewards/rejected": -1.0928925275802612, + "step": 2740 + }, + { + "epoch": 0.6406523005241701, + "grad_norm": 7.899952411651611, + "learning_rate": 1.5104207119741102e-05, + "logits/chosen": -4.327859878540039, + "logits/rejected": -4.248705863952637, + "logps/chosen": -768.5486450195312, + "logps/rejected": -713.5155639648438, + "loss": 0.6868, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.42632707953453064, + "rewards/margins": 0.5251134634017944, + "rewards/rejected": -0.9514405131340027, + "step": 2750 + }, + { + "epoch": 0.6429819452533488, + "grad_norm": 6.4603729248046875, + "learning_rate": 1.507831715210356e-05, + "logits/chosen": -4.355565071105957, + "logits/rejected": -4.352751731872559, + "logps/chosen": -706.5877685546875, + "logps/rejected": -733.3256225585938, + "loss": 0.8891, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.79791259765625, + "rewards/margins": 0.029533350840210915, + "rewards/rejected": -0.8274458646774292, + "step": 2760 + }, + { + "epoch": 0.6453115899825277, + "grad_norm": 7.2228803634643555, + "learning_rate": 1.5052427184466021e-05, + "logits/chosen": -4.348843574523926, + "logits/rejected": -4.36469030380249, + "logps/chosen": -736.6920776367188, + "logps/rejected": -755.9579467773438, + "loss": 0.8354, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5484541058540344, + "rewards/margins": 0.36284321546554565, + "rewards/rejected": -0.9112973213195801, + "step": 2770 + }, + { + "epoch": 0.6476412347117064, + "grad_norm": 9.550434112548828, + "learning_rate": 1.502653721682848e-05, + "logits/chosen": -4.265006065368652, + "logits/rejected": -4.404940128326416, + "logps/chosen": -741.6475830078125, + "logps/rejected": -816.3297119140625, + "loss": 0.7099, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.39252057671546936, + "rewards/margins": 0.39079761505126953, + "rewards/rejected": -0.7833182215690613, + "step": 2780 + }, + { + "epoch": 0.6499708794408853, + "grad_norm": 7.080128192901611, + "learning_rate": 1.500064724919094e-05, + "logits/chosen": -4.357051849365234, + "logits/rejected": -4.326101303100586, + "logps/chosen": -781.0487670898438, + "logps/rejected": -725.7145385742188, + "loss": 0.6983, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.22293445467948914, + "rewards/margins": 0.5743099451065063, + "rewards/rejected": -0.7972443699836731, + "step": 2790 + }, + { + "epoch": 0.652300524170064, + "grad_norm": 6.817419052124023, + "learning_rate": 1.4974757281553401e-05, + "logits/chosen": -4.357829570770264, + "logits/rejected": -4.383296489715576, + "logps/chosen": -741.2380981445312, + "logps/rejected": -783.3194580078125, + "loss": 0.8684, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.686312735080719, + "rewards/margins": 0.21443429589271545, + "rewards/rejected": -0.9007471203804016, + "step": 2800 + }, + { + "epoch": 0.652300524170064, + "eval_logits/chosen": -4.2776408195495605, + "eval_logits/rejected": -4.264857292175293, + "eval_logps/chosen": -697.18115234375, + "eval_logps/rejected": -716.2308959960938, + "eval_loss": 0.6444410085678101, + "eval_rewards/accuracies": 0.6178361773490906, + "eval_rewards/chosen": -0.6352859735488892, + "eval_rewards/margins": 0.3079439699649811, + "eval_rewards/rejected": -0.9432300329208374, + "eval_runtime": 388.9003, + "eval_samples_per_second": 18.395, + "eval_steps_per_second": 9.198, + "step": 2800 + }, + { + "epoch": 0.6546301688992429, + "grad_norm": 6.708640098571777, + "learning_rate": 1.4948867313915858e-05, + "logits/chosen": -4.2895307540893555, + "logits/rejected": -4.2548699378967285, + "logps/chosen": -744.3851318359375, + "logps/rejected": -756.6453247070312, + "loss": 0.9964, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.845330536365509, + "rewards/margins": -0.13607561588287354, + "rewards/rejected": -0.7092548608779907, + "step": 2810 + }, + { + "epoch": 0.6569598136284217, + "grad_norm": 8.062914848327637, + "learning_rate": 1.4922977346278317e-05, + "logits/chosen": -4.304629802703857, + "logits/rejected": -4.2135725021362305, + "logps/chosen": -758.5049438476562, + "logps/rejected": -712.3289794921875, + "loss": 0.9002, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.6158826947212219, + "rewards/margins": 0.14774423837661743, + "rewards/rejected": -0.7636269330978394, + "step": 2820 + }, + { + "epoch": 0.6592894583576004, + "grad_norm": 8.468124389648438, + "learning_rate": 1.4897087378640778e-05, + "logits/chosen": -4.3212080001831055, + "logits/rejected": -4.310835838317871, + "logps/chosen": -730.5067749023438, + "logps/rejected": -711.48095703125, + "loss": 0.7278, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.46248364448547363, + "rewards/margins": 0.3725832998752594, + "rewards/rejected": -0.8350669145584106, + "step": 2830 + }, + { + "epoch": 0.6616191030867793, + "grad_norm": 5.327014446258545, + "learning_rate": 1.4871197411003238e-05, + "logits/chosen": -4.333258152008057, + "logits/rejected": -4.298372268676758, + "logps/chosen": -703.5050048828125, + "logps/rejected": -738.3985595703125, + "loss": 0.7743, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5580412745475769, + "rewards/margins": 0.3654623031616211, + "rewards/rejected": -0.9235035181045532, + "step": 2840 + }, + { + "epoch": 0.663948747815958, + "grad_norm": 8.886383056640625, + "learning_rate": 1.4845307443365697e-05, + "logits/chosen": -4.331049919128418, + "logits/rejected": -4.31258487701416, + "logps/chosen": -711.8903198242188, + "logps/rejected": -739.1852416992188, + "loss": 0.9072, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.6002537608146667, + "rewards/margins": -0.008847487159073353, + "rewards/rejected": -0.5914062261581421, + "step": 2850 + }, + { + "epoch": 0.6662783925451369, + "grad_norm": 9.955883026123047, + "learning_rate": 1.4819417475728158e-05, + "logits/chosen": -4.360678672790527, + "logits/rejected": -4.346714973449707, + "logps/chosen": -741.66357421875, + "logps/rejected": -740.5900268554688, + "loss": 0.7513, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.43228378891944885, + "rewards/margins": 0.45141610503196716, + "rewards/rejected": -0.883699893951416, + "step": 2860 + }, + { + "epoch": 0.6686080372743156, + "grad_norm": 7.811578750610352, + "learning_rate": 1.4793527508090617e-05, + "logits/chosen": -4.372070789337158, + "logits/rejected": -4.431691646575928, + "logps/chosen": -701.4199829101562, + "logps/rejected": -754.1885986328125, + "loss": 0.7976, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.565507173538208, + "rewards/margins": 0.3819185197353363, + "rewards/rejected": -0.9474257230758667, + "step": 2870 + }, + { + "epoch": 0.6709376820034945, + "grad_norm": 7.534102439880371, + "learning_rate": 1.4767637540453075e-05, + "logits/chosen": -4.34897518157959, + "logits/rejected": -4.343818187713623, + "logps/chosen": -740.5709228515625, + "logps/rejected": -758.922119140625, + "loss": 0.7366, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5281800627708435, + "rewards/margins": 0.488368421792984, + "rewards/rejected": -1.0165483951568604, + "step": 2880 + }, + { + "epoch": 0.6732673267326733, + "grad_norm": 4.9338884353637695, + "learning_rate": 1.4741747572815534e-05, + "logits/chosen": -4.355449676513672, + "logits/rejected": -4.404020309448242, + "logps/chosen": -711.3448486328125, + "logps/rejected": -729.7279663085938, + "loss": 0.8034, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5138441920280457, + "rewards/margins": 0.2933233082294464, + "rewards/rejected": -0.8071675300598145, + "step": 2890 + }, + { + "epoch": 0.675596971461852, + "grad_norm": 8.007110595703125, + "learning_rate": 1.4715857605177995e-05, + "logits/chosen": -4.389501094818115, + "logits/rejected": -4.405324935913086, + "logps/chosen": -758.5374755859375, + "logps/rejected": -777.3524169921875, + "loss": 0.7807, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41694846749305725, + "rewards/margins": 0.5797191858291626, + "rewards/rejected": -0.9966676831245422, + "step": 2900 + }, + { + "epoch": 0.675596971461852, + "eval_logits/chosen": -4.294885635375977, + "eval_logits/rejected": -4.282512664794922, + "eval_logps/chosen": -696.2392578125, + "eval_logps/rejected": -715.1339111328125, + "eval_loss": 0.6404834389686584, + "eval_rewards/accuracies": 0.6242661476135254, + "eval_rewards/chosen": -0.5410952568054199, + "eval_rewards/margins": 0.29242727160453796, + "eval_rewards/rejected": -0.8335224986076355, + "eval_runtime": 388.2427, + "eval_samples_per_second": 18.427, + "eval_steps_per_second": 9.213, + "step": 2900 + }, + { + "epoch": 0.6779266161910309, + "grad_norm": 8.221184730529785, + "learning_rate": 1.4689967637540454e-05, + "logits/chosen": -4.374337673187256, + "logits/rejected": -4.375265598297119, + "logps/chosen": -738.59228515625, + "logps/rejected": -785.3124389648438, + "loss": 0.7185, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5105136036872864, + "rewards/margins": 0.46654510498046875, + "rewards/rejected": -0.9770587682723999, + "step": 2910 + }, + { + "epoch": 0.6802562609202096, + "grad_norm": 9.547266006469727, + "learning_rate": 1.4664077669902914e-05, + "logits/chosen": -4.404740333557129, + "logits/rejected": -4.382708549499512, + "logps/chosen": -784.3701171875, + "logps/rejected": -764.4552612304688, + "loss": 0.8005, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2633661925792694, + "rewards/margins": 0.3906458020210266, + "rewards/rejected": -0.6540120244026184, + "step": 2920 + }, + { + "epoch": 0.6825859056493885, + "grad_norm": 5.635769844055176, + "learning_rate": 1.4638187702265373e-05, + "logits/chosen": -4.346009731292725, + "logits/rejected": -4.309237003326416, + "logps/chosen": -778.1959228515625, + "logps/rejected": -724.6881713867188, + "loss": 0.8783, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.6418018937110901, + "rewards/margins": 0.07551275193691254, + "rewards/rejected": -0.717314600944519, + "step": 2930 + }, + { + "epoch": 0.6849155503785672, + "grad_norm": 8.018805503845215, + "learning_rate": 1.4612297734627834e-05, + "logits/chosen": -4.3149094581604, + "logits/rejected": -4.2781453132629395, + "logps/chosen": -681.9122314453125, + "logps/rejected": -679.7462768554688, + "loss": 0.9115, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.6334387063980103, + "rewards/margins": 0.008119463920593262, + "rewards/rejected": -0.6415581703186035, + "step": 2940 + }, + { + "epoch": 0.6872451951077461, + "grad_norm": 6.547179222106934, + "learning_rate": 1.4586407766990291e-05, + "logits/chosen": -4.2727508544921875, + "logits/rejected": -4.245587348937988, + "logps/chosen": -752.4075927734375, + "logps/rejected": -750.0736083984375, + "loss": 0.6802, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3730926513671875, + "rewards/margins": 0.4776512086391449, + "rewards/rejected": -0.8507438898086548, + "step": 2950 + }, + { + "epoch": 0.6895748398369249, + "grad_norm": 6.982612133026123, + "learning_rate": 1.4560517799352752e-05, + "logits/chosen": -4.291788578033447, + "logits/rejected": -4.315970420837402, + "logps/chosen": -657.02978515625, + "logps/rejected": -682.1209716796875, + "loss": 0.6605, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3291148841381073, + "rewards/margins": 0.6222742199897766, + "rewards/rejected": -0.9513891339302063, + "step": 2960 + }, + { + "epoch": 0.6919044845661037, + "grad_norm": 8.867607116699219, + "learning_rate": 1.453462783171521e-05, + "logits/chosen": -4.366148471832275, + "logits/rejected": -4.393885612487793, + "logps/chosen": -675.3447265625, + "logps/rejected": -713.319580078125, + "loss": 0.6773, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.28730329871177673, + "rewards/margins": 0.5438534617424011, + "rewards/rejected": -0.8311569094657898, + "step": 2970 + }, + { + "epoch": 0.6942341292952825, + "grad_norm": 8.969156265258789, + "learning_rate": 1.4508737864077671e-05, + "logits/chosen": -4.319128036499023, + "logits/rejected": -4.303523063659668, + "logps/chosen": -755.8877563476562, + "logps/rejected": -751.65087890625, + "loss": 0.7509, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.24285368621349335, + "rewards/margins": 0.549541711807251, + "rewards/rejected": -0.7923953533172607, + "step": 2980 + }, + { + "epoch": 0.6965637740244612, + "grad_norm": 4.678745269775391, + "learning_rate": 1.4482847896440132e-05, + "logits/chosen": -4.411252021789551, + "logits/rejected": -4.31979513168335, + "logps/chosen": -780.011962890625, + "logps/rejected": -769.9260864257812, + "loss": 0.8132, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4631928503513336, + "rewards/margins": 0.34427186846733093, + "rewards/rejected": -0.8074647188186646, + "step": 2990 + }, + { + "epoch": 0.6988934187536401, + "grad_norm": 5.628600120544434, + "learning_rate": 1.445695792880259e-05, + "logits/chosen": -4.367173194885254, + "logits/rejected": -4.255070686340332, + "logps/chosen": -774.8583984375, + "logps/rejected": -759.581298828125, + "loss": 0.7141, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3500550389289856, + "rewards/margins": 0.35884183645248413, + "rewards/rejected": -0.708896815776825, + "step": 3000 + }, + { + "epoch": 0.6988934187536401, + "eval_logits/chosen": -4.2953410148620605, + "eval_logits/rejected": -4.283372402191162, + "eval_logps/chosen": -696.170166015625, + "eval_logps/rejected": -715.119384765625, + "eval_loss": 0.6390213966369629, + "eval_rewards/accuracies": 0.626782238483429, + "eval_rewards/chosen": -0.5341863036155701, + "eval_rewards/margins": 0.2978910207748413, + "eval_rewards/rejected": -0.8320773839950562, + "eval_runtime": 388.9031, + "eval_samples_per_second": 18.395, + "eval_steps_per_second": 9.198, + "step": 3000 + }, + { + "epoch": 0.7012230634828188, + "grad_norm": 8.826177597045898, + "learning_rate": 1.4431067961165051e-05, + "logits/chosen": -4.272387504577637, + "logits/rejected": -4.336935520172119, + "logps/chosen": -697.0843505859375, + "logps/rejected": -790.090576171875, + "loss": 0.8917, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.611141562461853, + "rewards/margins": 0.14556744694709778, + "rewards/rejected": -0.7567089796066284, + "step": 3010 + }, + { + "epoch": 0.7035527082119977, + "grad_norm": 10.059699058532715, + "learning_rate": 1.4405177993527508e-05, + "logits/chosen": -4.255461692810059, + "logits/rejected": -4.3038225173950195, + "logps/chosen": -657.3980712890625, + "logps/rejected": -669.6798095703125, + "loss": 0.8871, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.6021971702575684, + "rewards/margins": 0.05020788311958313, + "rewards/rejected": -0.6524051427841187, + "step": 3020 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 8.71876049041748, + "learning_rate": 1.4379288025889969e-05, + "logits/chosen": -4.3381667137146, + "logits/rejected": -4.318350315093994, + "logps/chosen": -715.66455078125, + "logps/rejected": -674.4814453125, + "loss": 0.7505, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5548633933067322, + "rewards/margins": 0.2882435917854309, + "rewards/rejected": -0.8431070446968079, + "step": 3030 + }, + { + "epoch": 0.7082119976703553, + "grad_norm": 8.695144653320312, + "learning_rate": 1.4353398058252428e-05, + "logits/chosen": -4.258905410766602, + "logits/rejected": -4.291356086730957, + "logps/chosen": -682.4697875976562, + "logps/rejected": -735.2022705078125, + "loss": 0.8223, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5667489767074585, + "rewards/margins": 0.16179592907428741, + "rewards/rejected": -0.7285449504852295, + "step": 3040 + }, + { + "epoch": 0.7105416423995341, + "grad_norm": 8.749677658081055, + "learning_rate": 1.4327508090614888e-05, + "logits/chosen": -4.3128132820129395, + "logits/rejected": -4.305615425109863, + "logps/chosen": -707.4427490234375, + "logps/rejected": -725.2252807617188, + "loss": 0.6859, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.2074352204799652, + "rewards/margins": 0.5990522503852844, + "rewards/rejected": -0.8064874410629272, + "step": 3050 + }, + { + "epoch": 0.7128712871287128, + "grad_norm": 4.48876953125, + "learning_rate": 1.4301618122977347e-05, + "logits/chosen": -4.397701263427734, + "logits/rejected": -4.405947208404541, + "logps/chosen": -702.0162963867188, + "logps/rejected": -730.04150390625, + "loss": 0.7349, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.35273998975753784, + "rewards/margins": 0.4463699460029602, + "rewards/rejected": -0.799109935760498, + "step": 3060 + }, + { + "epoch": 0.7152009318578917, + "grad_norm": 7.877706050872803, + "learning_rate": 1.4275728155339808e-05, + "logits/chosen": -4.2988691329956055, + "logits/rejected": -4.319767951965332, + "logps/chosen": -718.3621826171875, + "logps/rejected": -700.2260131835938, + "loss": 0.89, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6528037190437317, + "rewards/margins": 0.040874332189559937, + "rewards/rejected": -0.6936780214309692, + "step": 3070 + }, + { + "epoch": 0.7175305765870704, + "grad_norm": 8.941927909851074, + "learning_rate": 1.4249838187702267e-05, + "logits/chosen": -4.259885787963867, + "logits/rejected": -4.335204124450684, + "logps/chosen": -665.210205078125, + "logps/rejected": -732.52978515625, + "loss": 0.8301, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.5433850288391113, + "rewards/margins": 0.19039340317249298, + "rewards/rejected": -0.7337783575057983, + "step": 3080 + }, + { + "epoch": 0.7198602213162493, + "grad_norm": 9.396485328674316, + "learning_rate": 1.4223948220064725e-05, + "logits/chosen": -4.220810890197754, + "logits/rejected": -4.346644878387451, + "logps/chosen": -667.0452880859375, + "logps/rejected": -778.1810913085938, + "loss": 0.666, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5383417010307312, + "rewards/margins": 0.5022939443588257, + "rewards/rejected": -1.0406357049942017, + "step": 3090 + }, + { + "epoch": 0.7221898660454281, + "grad_norm": 10.992788314819336, + "learning_rate": 1.4198058252427184e-05, + "logits/chosen": -4.300179481506348, + "logits/rejected": -4.309948444366455, + "logps/chosen": -678.5414428710938, + "logps/rejected": -774.6533203125, + "loss": 0.6912, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5962709784507751, + "rewards/margins": 0.4656152129173279, + "rewards/rejected": -1.061886191368103, + "step": 3100 + }, + { + "epoch": 0.7221898660454281, + "eval_logits/chosen": -4.290231227874756, + "eval_logits/rejected": -4.278411865234375, + "eval_logps/chosen": -696.8681030273438, + "eval_logps/rejected": -716.00634765625, + "eval_loss": 0.6407871246337891, + "eval_rewards/accuracies": 0.6272015571594238, + "eval_rewards/chosen": -0.603983998298645, + "eval_rewards/margins": 0.3167959451675415, + "eval_rewards/rejected": -0.9207799434661865, + "eval_runtime": 388.9741, + "eval_samples_per_second": 18.392, + "eval_steps_per_second": 9.196, + "step": 3100 + }, + { + "epoch": 0.7245195107746069, + "grad_norm": 9.470890998840332, + "learning_rate": 1.4172168284789645e-05, + "logits/chosen": -4.371718406677246, + "logits/rejected": -4.242143154144287, + "logps/chosen": -762.9887084960938, + "logps/rejected": -700.8438110351562, + "loss": 0.7672, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5022997260093689, + "rewards/margins": 0.3725367486476898, + "rewards/rejected": -0.8748364448547363, + "step": 3110 + }, + { + "epoch": 0.7268491555037857, + "grad_norm": 7.6680521965026855, + "learning_rate": 1.4146278317152104e-05, + "logits/chosen": -4.336404323577881, + "logits/rejected": -4.37836217880249, + "logps/chosen": -727.9071044921875, + "logps/rejected": -763.2144775390625, + "loss": 0.7235, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.3655966818332672, + "rewards/margins": 0.3294195234775543, + "rewards/rejected": -0.6950162649154663, + "step": 3120 + }, + { + "epoch": 0.7291788002329644, + "grad_norm": 6.3448991775512695, + "learning_rate": 1.4120388349514564e-05, + "logits/chosen": -4.234147548675537, + "logits/rejected": -4.275125503540039, + "logps/chosen": -657.5973510742188, + "logps/rejected": -754.4498291015625, + "loss": 0.8516, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.6424176096916199, + "rewards/margins": 0.21146734058856964, + "rewards/rejected": -0.8538848757743835, + "step": 3130 + }, + { + "epoch": 0.7315084449621433, + "grad_norm": 7.723195552825928, + "learning_rate": 1.4094498381877025e-05, + "logits/chosen": -4.297318458557129, + "logits/rejected": -4.263772964477539, + "logps/chosen": -720.8884887695312, + "logps/rejected": -688.8799438476562, + "loss": 0.9713, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.759177029132843, + "rewards/margins": -0.07643283903598785, + "rewards/rejected": -0.6827441453933716, + "step": 3140 + }, + { + "epoch": 0.733838089691322, + "grad_norm": 8.816356658935547, + "learning_rate": 1.4068608414239484e-05, + "logits/chosen": -4.3540825843811035, + "logits/rejected": -4.330616474151611, + "logps/chosen": -720.6648559570312, + "logps/rejected": -727.0189208984375, + "loss": 0.6476, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4314247667789459, + "rewards/margins": 0.5888901948928833, + "rewards/rejected": -1.0203149318695068, + "step": 3150 + }, + { + "epoch": 0.7361677344205009, + "grad_norm": 7.842738151550293, + "learning_rate": 1.4042718446601944e-05, + "logits/chosen": -4.272993564605713, + "logits/rejected": -4.329249382019043, + "logps/chosen": -707.3156127929688, + "logps/rejected": -747.498291015625, + "loss": 0.6714, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5479978919029236, + "rewards/margins": 0.6574041247367859, + "rewards/rejected": -1.20540189743042, + "step": 3160 + }, + { + "epoch": 0.7384973791496797, + "grad_norm": 6.250893592834473, + "learning_rate": 1.4016828478964402e-05, + "logits/chosen": -4.374791145324707, + "logits/rejected": -4.3477301597595215, + "logps/chosen": -748.3688354492188, + "logps/rejected": -742.9495239257812, + "loss": 0.809, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.6412941217422485, + "rewards/margins": 0.2639515995979309, + "rewards/rejected": -0.9052456021308899, + "step": 3170 + }, + { + "epoch": 0.7408270238788585, + "grad_norm": 6.362512588500977, + "learning_rate": 1.3990938511326862e-05, + "logits/chosen": -4.23079252243042, + "logits/rejected": -4.321897506713867, + "logps/chosen": -655.5142822265625, + "logps/rejected": -756.73681640625, + "loss": 0.7603, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6060266494750977, + "rewards/margins": 0.39030686020851135, + "rewards/rejected": -0.9963334798812866, + "step": 3180 + }, + { + "epoch": 0.7431566686080373, + "grad_norm": 8.123212814331055, + "learning_rate": 1.3965048543689321e-05, + "logits/chosen": -4.311104774475098, + "logits/rejected": -4.362451553344727, + "logps/chosen": -683.316162109375, + "logps/rejected": -717.951904296875, + "loss": 0.9672, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.7517843246459961, + "rewards/margins": 0.05504413694143295, + "rewards/rejected": -0.8068283796310425, + "step": 3190 + }, + { + "epoch": 0.745486313337216, + "grad_norm": 9.257919311523438, + "learning_rate": 1.3939158576051782e-05, + "logits/chosen": -4.272576808929443, + "logits/rejected": -4.2962846755981445, + "logps/chosen": -701.0122680664062, + "logps/rejected": -724.0266723632812, + "loss": 0.8165, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5627827048301697, + "rewards/margins": 0.30343493819236755, + "rewards/rejected": -0.8662176132202148, + "step": 3200 + }, + { + "epoch": 0.745486313337216, + "eval_logits/chosen": -4.283018112182617, + "eval_logits/rejected": -4.270773410797119, + "eval_logps/chosen": -697.2401733398438, + "eval_logps/rejected": -716.45654296875, + "eval_loss": 0.6407615542411804, + "eval_rewards/accuracies": 0.6269220113754272, + "eval_rewards/chosen": -0.6411851048469543, + "eval_rewards/margins": 0.3246031105518341, + "eval_rewards/rejected": -0.9657881259918213, + "eval_runtime": 388.9661, + "eval_samples_per_second": 18.392, + "eval_steps_per_second": 9.196, + "step": 3200 + }, + { + "epoch": 0.7478159580663949, + "grad_norm": 11.506460189819336, + "learning_rate": 1.391326860841424e-05, + "logits/chosen": -4.346702575683594, + "logits/rejected": -4.339921951293945, + "logps/chosen": -722.6053466796875, + "logps/rejected": -769.1424560546875, + "loss": 0.8317, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.7728465795516968, + "rewards/margins": 0.3128949999809265, + "rewards/rejected": -1.085741639137268, + "step": 3210 + }, + { + "epoch": 0.7501456027955736, + "grad_norm": 10.876580238342285, + "learning_rate": 1.3887378640776701e-05, + "logits/chosen": -4.32213020324707, + "logits/rejected": -4.19241189956665, + "logps/chosen": -683.1779174804688, + "logps/rejected": -726.1195068359375, + "loss": 0.8666, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.7728645205497742, + "rewards/margins": 0.10151295363903046, + "rewards/rejected": -0.8743775486946106, + "step": 3220 + }, + { + "epoch": 0.7524752475247525, + "grad_norm": 5.669135093688965, + "learning_rate": 1.386148867313916e-05, + "logits/chosen": -4.335180282592773, + "logits/rejected": -4.385754108428955, + "logps/chosen": -704.3292236328125, + "logps/rejected": -790.5955200195312, + "loss": 0.7836, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5992025136947632, + "rewards/margins": 0.3988838493824005, + "rewards/rejected": -0.9980863332748413, + "step": 3230 + }, + { + "epoch": 0.7548048922539313, + "grad_norm": 11.22128963470459, + "learning_rate": 1.3835598705501619e-05, + "logits/chosen": -4.41721248626709, + "logits/rejected": -4.297575950622559, + "logps/chosen": -755.6207275390625, + "logps/rejected": -690.0484619140625, + "loss": 0.9106, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.6340324878692627, + "rewards/margins": 0.04324830323457718, + "rewards/rejected": -0.6772807836532593, + "step": 3240 + }, + { + "epoch": 0.7571345369831101, + "grad_norm": 7.739163875579834, + "learning_rate": 1.3809708737864078e-05, + "logits/chosen": -4.304110527038574, + "logits/rejected": -4.264829158782959, + "logps/chosen": -683.7374877929688, + "logps/rejected": -689.4002685546875, + "loss": 0.8206, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6143852472305298, + "rewards/margins": 0.10701651871204376, + "rewards/rejected": -0.7214018106460571, + "step": 3250 + }, + { + "epoch": 0.7594641817122889, + "grad_norm": 6.535190105438232, + "learning_rate": 1.3783818770226538e-05, + "logits/chosen": -4.372353553771973, + "logits/rejected": -4.373166561126709, + "logps/chosen": -714.5510864257812, + "logps/rejected": -700.5277099609375, + "loss": 0.9644, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.7022789120674133, + "rewards/margins": 0.01919153705239296, + "rewards/rejected": -0.7214704155921936, + "step": 3260 + }, + { + "epoch": 0.7617938264414676, + "grad_norm": 7.50679874420166, + "learning_rate": 1.3757928802588997e-05, + "logits/chosen": -4.37531852722168, + "logits/rejected": -4.408667087554932, + "logps/chosen": -674.136962890625, + "logps/rejected": -773.3675537109375, + "loss": 0.7061, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.518215537071228, + "rewards/margins": 0.5985016822814941, + "rewards/rejected": -1.1167173385620117, + "step": 3270 + }, + { + "epoch": 0.7641234711706465, + "grad_norm": 8.066099166870117, + "learning_rate": 1.3732038834951458e-05, + "logits/chosen": -4.234104156494141, + "logits/rejected": -4.255315780639648, + "logps/chosen": -681.3992919921875, + "logps/rejected": -715.1346435546875, + "loss": 0.7161, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.4894762635231018, + "rewards/margins": 0.5325658917427063, + "rewards/rejected": -1.0220420360565186, + "step": 3280 + }, + { + "epoch": 0.7664531158998252, + "grad_norm": 8.645896911621094, + "learning_rate": 1.3706148867313918e-05, + "logits/chosen": -4.340358734130859, + "logits/rejected": -4.369351387023926, + "logps/chosen": -731.498291015625, + "logps/rejected": -842.1834106445312, + "loss": 0.6648, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5124009847640991, + "rewards/margins": 0.6522801518440247, + "rewards/rejected": -1.1646811962127686, + "step": 3290 + }, + { + "epoch": 0.7687827606290041, + "grad_norm": 8.283692359924316, + "learning_rate": 1.3680258899676377e-05, + "logits/chosen": -4.37287712097168, + "logits/rejected": -4.286995887756348, + "logps/chosen": -746.0880126953125, + "logps/rejected": -711.8284301757812, + "loss": 0.7196, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6805142164230347, + "rewards/margins": 0.4170430302619934, + "rewards/rejected": -1.0975573062896729, + "step": 3300 + }, + { + "epoch": 0.7687827606290041, + "eval_logits/chosen": -4.292080879211426, + "eval_logits/rejected": -4.279994010925293, + "eval_logps/chosen": -697.4268798828125, + "eval_logps/rejected": -716.6896362304688, + "eval_loss": 0.6396226286888123, + "eval_rewards/accuracies": 0.6272015571594238, + "eval_rewards/chosen": -0.6598604917526245, + "eval_rewards/margins": 0.32923582196235657, + "eval_rewards/rejected": -0.9890962839126587, + "eval_runtime": 389.3547, + "eval_samples_per_second": 18.374, + "eval_steps_per_second": 9.187, + "step": 3300 + }, + { + "epoch": 0.7711124053581829, + "grad_norm": 7.7540740966796875, + "learning_rate": 1.3654368932038834e-05, + "logits/chosen": -4.363582134246826, + "logits/rejected": -4.354387283325195, + "logps/chosen": -697.3382568359375, + "logps/rejected": -747.3855590820312, + "loss": 0.7666, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5363968014717102, + "rewards/margins": 0.40406447649002075, + "rewards/rejected": -0.940461277961731, + "step": 3310 + }, + { + "epoch": 0.7734420500873617, + "grad_norm": 9.846510887145996, + "learning_rate": 1.3628478964401295e-05, + "logits/chosen": -4.361764430999756, + "logits/rejected": -4.345663070678711, + "logps/chosen": -691.598388671875, + "logps/rejected": -733.6590576171875, + "loss": 0.8794, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.7674421072006226, + "rewards/margins": 0.21453304588794708, + "rewards/rejected": -0.9819751977920532, + "step": 3320 + }, + { + "epoch": 0.7757716948165405, + "grad_norm": 5.368443489074707, + "learning_rate": 1.3602588996763756e-05, + "logits/chosen": -4.414107799530029, + "logits/rejected": -4.219078540802002, + "logps/chosen": -830.5812377929688, + "logps/rejected": -738.1531982421875, + "loss": 0.779, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6906629204750061, + "rewards/margins": 0.3374762237071991, + "rewards/rejected": -1.0281391143798828, + "step": 3330 + }, + { + "epoch": 0.7781013395457193, + "grad_norm": 8.01265811920166, + "learning_rate": 1.3576699029126214e-05, + "logits/chosen": -4.282806396484375, + "logits/rejected": -4.28933048248291, + "logps/chosen": -692.3564453125, + "logps/rejected": -672.9450073242188, + "loss": 0.8813, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.7251057028770447, + "rewards/margins": 0.05957312509417534, + "rewards/rejected": -0.7846788167953491, + "step": 3340 + }, + { + "epoch": 0.7804309842748981, + "grad_norm": 9.943344116210938, + "learning_rate": 1.3550809061488675e-05, + "logits/chosen": -4.298067569732666, + "logits/rejected": -4.34145975112915, + "logps/chosen": -717.462890625, + "logps/rejected": -672.3321533203125, + "loss": 0.7298, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48818421363830566, + "rewards/margins": 0.4551779627799988, + "rewards/rejected": -0.9433622360229492, + "step": 3350 + }, + { + "epoch": 0.7827606290040768, + "grad_norm": 9.782580375671387, + "learning_rate": 1.3524919093851134e-05, + "logits/chosen": -4.281432151794434, + "logits/rejected": -4.373194694519043, + "logps/chosen": -691.5526123046875, + "logps/rejected": -751.7589111328125, + "loss": 0.806, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5640980005264282, + "rewards/margins": 0.3025267422199249, + "rewards/rejected": -0.8666247129440308, + "step": 3360 + }, + { + "epoch": 0.7850902737332557, + "grad_norm": 11.208285331726074, + "learning_rate": 1.3499029126213594e-05, + "logits/chosen": -4.372630596160889, + "logits/rejected": -4.34097146987915, + "logps/chosen": -699.0808715820312, + "logps/rejected": -725.1157836914062, + "loss": 0.826, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.6243829131126404, + "rewards/margins": 0.31237927079200745, + "rewards/rejected": -0.9367621541023254, + "step": 3370 + }, + { + "epoch": 0.7874199184624345, + "grad_norm": 11.51611614227295, + "learning_rate": 1.3473139158576052e-05, + "logits/chosen": -4.39542293548584, + "logits/rejected": -4.369442939758301, + "logps/chosen": -714.1785278320312, + "logps/rejected": -723.6585083007812, + "loss": 0.7919, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3786347508430481, + "rewards/margins": 0.3001879155635834, + "rewards/rejected": -0.6788226962089539, + "step": 3380 + }, + { + "epoch": 0.7897495631916133, + "grad_norm": 8.331809997558594, + "learning_rate": 1.3447249190938512e-05, + "logits/chosen": -4.329986572265625, + "logits/rejected": -4.317460536956787, + "logps/chosen": -695.0831298828125, + "logps/rejected": -723.6203002929688, + "loss": 0.6898, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5646909475326538, + "rewards/margins": 0.5181136727333069, + "rewards/rejected": -1.082804560661316, + "step": 3390 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 8.896123886108398, + "learning_rate": 1.3421359223300971e-05, + "logits/chosen": -4.404135704040527, + "logits/rejected": -4.331064701080322, + "logps/chosen": -750.0504760742188, + "logps/rejected": -708.2687377929688, + "loss": 0.6001, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.20269541442394257, + "rewards/margins": 0.699763834476471, + "rewards/rejected": -0.9024591445922852, + "step": 3400 + }, + { + "epoch": 0.7920792079207921, + "eval_logits/chosen": -4.312615394592285, + "eval_logits/rejected": -4.3008599281311035, + "eval_logps/chosen": -696.9474487304688, + "eval_logps/rejected": -716.1688842773438, + "eval_loss": 0.6369568109512329, + "eval_rewards/accuracies": 0.6323735117912292, + "eval_rewards/chosen": -0.6119146347045898, + "eval_rewards/margins": 0.32510802149772644, + "eval_rewards/rejected": -0.9370226263999939, + "eval_runtime": 389.7133, + "eval_samples_per_second": 18.357, + "eval_steps_per_second": 9.179, + "step": 3400 + }, + { + "epoch": 0.7944088526499709, + "grad_norm": 12.664941787719727, + "learning_rate": 1.3395469255663432e-05, + "logits/chosen": -4.360062599182129, + "logits/rejected": -4.405812740325928, + "logps/chosen": -747.7235717773438, + "logps/rejected": -776.3199462890625, + "loss": 0.8625, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7826858758926392, + "rewards/margins": 0.23854394257068634, + "rewards/rejected": -1.0212297439575195, + "step": 3410 + }, + { + "epoch": 0.7967384973791497, + "grad_norm": 6.154604434967041, + "learning_rate": 1.336957928802589e-05, + "logits/chosen": -4.380074501037598, + "logits/rejected": -4.334290504455566, + "logps/chosen": -823.052734375, + "logps/rejected": -751.04443359375, + "loss": 0.883, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.6692854166030884, + "rewards/margins": 0.17931106686592102, + "rewards/rejected": -0.8485965728759766, + "step": 3420 + }, + { + "epoch": 0.7990681421083284, + "grad_norm": 9.303884506225586, + "learning_rate": 1.3343689320388351e-05, + "logits/chosen": -4.323749542236328, + "logits/rejected": -4.386262893676758, + "logps/chosen": -687.6109619140625, + "logps/rejected": -718.0032348632812, + "loss": 0.8712, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.7120986580848694, + "rewards/margins": 0.1205534115433693, + "rewards/rejected": -0.83265221118927, + "step": 3430 + }, + { + "epoch": 0.8013977868375073, + "grad_norm": 6.182861328125, + "learning_rate": 1.3317799352750812e-05, + "logits/chosen": -4.345360279083252, + "logits/rejected": -4.34683895111084, + "logps/chosen": -705.4597778320312, + "logps/rejected": -755.2239990234375, + "loss": 0.6721, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.40056854486465454, + "rewards/margins": 0.5601202845573425, + "rewards/rejected": -0.9606888890266418, + "step": 3440 + }, + { + "epoch": 0.8037274315666861, + "grad_norm": 10.787919998168945, + "learning_rate": 1.329190938511327e-05, + "logits/chosen": -4.387537956237793, + "logits/rejected": -4.398334503173828, + "logps/chosen": -703.9459838867188, + "logps/rejected": -728.9833374023438, + "loss": 0.8532, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.7066787481307983, + "rewards/margins": 0.12415404617786407, + "rewards/rejected": -0.8308327794075012, + "step": 3450 + }, + { + "epoch": 0.8060570762958649, + "grad_norm": 11.048896789550781, + "learning_rate": 1.3266019417475728e-05, + "logits/chosen": -4.34719705581665, + "logits/rejected": -4.3350510597229, + "logps/chosen": -675.7979736328125, + "logps/rejected": -711.3985595703125, + "loss": 0.8836, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.6477685570716858, + "rewards/margins": 0.05750611424446106, + "rewards/rejected": -0.7052747011184692, + "step": 3460 + }, + { + "epoch": 0.8083867210250437, + "grad_norm": 7.28444242477417, + "learning_rate": 1.3240129449838188e-05, + "logits/chosen": -4.3619279861450195, + "logits/rejected": -4.376542568206787, + "logps/chosen": -731.7483520507812, + "logps/rejected": -795.55859375, + "loss": 0.8076, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.4643213748931885, + "rewards/margins": 0.3656710982322693, + "rewards/rejected": -0.829992413520813, + "step": 3470 + }, + { + "epoch": 0.8107163657542225, + "grad_norm": 6.876546382904053, + "learning_rate": 1.3214239482200649e-05, + "logits/chosen": -4.338783264160156, + "logits/rejected": -4.366893768310547, + "logps/chosen": -692.851806640625, + "logps/rejected": -746.337890625, + "loss": 0.6853, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5015130639076233, + "rewards/margins": 0.5323377251625061, + "rewards/rejected": -1.033850908279419, + "step": 3480 + }, + { + "epoch": 0.8130460104834013, + "grad_norm": 7.1680707931518555, + "learning_rate": 1.3188349514563108e-05, + "logits/chosen": -4.326290607452393, + "logits/rejected": -4.401350021362305, + "logps/chosen": -669.87353515625, + "logps/rejected": -743.699462890625, + "loss": 0.7384, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6359532475471497, + "rewards/margins": 0.24711358547210693, + "rewards/rejected": -0.8830667734146118, + "step": 3490 + }, + { + "epoch": 0.81537565521258, + "grad_norm": 7.081179618835449, + "learning_rate": 1.3162459546925568e-05, + "logits/chosen": -4.389810085296631, + "logits/rejected": -4.387526035308838, + "logps/chosen": -713.7439575195312, + "logps/rejected": -774.56298828125, + "loss": 0.8193, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6019744277000427, + "rewards/margins": 0.28494006395339966, + "rewards/rejected": -0.8869145512580872, + "step": 3500 + }, + { + "epoch": 0.81537565521258, + "eval_logits/chosen": -4.3075971603393555, + "eval_logits/rejected": -4.2960638999938965, + "eval_logps/chosen": -696.4827880859375, + "eval_logps/rejected": -715.6641845703125, + "eval_loss": 0.6347267031669617, + "eval_rewards/accuracies": 0.6347497701644897, + "eval_rewards/chosen": -0.5654566884040833, + "eval_rewards/margins": 0.32109642028808594, + "eval_rewards/rejected": -0.8865532279014587, + "eval_runtime": 389.244, + "eval_samples_per_second": 18.379, + "eval_steps_per_second": 9.19, + "step": 3500 + }, + { + "epoch": 0.8177052999417589, + "grad_norm": 8.20933723449707, + "learning_rate": 1.3136569579288027e-05, + "logits/chosen": -4.342705726623535, + "logits/rejected": -4.373048305511475, + "logps/chosen": -630.05078125, + "logps/rejected": -686.9644165039062, + "loss": 0.7903, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.4095194339752197, + "rewards/margins": 0.2709464430809021, + "rewards/rejected": -0.680465817451477, + "step": 3510 + }, + { + "epoch": 0.8200349446709376, + "grad_norm": 7.260538578033447, + "learning_rate": 1.3110679611650488e-05, + "logits/chosen": -4.398225784301758, + "logits/rejected": -4.343891143798828, + "logps/chosen": -702.5809936523438, + "logps/rejected": -723.8179321289062, + "loss": 0.9465, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.8503254055976868, + "rewards/margins": -0.010509604588150978, + "rewards/rejected": -0.8398159146308899, + "step": 3520 + }, + { + "epoch": 0.8223645894001165, + "grad_norm": 8.280593872070312, + "learning_rate": 1.3084789644012945e-05, + "logits/chosen": -4.371305465698242, + "logits/rejected": -4.4022722244262695, + "logps/chosen": -680.5316162109375, + "logps/rejected": -692.623046875, + "loss": 0.7366, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.4734654426574707, + "rewards/margins": 0.33583229780197144, + "rewards/rejected": -0.8092976808547974, + "step": 3530 + }, + { + "epoch": 0.8246942341292953, + "grad_norm": 9.60183048248291, + "learning_rate": 1.3058899676375406e-05, + "logits/chosen": -4.415801048278809, + "logits/rejected": -4.346750259399414, + "logps/chosen": -799.20361328125, + "logps/rejected": -742.3433837890625, + "loss": 0.9304, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6391052603721619, + "rewards/margins": 0.060576051473617554, + "rewards/rejected": -0.6996814012527466, + "step": 3540 + }, + { + "epoch": 0.8270238788584741, + "grad_norm": 6.183422088623047, + "learning_rate": 1.3033009708737864e-05, + "logits/chosen": -4.35550594329834, + "logits/rejected": -4.362612724304199, + "logps/chosen": -713.9013671875, + "logps/rejected": -729.5386352539062, + "loss": 0.7412, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.4271429479122162, + "rewards/margins": 0.41919174790382385, + "rewards/rejected": -0.8463346362113953, + "step": 3550 + }, + { + "epoch": 0.8293535235876529, + "grad_norm": 10.527793884277344, + "learning_rate": 1.3007119741100325e-05, + "logits/chosen": -4.403775215148926, + "logits/rejected": -4.371069431304932, + "logps/chosen": -736.4136352539062, + "logps/rejected": -757.3355712890625, + "loss": 0.8426, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5583220720291138, + "rewards/margins": 0.11188054084777832, + "rewards/rejected": -0.6702027320861816, + "step": 3560 + }, + { + "epoch": 0.8316831683168316, + "grad_norm": 5.496523857116699, + "learning_rate": 1.2981229773462784e-05, + "logits/chosen": -4.342892646789551, + "logits/rejected": -4.3970513343811035, + "logps/chosen": -684.6142578125, + "logps/rejected": -785.8450927734375, + "loss": 0.76, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.35788896679878235, + "rewards/margins": 0.4052864909172058, + "rewards/rejected": -0.7631754279136658, + "step": 3570 + }, + { + "epoch": 0.8340128130460105, + "grad_norm": 6.381747245788574, + "learning_rate": 1.2955339805825244e-05, + "logits/chosen": -4.360800743103027, + "logits/rejected": -4.352953910827637, + "logps/chosen": -691.8085327148438, + "logps/rejected": -736.9182739257812, + "loss": 0.7469, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.47531062364578247, + "rewards/margins": 0.36524698138237, + "rewards/rejected": -0.8405576944351196, + "step": 3580 + }, + { + "epoch": 0.8363424577751892, + "grad_norm": 10.559829711914062, + "learning_rate": 1.2929449838187705e-05, + "logits/chosen": -4.344983100891113, + "logits/rejected": -4.416808128356934, + "logps/chosen": -693.577880859375, + "logps/rejected": -742.8538818359375, + "loss": 0.7386, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4921267628669739, + "rewards/margins": 0.42470502853393555, + "rewards/rejected": -0.9168317914009094, + "step": 3590 + }, + { + "epoch": 0.8386721025043681, + "grad_norm": 4.188501834869385, + "learning_rate": 1.2903559870550162e-05, + "logits/chosen": -4.345074653625488, + "logits/rejected": -4.432697772979736, + "logps/chosen": -746.7095947265625, + "logps/rejected": -737.2679443359375, + "loss": 0.7706, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5077939033508301, + "rewards/margins": 0.286722332239151, + "rewards/rejected": -0.7945162653923035, + "step": 3600 + }, + { + "epoch": 0.8386721025043681, + "eval_logits/chosen": -4.317319393157959, + "eval_logits/rejected": -4.3060455322265625, + "eval_logps/chosen": -696.902587890625, + "eval_logps/rejected": -716.1546020507812, + "eval_loss": 0.6351374387741089, + "eval_rewards/accuracies": 0.6350293755531311, + "eval_rewards/chosen": -0.6074322462081909, + "eval_rewards/margins": 0.3281627297401428, + "eval_rewards/rejected": -0.9355949759483337, + "eval_runtime": 390.3144, + "eval_samples_per_second": 18.329, + "eval_steps_per_second": 9.164, + "step": 3600 + }, + { + "epoch": 0.8410017472335469, + "grad_norm": 7.669933795928955, + "learning_rate": 1.2877669902912621e-05, + "logits/chosen": -4.36743688583374, + "logits/rejected": -4.304625511169434, + "logps/chosen": -701.3809814453125, + "logps/rejected": -658.0457763671875, + "loss": 0.7874, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5979401469230652, + "rewards/margins": 0.18311890959739685, + "rewards/rejected": -0.7810591459274292, + "step": 3610 + }, + { + "epoch": 0.8433313919627257, + "grad_norm": 5.987900733947754, + "learning_rate": 1.2851779935275082e-05, + "logits/chosen": -4.3112688064575195, + "logits/rejected": -4.322850227355957, + "logps/chosen": -705.4276123046875, + "logps/rejected": -719.9669189453125, + "loss": 0.7801, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4701387882232666, + "rewards/margins": 0.3039657473564148, + "rewards/rejected": -0.7741045355796814, + "step": 3620 + }, + { + "epoch": 0.8456610366919045, + "grad_norm": 4.629632949829102, + "learning_rate": 1.2825889967637542e-05, + "logits/chosen": -4.413471221923828, + "logits/rejected": -4.342827320098877, + "logps/chosen": -696.3890991210938, + "logps/rejected": -672.6505126953125, + "loss": 0.8676, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7070828676223755, + "rewards/margins": 0.12928228080272675, + "rewards/rejected": -0.8363651037216187, + "step": 3630 + }, + { + "epoch": 0.8479906814210832, + "grad_norm": 8.022156715393066, + "learning_rate": 1.2800000000000001e-05, + "logits/chosen": -4.351454734802246, + "logits/rejected": -4.433553218841553, + "logps/chosen": -638.5418701171875, + "logps/rejected": -781.3585205078125, + "loss": 0.744, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.428017795085907, + "rewards/margins": 0.3672192096710205, + "rewards/rejected": -0.7952369451522827, + "step": 3640 + }, + { + "epoch": 0.8503203261502621, + "grad_norm": 6.858470439910889, + "learning_rate": 1.2774110032362462e-05, + "logits/chosen": -4.435838222503662, + "logits/rejected": -4.332773208618164, + "logps/chosen": -720.262939453125, + "logps/rejected": -651.93701171875, + "loss": 0.6819, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.562780499458313, + "rewards/margins": 0.36187881231307983, + "rewards/rejected": -0.924659252166748, + "step": 3650 + }, + { + "epoch": 0.8526499708794408, + "grad_norm": 8.157905578613281, + "learning_rate": 1.274822006472492e-05, + "logits/chosen": -4.371639251708984, + "logits/rejected": -4.241855621337891, + "logps/chosen": -706.9839477539062, + "logps/rejected": -693.9249267578125, + "loss": 0.8243, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.6309869885444641, + "rewards/margins": 0.22677846252918243, + "rewards/rejected": -0.8577653765678406, + "step": 3660 + }, + { + "epoch": 0.8549796156086197, + "grad_norm": 7.720125198364258, + "learning_rate": 1.272233009708738e-05, + "logits/chosen": -4.362655162811279, + "logits/rejected": -4.343399524688721, + "logps/chosen": -693.0546875, + "logps/rejected": -795.5716552734375, + "loss": 0.7693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.6363159418106079, + "rewards/margins": 0.2812911570072174, + "rewards/rejected": -0.9176071286201477, + "step": 3670 + }, + { + "epoch": 0.8573092603377985, + "grad_norm": 4.946926116943359, + "learning_rate": 1.2696440129449838e-05, + "logits/chosen": -4.334009170532227, + "logits/rejected": -4.393346309661865, + "logps/chosen": -661.3502197265625, + "logps/rejected": -723.93505859375, + "loss": 0.6161, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3838418126106262, + "rewards/margins": 0.6862874031066895, + "rewards/rejected": -1.0701292753219604, + "step": 3680 + }, + { + "epoch": 0.8596389050669773, + "grad_norm": 7.175718307495117, + "learning_rate": 1.2670550161812299e-05, + "logits/chosen": -4.439593315124512, + "logits/rejected": -4.346892356872559, + "logps/chosen": -750.6116943359375, + "logps/rejected": -738.089599609375, + "loss": 0.7567, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6906553506851196, + "rewards/margins": 0.38243815302848816, + "rewards/rejected": -1.0730934143066406, + "step": 3690 + }, + { + "epoch": 0.8619685497961561, + "grad_norm": 3.485154867172241, + "learning_rate": 1.2644660194174758e-05, + "logits/chosen": -4.373470783233643, + "logits/rejected": -4.380204200744629, + "logps/chosen": -722.3649291992188, + "logps/rejected": -786.5294189453125, + "loss": 0.7397, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6116902232170105, + "rewards/margins": 0.5988327860832214, + "rewards/rejected": -1.2105228900909424, + "step": 3700 + }, + { + "epoch": 0.8619685497961561, + "eval_logits/chosen": -4.296970367431641, + "eval_logits/rejected": -4.285056114196777, + "eval_logps/chosen": -697.7417602539062, + "eval_logps/rejected": -717.1804809570312, + "eval_loss": 0.6377041935920715, + "eval_rewards/accuracies": 0.6340509057044983, + "eval_rewards/chosen": -0.6913568377494812, + "eval_rewards/margins": 0.34682154655456543, + "eval_rewards/rejected": -1.0381783246994019, + "eval_runtime": 390.2544, + "eval_samples_per_second": 18.332, + "eval_steps_per_second": 9.166, + "step": 3700 + }, + { + "epoch": 0.8642981945253349, + "grad_norm": 9.30069351196289, + "learning_rate": 1.2618770226537218e-05, + "logits/chosen": -4.300488471984863, + "logits/rejected": -4.226680755615234, + "logps/chosen": -686.4176025390625, + "logps/rejected": -642.7655639648438, + "loss": 0.8453, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8403793573379517, + "rewards/margins": 0.18287669122219086, + "rewards/rejected": -1.0232560634613037, + "step": 3710 + }, + { + "epoch": 0.8666278392545137, + "grad_norm": 10.053071975708008, + "learning_rate": 1.2592880258899677e-05, + "logits/chosen": -4.428006172180176, + "logits/rejected": -4.311192512512207, + "logps/chosen": -745.8179931640625, + "logps/rejected": -731.64208984375, + "loss": 0.7535, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.61701500415802, + "rewards/margins": 0.3933693468570709, + "rewards/rejected": -1.0103843212127686, + "step": 3720 + }, + { + "epoch": 0.8689574839836924, + "grad_norm": 10.565129280090332, + "learning_rate": 1.2566990291262138e-05, + "logits/chosen": -4.2700724601745605, + "logits/rejected": -4.286831855773926, + "logps/chosen": -701.6298828125, + "logps/rejected": -780.7486572265625, + "loss": 0.8448, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6534292101860046, + "rewards/margins": 0.44209614396095276, + "rewards/rejected": -1.0955253839492798, + "step": 3730 + }, + { + "epoch": 0.8712871287128713, + "grad_norm": 6.875129699707031, + "learning_rate": 1.2541100323624595e-05, + "logits/chosen": -4.282231330871582, + "logits/rejected": -4.344172477722168, + "logps/chosen": -733.3506469726562, + "logps/rejected": -777.8792724609375, + "loss": 0.77, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7044860124588013, + "rewards/margins": 0.4850228428840637, + "rewards/rejected": -1.1895086765289307, + "step": 3740 + }, + { + "epoch": 0.8736167734420501, + "grad_norm": 3.8528778553009033, + "learning_rate": 1.2515210355987056e-05, + "logits/chosen": -4.3902411460876465, + "logits/rejected": -4.359185218811035, + "logps/chosen": -715.0982666015625, + "logps/rejected": -738.7059326171875, + "loss": 0.832, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6878668069839478, + "rewards/margins": 0.247682124376297, + "rewards/rejected": -0.9355489015579224, + "step": 3750 + }, + { + "epoch": 0.8759464181712289, + "grad_norm": 11.82832145690918, + "learning_rate": 1.2489320388349514e-05, + "logits/chosen": -4.367011070251465, + "logits/rejected": -4.325486183166504, + "logps/chosen": -674.1287841796875, + "logps/rejected": -743.0347900390625, + "loss": 0.7699, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.662690281867981, + "rewards/margins": 0.4709756374359131, + "rewards/rejected": -1.1336658000946045, + "step": 3760 + }, + { + "epoch": 0.8782760629004077, + "grad_norm": 6.591240406036377, + "learning_rate": 1.2463430420711975e-05, + "logits/chosen": -4.304961204528809, + "logits/rejected": -4.336878776550293, + "logps/chosen": -675.7276000976562, + "logps/rejected": -756.3129272460938, + "loss": 0.738, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5121418833732605, + "rewards/margins": 0.4193623661994934, + "rewards/rejected": -0.9315041303634644, + "step": 3770 + }, + { + "epoch": 0.8806057076295865, + "grad_norm": 8.733416557312012, + "learning_rate": 1.2437540453074436e-05, + "logits/chosen": -4.301700592041016, + "logits/rejected": -4.304194450378418, + "logps/chosen": -664.4769287109375, + "logps/rejected": -685.9580078125, + "loss": 0.8389, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.555719256401062, + "rewards/margins": 0.24690046906471252, + "rewards/rejected": -0.8026197552680969, + "step": 3780 + }, + { + "epoch": 0.8829353523587653, + "grad_norm": 7.504303455352783, + "learning_rate": 1.2411650485436894e-05, + "logits/chosen": -4.322279453277588, + "logits/rejected": -4.386387825012207, + "logps/chosen": -702.5224609375, + "logps/rejected": -734.0633544921875, + "loss": 0.683, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4761506915092468, + "rewards/margins": 0.6575405597686768, + "rewards/rejected": -1.1336911916732788, + "step": 3790 + }, + { + "epoch": 0.885264997087944, + "grad_norm": 4.952593803405762, + "learning_rate": 1.2385760517799355e-05, + "logits/chosen": -4.264891624450684, + "logits/rejected": -4.3359832763671875, + "logps/chosen": -690.0963745117188, + "logps/rejected": -771.6055908203125, + "loss": 0.7332, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6271868944168091, + "rewards/margins": 0.535195529460907, + "rewards/rejected": -1.1623823642730713, + "step": 3800 + }, + { + "epoch": 0.885264997087944, + "eval_logits/chosen": -4.289893627166748, + "eval_logits/rejected": -4.277091979980469, + "eval_logps/chosen": -697.6959228515625, + "eval_logps/rejected": -717.1419067382812, + "eval_loss": 0.637164831161499, + "eval_rewards/accuracies": 0.6327928304672241, + "eval_rewards/chosen": -0.6867589354515076, + "eval_rewards/margins": 0.3475603461265564, + "eval_rewards/rejected": -1.034319281578064, + "eval_runtime": 390.115, + "eval_samples_per_second": 18.338, + "eval_steps_per_second": 9.169, + "step": 3800 + }, + { + "epoch": 0.8875946418171229, + "grad_norm": 11.472143173217773, + "learning_rate": 1.2359870550161814e-05, + "logits/chosen": -4.364859580993652, + "logits/rejected": -4.3472161293029785, + "logps/chosen": -775.5863037109375, + "logps/rejected": -826.8441162109375, + "loss": 0.8701, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.7601150870323181, + "rewards/margins": 0.15106992423534393, + "rewards/rejected": -0.911185085773468, + "step": 3810 + }, + { + "epoch": 0.8899242865463017, + "grad_norm": 7.065751075744629, + "learning_rate": 1.2333980582524273e-05, + "logits/chosen": -4.3581085205078125, + "logits/rejected": -4.301484107971191, + "logps/chosen": -761.5343627929688, + "logps/rejected": -733.0517578125, + "loss": 0.7902, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6210321187973022, + "rewards/margins": 0.3615582585334778, + "rewards/rejected": -0.9825904965400696, + "step": 3820 + }, + { + "epoch": 0.8922539312754805, + "grad_norm": 8.05189323425293, + "learning_rate": 1.2308090614886732e-05, + "logits/chosen": -4.315977096557617, + "logits/rejected": -4.266562461853027, + "logps/chosen": -676.1387329101562, + "logps/rejected": -718.7037353515625, + "loss": 0.7636, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.44627365469932556, + "rewards/margins": 0.37797656655311584, + "rewards/rejected": -0.8242501020431519, + "step": 3830 + }, + { + "epoch": 0.8945835760046593, + "grad_norm": 9.934157371520996, + "learning_rate": 1.2282200647249192e-05, + "logits/chosen": -4.352757930755615, + "logits/rejected": -4.351375102996826, + "logps/chosen": -721.6439819335938, + "logps/rejected": -761.50146484375, + "loss": 0.9771, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.9130200147628784, + "rewards/margins": 0.052966028451919556, + "rewards/rejected": -0.9659860730171204, + "step": 3840 + }, + { + "epoch": 0.8969132207338381, + "grad_norm": 6.56084680557251, + "learning_rate": 1.2256310679611651e-05, + "logits/chosen": -4.260069847106934, + "logits/rejected": -4.327987194061279, + "logps/chosen": -745.3139038085938, + "logps/rejected": -761.8585815429688, + "loss": 0.8553, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.9242075085639954, + "rewards/margins": 0.1127476915717125, + "rewards/rejected": -1.0369552373886108, + "step": 3850 + }, + { + "epoch": 0.8992428654630169, + "grad_norm": 7.803834438323975, + "learning_rate": 1.2230420711974112e-05, + "logits/chosen": -4.303910255432129, + "logits/rejected": -4.297524452209473, + "logps/chosen": -769.2096557617188, + "logps/rejected": -766.2592163085938, + "loss": 0.7203, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5617278218269348, + "rewards/margins": 0.6413966417312622, + "rewards/rejected": -1.2031244039535522, + "step": 3860 + }, + { + "epoch": 0.9015725101921956, + "grad_norm": 9.433052062988281, + "learning_rate": 1.220453074433657e-05, + "logits/chosen": -4.305348873138428, + "logits/rejected": -4.278184413909912, + "logps/chosen": -722.5424194335938, + "logps/rejected": -740.0787353515625, + "loss": 0.8749, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.7318505644798279, + "rewards/margins": 0.2239595353603363, + "rewards/rejected": -0.9558101892471313, + "step": 3870 + }, + { + "epoch": 0.9039021549213745, + "grad_norm": 4.370671272277832, + "learning_rate": 1.2178640776699031e-05, + "logits/chosen": -4.366055488586426, + "logits/rejected": -4.3162760734558105, + "logps/chosen": -765.7427978515625, + "logps/rejected": -722.709228515625, + "loss": 0.7788, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.49548083543777466, + "rewards/margins": 0.2860412001609802, + "rewards/rejected": -0.7815219759941101, + "step": 3880 + }, + { + "epoch": 0.9062317996505533, + "grad_norm": 7.877103328704834, + "learning_rate": 1.2152750809061488e-05, + "logits/chosen": -4.199087142944336, + "logits/rejected": -4.245467185974121, + "logps/chosen": -690.6033935546875, + "logps/rejected": -745.9832153320312, + "loss": 0.7376, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4985593259334564, + "rewards/margins": 0.4773792624473572, + "rewards/rejected": -0.9759386777877808, + "step": 3890 + }, + { + "epoch": 0.9085614443797321, + "grad_norm": 6.403533935546875, + "learning_rate": 1.2126860841423949e-05, + "logits/chosen": -4.394805431365967, + "logits/rejected": -4.327023506164551, + "logps/chosen": -719.85791015625, + "logps/rejected": -745.8244018554688, + "loss": 0.6818, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5105542540550232, + "rewards/margins": 0.5197178721427917, + "rewards/rejected": -1.030272126197815, + "step": 3900 + }, + { + "epoch": 0.9085614443797321, + "eval_logits/chosen": -4.285029411315918, + "eval_logits/rejected": -4.271745204925537, + "eval_logps/chosen": -697.7550048828125, + "eval_logps/rejected": -717.2332763671875, + "eval_loss": 0.6361972093582153, + "eval_rewards/accuracies": 0.634889543056488, + "eval_rewards/chosen": -0.6926815509796143, + "eval_rewards/margins": 0.35078319907188416, + "eval_rewards/rejected": -1.0434646606445312, + "eval_runtime": 390.7908, + "eval_samples_per_second": 18.306, + "eval_steps_per_second": 9.153, + "step": 3900 + }, + { + "epoch": 0.9108910891089109, + "grad_norm": 5.553112030029297, + "learning_rate": 1.2100970873786408e-05, + "logits/chosen": -4.275856018066406, + "logits/rejected": -4.270997047424316, + "logps/chosen": -668.0219116210938, + "logps/rejected": -684.3800048828125, + "loss": 0.7816, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5604637861251831, + "rewards/margins": 0.280496746301651, + "rewards/rejected": -0.8409605026245117, + "step": 3910 + }, + { + "epoch": 0.9132207338380897, + "grad_norm": 7.20487117767334, + "learning_rate": 1.2075080906148868e-05, + "logits/chosen": -4.234086990356445, + "logits/rejected": -4.341015815734863, + "logps/chosen": -652.541259765625, + "logps/rejected": -708.7343139648438, + "loss": 0.7336, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36917343735694885, + "rewards/margins": 0.36028575897216797, + "rewards/rejected": -0.7294591665267944, + "step": 3920 + }, + { + "epoch": 0.9155503785672685, + "grad_norm": 6.224857807159424, + "learning_rate": 1.2049190938511329e-05, + "logits/chosen": -4.316666603088379, + "logits/rejected": -4.31832218170166, + "logps/chosen": -700.3021850585938, + "logps/rejected": -748.1549682617188, + "loss": 0.682, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6771694421768188, + "rewards/margins": 0.3864041566848755, + "rewards/rejected": -1.0635735988616943, + "step": 3930 + }, + { + "epoch": 0.9178800232964472, + "grad_norm": 9.819120407104492, + "learning_rate": 1.2023300970873788e-05, + "logits/chosen": -4.288181304931641, + "logits/rejected": -4.2603254318237305, + "logps/chosen": -701.2803955078125, + "logps/rejected": -651.3552856445312, + "loss": 0.8137, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.7791440486907959, + "rewards/margins": 0.28175079822540283, + "rewards/rejected": -1.0608948469161987, + "step": 3940 + }, + { + "epoch": 0.9202096680256261, + "grad_norm": 6.487390995025635, + "learning_rate": 1.1997411003236248e-05, + "logits/chosen": -4.354862689971924, + "logits/rejected": -4.413088798522949, + "logps/chosen": -704.1953125, + "logps/rejected": -725.8970947265625, + "loss": 0.686, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6075937747955322, + "rewards/margins": 0.45311158895492554, + "rewards/rejected": -1.0607054233551025, + "step": 3950 + }, + { + "epoch": 0.9225393127548049, + "grad_norm": 7.161795616149902, + "learning_rate": 1.1971521035598706e-05, + "logits/chosen": -4.353941917419434, + "logits/rejected": -4.300901412963867, + "logps/chosen": -712.5975341796875, + "logps/rejected": -664.6988525390625, + "loss": 0.6314, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5916818380355835, + "rewards/margins": 0.6264206171035767, + "rewards/rejected": -1.2181024551391602, + "step": 3960 + }, + { + "epoch": 0.9248689574839837, + "grad_norm": 7.680700778961182, + "learning_rate": 1.1945631067961166e-05, + "logits/chosen": -4.289122581481934, + "logits/rejected": -4.3174662590026855, + "logps/chosen": -699.622314453125, + "logps/rejected": -808.3905639648438, + "loss": 0.7317, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4722517132759094, + "rewards/margins": 0.5728135704994202, + "rewards/rejected": -1.04506516456604, + "step": 3970 + }, + { + "epoch": 0.9271986022131625, + "grad_norm": 6.7148613929748535, + "learning_rate": 1.1919741100323625e-05, + "logits/chosen": -4.370627403259277, + "logits/rejected": -4.364049434661865, + "logps/chosen": -672.5323486328125, + "logps/rejected": -693.3355712890625, + "loss": 0.6916, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3713933527469635, + "rewards/margins": 0.619596004486084, + "rewards/rejected": -0.9909893274307251, + "step": 3980 + }, + { + "epoch": 0.9295282469423413, + "grad_norm": 6.711925029754639, + "learning_rate": 1.1893851132686086e-05, + "logits/chosen": -4.313281059265137, + "logits/rejected": -4.3168206214904785, + "logps/chosen": -757.818115234375, + "logps/rejected": -744.3825073242188, + "loss": 0.8777, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.6283093690872192, + "rewards/margins": 0.17840756475925446, + "rewards/rejected": -0.8067169189453125, + "step": 3990 + }, + { + "epoch": 0.9318578916715201, + "grad_norm": 9.016495704650879, + "learning_rate": 1.1867961165048544e-05, + "logits/chosen": -4.332929611206055, + "logits/rejected": -4.292237758636475, + "logps/chosen": -754.0440673828125, + "logps/rejected": -729.8211669921875, + "loss": 0.8391, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7084426283836365, + "rewards/margins": 0.30985385179519653, + "rewards/rejected": -1.018296480178833, + "step": 4000 + }, + { + "epoch": 0.9318578916715201, + "eval_logits/chosen": -4.2970099449157715, + "eval_logits/rejected": -4.284469127655029, + "eval_logps/chosen": -697.7953491210938, + "eval_logps/rejected": -717.3357543945312, + "eval_loss": 0.6347742080688477, + "eval_rewards/accuracies": 0.6379647850990295, + "eval_rewards/chosen": -0.6967126131057739, + "eval_rewards/margins": 0.35700222849845886, + "eval_rewards/rejected": -1.0537148714065552, + "eval_runtime": 390.6719, + "eval_samples_per_second": 18.312, + "eval_steps_per_second": 9.156, + "step": 4000 + }, + { + "epoch": 0.9341875364006988, + "grad_norm": 5.923018455505371, + "learning_rate": 1.1842071197411005e-05, + "logits/chosen": -4.305145263671875, + "logits/rejected": -4.294079303741455, + "logps/chosen": -767.0081787109375, + "logps/rejected": -767.0750732421875, + "loss": 0.6031, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5457931756973267, + "rewards/margins": 0.7378371953964233, + "rewards/rejected": -1.28363037109375, + "step": 4010 + }, + { + "epoch": 0.9365171811298777, + "grad_norm": 7.910881519317627, + "learning_rate": 1.1816181229773464e-05, + "logits/chosen": -4.279091835021973, + "logits/rejected": -4.292242527008057, + "logps/chosen": -677.419189453125, + "logps/rejected": -674.3132934570312, + "loss": 0.8111, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.7452019453048706, + "rewards/margins": 0.21577151119709015, + "rewards/rejected": -0.9609734416007996, + "step": 4020 + }, + { + "epoch": 0.9388468258590565, + "grad_norm": 6.735974311828613, + "learning_rate": 1.1790291262135923e-05, + "logits/chosen": -4.291772365570068, + "logits/rejected": -4.343691825866699, + "logps/chosen": -713.7225341796875, + "logps/rejected": -804.0164184570312, + "loss": 0.6981, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5853220224380493, + "rewards/margins": 0.46226024627685547, + "rewards/rejected": -1.0475821495056152, + "step": 4030 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 8.748640060424805, + "learning_rate": 1.1764401294498382e-05, + "logits/chosen": -4.29034423828125, + "logits/rejected": -4.391429901123047, + "logps/chosen": -731.4406127929688, + "logps/rejected": -813.4442138671875, + "loss": 0.8127, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.8790253400802612, + "rewards/margins": 0.3499372601509094, + "rewards/rejected": -1.2289626598358154, + "step": 4040 + }, + { + "epoch": 0.9435061153174141, + "grad_norm": 8.355563163757324, + "learning_rate": 1.1738511326860842e-05, + "logits/chosen": -4.349619388580322, + "logits/rejected": -4.397017478942871, + "logps/chosen": -756.1160278320312, + "logps/rejected": -741.4420776367188, + "loss": 0.8698, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5679250359535217, + "rewards/margins": 0.10850824415683746, + "rewards/rejected": -0.676433265209198, + "step": 4050 + }, + { + "epoch": 0.9458357600465929, + "grad_norm": 7.7695088386535645, + "learning_rate": 1.1712621359223301e-05, + "logits/chosen": -4.371559143066406, + "logits/rejected": -4.285519599914551, + "logps/chosen": -667.3077392578125, + "logps/rejected": -681.5567626953125, + "loss": 0.7885, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6184073686599731, + "rewards/margins": 0.35303595662117004, + "rewards/rejected": -0.9714432954788208, + "step": 4060 + }, + { + "epoch": 0.9481654047757717, + "grad_norm": 5.586401462554932, + "learning_rate": 1.1686731391585762e-05, + "logits/chosen": -4.3023362159729, + "logits/rejected": -4.3370680809021, + "logps/chosen": -688.8123779296875, + "logps/rejected": -772.4279174804688, + "loss": 0.8592, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6615897417068481, + "rewards/margins": 0.2921566963195801, + "rewards/rejected": -0.9537464380264282, + "step": 4070 + }, + { + "epoch": 0.9504950495049505, + "grad_norm": 9.162353515625, + "learning_rate": 1.1660841423948222e-05, + "logits/chosen": -4.320708751678467, + "logits/rejected": -4.362603187561035, + "logps/chosen": -680.4637451171875, + "logps/rejected": -716.41796875, + "loss": 0.8136, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.632215142250061, + "rewards/margins": 0.2641952633857727, + "rewards/rejected": -0.896410346031189, + "step": 4080 + }, + { + "epoch": 0.9528246942341293, + "grad_norm": 6.786946773529053, + "learning_rate": 1.1634951456310681e-05, + "logits/chosen": -4.182633399963379, + "logits/rejected": -4.321324348449707, + "logps/chosen": -618.3949584960938, + "logps/rejected": -735.050048828125, + "loss": 0.8862, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.725447416305542, + "rewards/margins": 0.29415830969810486, + "rewards/rejected": -1.0196057558059692, + "step": 4090 + }, + { + "epoch": 0.9551543389633081, + "grad_norm": 9.914709091186523, + "learning_rate": 1.1609061488673142e-05, + "logits/chosen": -4.354341983795166, + "logits/rejected": -4.419948577880859, + "logps/chosen": -713.4168090820312, + "logps/rejected": -794.8353271484375, + "loss": 0.9021, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.7848163843154907, + "rewards/margins": 0.18717719614505768, + "rewards/rejected": -0.9719935655593872, + "step": 4100 + }, + { + "epoch": 0.9551543389633081, + "eval_logits/chosen": -4.297923564910889, + "eval_logits/rejected": -4.285905838012695, + "eval_logps/chosen": -697.90234375, + "eval_logps/rejected": -717.4815063476562, + "eval_loss": 0.6345070600509644, + "eval_rewards/accuracies": 0.6341906785964966, + "eval_rewards/chosen": -0.7074149250984192, + "eval_rewards/margins": 0.3608627915382385, + "eval_rewards/rejected": -1.0682777166366577, + "eval_runtime": 390.4537, + "eval_samples_per_second": 18.322, + "eval_steps_per_second": 9.161, + "step": 4100 + }, + { + "epoch": 0.9574839836924869, + "grad_norm": 10.379064559936523, + "learning_rate": 1.1583171521035599e-05, + "logits/chosen": -4.362015724182129, + "logits/rejected": -4.387961387634277, + "logps/chosen": -696.187255859375, + "logps/rejected": -740.3745727539062, + "loss": 0.7754, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7311137914657593, + "rewards/margins": 0.3941892087459564, + "rewards/rejected": -1.125303030014038, + "step": 4110 + }, + { + "epoch": 0.9598136284216657, + "grad_norm": 7.714394569396973, + "learning_rate": 1.155728155339806e-05, + "logits/chosen": -4.350986003875732, + "logits/rejected": -4.345406532287598, + "logps/chosen": -687.46435546875, + "logps/rejected": -715.8142700195312, + "loss": 0.7417, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8579521179199219, + "rewards/margins": 0.4291927218437195, + "rewards/rejected": -1.2871448993682861, + "step": 4120 + }, + { + "epoch": 0.9621432731508445, + "grad_norm": 10.189233779907227, + "learning_rate": 1.1531391585760518e-05, + "logits/chosen": -4.308093070983887, + "logits/rejected": -4.3064866065979, + "logps/chosen": -720.6092529296875, + "logps/rejected": -737.302490234375, + "loss": 0.8021, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8064984083175659, + "rewards/margins": 0.40783438086509705, + "rewards/rejected": -1.2143328189849854, + "step": 4130 + }, + { + "epoch": 0.9644729178800233, + "grad_norm": 7.011444091796875, + "learning_rate": 1.1505501618122979e-05, + "logits/chosen": -4.417510032653809, + "logits/rejected": -4.422568321228027, + "logps/chosen": -755.2730712890625, + "logps/rejected": -800.2728881835938, + "loss": 0.7996, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7368863821029663, + "rewards/margins": 0.3117212653160095, + "rewards/rejected": -1.048607587814331, + "step": 4140 + }, + { + "epoch": 0.966802562609202, + "grad_norm": 7.239262104034424, + "learning_rate": 1.1479611650485438e-05, + "logits/chosen": -4.340888023376465, + "logits/rejected": -4.3161115646362305, + "logps/chosen": -725.8350830078125, + "logps/rejected": -671.7639770507812, + "loss": 0.7794, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.71551114320755, + "rewards/margins": 0.25095969438552856, + "rewards/rejected": -0.9664708971977234, + "step": 4150 + }, + { + "epoch": 0.9691322073383809, + "grad_norm": 10.932368278503418, + "learning_rate": 1.1453721682847898e-05, + "logits/chosen": -4.345290184020996, + "logits/rejected": -4.2818169593811035, + "logps/chosen": -714.5953369140625, + "logps/rejected": -723.066162109375, + "loss": 0.902, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5351029634475708, + "rewards/margins": 0.3501548171043396, + "rewards/rejected": -0.8852578401565552, + "step": 4160 + }, + { + "epoch": 0.9714618520675598, + "grad_norm": 4.409315586090088, + "learning_rate": 1.1427831715210357e-05, + "logits/chosen": -4.325705528259277, + "logits/rejected": -4.331601142883301, + "logps/chosen": -687.9990234375, + "logps/rejected": -684.64013671875, + "loss": 0.7662, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7611755728721619, + "rewards/margins": 0.32491764426231384, + "rewards/rejected": -1.0860933065414429, + "step": 4170 + }, + { + "epoch": 0.9737914967967385, + "grad_norm": 6.8606085777282715, + "learning_rate": 1.1401941747572816e-05, + "logits/chosen": -4.298327445983887, + "logits/rejected": -4.327838897705078, + "logps/chosen": -691.434326171875, + "logps/rejected": -719.2293090820312, + "loss": 0.8632, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6933432817459106, + "rewards/margins": 0.22406530380249023, + "rewards/rejected": -0.9174085855484009, + "step": 4180 + }, + { + "epoch": 0.9761211415259173, + "grad_norm": 8.066191673278809, + "learning_rate": 1.1376051779935275e-05, + "logits/chosen": -4.308099746704102, + "logits/rejected": -4.330200672149658, + "logps/chosen": -724.9734497070312, + "logps/rejected": -770.6451416015625, + "loss": 0.7989, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5899969935417175, + "rewards/margins": 0.49291300773620605, + "rewards/rejected": -1.0829100608825684, + "step": 4190 + }, + { + "epoch": 0.9784507862550961, + "grad_norm": 7.6504597663879395, + "learning_rate": 1.1350161812297736e-05, + "logits/chosen": -4.331873893737793, + "logits/rejected": -4.344123363494873, + "logps/chosen": -722.4224243164062, + "logps/rejected": -752.9412231445312, + "loss": 0.8485, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7468961477279663, + "rewards/margins": 0.22630326449871063, + "rewards/rejected": -0.9731994867324829, + "step": 4200 + }, + { + "epoch": 0.9784507862550961, + "eval_logits/chosen": -4.293464660644531, + "eval_logits/rejected": -4.280642509460449, + "eval_logps/chosen": -697.4835815429688, + "eval_logps/rejected": -717.0183715820312, + "eval_loss": 0.632033109664917, + "eval_rewards/accuracies": 0.6395023465156555, + "eval_rewards/chosen": -0.6655304431915283, + "eval_rewards/margins": 0.3564453721046448, + "eval_rewards/rejected": -1.0219757556915283, + "eval_runtime": 390.3572, + "eval_samples_per_second": 18.327, + "eval_steps_per_second": 9.163, + "step": 4200 + }, + { + "epoch": 0.9807804309842749, + "grad_norm": 8.241952896118164, + "learning_rate": 1.1324271844660195e-05, + "logits/chosen": -4.362992286682129, + "logits/rejected": -4.3978118896484375, + "logps/chosen": -706.3831787109375, + "logps/rejected": -748.598876953125, + "loss": 0.6911, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5810929536819458, + "rewards/margins": 0.5309965014457703, + "rewards/rejected": -1.1120895147323608, + "step": 4210 + }, + { + "epoch": 0.9831100757134537, + "grad_norm": 5.629496097564697, + "learning_rate": 1.1298381877022655e-05, + "logits/chosen": -4.324032306671143, + "logits/rejected": -4.422591686248779, + "logps/chosen": -654.5308837890625, + "logps/rejected": -739.7088012695312, + "loss": 0.687, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5922080278396606, + "rewards/margins": 0.5977957248687744, + "rewards/rejected": -1.1900036334991455, + "step": 4220 + }, + { + "epoch": 0.9854397204426325, + "grad_norm": 8.188299179077148, + "learning_rate": 1.1272491909385116e-05, + "logits/chosen": -4.344983100891113, + "logits/rejected": -4.238674640655518, + "logps/chosen": -704.5607299804688, + "logps/rejected": -669.7830810546875, + "loss": 0.8422, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.7657082080841064, + "rewards/margins": 0.19472898542881012, + "rewards/rejected": -0.9604371190071106, + "step": 4230 + }, + { + "epoch": 0.9877693651718114, + "grad_norm": 10.56550121307373, + "learning_rate": 1.1246601941747575e-05, + "logits/chosen": -4.41368293762207, + "logits/rejected": -4.372306823730469, + "logps/chosen": -759.0302124023438, + "logps/rejected": -758.4656372070312, + "loss": 0.7326, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5605214834213257, + "rewards/margins": 0.5081143379211426, + "rewards/rejected": -1.0686357021331787, + "step": 4240 + }, + { + "epoch": 0.9900990099009901, + "grad_norm": 6.3407111167907715, + "learning_rate": 1.1220711974110032e-05, + "logits/chosen": -4.351005554199219, + "logits/rejected": -4.320067405700684, + "logps/chosen": -724.1296997070312, + "logps/rejected": -682.4201049804688, + "loss": 0.8276, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4309804439544678, + "rewards/margins": 0.1828249841928482, + "rewards/rejected": -0.6138054728507996, + "step": 4250 + }, + { + "epoch": 0.9924286546301689, + "grad_norm": 7.637398719787598, + "learning_rate": 1.1194822006472492e-05, + "logits/chosen": -4.266735076904297, + "logits/rejected": -4.263129234313965, + "logps/chosen": -711.1100463867188, + "logps/rejected": -707.9876708984375, + "loss": 0.8207, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7385729551315308, + "rewards/margins": 0.1418248414993286, + "rewards/rejected": -0.8803977966308594, + "step": 4260 + }, + { + "epoch": 0.9947582993593477, + "grad_norm": 5.631221294403076, + "learning_rate": 1.1168932038834953e-05, + "logits/chosen": -4.288881778717041, + "logits/rejected": -4.356893539428711, + "logps/chosen": -738.98046875, + "logps/rejected": -761.4144897460938, + "loss": 0.659, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4446313977241516, + "rewards/margins": 0.7082819938659668, + "rewards/rejected": -1.1529133319854736, + "step": 4270 + }, + { + "epoch": 0.9970879440885265, + "grad_norm": 7.247366428375244, + "learning_rate": 1.1143042071197412e-05, + "logits/chosen": -4.375193119049072, + "logits/rejected": -4.431723117828369, + "logps/chosen": -710.0206298828125, + "logps/rejected": -778.4895629882812, + "loss": 0.5791, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.515932023525238, + "rewards/margins": 0.9266525506973267, + "rewards/rejected": -1.4425846338272095, + "step": 4280 + }, + { + "epoch": 0.9994175888177053, + "grad_norm": 7.143087387084961, + "learning_rate": 1.1117152103559872e-05, + "logits/chosen": -4.352514743804932, + "logits/rejected": -4.303081035614014, + "logps/chosen": -749.5734252929688, + "logps/rejected": -768.8510131835938, + "loss": 0.6417, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.3860852122306824, + "rewards/margins": 0.6603494882583618, + "rewards/rejected": -1.0464346408843994, + "step": 4290 + }, + { + "epoch": 1.001747233546884, + "grad_norm": 10.907618522644043, + "learning_rate": 1.1091262135922331e-05, + "logits/chosen": -4.354809284210205, + "logits/rejected": -4.3103461265563965, + "logps/chosen": -759.310791015625, + "logps/rejected": -779.1396484375, + "loss": 0.8318, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.6837959289550781, + "rewards/margins": 0.14305201172828674, + "rewards/rejected": -0.8268479108810425, + "step": 4300 + }, + { + "epoch": 1.001747233546884, + "eval_logits/chosen": -4.28951358795166, + "eval_logits/rejected": -4.275976657867432, + "eval_logps/chosen": -697.7381591796875, + "eval_logps/rejected": -717.3543701171875, + "eval_loss": 0.6335008144378662, + "eval_rewards/accuracies": 0.6350293755531311, + "eval_rewards/chosen": -0.6909992694854736, + "eval_rewards/margins": 0.36457014083862305, + "eval_rewards/rejected": -1.0555692911148071, + "eval_runtime": 390.7708, + "eval_samples_per_second": 18.307, + "eval_steps_per_second": 9.154, + "step": 4300 + }, + { + "epoch": 1.004076878276063, + "grad_norm": 8.588318824768066, + "learning_rate": 1.1065372168284792e-05, + "logits/chosen": -4.376956939697266, + "logits/rejected": -4.33894157409668, + "logps/chosen": -717.998779296875, + "logps/rejected": -729.5446166992188, + "loss": 0.7683, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.6977962255477905, + "rewards/margins": 0.3738870620727539, + "rewards/rejected": -1.0716831684112549, + "step": 4310 + }, + { + "epoch": 1.0064065230052417, + "grad_norm": 6.269148349761963, + "learning_rate": 1.1039482200647249e-05, + "logits/chosen": -4.326693534851074, + "logits/rejected": -4.353607177734375, + "logps/chosen": -755.3944091796875, + "logps/rejected": -790.818115234375, + "loss": 0.7439, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8258703947067261, + "rewards/margins": 0.47228875756263733, + "rewards/rejected": -1.298159122467041, + "step": 4320 + }, + { + "epoch": 1.0087361677344204, + "grad_norm": 10.411598205566406, + "learning_rate": 1.101359223300971e-05, + "logits/chosen": -4.370461940765381, + "logits/rejected": -4.218691825866699, + "logps/chosen": -758.8607177734375, + "logps/rejected": -717.3480224609375, + "loss": 0.6571, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5283089876174927, + "rewards/margins": 0.50538170337677, + "rewards/rejected": -1.0336906909942627, + "step": 4330 + }, + { + "epoch": 1.0110658124635994, + "grad_norm": 5.391559600830078, + "learning_rate": 1.0987702265372168e-05, + "logits/chosen": -4.347881317138672, + "logits/rejected": -4.359528541564941, + "logps/chosen": -708.3019409179688, + "logps/rejected": -712.9531860351562, + "loss": 0.7141, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7340580821037292, + "rewards/margins": 0.37196987867355347, + "rewards/rejected": -1.1060279607772827, + "step": 4340 + }, + { + "epoch": 1.0133954571927781, + "grad_norm": 5.033149242401123, + "learning_rate": 1.0961812297734629e-05, + "logits/chosen": -4.371495246887207, + "logits/rejected": -4.206801891326904, + "logps/chosen": -752.9769287109375, + "logps/rejected": -694.5695190429688, + "loss": 0.8001, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.7193650007247925, + "rewards/margins": 0.2597826421260834, + "rewards/rejected": -0.979147732257843, + "step": 4350 + }, + { + "epoch": 1.0157251019219569, + "grad_norm": 7.617324352264404, + "learning_rate": 1.0935922330097088e-05, + "logits/chosen": -4.31460428237915, + "logits/rejected": -4.267983913421631, + "logps/chosen": -691.3458862304688, + "logps/rejected": -702.7252807617188, + "loss": 0.7861, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7619749903678894, + "rewards/margins": 0.3660062551498413, + "rewards/rejected": -1.1279813051223755, + "step": 4360 + }, + { + "epoch": 1.0180547466511356, + "grad_norm": 4.813138961791992, + "learning_rate": 1.0910032362459548e-05, + "logits/chosen": -4.37437105178833, + "logits/rejected": -4.39128303527832, + "logps/chosen": -753.4616088867188, + "logps/rejected": -759.0370483398438, + "loss": 0.7151, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6380533576011658, + "rewards/margins": 0.5098127126693726, + "rewards/rejected": -1.1478660106658936, + "step": 4370 + }, + { + "epoch": 1.0203843913803146, + "grad_norm": 7.298201084136963, + "learning_rate": 1.0884142394822009e-05, + "logits/chosen": -4.26786994934082, + "logits/rejected": -4.357375621795654, + "logps/chosen": -680.5759887695312, + "logps/rejected": -726.3565673828125, + "loss": 0.9456, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.8680903315544128, + "rewards/margins": 0.08186836540699005, + "rewards/rejected": -0.9499586820602417, + "step": 4380 + }, + { + "epoch": 1.0227140361094933, + "grad_norm": 8.440717697143555, + "learning_rate": 1.0858252427184466e-05, + "logits/chosen": -4.3552470207214355, + "logits/rejected": -4.355259895324707, + "logps/chosen": -598.6309814453125, + "logps/rejected": -722.0578002929688, + "loss": 0.732, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.7806925177574158, + "rewards/margins": 0.5147355198860168, + "rewards/rejected": -1.2954281568527222, + "step": 4390 + }, + { + "epoch": 1.025043680838672, + "grad_norm": 8.4296236038208, + "learning_rate": 1.0832362459546925e-05, + "logits/chosen": -4.287355899810791, + "logits/rejected": -4.293883323669434, + "logps/chosen": -703.4840087890625, + "logps/rejected": -756.8140869140625, + "loss": 0.7625, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7262560725212097, + "rewards/margins": 0.43259239196777344, + "rewards/rejected": -1.158848524093628, + "step": 4400 + }, + { + "epoch": 1.025043680838672, + "eval_logits/chosen": -4.290392875671387, + "eval_logits/rejected": -4.277007102966309, + "eval_logps/chosen": -698.2494506835938, + "eval_logps/rejected": -717.9976806640625, + "eval_loss": 0.6358251571655273, + "eval_rewards/accuracies": 0.6337713003158569, + "eval_rewards/chosen": -0.7421231269836426, + "eval_rewards/margins": 0.3777867555618286, + "eval_rewards/rejected": -1.1199098825454712, + "eval_runtime": 392.2028, + "eval_samples_per_second": 18.241, + "eval_steps_per_second": 9.12, + "step": 4400 + }, + { + "epoch": 1.027373325567851, + "grad_norm": 9.419212341308594, + "learning_rate": 1.0806472491909386e-05, + "logits/chosen": -4.247093200683594, + "logits/rejected": -4.255812644958496, + "logps/chosen": -731.6544189453125, + "logps/rejected": -766.1512451171875, + "loss": 0.7624, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5727885961532593, + "rewards/margins": 0.5212075114250183, + "rewards/rejected": -1.0939961671829224, + "step": 4410 + }, + { + "epoch": 1.0297029702970297, + "grad_norm": 6.185367107391357, + "learning_rate": 1.0780582524271846e-05, + "logits/chosen": -4.298975944519043, + "logits/rejected": -4.33304500579834, + "logps/chosen": -684.7571411132812, + "logps/rejected": -752.9080810546875, + "loss": 0.6736, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.694426417350769, + "rewards/margins": 0.5212685465812683, + "rewards/rejected": -1.2156950235366821, + "step": 4420 + }, + { + "epoch": 1.0320326150262085, + "grad_norm": 8.520529747009277, + "learning_rate": 1.0754692556634305e-05, + "logits/chosen": -4.316971778869629, + "logits/rejected": -4.40161657333374, + "logps/chosen": -700.2435302734375, + "logps/rejected": -781.5968627929688, + "loss": 0.8329, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6743239760398865, + "rewards/margins": 0.26052436232566833, + "rewards/rejected": -0.9348483085632324, + "step": 4430 + }, + { + "epoch": 1.0343622597553872, + "grad_norm": 8.941186904907227, + "learning_rate": 1.0728802588996766e-05, + "logits/chosen": -4.344968318939209, + "logits/rejected": -4.354222774505615, + "logps/chosen": -727.3234252929688, + "logps/rejected": -796.7506103515625, + "loss": 0.7991, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6619983911514282, + "rewards/margins": 0.3960326611995697, + "rewards/rejected": -1.0580310821533203, + "step": 4440 + }, + { + "epoch": 1.0366919044845662, + "grad_norm": 9.501112937927246, + "learning_rate": 1.0702912621359225e-05, + "logits/chosen": -4.3322954177856445, + "logits/rejected": -4.332674026489258, + "logps/chosen": -724.5609130859375, + "logps/rejected": -736.2202758789062, + "loss": 0.554, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5000070929527283, + "rewards/margins": 0.7766839265823364, + "rewards/rejected": -1.276691198348999, + "step": 4450 + }, + { + "epoch": 1.039021549213745, + "grad_norm": 8.592558860778809, + "learning_rate": 1.0677022653721685e-05, + "logits/chosen": -4.405747413635254, + "logits/rejected": -4.35235071182251, + "logps/chosen": -729.2918701171875, + "logps/rejected": -749.1801147460938, + "loss": 0.7921, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.7027440071105957, + "rewards/margins": 0.24186758697032928, + "rewards/rejected": -0.9446115493774414, + "step": 4460 + }, + { + "epoch": 1.0413511939429236, + "grad_norm": 8.168656349182129, + "learning_rate": 1.0651132686084142e-05, + "logits/chosen": -4.345917701721191, + "logits/rejected": -4.364696025848389, + "logps/chosen": -699.2119140625, + "logps/rejected": -736.5466918945312, + "loss": 0.8149, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6802065968513489, + "rewards/margins": 0.30877023935317993, + "rewards/rejected": -0.9889768362045288, + "step": 4470 + }, + { + "epoch": 1.0436808386721026, + "grad_norm": 5.467567443847656, + "learning_rate": 1.0625242718446603e-05, + "logits/chosen": -4.320838928222656, + "logits/rejected": -4.439824104309082, + "logps/chosen": -714.8374633789062, + "logps/rejected": -820.0277099609375, + "loss": 0.8304, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8643010258674622, + "rewards/margins": 0.19848057627677917, + "rewards/rejected": -1.062781572341919, + "step": 4480 + }, + { + "epoch": 1.0460104834012813, + "grad_norm": 10.389025688171387, + "learning_rate": 1.0599352750809062e-05, + "logits/chosen": -4.241874694824219, + "logits/rejected": -4.292876243591309, + "logps/chosen": -666.0745239257812, + "logps/rejected": -695.1250610351562, + "loss": 0.6892, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6902895569801331, + "rewards/margins": 0.4029058814048767, + "rewards/rejected": -1.0931955575942993, + "step": 4490 + }, + { + "epoch": 1.04834012813046, + "grad_norm": 8.474845886230469, + "learning_rate": 1.0573462783171522e-05, + "logits/chosen": -4.303086280822754, + "logits/rejected": -4.328011512756348, + "logps/chosen": -727.4135131835938, + "logps/rejected": -791.6959228515625, + "loss": 0.625, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5062735676765442, + "rewards/margins": 0.6144469976425171, + "rewards/rejected": -1.120720624923706, + "step": 4500 + }, + { + "epoch": 1.04834012813046, + "eval_logits/chosen": -4.285947799682617, + "eval_logits/rejected": -4.272801399230957, + "eval_logps/chosen": -698.516357421875, + "eval_logps/rejected": -718.3611450195312, + "eval_loss": 0.6368241310119629, + "eval_rewards/accuracies": 0.6340509057044983, + "eval_rewards/chosen": -0.7688109278678894, + "eval_rewards/margins": 0.3874339163303375, + "eval_rewards/rejected": -1.1562447547912598, + "eval_runtime": 391.1455, + "eval_samples_per_second": 18.29, + "eval_steps_per_second": 9.145, + "step": 4500 + }, + { + "epoch": 1.0506697728596388, + "grad_norm": 5.533942699432373, + "learning_rate": 1.0547572815533981e-05, + "logits/chosen": -4.259266376495361, + "logits/rejected": -4.302947521209717, + "logps/chosen": -702.03857421875, + "logps/rejected": -733.381591796875, + "loss": 0.7977, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7158435583114624, + "rewards/margins": 0.3514593541622162, + "rewards/rejected": -1.067302942276001, + "step": 4510 + }, + { + "epoch": 1.0529994175888178, + "grad_norm": 8.337974548339844, + "learning_rate": 1.0521682847896442e-05, + "logits/chosen": -4.3287506103515625, + "logits/rejected": -4.309664249420166, + "logps/chosen": -723.6080322265625, + "logps/rejected": -692.8987426757812, + "loss": 0.7763, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.7586453557014465, + "rewards/margins": 0.3815062940120697, + "rewards/rejected": -1.1401516199111938, + "step": 4520 + }, + { + "epoch": 1.0553290623179965, + "grad_norm": 6.677039623260498, + "learning_rate": 1.0495792880258902e-05, + "logits/chosen": -4.355586051940918, + "logits/rejected": -4.369414329528809, + "logps/chosen": -717.1536254882812, + "logps/rejected": -761.757080078125, + "loss": 0.6711, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8186143040657043, + "rewards/margins": 0.6329285502433777, + "rewards/rejected": -1.451542854309082, + "step": 4530 + }, + { + "epoch": 1.0576587070471752, + "grad_norm": 7.527097225189209, + "learning_rate": 1.046990291262136e-05, + "logits/chosen": -4.2965521812438965, + "logits/rejected": -4.304562568664551, + "logps/chosen": -722.9425048828125, + "logps/rejected": -719.3508911132812, + "loss": 0.8067, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.8394044041633606, + "rewards/margins": 0.35650086402893066, + "rewards/rejected": -1.195905327796936, + "step": 4540 + }, + { + "epoch": 1.0599883517763542, + "grad_norm": 6.02723503112793, + "learning_rate": 1.0444012944983818e-05, + "logits/chosen": -4.2509379386901855, + "logits/rejected": -4.236239910125732, + "logps/chosen": -683.6402587890625, + "logps/rejected": -730.5028076171875, + "loss": 0.7061, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.834450900554657, + "rewards/margins": 0.5139259696006775, + "rewards/rejected": -1.348376989364624, + "step": 4550 + }, + { + "epoch": 1.062317996505533, + "grad_norm": 10.399433135986328, + "learning_rate": 1.0418122977346279e-05, + "logits/chosen": -4.382308006286621, + "logits/rejected": -4.351099967956543, + "logps/chosen": -766.7814331054688, + "logps/rejected": -750.8299560546875, + "loss": 0.7899, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8868049383163452, + "rewards/margins": 0.38993802666664124, + "rewards/rejected": -1.276742935180664, + "step": 4560 + }, + { + "epoch": 1.0646476412347117, + "grad_norm": 8.421102523803711, + "learning_rate": 1.0392233009708738e-05, + "logits/chosen": -4.400119304656982, + "logits/rejected": -4.347952365875244, + "logps/chosen": -761.1624755859375, + "logps/rejected": -763.1273193359375, + "loss": 0.7957, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9390009641647339, + "rewards/margins": 0.27838605642318726, + "rewards/rejected": -1.217387080192566, + "step": 4570 + }, + { + "epoch": 1.0669772859638904, + "grad_norm": 8.368112564086914, + "learning_rate": 1.0366343042071198e-05, + "logits/chosen": -4.340146064758301, + "logits/rejected": -4.391610145568848, + "logps/chosen": -744.6365966796875, + "logps/rejected": -738.5140991210938, + "loss": 0.6738, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7065492272377014, + "rewards/margins": 0.4860905706882477, + "rewards/rejected": -1.1926395893096924, + "step": 4580 + }, + { + "epoch": 1.0693069306930694, + "grad_norm": 5.831106185913086, + "learning_rate": 1.0340453074433659e-05, + "logits/chosen": -4.410943031311035, + "logits/rejected": -4.265590667724609, + "logps/chosen": -800.0321044921875, + "logps/rejected": -748.1279907226562, + "loss": 0.7077, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5919022560119629, + "rewards/margins": 0.45542460680007935, + "rewards/rejected": -1.047326922416687, + "step": 4590 + }, + { + "epoch": 1.071636575422248, + "grad_norm": 10.359914779663086, + "learning_rate": 1.0314563106796118e-05, + "logits/chosen": -4.424242973327637, + "logits/rejected": -4.363162040710449, + "logps/chosen": -846.7982177734375, + "logps/rejected": -757.5208129882812, + "loss": 0.8212, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9542733430862427, + "rewards/margins": 0.23269712924957275, + "rewards/rejected": -1.1869704723358154, + "step": 4600 + }, + { + "epoch": 1.071636575422248, + "eval_logits/chosen": -4.292172431945801, + "eval_logits/rejected": -4.279298305511475, + "eval_logps/chosen": -698.2608032226562, + "eval_logps/rejected": -718.0540771484375, + "eval_loss": 0.6358359456062317, + "eval_rewards/accuracies": 0.6313950419425964, + "eval_rewards/chosen": -0.743259608745575, + "eval_rewards/margins": 0.3822806179523468, + "eval_rewards/rejected": -1.1255401372909546, + "eval_runtime": 391.0127, + "eval_samples_per_second": 18.296, + "eval_steps_per_second": 9.148, + "step": 4600 + }, + { + "epoch": 1.0739662201514268, + "grad_norm": 10.056096076965332, + "learning_rate": 1.0288673139158575e-05, + "logits/chosen": -4.429704666137695, + "logits/rejected": -4.434037208557129, + "logps/chosen": -787.2510986328125, + "logps/rejected": -794.8635864257812, + "loss": 0.616, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5970505475997925, + "rewards/margins": 0.8173857927322388, + "rewards/rejected": -1.4144362211227417, + "step": 4610 + }, + { + "epoch": 1.0762958648806058, + "grad_norm": 8.542181968688965, + "learning_rate": 1.0262783171521036e-05, + "logits/chosen": -4.355952262878418, + "logits/rejected": -4.35338020324707, + "logps/chosen": -760.3809814453125, + "logps/rejected": -771.689453125, + "loss": 0.8748, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.9295045733451843, + "rewards/margins": 0.21732263267040253, + "rewards/rejected": -1.1468273401260376, + "step": 4620 + }, + { + "epoch": 1.0786255096097845, + "grad_norm": 9.360555648803711, + "learning_rate": 1.0236893203883496e-05, + "logits/chosen": -4.3113322257995605, + "logits/rejected": -4.3568220138549805, + "logps/chosen": -703.3505859375, + "logps/rejected": -716.5987548828125, + "loss": 0.8138, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.5865589380264282, + "rewards/margins": 0.2181408703327179, + "rewards/rejected": -0.8046997785568237, + "step": 4630 + }, + { + "epoch": 1.0809551543389633, + "grad_norm": 6.834812164306641, + "learning_rate": 1.0211003236245955e-05, + "logits/chosen": -4.268773078918457, + "logits/rejected": -4.2996439933776855, + "logps/chosen": -677.6121826171875, + "logps/rejected": -732.3841552734375, + "loss": 0.7718, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.9201027154922485, + "rewards/margins": 0.29583680629730225, + "rewards/rejected": -1.2159394025802612, + "step": 4640 + }, + { + "epoch": 1.083284799068142, + "grad_norm": 9.268996238708496, + "learning_rate": 1.0185113268608416e-05, + "logits/chosen": -4.294376850128174, + "logits/rejected": -4.4644975662231445, + "logps/chosen": -689.6966552734375, + "logps/rejected": -825.1298828125, + "loss": 0.6809, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5040693879127502, + "rewards/margins": 0.5739471316337585, + "rewards/rejected": -1.0780165195465088, + "step": 4650 + }, + { + "epoch": 1.085614443797321, + "grad_norm": 6.353123188018799, + "learning_rate": 1.0159223300970875e-05, + "logits/chosen": -4.318959712982178, + "logits/rejected": -4.395209312438965, + "logps/chosen": -689.1337280273438, + "logps/rejected": -755.0162353515625, + "loss": 0.6467, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.33966705203056335, + "rewards/margins": 0.7350815534591675, + "rewards/rejected": -1.0747486352920532, + "step": 4660 + }, + { + "epoch": 1.0879440885264997, + "grad_norm": 9.633374214172363, + "learning_rate": 1.0133333333333335e-05, + "logits/chosen": -4.425009727478027, + "logits/rejected": -4.371664524078369, + "logps/chosen": -812.9774169921875, + "logps/rejected": -719.8851928710938, + "loss": 0.6838, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.70526123046875, + "rewards/margins": 0.6213148236274719, + "rewards/rejected": -1.3265759944915771, + "step": 4670 + }, + { + "epoch": 1.0902737332556784, + "grad_norm": 7.010782241821289, + "learning_rate": 1.0107443365695792e-05, + "logits/chosen": -4.395978927612305, + "logits/rejected": -4.355780124664307, + "logps/chosen": -761.0682373046875, + "logps/rejected": -733.3692626953125, + "loss": 0.5467, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5477423071861267, + "rewards/margins": 0.8993877172470093, + "rewards/rejected": -1.4471299648284912, + "step": 4680 + }, + { + "epoch": 1.0926033779848574, + "grad_norm": 10.430981636047363, + "learning_rate": 1.0081553398058253e-05, + "logits/chosen": -4.359240531921387, + "logits/rejected": -4.325311183929443, + "logps/chosen": -746.3953857421875, + "logps/rejected": -761.3292846679688, + "loss": 0.8116, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6973962187767029, + "rewards/margins": 0.5142523646354675, + "rewards/rejected": -1.21164870262146, + "step": 4690 + }, + { + "epoch": 1.0949330227140361, + "grad_norm": 8.63824462890625, + "learning_rate": 1.0055663430420712e-05, + "logits/chosen": -4.394569396972656, + "logits/rejected": -4.347747325897217, + "logps/chosen": -779.9010009765625, + "logps/rejected": -774.9115600585938, + "loss": 0.8804, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.8874024152755737, + "rewards/margins": 0.16095098853111267, + "rewards/rejected": -1.0483534336090088, + "step": 4700 + }, + { + "epoch": 1.0949330227140361, + "eval_logits/chosen": -4.293430805206299, + "eval_logits/rejected": -4.280736923217773, + "eval_logps/chosen": -698.022705078125, + "eval_logps/rejected": -717.76416015625, + "eval_loss": 0.6339041590690613, + "eval_rewards/accuracies": 0.6339110732078552, + "eval_rewards/chosen": -0.7194374203681946, + "eval_rewards/margins": 0.3771189749240875, + "eval_rewards/rejected": -1.096556544303894, + "eval_runtime": 392.0836, + "eval_samples_per_second": 18.246, + "eval_steps_per_second": 9.123, + "step": 4700 + }, + { + "epoch": 1.0972626674432149, + "grad_norm": 6.816390037536621, + "learning_rate": 1.0029773462783172e-05, + "logits/chosen": -4.264836311340332, + "logits/rejected": -4.192754745483398, + "logps/chosen": -699.5757446289062, + "logps/rejected": -689.8859252929688, + "loss": 0.7551, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7004824876785278, + "rewards/margins": 0.3938553035259247, + "rewards/rejected": -1.094337821006775, + "step": 4710 + }, + { + "epoch": 1.0995923121723936, + "grad_norm": 6.77658748626709, + "learning_rate": 1.0003883495145631e-05, + "logits/chosen": -4.375441551208496, + "logits/rejected": -4.27400541305542, + "logps/chosen": -765.828369140625, + "logps/rejected": -741.5504150390625, + "loss": 0.7337, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6998693943023682, + "rewards/margins": 0.4171157777309418, + "rewards/rejected": -1.1169852018356323, + "step": 4720 + }, + { + "epoch": 1.1019219569015726, + "grad_norm": 7.092930793762207, + "learning_rate": 9.977993527508092e-06, + "logits/chosen": -4.354989051818848, + "logits/rejected": -4.303897380828857, + "logps/chosen": -743.7265625, + "logps/rejected": -713.6380615234375, + "loss": 0.6894, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5861555933952332, + "rewards/margins": 0.5605015158653259, + "rewards/rejected": -1.1466572284698486, + "step": 4730 + }, + { + "epoch": 1.1042516016307513, + "grad_norm": 8.570871353149414, + "learning_rate": 9.95210355987055e-06, + "logits/chosen": -4.348479747772217, + "logits/rejected": -4.361424922943115, + "logps/chosen": -715.3938598632812, + "logps/rejected": -732.878662109375, + "loss": 0.8737, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7268328666687012, + "rewards/margins": 0.22926625609397888, + "rewards/rejected": -0.9560991525650024, + "step": 4740 + }, + { + "epoch": 1.10658124635993, + "grad_norm": 6.749851703643799, + "learning_rate": 9.926213592233011e-06, + "logits/chosen": -4.329443454742432, + "logits/rejected": -4.359018802642822, + "logps/chosen": -757.46630859375, + "logps/rejected": -846.7803955078125, + "loss": 0.8767, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.8326984643936157, + "rewards/margins": 0.28897660970687866, + "rewards/rejected": -1.1216751337051392, + "step": 4750 + }, + { + "epoch": 1.108910891089109, + "grad_norm": 6.992388725280762, + "learning_rate": 9.90032362459547e-06, + "logits/chosen": -4.314213752746582, + "logits/rejected": -4.2905144691467285, + "logps/chosen": -699.3799438476562, + "logps/rejected": -697.0438232421875, + "loss": 0.8799, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.9238218069076538, + "rewards/margins": 0.16288027167320251, + "rewards/rejected": -1.0867021083831787, + "step": 4760 + }, + { + "epoch": 1.1112405358182877, + "grad_norm": 8.790740013122559, + "learning_rate": 9.874433656957929e-06, + "logits/chosen": -4.342259407043457, + "logits/rejected": -4.310133934020996, + "logps/chosen": -740.4301147460938, + "logps/rejected": -730.6539306640625, + "loss": 0.8358, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.703795313835144, + "rewards/margins": 0.34042343497276306, + "rewards/rejected": -1.0442187786102295, + "step": 4770 + }, + { + "epoch": 1.1135701805474665, + "grad_norm": 8.854411125183105, + "learning_rate": 9.84854368932039e-06, + "logits/chosen": -4.325991630554199, + "logits/rejected": -4.3423686027526855, + "logps/chosen": -649.7286376953125, + "logps/rejected": -722.8423461914062, + "loss": 0.8412, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6523281335830688, + "rewards/margins": 0.2276729792356491, + "rewards/rejected": -0.8800011873245239, + "step": 4780 + }, + { + "epoch": 1.1158998252766452, + "grad_norm": 9.450496673583984, + "learning_rate": 9.822653721682848e-06, + "logits/chosen": -4.22884464263916, + "logits/rejected": -4.330252170562744, + "logps/chosen": -689.923583984375, + "logps/rejected": -777.794677734375, + "loss": 0.8275, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.7842633724212646, + "rewards/margins": 0.30382487177848816, + "rewards/rejected": -1.0880881547927856, + "step": 4790 + }, + { + "epoch": 1.1182294700058242, + "grad_norm": 9.24414348602295, + "learning_rate": 9.796763754045309e-06, + "logits/chosen": -4.3309173583984375, + "logits/rejected": -4.296918869018555, + "logps/chosen": -722.8944091796875, + "logps/rejected": -711.1109619140625, + "loss": 0.7236, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7419768571853638, + "rewards/margins": 0.45678672194480896, + "rewards/rejected": -1.1987636089324951, + "step": 4800 + }, + { + "epoch": 1.1182294700058242, + "eval_logits/chosen": -4.296524524688721, + "eval_logits/rejected": -4.28386116027832, + "eval_logps/chosen": -697.907470703125, + "eval_logps/rejected": -717.6570434570312, + "eval_loss": 0.6322656869888306, + "eval_rewards/accuracies": 0.634889543056488, + "eval_rewards/chosen": -0.7079183459281921, + "eval_rewards/margins": 0.37791895866394043, + "eval_rewards/rejected": -1.0858372449874878, + "eval_runtime": 391.976, + "eval_samples_per_second": 18.251, + "eval_steps_per_second": 9.126, + "step": 4800 + }, + { + "epoch": 1.120559114735003, + "grad_norm": 8.10059928894043, + "learning_rate": 9.770873786407768e-06, + "logits/chosen": -4.268490791320801, + "logits/rejected": -4.312201499938965, + "logps/chosen": -658.3656005859375, + "logps/rejected": -724.805419921875, + "loss": 0.7867, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6494590044021606, + "rewards/margins": 0.30502957105636597, + "rewards/rejected": -0.9544886350631714, + "step": 4810 + }, + { + "epoch": 1.1228887594641817, + "grad_norm": 7.271373271942139, + "learning_rate": 9.744983818770227e-06, + "logits/chosen": -4.365329265594482, + "logits/rejected": -4.415966987609863, + "logps/chosen": -647.1744384765625, + "logps/rejected": -726.2669677734375, + "loss": 0.7327, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.4994463324546814, + "rewards/margins": 0.5641657114028931, + "rewards/rejected": -1.0636119842529297, + "step": 4820 + }, + { + "epoch": 1.1252184041933604, + "grad_norm": 9.720874786376953, + "learning_rate": 9.719093851132687e-06, + "logits/chosen": -4.329529762268066, + "logits/rejected": -4.333421230316162, + "logps/chosen": -696.6600341796875, + "logps/rejected": -689.1129760742188, + "loss": 0.7579, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6637765765190125, + "rewards/margins": 0.37587255239486694, + "rewards/rejected": -1.039649248123169, + "step": 4830 + }, + { + "epoch": 1.1275480489225393, + "grad_norm": 5.86013650894165, + "learning_rate": 9.693203883495146e-06, + "logits/chosen": -4.383194923400879, + "logits/rejected": -4.380536079406738, + "logps/chosen": -675.5174560546875, + "logps/rejected": -729.9697875976562, + "loss": 0.567, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.334768146276474, + "rewards/margins": 0.8987871408462524, + "rewards/rejected": -1.2335551977157593, + "step": 4840 + }, + { + "epoch": 1.129877693651718, + "grad_norm": 11.408817291259766, + "learning_rate": 9.667313915857605e-06, + "logits/chosen": -4.298308849334717, + "logits/rejected": -4.3367109298706055, + "logps/chosen": -733.4371948242188, + "logps/rejected": -817.2102661132812, + "loss": 0.7335, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5985130071640015, + "rewards/margins": 0.49690619111061096, + "rewards/rejected": -1.09541916847229, + "step": 4850 + }, + { + "epoch": 1.1322073383808968, + "grad_norm": 7.952821731567383, + "learning_rate": 9.641423948220066e-06, + "logits/chosen": -4.365422248840332, + "logits/rejected": -4.373671531677246, + "logps/chosen": -702.8591918945312, + "logps/rejected": -762.954345703125, + "loss": 0.9034, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.8020332455635071, + "rewards/margins": 0.20034417510032654, + "rewards/rejected": -1.0023772716522217, + "step": 4860 + }, + { + "epoch": 1.1345369831100758, + "grad_norm": 8.62785816192627, + "learning_rate": 9.615533980582525e-06, + "logits/chosen": -4.355130195617676, + "logits/rejected": -4.3633222579956055, + "logps/chosen": -759.4334106445312, + "logps/rejected": -808.9783935546875, + "loss": 0.7554, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6296737790107727, + "rewards/margins": 0.4100741446018219, + "rewards/rejected": -1.039747953414917, + "step": 4870 + }, + { + "epoch": 1.1368666278392545, + "grad_norm": 8.518671035766602, + "learning_rate": 9.589644012944983e-06, + "logits/chosen": -4.361384391784668, + "logits/rejected": -4.2445831298828125, + "logps/chosen": -756.2235717773438, + "logps/rejected": -749.2499389648438, + "loss": 0.853, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.7251854538917542, + "rewards/margins": 0.17502622306346893, + "rewards/rejected": -0.90021151304245, + "step": 4880 + }, + { + "epoch": 1.1391962725684333, + "grad_norm": 7.055928707122803, + "learning_rate": 9.563754045307444e-06, + "logits/chosen": -4.406973838806152, + "logits/rejected": -4.381006717681885, + "logps/chosen": -711.85888671875, + "logps/rejected": -769.9061889648438, + "loss": 0.6148, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5367127656936646, + "rewards/margins": 0.725053608417511, + "rewards/rejected": -1.2617665529251099, + "step": 4890 + }, + { + "epoch": 1.1415259172976122, + "grad_norm": 6.893064498901367, + "learning_rate": 9.537864077669905e-06, + "logits/chosen": -4.3568034172058105, + "logits/rejected": -4.30020809173584, + "logps/chosen": -746.2471313476562, + "logps/rejected": -722.7155151367188, + "loss": 0.7335, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6146231889724731, + "rewards/margins": 0.5055471658706665, + "rewards/rejected": -1.1201703548431396, + "step": 4900 + }, + { + "epoch": 1.1415259172976122, + "eval_logits/chosen": -4.295858860015869, + "eval_logits/rejected": -4.2832417488098145, + "eval_logps/chosen": -697.7133178710938, + "eval_logps/rejected": -717.448974609375, + "eval_loss": 0.630944013595581, + "eval_rewards/accuracies": 0.64034104347229, + "eval_rewards/chosen": -0.6884997487068176, + "eval_rewards/margins": 0.37653934955596924, + "eval_rewards/rejected": -1.065039038658142, + "eval_runtime": 392.2997, + "eval_samples_per_second": 18.236, + "eval_steps_per_second": 9.118, + "step": 4900 + }, + { + "epoch": 1.143855562026791, + "grad_norm": 11.00304126739502, + "learning_rate": 9.511974110032363e-06, + "logits/chosen": -4.2610039710998535, + "logits/rejected": -4.306243896484375, + "logps/chosen": -724.7777099609375, + "logps/rejected": -772.2752685546875, + "loss": 0.8958, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.7763365507125854, + "rewards/margins": 0.23164916038513184, + "rewards/rejected": -1.0079858303070068, + "step": 4910 + }, + { + "epoch": 1.1461852067559697, + "grad_norm": 5.603542327880859, + "learning_rate": 9.486084142394822e-06, + "logits/chosen": -4.164181232452393, + "logits/rejected": -4.260239601135254, + "logps/chosen": -659.2215576171875, + "logps/rejected": -744.3831787109375, + "loss": 0.7597, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.6770327091217041, + "rewards/margins": 0.40782466530799866, + "rewards/rejected": -1.0848573446273804, + "step": 4920 + }, + { + "epoch": 1.1485148514851484, + "grad_norm": 7.558773994445801, + "learning_rate": 9.460194174757283e-06, + "logits/chosen": -4.3481245040893555, + "logits/rejected": -4.423264503479004, + "logps/chosen": -650.3001098632812, + "logps/rejected": -732.3673095703125, + "loss": 0.5359, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.27106544375419617, + "rewards/margins": 0.8346785306930542, + "rewards/rejected": -1.1057438850402832, + "step": 4930 + }, + { + "epoch": 1.1508444962143274, + "grad_norm": 6.854055881500244, + "learning_rate": 9.434304207119742e-06, + "logits/chosen": -4.359449863433838, + "logits/rejected": -4.34786319732666, + "logps/chosen": -768.947998046875, + "logps/rejected": -747.8753662109375, + "loss": 0.8995, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.9032365679740906, + "rewards/margins": 0.1830495297908783, + "rewards/rejected": -1.086286187171936, + "step": 4940 + }, + { + "epoch": 1.1531741409435061, + "grad_norm": 10.397383689880371, + "learning_rate": 9.4084142394822e-06, + "logits/chosen": -4.300423622131348, + "logits/rejected": -4.38247537612915, + "logps/chosen": -728.3832397460938, + "logps/rejected": -797.1463623046875, + "loss": 0.8069, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6606366038322449, + "rewards/margins": 0.3577214777469635, + "rewards/rejected": -1.0183582305908203, + "step": 4950 + }, + { + "epoch": 1.1555037856726849, + "grad_norm": 5.43316650390625, + "learning_rate": 9.382524271844661e-06, + "logits/chosen": -4.393747329711914, + "logits/rejected": -4.417483329772949, + "logps/chosen": -713.3004150390625, + "logps/rejected": -757.9130859375, + "loss": 0.8116, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7697121500968933, + "rewards/margins": 0.36362361907958984, + "rewards/rejected": -1.133335828781128, + "step": 4960 + }, + { + "epoch": 1.1578334304018636, + "grad_norm": 7.3963623046875, + "learning_rate": 9.35663430420712e-06, + "logits/chosen": -4.301112174987793, + "logits/rejected": -4.343433380126953, + "logps/chosen": -697.62548828125, + "logps/rejected": -657.8740234375, + "loss": 0.7422, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6323320269584656, + "rewards/margins": 0.5476266741752625, + "rewards/rejected": -1.179958701133728, + "step": 4970 + }, + { + "epoch": 1.1601630751310426, + "grad_norm": 6.423781394958496, + "learning_rate": 9.33074433656958e-06, + "logits/chosen": -4.308186054229736, + "logits/rejected": -4.321841239929199, + "logps/chosen": -713.9429321289062, + "logps/rejected": -762.6380615234375, + "loss": 0.833, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9051560163497925, + "rewards/margins": 0.27821090817451477, + "rewards/rejected": -1.1833668947219849, + "step": 4980 + }, + { + "epoch": 1.1624927198602213, + "grad_norm": 8.114919662475586, + "learning_rate": 9.30485436893204e-06, + "logits/chosen": -4.363166809082031, + "logits/rejected": -4.334114074707031, + "logps/chosen": -734.3272705078125, + "logps/rejected": -783.6725463867188, + "loss": 0.6761, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5971099734306335, + "rewards/margins": 0.5837138891220093, + "rewards/rejected": -1.1808240413665771, + "step": 4990 + }, + { + "epoch": 1.1648223645894, + "grad_norm": 2.8438472747802734, + "learning_rate": 9.278964401294498e-06, + "logits/chosen": -4.292839050292969, + "logits/rejected": -4.3964738845825195, + "logps/chosen": -685.0821533203125, + "logps/rejected": -787.2464599609375, + "loss": 0.6453, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6114527583122253, + "rewards/margins": 0.7760822176933289, + "rewards/rejected": -1.3875350952148438, + "step": 5000 + }, + { + "epoch": 1.1648223645894, + "eval_logits/chosen": -4.296248912811279, + "eval_logits/rejected": -4.2839531898498535, + "eval_logps/chosen": -698.3717651367188, + "eval_logps/rejected": -718.2405395507812, + "eval_loss": 0.6339278817176819, + "eval_rewards/accuracies": 0.6365669369697571, + "eval_rewards/chosen": -0.7543493509292603, + "eval_rewards/margins": 0.38984665274620056, + "eval_rewards/rejected": -1.1441960334777832, + "eval_runtime": 392.4103, + "eval_samples_per_second": 18.231, + "eval_steps_per_second": 9.115, + "step": 5000 + }, + { + "epoch": 1.167152009318579, + "grad_norm": 9.56225872039795, + "learning_rate": 9.253074433656959e-06, + "logits/chosen": -4.369412422180176, + "logits/rejected": -4.308619022369385, + "logps/chosen": -729.121826171875, + "logps/rejected": -766.3821411132812, + "loss": 0.7563, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.717940628528595, + "rewards/margins": 0.4016820788383484, + "rewards/rejected": -1.1196227073669434, + "step": 5010 + }, + { + "epoch": 1.1694816540477577, + "grad_norm": 7.350949287414551, + "learning_rate": 9.227184466019418e-06, + "logits/chosen": -4.340554237365723, + "logits/rejected": -4.403109550476074, + "logps/chosen": -728.0326538085938, + "logps/rejected": -739.9369506835938, + "loss": 0.7311, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.7404690384864807, + "rewards/margins": 0.4509666860103607, + "rewards/rejected": -1.1914358139038086, + "step": 5020 + }, + { + "epoch": 1.1718112987769365, + "grad_norm": 4.559691429138184, + "learning_rate": 9.201294498381877e-06, + "logits/chosen": -4.373383522033691, + "logits/rejected": -4.363102912902832, + "logps/chosen": -799.009033203125, + "logps/rejected": -789.736572265625, + "loss": 0.6564, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5797772407531738, + "rewards/margins": 0.6232125163078308, + "rewards/rejected": -1.2029898166656494, + "step": 5030 + }, + { + "epoch": 1.1741409435061154, + "grad_norm": 9.547080993652344, + "learning_rate": 9.175404530744337e-06, + "logits/chosen": -4.302543640136719, + "logits/rejected": -4.301863670349121, + "logps/chosen": -706.2684936523438, + "logps/rejected": -715.9816284179688, + "loss": 0.6453, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6682039499282837, + "rewards/margins": 0.6009415984153748, + "rewards/rejected": -1.2691456079483032, + "step": 5040 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 7.015618801116943, + "learning_rate": 9.149514563106798e-06, + "logits/chosen": -4.382655620574951, + "logits/rejected": -4.368760585784912, + "logps/chosen": -708.5529174804688, + "logps/rejected": -719.41015625, + "loss": 0.7583, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.7785158157348633, + "rewards/margins": 0.46291232109069824, + "rewards/rejected": -1.2414281368255615, + "step": 5050 + }, + { + "epoch": 1.178800232964473, + "grad_norm": 5.636579990386963, + "learning_rate": 9.123624595469255e-06, + "logits/chosen": -4.289775371551514, + "logits/rejected": -4.319928169250488, + "logps/chosen": -700.90185546875, + "logps/rejected": -759.4402465820312, + "loss": 0.6075, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6372637152671814, + "rewards/margins": 0.6822826266288757, + "rewards/rejected": -1.3195462226867676, + "step": 5060 + }, + { + "epoch": 1.1811298776936516, + "grad_norm": 7.952759265899658, + "learning_rate": 9.097734627831716e-06, + "logits/chosen": -4.374382972717285, + "logits/rejected": -4.308639049530029, + "logps/chosen": -736.1868896484375, + "logps/rejected": -727.0337524414062, + "loss": 0.7677, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8247160911560059, + "rewards/margins": 0.45139646530151367, + "rewards/rejected": -1.2761125564575195, + "step": 5070 + }, + { + "epoch": 1.1834595224228306, + "grad_norm": 5.8634185791015625, + "learning_rate": 9.071844660194176e-06, + "logits/chosen": -4.347293376922607, + "logits/rejected": -4.27130651473999, + "logps/chosen": -689.748046875, + "logps/rejected": -695.448486328125, + "loss": 0.6795, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6607910394668579, + "rewards/margins": 0.6075721979141235, + "rewards/rejected": -1.2683632373809814, + "step": 5080 + }, + { + "epoch": 1.1857891671520093, + "grad_norm": 6.076456069946289, + "learning_rate": 9.045954692556635e-06, + "logits/chosen": -4.2905659675598145, + "logits/rejected": -4.332380771636963, + "logps/chosen": -725.4127807617188, + "logps/rejected": -792.0933837890625, + "loss": 0.7146, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6212302446365356, + "rewards/margins": 0.39680036902427673, + "rewards/rejected": -1.0180305242538452, + "step": 5090 + }, + { + "epoch": 1.188118811881188, + "grad_norm": 7.59320592880249, + "learning_rate": 9.020064724919094e-06, + "logits/chosen": -4.304072380065918, + "logits/rejected": -4.223092555999756, + "logps/chosen": -731.3455810546875, + "logps/rejected": -707.1759643554688, + "loss": 0.7496, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5885947942733765, + "rewards/margins": 0.4130372405052185, + "rewards/rejected": -1.0016319751739502, + "step": 5100 + }, + { + "epoch": 1.188118811881188, + "eval_logits/chosen": -4.295443534851074, + "eval_logits/rejected": -4.2832560539245605, + "eval_logps/chosen": -698.3624267578125, + "eval_logps/rejected": -718.23583984375, + "eval_loss": 0.6326996088027954, + "eval_rewards/accuracies": 0.6375454068183899, + "eval_rewards/chosen": -0.753415584564209, + "eval_rewards/margins": 0.3902973234653473, + "eval_rewards/rejected": -1.1437128782272339, + "eval_runtime": 393.3354, + "eval_samples_per_second": 18.188, + "eval_steps_per_second": 9.094, + "step": 5100 + }, + { + "epoch": 1.1904484566103668, + "grad_norm": 11.862083435058594, + "learning_rate": 8.994174757281555e-06, + "logits/chosen": -4.289244174957275, + "logits/rejected": -4.383764743804932, + "logps/chosen": -713.3644409179688, + "logps/rejected": -739.8452758789062, + "loss": 0.9078, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.898316502571106, + "rewards/margins": 0.10962538421154022, + "rewards/rejected": -1.0079419612884521, + "step": 5110 + }, + { + "epoch": 1.1927781013395458, + "grad_norm": 5.873625755310059, + "learning_rate": 8.968284789644013e-06, + "logits/chosen": -4.285080909729004, + "logits/rejected": -4.272884845733643, + "logps/chosen": -688.602294921875, + "logps/rejected": -684.8470458984375, + "loss": 0.7197, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6948808431625366, + "rewards/margins": 0.4366269111633301, + "rewards/rejected": -1.1315077543258667, + "step": 5120 + }, + { + "epoch": 1.1951077460687245, + "grad_norm": 8.76635456085205, + "learning_rate": 8.942394822006472e-06, + "logits/chosen": -4.330078125, + "logits/rejected": -4.282149791717529, + "logps/chosen": -771.1861572265625, + "logps/rejected": -738.9520874023438, + "loss": 0.8948, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.7769016027450562, + "rewards/margins": 0.17473702132701874, + "rewards/rejected": -0.9516385793685913, + "step": 5130 + }, + { + "epoch": 1.1974373907979032, + "grad_norm": 4.928976535797119, + "learning_rate": 8.916504854368933e-06, + "logits/chosen": -4.3442463874816895, + "logits/rejected": -4.3403706550598145, + "logps/chosen": -747.8082275390625, + "logps/rejected": -774.0421142578125, + "loss": 0.6963, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.7253668308258057, + "rewards/margins": 0.4606569707393646, + "rewards/rejected": -1.1860238313674927, + "step": 5140 + }, + { + "epoch": 1.1997670355270822, + "grad_norm": 10.240233421325684, + "learning_rate": 8.890614886731392e-06, + "logits/chosen": -4.33738899230957, + "logits/rejected": -4.348621368408203, + "logps/chosen": -720.0654907226562, + "logps/rejected": -741.6688842773438, + "loss": 0.7944, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.870683491230011, + "rewards/margins": 0.355567067861557, + "rewards/rejected": -1.2262506484985352, + "step": 5150 + }, + { + "epoch": 1.202096680256261, + "grad_norm": 7.294375419616699, + "learning_rate": 8.864724919093852e-06, + "logits/chosen": -4.395794868469238, + "logits/rejected": -4.36327600479126, + "logps/chosen": -723.692626953125, + "logps/rejected": -736.25927734375, + "loss": 0.6848, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5928478240966797, + "rewards/margins": 0.5372605919837952, + "rewards/rejected": -1.1301084756851196, + "step": 5160 + }, + { + "epoch": 1.2044263249854397, + "grad_norm": 8.202095031738281, + "learning_rate": 8.838834951456311e-06, + "logits/chosen": -4.327259540557861, + "logits/rejected": -4.300198554992676, + "logps/chosen": -770.6749267578125, + "logps/rejected": -729.3353271484375, + "loss": 0.8911, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.084133505821228, + "rewards/margins": 0.147923082113266, + "rewards/rejected": -1.2320566177368164, + "step": 5170 + }, + { + "epoch": 1.2067559697146186, + "grad_norm": 8.397380828857422, + "learning_rate": 8.81294498381877e-06, + "logits/chosen": -4.301027774810791, + "logits/rejected": -4.390149116516113, + "logps/chosen": -746.8372802734375, + "logps/rejected": -793.6979370117188, + "loss": 0.8489, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7529788017272949, + "rewards/margins": 0.3230917453765869, + "rewards/rejected": -1.0760705471038818, + "step": 5180 + }, + { + "epoch": 1.2090856144437974, + "grad_norm": 7.202069282531738, + "learning_rate": 8.78705501618123e-06, + "logits/chosen": -4.267294406890869, + "logits/rejected": -4.347865104675293, + "logps/chosen": -672.7081298828125, + "logps/rejected": -761.9598388671875, + "loss": 0.686, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7640711069107056, + "rewards/margins": 0.5648372173309326, + "rewards/rejected": -1.3289084434509277, + "step": 5190 + }, + { + "epoch": 1.211415259172976, + "grad_norm": 8.52991771697998, + "learning_rate": 8.761165048543691e-06, + "logits/chosen": -4.3260908126831055, + "logits/rejected": -4.4024786949157715, + "logps/chosen": -643.678466796875, + "logps/rejected": -768.1748046875, + "loss": 0.7476, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5957127213478088, + "rewards/margins": 0.46101027727127075, + "rewards/rejected": -1.0567229986190796, + "step": 5200 + }, + { + "epoch": 1.211415259172976, + "eval_logits/chosen": -4.2841081619262695, + "eval_logits/rejected": -4.2714667320251465, + "eval_logps/chosen": -698.4566040039062, + "eval_logps/rejected": -718.3603515625, + "eval_loss": 0.632652759552002, + "eval_rewards/accuracies": 0.6399217247962952, + "eval_rewards/chosen": -0.7628329992294312, + "eval_rewards/margins": 0.3933371603488922, + "eval_rewards/rejected": -1.156170129776001, + "eval_runtime": 392.4296, + "eval_samples_per_second": 18.23, + "eval_steps_per_second": 9.115, + "step": 5200 + }, + { + "epoch": 1.2137449039021548, + "grad_norm": 6.904987812042236, + "learning_rate": 8.735275080906148e-06, + "logits/chosen": -4.297495365142822, + "logits/rejected": -4.3596415519714355, + "logps/chosen": -615.5276489257812, + "logps/rejected": -720.9169921875, + "loss": 0.8417, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.8318285942077637, + "rewards/margins": 0.24546094238758087, + "rewards/rejected": -1.0772894620895386, + "step": 5210 + }, + { + "epoch": 1.2160745486313338, + "grad_norm": 7.0425639152526855, + "learning_rate": 8.709385113268609e-06, + "logits/chosen": -4.2223615646362305, + "logits/rejected": -4.204460620880127, + "logps/chosen": -661.43701171875, + "logps/rejected": -671.744140625, + "loss": 0.6586, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5984729528427124, + "rewards/margins": 0.6058539152145386, + "rewards/rejected": -1.204326868057251, + "step": 5220 + }, + { + "epoch": 1.2184041933605125, + "grad_norm": 8.22312068939209, + "learning_rate": 8.68349514563107e-06, + "logits/chosen": -4.342942714691162, + "logits/rejected": -4.427065849304199, + "logps/chosen": -734.3280029296875, + "logps/rejected": -794.6597900390625, + "loss": 0.5786, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.441389262676239, + "rewards/margins": 0.841633677482605, + "rewards/rejected": -1.2830231189727783, + "step": 5230 + }, + { + "epoch": 1.2207338380896913, + "grad_norm": 5.765860080718994, + "learning_rate": 8.657605177993529e-06, + "logits/chosen": -4.316514492034912, + "logits/rejected": -4.353440284729004, + "logps/chosen": -719.7871704101562, + "logps/rejected": -721.1253662109375, + "loss": 0.7851, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7890719175338745, + "rewards/margins": 0.30373963713645935, + "rewards/rejected": -1.0928115844726562, + "step": 5240 + }, + { + "epoch": 1.22306348281887, + "grad_norm": 9.424662590026855, + "learning_rate": 8.631715210355987e-06, + "logits/chosen": -4.375064373016357, + "logits/rejected": -4.371817111968994, + "logps/chosen": -712.6149291992188, + "logps/rejected": -771.7597045898438, + "loss": 0.7467, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7428840398788452, + "rewards/margins": 0.35510486364364624, + "rewards/rejected": -1.0979888439178467, + "step": 5250 + }, + { + "epoch": 1.225393127548049, + "grad_norm": 6.406722545623779, + "learning_rate": 8.605825242718448e-06, + "logits/chosen": -4.307132244110107, + "logits/rejected": -4.281327247619629, + "logps/chosen": -725.4629516601562, + "logps/rejected": -739.6784057617188, + "loss": 0.7597, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7313467860221863, + "rewards/margins": 0.40912118554115295, + "rewards/rejected": -1.140467882156372, + "step": 5260 + }, + { + "epoch": 1.2277227722772277, + "grad_norm": 8.925617218017578, + "learning_rate": 8.579935275080907e-06, + "logits/chosen": -4.30813455581665, + "logits/rejected": -4.262563228607178, + "logps/chosen": -686.4441528320312, + "logps/rejected": -630.260009765625, + "loss": 0.8031, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6095466613769531, + "rewards/margins": 0.4209167957305908, + "rewards/rejected": -1.030463457107544, + "step": 5270 + }, + { + "epoch": 1.2300524170064064, + "grad_norm": 9.995710372924805, + "learning_rate": 8.554045307443366e-06, + "logits/chosen": -4.355738639831543, + "logits/rejected": -4.22121524810791, + "logps/chosen": -764.5777587890625, + "logps/rejected": -727.5269165039062, + "loss": 0.9327, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8444086313247681, + "rewards/margins": 0.1093897819519043, + "rewards/rejected": -0.9537984132766724, + "step": 5280 + }, + { + "epoch": 1.2323820617355854, + "grad_norm": 7.851297855377197, + "learning_rate": 8.528155339805826e-06, + "logits/chosen": -4.309381484985352, + "logits/rejected": -4.3855791091918945, + "logps/chosen": -706.9483642578125, + "logps/rejected": -803.3857421875, + "loss": 0.7377, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6579431891441345, + "rewards/margins": 0.5061560869216919, + "rewards/rejected": -1.1640993356704712, + "step": 5290 + }, + { + "epoch": 1.2347117064647641, + "grad_norm": 8.250238418579102, + "learning_rate": 8.502265372168285e-06, + "logits/chosen": -4.303959369659424, + "logits/rejected": -4.3808698654174805, + "logps/chosen": -731.1709594726562, + "logps/rejected": -842.6114501953125, + "loss": 0.7532, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8347466588020325, + "rewards/margins": 0.4443570077419281, + "rewards/rejected": -1.2791036367416382, + "step": 5300 + }, + { + "epoch": 1.2347117064647641, + "eval_logits/chosen": -4.286922454833984, + "eval_logits/rejected": -4.274168491363525, + "eval_logps/chosen": -698.2924194335938, + "eval_logps/rejected": -718.2178344726562, + "eval_loss": 0.6320476531982422, + "eval_rewards/accuracies": 0.6407604217529297, + "eval_rewards/chosen": -0.7464197278022766, + "eval_rewards/margins": 0.3955014944076538, + "eval_rewards/rejected": -1.1419211626052856, + "eval_runtime": 393.3854, + "eval_samples_per_second": 18.186, + "eval_steps_per_second": 9.093, + "step": 5300 + }, + { + "epoch": 1.2370413511939429, + "grad_norm": 7.946064472198486, + "learning_rate": 8.476375404530744e-06, + "logits/chosen": -4.3965559005737305, + "logits/rejected": -4.316365718841553, + "logps/chosen": -757.570068359375, + "logps/rejected": -776.7664794921875, + "loss": 0.813, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6333745718002319, + "rewards/margins": 0.3966577649116516, + "rewards/rejected": -1.0300323963165283, + "step": 5310 + }, + { + "epoch": 1.2393709959231218, + "grad_norm": 11.394083023071289, + "learning_rate": 8.450485436893205e-06, + "logits/chosen": -4.389873027801514, + "logits/rejected": -4.220327377319336, + "logps/chosen": -769.0503540039062, + "logps/rejected": -705.5409545898438, + "loss": 0.796, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7753839492797852, + "rewards/margins": 0.2997626066207886, + "rewards/rejected": -1.0751464366912842, + "step": 5320 + }, + { + "epoch": 1.2417006406523006, + "grad_norm": 9.40004825592041, + "learning_rate": 8.424595469255664e-06, + "logits/chosen": -4.316657543182373, + "logits/rejected": -4.366091728210449, + "logps/chosen": -658.1968994140625, + "logps/rejected": -729.2615966796875, + "loss": 0.7724, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6652747392654419, + "rewards/margins": 0.5693114399909973, + "rewards/rejected": -1.234586238861084, + "step": 5330 + }, + { + "epoch": 1.2440302853814793, + "grad_norm": 9.043682098388672, + "learning_rate": 8.398705501618124e-06, + "logits/chosen": -4.202693939208984, + "logits/rejected": -4.343643665313721, + "logps/chosen": -636.1541137695312, + "logps/rejected": -754.822509765625, + "loss": 0.7941, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5894585847854614, + "rewards/margins": 0.3509396016597748, + "rewards/rejected": -0.9403982162475586, + "step": 5340 + }, + { + "epoch": 1.246359930110658, + "grad_norm": 7.098946571350098, + "learning_rate": 8.372815533980583e-06, + "logits/chosen": -4.231969833374023, + "logits/rejected": -4.287779808044434, + "logps/chosen": -697.3786010742188, + "logps/rejected": -752.2830200195312, + "loss": 0.7645, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6086194515228271, + "rewards/margins": 0.48688268661499023, + "rewards/rejected": -1.0955021381378174, + "step": 5350 + }, + { + "epoch": 1.248689574839837, + "grad_norm": 6.7419753074646, + "learning_rate": 8.346925566343042e-06, + "logits/chosen": -4.2158942222595215, + "logits/rejected": -4.2786431312561035, + "logps/chosen": -655.709716796875, + "logps/rejected": -727.7312622070312, + "loss": 0.8279, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5915407538414001, + "rewards/margins": 0.28814780712127686, + "rewards/rejected": -0.8796886205673218, + "step": 5360 + }, + { + "epoch": 1.2510192195690157, + "grad_norm": 8.620369911193848, + "learning_rate": 8.321035598705502e-06, + "logits/chosen": -4.3344340324401855, + "logits/rejected": -4.348869800567627, + "logps/chosen": -704.429443359375, + "logps/rejected": -764.4039306640625, + "loss": 0.7372, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7943536639213562, + "rewards/margins": 0.44872117042541504, + "rewards/rejected": -1.2430747747421265, + "step": 5370 + }, + { + "epoch": 1.2533488642981945, + "grad_norm": 8.750239372253418, + "learning_rate": 8.295145631067963e-06, + "logits/chosen": -4.340044021606445, + "logits/rejected": -4.353331089019775, + "logps/chosen": -712.9949951171875, + "logps/rejected": -736.2276000976562, + "loss": 0.7095, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6870006322860718, + "rewards/margins": 0.5696112513542175, + "rewards/rejected": -1.256611943244934, + "step": 5380 + }, + { + "epoch": 1.2556785090273732, + "grad_norm": 7.095187664031982, + "learning_rate": 8.269255663430422e-06, + "logits/chosen": -4.3044915199279785, + "logits/rejected": -4.3486504554748535, + "logps/chosen": -689.8836059570312, + "logps/rejected": -756.3995361328125, + "loss": 0.6589, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5822979807853699, + "rewards/margins": 0.7012337446212769, + "rewards/rejected": -1.2835317850112915, + "step": 5390 + }, + { + "epoch": 1.2580081537565522, + "grad_norm": 7.411107063293457, + "learning_rate": 8.24336569579288e-06, + "logits/chosen": -4.365941047668457, + "logits/rejected": -4.346836566925049, + "logps/chosen": -674.5985107421875, + "logps/rejected": -732.57568359375, + "loss": 0.6924, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6771245002746582, + "rewards/margins": 0.7330614328384399, + "rewards/rejected": -1.4101858139038086, + "step": 5400 + }, + { + "epoch": 1.2580081537565522, + "eval_logits/chosen": -4.280876159667969, + "eval_logits/rejected": -4.268177509307861, + "eval_logps/chosen": -698.876220703125, + "eval_logps/rejected": -718.9097900390625, + "eval_loss": 0.633796751499176, + "eval_rewards/accuracies": 0.6382443308830261, + "eval_rewards/chosen": -0.8047946691513062, + "eval_rewards/margins": 0.4063267707824707, + "eval_rewards/rejected": -1.2111215591430664, + "eval_runtime": 393.2724, + "eval_samples_per_second": 18.191, + "eval_steps_per_second": 9.095, + "step": 5400 + }, + { + "epoch": 1.260337798485731, + "grad_norm": 12.6148681640625, + "learning_rate": 8.217475728155341e-06, + "logits/chosen": -4.354096412658691, + "logits/rejected": -4.3372802734375, + "logps/chosen": -765.8435668945312, + "logps/rejected": -756.4341430664062, + "loss": 0.8824, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.077000379562378, + "rewards/margins": 0.16846218705177307, + "rewards/rejected": -1.2454622983932495, + "step": 5410 + }, + { + "epoch": 1.2626674432149096, + "grad_norm": 7.284420967102051, + "learning_rate": 8.1915857605178e-06, + "logits/chosen": -4.285361289978027, + "logits/rejected": -4.241901397705078, + "logps/chosen": -710.8505859375, + "logps/rejected": -629.2987060546875, + "loss": 0.7785, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8103518486022949, + "rewards/margins": 0.3938572406768799, + "rewards/rejected": -1.2042090892791748, + "step": 5420 + }, + { + "epoch": 1.2649970879440886, + "grad_norm": 8.587806701660156, + "learning_rate": 8.165695792880259e-06, + "logits/chosen": -4.346640110015869, + "logits/rejected": -4.296164512634277, + "logps/chosen": -808.0670776367188, + "logps/rejected": -764.1261596679688, + "loss": 0.7684, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6092880368232727, + "rewards/margins": 0.47000154852867126, + "rewards/rejected": -1.0792896747589111, + "step": 5430 + }, + { + "epoch": 1.2673267326732673, + "grad_norm": 9.06271743774414, + "learning_rate": 8.13980582524272e-06, + "logits/chosen": -4.3552703857421875, + "logits/rejected": -4.35286808013916, + "logps/chosen": -676.3934326171875, + "logps/rejected": -709.7229614257812, + "loss": 0.7631, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4008163511753082, + "rewards/margins": 0.5974048376083374, + "rewards/rejected": -0.998221218585968, + "step": 5440 + }, + { + "epoch": 1.269656377402446, + "grad_norm": 5.518189907073975, + "learning_rate": 8.113915857605179e-06, + "logits/chosen": -4.357514381408691, + "logits/rejected": -4.337433338165283, + "logps/chosen": -685.5467529296875, + "logps/rejected": -711.4341430664062, + "loss": 0.7718, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.8264461755752563, + "rewards/margins": 0.23178991675376892, + "rewards/rejected": -1.0582361221313477, + "step": 5450 + }, + { + "epoch": 1.271986022131625, + "grad_norm": 10.297592163085938, + "learning_rate": 8.088025889967637e-06, + "logits/chosen": -4.2873921394348145, + "logits/rejected": -4.283249378204346, + "logps/chosen": -723.2974853515625, + "logps/rejected": -750.7506103515625, + "loss": 0.6925, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5153021812438965, + "rewards/margins": 0.5844428539276123, + "rewards/rejected": -1.0997450351715088, + "step": 5460 + }, + { + "epoch": 1.2743156668608038, + "grad_norm": 9.21555233001709, + "learning_rate": 8.062135922330098e-06, + "logits/chosen": -4.428861141204834, + "logits/rejected": -4.376482009887695, + "logps/chosen": -722.2612915039062, + "logps/rejected": -748.6749267578125, + "loss": 0.7888, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7600167393684387, + "rewards/margins": 0.32688966393470764, + "rewards/rejected": -1.0869064331054688, + "step": 5470 + }, + { + "epoch": 1.2766453115899825, + "grad_norm": 9.568215370178223, + "learning_rate": 8.036245954692557e-06, + "logits/chosen": -4.308640003204346, + "logits/rejected": -4.361050128936768, + "logps/chosen": -704.77783203125, + "logps/rejected": -727.996826171875, + "loss": 0.8215, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.6832433938980103, + "rewards/margins": 0.3008263409137726, + "rewards/rejected": -0.9840697050094604, + "step": 5480 + }, + { + "epoch": 1.2789749563191612, + "grad_norm": 7.80417013168335, + "learning_rate": 8.010355987055017e-06, + "logits/chosen": -4.305323600769043, + "logits/rejected": -4.314250469207764, + "logps/chosen": -719.4246826171875, + "logps/rejected": -692.815185546875, + "loss": 0.6677, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.642403244972229, + "rewards/margins": 0.47839683294296265, + "rewards/rejected": -1.1208001375198364, + "step": 5490 + }, + { + "epoch": 1.2813046010483402, + "grad_norm": 8.392496109008789, + "learning_rate": 7.984466019417476e-06, + "logits/chosen": -4.330838680267334, + "logits/rejected": -4.276543140411377, + "logps/chosen": -748.7491455078125, + "logps/rejected": -803.5469970703125, + "loss": 0.6556, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7700390815734863, + "rewards/margins": 0.7721626162528992, + "rewards/rejected": -1.5422016382217407, + "step": 5500 + }, + { + "epoch": 1.2813046010483402, + "eval_logits/chosen": -4.276762962341309, + "eval_logits/rejected": -4.264129638671875, + "eval_logps/chosen": -698.3977661132812, + "eval_logps/rejected": -718.328125, + "eval_loss": 0.6312812566757202, + "eval_rewards/accuracies": 0.6388034820556641, + "eval_rewards/chosen": -0.7569450736045837, + "eval_rewards/margins": 0.3960038125514984, + "eval_rewards/rejected": -1.1529488563537598, + "eval_runtime": 392.8469, + "eval_samples_per_second": 18.211, + "eval_steps_per_second": 9.105, + "step": 5500 + }, + { + "epoch": 1.283634245777519, + "grad_norm": 8.474251747131348, + "learning_rate": 7.958576051779935e-06, + "logits/chosen": -4.332800388336182, + "logits/rejected": -4.342696189880371, + "logps/chosen": -655.5740966796875, + "logps/rejected": -683.3048095703125, + "loss": 0.6476, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6026918292045593, + "rewards/margins": 0.5270061492919922, + "rewards/rejected": -1.1296980381011963, + "step": 5510 + }, + { + "epoch": 1.2859638905066977, + "grad_norm": 6.1152472496032715, + "learning_rate": 7.932686084142396e-06, + "logits/chosen": -4.386120319366455, + "logits/rejected": -4.363664150238037, + "logps/chosen": -736.0418701171875, + "logps/rejected": -755.9088745117188, + "loss": 0.6813, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7132939100265503, + "rewards/margins": 0.961859405040741, + "rewards/rejected": -1.675153374671936, + "step": 5520 + }, + { + "epoch": 1.2882935352358764, + "grad_norm": 7.68399715423584, + "learning_rate": 7.906796116504855e-06, + "logits/chosen": -4.306368827819824, + "logits/rejected": -4.369459629058838, + "logps/chosen": -728.9708251953125, + "logps/rejected": -740.6129150390625, + "loss": 0.8068, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6639933586120605, + "rewards/margins": 0.355257123708725, + "rewards/rejected": -1.019250512123108, + "step": 5530 + }, + { + "epoch": 1.2906231799650554, + "grad_norm": 8.179251670837402, + "learning_rate": 7.880906148867315e-06, + "logits/chosen": -4.3534159660339355, + "logits/rejected": -4.320357322692871, + "logps/chosen": -719.878173828125, + "logps/rejected": -718.3634033203125, + "loss": 0.8182, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7298353910446167, + "rewards/margins": 0.35824066400527954, + "rewards/rejected": -1.088076114654541, + "step": 5540 + }, + { + "epoch": 1.2929528246942341, + "grad_norm": 9.077390670776367, + "learning_rate": 7.855016181229774e-06, + "logits/chosen": -4.333242893218994, + "logits/rejected": -4.3756327629089355, + "logps/chosen": -657.5175170898438, + "logps/rejected": -717.9803466796875, + "loss": 0.8273, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.892708957195282, + "rewards/margins": 0.15217627584934235, + "rewards/rejected": -1.044885277748108, + "step": 5550 + }, + { + "epoch": 1.2952824694234129, + "grad_norm": 9.775418281555176, + "learning_rate": 7.829126213592235e-06, + "logits/chosen": -4.333688259124756, + "logits/rejected": -4.265844821929932, + "logps/chosen": -750.9478759765625, + "logps/rejected": -772.283203125, + "loss": 0.683, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9198330044746399, + "rewards/margins": 0.5669907927513123, + "rewards/rejected": -1.4868237972259521, + "step": 5560 + }, + { + "epoch": 1.2976121141525918, + "grad_norm": 8.756830215454102, + "learning_rate": 7.803236245954694e-06, + "logits/chosen": -4.229300498962402, + "logits/rejected": -4.275421142578125, + "logps/chosen": -665.558349609375, + "logps/rejected": -683.5747680664062, + "loss": 0.7361, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.49589547514915466, + "rewards/margins": 0.5741171836853027, + "rewards/rejected": -1.0700128078460693, + "step": 5570 + }, + { + "epoch": 1.2999417588817705, + "grad_norm": 8.244976997375488, + "learning_rate": 7.777346278317152e-06, + "logits/chosen": -4.362826347351074, + "logits/rejected": -4.415011405944824, + "logps/chosen": -744.3865356445312, + "logps/rejected": -787.7940673828125, + "loss": 0.8813, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.7486151456832886, + "rewards/margins": 0.2536180019378662, + "rewards/rejected": -1.0022331476211548, + "step": 5580 + }, + { + "epoch": 1.3022714036109493, + "grad_norm": 7.764080047607422, + "learning_rate": 7.751456310679613e-06, + "logits/chosen": -4.275409698486328, + "logits/rejected": -4.322786808013916, + "logps/chosen": -658.6358032226562, + "logps/rejected": -733.7100219726562, + "loss": 0.6867, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4042404294013977, + "rewards/margins": 0.5954650640487671, + "rewards/rejected": -0.9997054934501648, + "step": 5590 + }, + { + "epoch": 1.3046010483401282, + "grad_norm": 6.2695231437683105, + "learning_rate": 7.725566343042072e-06, + "logits/chosen": -4.200558662414551, + "logits/rejected": -4.332554817199707, + "logps/chosen": -667.927734375, + "logps/rejected": -749.1799926757812, + "loss": 0.8593, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.858493983745575, + "rewards/margins": 0.11628395318984985, + "rewards/rejected": -0.9747779965400696, + "step": 5600 + }, + { + "epoch": 1.3046010483401282, + "eval_logits/chosen": -4.26957893371582, + "eval_logits/rejected": -4.256231307983398, + "eval_logps/chosen": -698.6621704101562, + "eval_logps/rejected": -718.7105102539062, + "eval_loss": 0.6317076683044434, + "eval_rewards/accuracies": 0.6399217247962952, + "eval_rewards/chosen": -0.7833826541900635, + "eval_rewards/margins": 0.4078075885772705, + "eval_rewards/rejected": -1.1911901235580444, + "eval_runtime": 393.8757, + "eval_samples_per_second": 18.163, + "eval_steps_per_second": 9.082, + "step": 5600 + }, + { + "epoch": 1.306930693069307, + "grad_norm": 10.356733322143555, + "learning_rate": 7.69967637540453e-06, + "logits/chosen": -4.231276512145996, + "logits/rejected": -4.262662887573242, + "logps/chosen": -749.026123046875, + "logps/rejected": -756.8435668945312, + "loss": 0.8628, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.8626430630683899, + "rewards/margins": 0.3443228602409363, + "rewards/rejected": -1.2069660425186157, + "step": 5610 + }, + { + "epoch": 1.3092603377984857, + "grad_norm": 7.466703414916992, + "learning_rate": 7.673786407766991e-06, + "logits/chosen": -4.292886257171631, + "logits/rejected": -4.345783710479736, + "logps/chosen": -721.8636474609375, + "logps/rejected": -810.0375366210938, + "loss": 0.851, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8587177395820618, + "rewards/margins": 0.3823612332344055, + "rewards/rejected": -1.2410789728164673, + "step": 5620 + }, + { + "epoch": 1.3115899825276645, + "grad_norm": 6.604434967041016, + "learning_rate": 7.64789644012945e-06, + "logits/chosen": -4.294118404388428, + "logits/rejected": -4.347481727600098, + "logps/chosen": -730.0178833007812, + "logps/rejected": -833.5206909179688, + "loss": 0.5881, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5834293365478516, + "rewards/margins": 0.9197348356246948, + "rewards/rejected": -1.503164291381836, + "step": 5630 + }, + { + "epoch": 1.3139196272568434, + "grad_norm": 9.590311050415039, + "learning_rate": 7.62200647249191e-06, + "logits/chosen": -4.24991512298584, + "logits/rejected": -4.206608772277832, + "logps/chosen": -727.8446655273438, + "logps/rejected": -719.554931640625, + "loss": 0.649, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.46159330010414124, + "rewards/margins": 0.6928114891052246, + "rewards/rejected": -1.154404878616333, + "step": 5640 + }, + { + "epoch": 1.3162492719860222, + "grad_norm": 9.104934692382812, + "learning_rate": 7.59611650485437e-06, + "logits/chosen": -4.273480415344238, + "logits/rejected": -4.279662132263184, + "logps/chosen": -702.955322265625, + "logps/rejected": -746.53564453125, + "loss": 0.7745, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.8513115048408508, + "rewards/margins": 0.5166622400283813, + "rewards/rejected": -1.3679735660552979, + "step": 5650 + }, + { + "epoch": 1.3185789167152009, + "grad_norm": 9.956319808959961, + "learning_rate": 7.570226537216829e-06, + "logits/chosen": -4.362860202789307, + "logits/rejected": -4.4000349044799805, + "logps/chosen": -705.8887939453125, + "logps/rejected": -815.6693725585938, + "loss": 0.7268, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6892780065536499, + "rewards/margins": 0.49222391843795776, + "rewards/rejected": -1.181501865386963, + "step": 5660 + }, + { + "epoch": 1.3209085614443796, + "grad_norm": 10.5054292678833, + "learning_rate": 7.544336569579289e-06, + "logits/chosen": -4.3153886795043945, + "logits/rejected": -4.259528160095215, + "logps/chosen": -739.3519287109375, + "logps/rejected": -677.4149780273438, + "loss": 0.7945, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6772125959396362, + "rewards/margins": 0.42796260118484497, + "rewards/rejected": -1.105175256729126, + "step": 5670 + }, + { + "epoch": 1.3232382061735586, + "grad_norm": 5.91664457321167, + "learning_rate": 7.518446601941748e-06, + "logits/chosen": -4.281999588012695, + "logits/rejected": -4.293154716491699, + "logps/chosen": -718.5802612304688, + "logps/rejected": -746.5780029296875, + "loss": 0.7888, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.9188039898872375, + "rewards/margins": 0.3310539722442627, + "rewards/rejected": -1.2498581409454346, + "step": 5680 + }, + { + "epoch": 1.3255678509027373, + "grad_norm": 10.995230674743652, + "learning_rate": 7.492556634304208e-06, + "logits/chosen": -4.330352783203125, + "logits/rejected": -4.3117570877075195, + "logps/chosen": -708.2078857421875, + "logps/rejected": -716.5367431640625, + "loss": 0.7332, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5375461578369141, + "rewards/margins": 0.5031279921531677, + "rewards/rejected": -1.0406742095947266, + "step": 5690 + }, + { + "epoch": 1.327897495631916, + "grad_norm": 9.291178703308105, + "learning_rate": 7.4666666666666675e-06, + "logits/chosen": -4.3973517417907715, + "logits/rejected": -4.330522537231445, + "logps/chosen": -761.6275024414062, + "logps/rejected": -713.5189208984375, + "loss": 0.6934, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5683543682098389, + "rewards/margins": 0.4567783772945404, + "rewards/rejected": -1.025132656097412, + "step": 5700 + }, + { + "epoch": 1.327897495631916, + "eval_logits/chosen": -4.26210355758667, + "eval_logits/rejected": -4.248154640197754, + "eval_logps/chosen": -699.094482421875, + "eval_logps/rejected": -719.2811279296875, + "eval_loss": 0.6337792277336121, + "eval_rewards/accuracies": 0.6406206488609314, + "eval_rewards/chosen": -0.826622724533081, + "eval_rewards/margins": 0.42162150144577026, + "eval_rewards/rejected": -1.248244285583496, + "eval_runtime": 394.2579, + "eval_samples_per_second": 18.145, + "eval_steps_per_second": 9.073, + "step": 5700 + }, + { + "epoch": 1.330227140361095, + "grad_norm": 7.562618255615234, + "learning_rate": 7.440776699029126e-06, + "logits/chosen": -4.334902763366699, + "logits/rejected": -4.1793293952941895, + "logps/chosen": -751.4991455078125, + "logps/rejected": -686.8331909179688, + "loss": 0.846, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.8237320184707642, + "rewards/margins": 0.20931318402290344, + "rewards/rejected": -1.0330451726913452, + "step": 5710 + }, + { + "epoch": 1.3325567850902738, + "grad_norm": 10.239123344421387, + "learning_rate": 7.414886731391586e-06, + "logits/chosen": -4.351064682006836, + "logits/rejected": -4.283308982849121, + "logps/chosen": -688.5595703125, + "logps/rejected": -710.2664794921875, + "loss": 0.7362, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8389534950256348, + "rewards/margins": 0.5938393473625183, + "rewards/rejected": -1.4327926635742188, + "step": 5720 + }, + { + "epoch": 1.3348864298194525, + "grad_norm": 5.591737270355225, + "learning_rate": 7.388996763754046e-06, + "logits/chosen": -4.312826156616211, + "logits/rejected": -4.2012529373168945, + "logps/chosen": -684.7025756835938, + "logps/rejected": -589.3084716796875, + "loss": 0.7283, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7459465265274048, + "rewards/margins": 0.4310365319252014, + "rewards/rejected": -1.176983118057251, + "step": 5730 + }, + { + "epoch": 1.3372160745486315, + "grad_norm": 7.605983257293701, + "learning_rate": 7.3631067961165055e-06, + "logits/chosen": -4.290006160736084, + "logits/rejected": -4.3164496421813965, + "logps/chosen": -702.5801391601562, + "logps/rejected": -752.56005859375, + "loss": 0.5171, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5781384706497192, + "rewards/margins": 0.9951320886611938, + "rewards/rejected": -1.5732704401016235, + "step": 5740 + }, + { + "epoch": 1.3395457192778102, + "grad_norm": 8.090303421020508, + "learning_rate": 7.337216828478964e-06, + "logits/chosen": -4.243030071258545, + "logits/rejected": -4.308330535888672, + "logps/chosen": -722.51953125, + "logps/rejected": -750.7996826171875, + "loss": 0.7339, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6192070841789246, + "rewards/margins": 0.5955747365951538, + "rewards/rejected": -1.2147818803787231, + "step": 5750 + }, + { + "epoch": 1.341875364006989, + "grad_norm": 7.423137187957764, + "learning_rate": 7.311326860841424e-06, + "logits/chosen": -4.275225639343262, + "logits/rejected": -4.218506813049316, + "logps/chosen": -705.7161254882812, + "logps/rejected": -707.9550170898438, + "loss": 0.7737, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9287475347518921, + "rewards/margins": 0.3816317915916443, + "rewards/rejected": -1.3103792667388916, + "step": 5760 + }, + { + "epoch": 1.3442050087361677, + "grad_norm": 4.06026554107666, + "learning_rate": 7.285436893203885e-06, + "logits/chosen": -4.278772830963135, + "logits/rejected": -4.319589614868164, + "logps/chosen": -694.3591918945312, + "logps/rejected": -732.3553466796875, + "loss": 0.7029, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7285884022712708, + "rewards/margins": 0.5107731819152832, + "rewards/rejected": -1.2393615245819092, + "step": 5770 + }, + { + "epoch": 1.3465346534653464, + "grad_norm": 8.069743156433105, + "learning_rate": 7.259546925566343e-06, + "logits/chosen": -4.411580562591553, + "logits/rejected": -4.367345809936523, + "logps/chosen": -779.0450439453125, + "logps/rejected": -742.70947265625, + "loss": 0.7816, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8539965748786926, + "rewards/margins": 0.46470707654953003, + "rewards/rejected": -1.3187037706375122, + "step": 5780 + }, + { + "epoch": 1.3488642981945254, + "grad_norm": 7.119553565979004, + "learning_rate": 7.233656957928803e-06, + "logits/chosen": -4.2892279624938965, + "logits/rejected": -4.25607967376709, + "logps/chosen": -751.4293823242188, + "logps/rejected": -743.5025634765625, + "loss": 0.6099, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.47523608803749084, + "rewards/margins": 0.7295746803283691, + "rewards/rejected": -1.2048107385635376, + "step": 5790 + }, + { + "epoch": 1.351193942923704, + "grad_norm": 8.696918487548828, + "learning_rate": 7.207766990291263e-06, + "logits/chosen": -4.378389358520508, + "logits/rejected": -4.295948505401611, + "logps/chosen": -724.874755859375, + "logps/rejected": -703.1808471679688, + "loss": 0.6694, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7297112941741943, + "rewards/margins": 0.5419555902481079, + "rewards/rejected": -1.2716668844223022, + "step": 5800 + }, + { + "epoch": 1.351193942923704, + "eval_logits/chosen": -4.2629899978637695, + "eval_logits/rejected": -4.249340057373047, + "eval_logps/chosen": -698.8829956054688, + "eval_logps/rejected": -719.0765991210938, + "eval_loss": 0.6321995854377747, + "eval_rewards/accuracies": 0.6397819519042969, + "eval_rewards/chosen": -0.8054772019386292, + "eval_rewards/margins": 0.42231690883636475, + "eval_rewards/rejected": -1.2277940511703491, + "eval_runtime": 394.4648, + "eval_samples_per_second": 18.136, + "eval_steps_per_second": 9.068, + "step": 5800 + }, + { + "epoch": 1.3535235876528828, + "grad_norm": 9.419877052307129, + "learning_rate": 7.181877022653723e-06, + "logits/chosen": -4.269660949707031, + "logits/rejected": -4.29043436050415, + "logps/chosen": -714.8873291015625, + "logps/rejected": -801.2305908203125, + "loss": 0.7412, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7010995149612427, + "rewards/margins": 0.4792519509792328, + "rewards/rejected": -1.1803514957427979, + "step": 5810 + }, + { + "epoch": 1.3558532323820618, + "grad_norm": 6.639642715454102, + "learning_rate": 7.155987055016182e-06, + "logits/chosen": -4.356001853942871, + "logits/rejected": -4.3863115310668945, + "logps/chosen": -656.8035888671875, + "logps/rejected": -730.4910278320312, + "loss": 0.7787, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9009748697280884, + "rewards/margins": 0.3519083857536316, + "rewards/rejected": -1.2528831958770752, + "step": 5820 + }, + { + "epoch": 1.3581828771112405, + "grad_norm": 7.337728500366211, + "learning_rate": 7.130097087378641e-06, + "logits/chosen": -4.27675724029541, + "logits/rejected": -4.264283180236816, + "logps/chosen": -730.9502563476562, + "logps/rejected": -788.376953125, + "loss": 0.7888, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.816659152507782, + "rewards/margins": 0.521469235420227, + "rewards/rejected": -1.3381284475326538, + "step": 5830 + }, + { + "epoch": 1.3605125218404193, + "grad_norm": 9.205945014953613, + "learning_rate": 7.104207119741101e-06, + "logits/chosen": -4.330324649810791, + "logits/rejected": -4.350374221801758, + "logps/chosen": -783.76220703125, + "logps/rejected": -805.0286865234375, + "loss": 0.6592, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6576730012893677, + "rewards/margins": 0.6231470704078674, + "rewards/rejected": -1.2808201313018799, + "step": 5840 + }, + { + "epoch": 1.3628421665695982, + "grad_norm": 10.915939331054688, + "learning_rate": 7.078317152103561e-06, + "logits/chosen": -4.300547122955322, + "logits/rejected": -4.347357749938965, + "logps/chosen": -756.61083984375, + "logps/rejected": -746.4322509765625, + "loss": 0.9299, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.0734272003173828, + "rewards/margins": 0.1357240527868271, + "rewards/rejected": -1.209151268005371, + "step": 5850 + }, + { + "epoch": 1.365171811298777, + "grad_norm": 8.515040397644043, + "learning_rate": 7.05242718446602e-06, + "logits/chosen": -4.35914945602417, + "logits/rejected": -4.41499137878418, + "logps/chosen": -685.0623779296875, + "logps/rejected": -724.2054443359375, + "loss": 0.8295, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.790489673614502, + "rewards/margins": 0.21192386746406555, + "rewards/rejected": -1.0024135112762451, + "step": 5860 + }, + { + "epoch": 1.3675014560279557, + "grad_norm": 6.500744819641113, + "learning_rate": 7.026537216828479e-06, + "logits/chosen": -4.287649154663086, + "logits/rejected": -4.258206844329834, + "logps/chosen": -717.4063720703125, + "logps/rejected": -725.9107666015625, + "loss": 0.7797, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.6877506375312805, + "rewards/margins": 0.3528876304626465, + "rewards/rejected": -1.0406382083892822, + "step": 5870 + }, + { + "epoch": 1.3698311007571347, + "grad_norm": 8.362839698791504, + "learning_rate": 7.000647249190939e-06, + "logits/chosen": -4.286078929901123, + "logits/rejected": -4.331356048583984, + "logps/chosen": -662.5283813476562, + "logps/rejected": -703.88818359375, + "loss": 0.8605, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7610023617744446, + "rewards/margins": 0.5419550538063049, + "rewards/rejected": -1.3029574155807495, + "step": 5880 + }, + { + "epoch": 1.3721607454863134, + "grad_norm": 7.359226226806641, + "learning_rate": 6.974757281553398e-06, + "logits/chosen": -4.292226314544678, + "logits/rejected": -4.382073402404785, + "logps/chosen": -713.1748046875, + "logps/rejected": -787.0335693359375, + "loss": 0.7733, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.7213521003723145, + "rewards/margins": 0.5341546535491943, + "rewards/rejected": -1.2555067539215088, + "step": 5890 + }, + { + "epoch": 1.3744903902154921, + "grad_norm": 8.315750122070312, + "learning_rate": 6.948867313915858e-06, + "logits/chosen": -4.223654747009277, + "logits/rejected": -4.363278865814209, + "logps/chosen": -695.8765869140625, + "logps/rejected": -790.5074462890625, + "loss": 0.6437, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.715376079082489, + "rewards/margins": 0.5789678692817688, + "rewards/rejected": -1.2943440675735474, + "step": 5900 + }, + { + "epoch": 1.3744903902154921, + "eval_logits/chosen": -4.265625, + "eval_logits/rejected": -4.251952648162842, + "eval_logps/chosen": -698.7142333984375, + "eval_logps/rejected": -718.8829345703125, + "eval_loss": 0.6310829520225525, + "eval_rewards/accuracies": 0.6404808759689331, + "eval_rewards/chosen": -0.7885909676551819, + "eval_rewards/margins": 0.4198411703109741, + "eval_rewards/rejected": -1.2084320783615112, + "eval_runtime": 394.5375, + "eval_samples_per_second": 18.133, + "eval_steps_per_second": 9.066, + "step": 5900 + }, + { + "epoch": 1.3768200349446709, + "grad_norm": 10.515283584594727, + "learning_rate": 6.9229773462783175e-06, + "logits/chosen": -4.387662887573242, + "logits/rejected": -4.373385429382324, + "logps/chosen": -744.9942626953125, + "logps/rejected": -810.0918579101562, + "loss": 0.8659, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.8038939237594604, + "rewards/margins": 0.3602578639984131, + "rewards/rejected": -1.164151906967163, + "step": 5910 + }, + { + "epoch": 1.3791496796738496, + "grad_norm": 10.210684776306152, + "learning_rate": 6.897087378640778e-06, + "logits/chosen": -4.323285102844238, + "logits/rejected": -4.2652082443237305, + "logps/chosen": -754.8870849609375, + "logps/rejected": -789.4259033203125, + "loss": 0.8231, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.7273879051208496, + "rewards/margins": 0.2793354392051697, + "rewards/rejected": -1.0067232847213745, + "step": 5920 + }, + { + "epoch": 1.3814793244030286, + "grad_norm": 10.7938232421875, + "learning_rate": 6.871197411003236e-06, + "logits/chosen": -4.402263641357422, + "logits/rejected": -4.343568801879883, + "logps/chosen": -713.602783203125, + "logps/rejected": -714.8239135742188, + "loss": 0.788, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.9156931638717651, + "rewards/margins": 0.34777992963790894, + "rewards/rejected": -1.2634732723236084, + "step": 5930 + }, + { + "epoch": 1.3838089691322073, + "grad_norm": 7.41132116317749, + "learning_rate": 6.845307443365697e-06, + "logits/chosen": -4.270591735839844, + "logits/rejected": -4.345918655395508, + "logps/chosen": -676.3931884765625, + "logps/rejected": -731.2847900390625, + "loss": 0.9121, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.9126005172729492, + "rewards/margins": 0.3571823239326477, + "rewards/rejected": -1.2697827816009521, + "step": 5940 + }, + { + "epoch": 1.386138613861386, + "grad_norm": 8.748828887939453, + "learning_rate": 6.819417475728156e-06, + "logits/chosen": -4.402093410491943, + "logits/rejected": -4.365542888641357, + "logps/chosen": -743.6368408203125, + "logps/rejected": -806.1112670898438, + "loss": 0.8711, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.981920599937439, + "rewards/margins": 0.3139951229095459, + "rewards/rejected": -1.2959158420562744, + "step": 5950 + }, + { + "epoch": 1.388468258590565, + "grad_norm": 4.679051399230957, + "learning_rate": 6.793527508090615e-06, + "logits/chosen": -4.309473991394043, + "logits/rejected": -4.248940467834473, + "logps/chosen": -681.0128173828125, + "logps/rejected": -692.7999877929688, + "loss": 0.7612, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6057000160217285, + "rewards/margins": 0.34382814168930054, + "rewards/rejected": -0.9495280981063843, + "step": 5960 + }, + { + "epoch": 1.3907979033197437, + "grad_norm": 8.569405555725098, + "learning_rate": 6.767637540453075e-06, + "logits/chosen": -4.276402473449707, + "logits/rejected": -4.360108375549316, + "logps/chosen": -722.626953125, + "logps/rejected": -727.410888671875, + "loss": 0.8777, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.8245531320571899, + "rewards/margins": 0.1962832510471344, + "rewards/rejected": -1.0208362340927124, + "step": 5970 + }, + { + "epoch": 1.3931275480489225, + "grad_norm": 7.404006481170654, + "learning_rate": 6.741747572815535e-06, + "logits/chosen": -4.379885196685791, + "logits/rejected": -4.29095983505249, + "logps/chosen": -767.14208984375, + "logps/rejected": -748.4708251953125, + "loss": 0.7369, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.7033445239067078, + "rewards/margins": 0.39107757806777954, + "rewards/rejected": -1.0944219827651978, + "step": 5980 + }, + { + "epoch": 1.3954571927781014, + "grad_norm": 7.095986366271973, + "learning_rate": 6.7158576051779944e-06, + "logits/chosen": -4.271486282348633, + "logits/rejected": -4.316716194152832, + "logps/chosen": -722.4144897460938, + "logps/rejected": -744.0294799804688, + "loss": 0.6972, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7354863286018372, + "rewards/margins": 0.49441951513290405, + "rewards/rejected": -1.2299058437347412, + "step": 5990 + }, + { + "epoch": 1.3977868375072802, + "grad_norm": 9.544381141662598, + "learning_rate": 6.689967637540453e-06, + "logits/chosen": -4.179322719573975, + "logits/rejected": -4.25494384765625, + "logps/chosen": -675.0315551757812, + "logps/rejected": -745.0963134765625, + "loss": 0.665, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.546692967414856, + "rewards/margins": 0.6451650857925415, + "rewards/rejected": -1.191857933998108, + "step": 6000 + }, + { + "epoch": 1.3977868375072802, + "eval_logits/chosen": -4.270597457885742, + "eval_logits/rejected": -4.256988525390625, + "eval_logps/chosen": -698.1524047851562, + "eval_logps/rejected": -718.1962280273438, + "eval_loss": 0.6282357573509216, + "eval_rewards/accuracies": 0.6407604217529297, + "eval_rewards/chosen": -0.7324159145355225, + "eval_rewards/margins": 0.4073488712310791, + "eval_rewards/rejected": -1.1397647857666016, + "eval_runtime": 395.4993, + "eval_samples_per_second": 18.089, + "eval_steps_per_second": 9.044, + "step": 6000 + }, + { + "epoch": 1.400116482236459, + "grad_norm": 7.5317206382751465, + "learning_rate": 6.664077669902913e-06, + "logits/chosen": -4.356226444244385, + "logits/rejected": -4.356008529663086, + "logps/chosen": -749.1924438476562, + "logps/rejected": -852.3751831054688, + "loss": 0.6264, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.42755842208862305, + "rewards/margins": 0.8259701728820801, + "rewards/rejected": -1.253528356552124, + "step": 6010 + }, + { + "epoch": 1.4024461269656379, + "grad_norm": 9.323957443237305, + "learning_rate": 6.638187702265373e-06, + "logits/chosen": -4.306419849395752, + "logits/rejected": -4.26028299331665, + "logps/chosen": -713.90380859375, + "logps/rejected": -701.1036376953125, + "loss": 0.7646, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.72160804271698, + "rewards/margins": 0.3279089629650116, + "rewards/rejected": -1.049517035484314, + "step": 6020 + }, + { + "epoch": 1.4047757716948166, + "grad_norm": 6.586495876312256, + "learning_rate": 6.6122977346278325e-06, + "logits/chosen": -4.380081653594971, + "logits/rejected": -4.304697036743164, + "logps/chosen": -827.091796875, + "logps/rejected": -757.8292846679688, + "loss": 0.8853, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.7758504748344421, + "rewards/margins": 0.2744961678981781, + "rewards/rejected": -1.0503467321395874, + "step": 6030 + }, + { + "epoch": 1.4071054164239953, + "grad_norm": 7.249460697174072, + "learning_rate": 6.586407766990291e-06, + "logits/chosen": -4.236702919006348, + "logits/rejected": -4.3020524978637695, + "logps/chosen": -725.0355224609375, + "logps/rejected": -752.8726806640625, + "loss": 0.7608, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5313434600830078, + "rewards/margins": 0.6068240404129028, + "rewards/rejected": -1.1381676197052002, + "step": 6040 + }, + { + "epoch": 1.409435061153174, + "grad_norm": 5.396103858947754, + "learning_rate": 6.560517799352751e-06, + "logits/chosen": -4.281092166900635, + "logits/rejected": -4.332211494445801, + "logps/chosen": -697.1739501953125, + "logps/rejected": -740.8766479492188, + "loss": 0.6829, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4345017373561859, + "rewards/margins": 0.5479524731636047, + "rewards/rejected": -0.9824541807174683, + "step": 6050 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 8.456120491027832, + "learning_rate": 6.534627831715211e-06, + "logits/chosen": -4.316834449768066, + "logits/rejected": -4.33341646194458, + "logps/chosen": -708.6388549804688, + "logps/rejected": -770.0853271484375, + "loss": 0.7611, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.7554001212120056, + "rewards/margins": 0.42426061630249023, + "rewards/rejected": -1.1796607971191406, + "step": 6060 + }, + { + "epoch": 1.4140943506115318, + "grad_norm": 9.169841766357422, + "learning_rate": 6.50873786407767e-06, + "logits/chosen": -4.311273097991943, + "logits/rejected": -4.331109523773193, + "logps/chosen": -690.289306640625, + "logps/rejected": -726.1717529296875, + "loss": 0.6198, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5135329365730286, + "rewards/margins": 0.7094392776489258, + "rewards/rejected": -1.2229722738265991, + "step": 6070 + }, + { + "epoch": 1.4164239953407105, + "grad_norm": 7.389662742614746, + "learning_rate": 6.4828478964401294e-06, + "logits/chosen": -4.348455905914307, + "logits/rejected": -4.332047939300537, + "logps/chosen": -701.18701171875, + "logps/rejected": -718.71484375, + "loss": 0.6092, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5786556005477905, + "rewards/margins": 0.6604295372962952, + "rewards/rejected": -1.239085078239441, + "step": 6080 + }, + { + "epoch": 1.4187536400698892, + "grad_norm": 9.116060256958008, + "learning_rate": 6.45695792880259e-06, + "logits/chosen": -4.322030067443848, + "logits/rejected": -4.3480024337768555, + "logps/chosen": -744.7877197265625, + "logps/rejected": -769.6700439453125, + "loss": 0.8433, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6332119107246399, + "rewards/margins": 0.38585028052330017, + "rewards/rejected": -1.0190622806549072, + "step": 6090 + }, + { + "epoch": 1.4210832847990682, + "grad_norm": 9.876167297363281, + "learning_rate": 6.43106796116505e-06, + "logits/chosen": -4.301876544952393, + "logits/rejected": -4.181447982788086, + "logps/chosen": -748.7747802734375, + "logps/rejected": -744.25, + "loss": 0.8143, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5354674458503723, + "rewards/margins": 0.28102821111679077, + "rewards/rejected": -0.8164957165718079, + "step": 6100 + }, + { + "epoch": 1.4210832847990682, + "eval_logits/chosen": -4.270403861999512, + "eval_logits/rejected": -4.256753444671631, + "eval_logps/chosen": -698.1387939453125, + "eval_logps/rejected": -718.2033081054688, + "eval_loss": 0.6276611685752869, + "eval_rewards/accuracies": 0.6421582102775574, + "eval_rewards/chosen": -0.7310547828674316, + "eval_rewards/margins": 0.4094196856021881, + "eval_rewards/rejected": -1.1404744386672974, + "eval_runtime": 395.9901, + "eval_samples_per_second": 18.066, + "eval_steps_per_second": 9.033, + "step": 6100 + }, + { + "epoch": 1.423412929528247, + "grad_norm": 13.78065299987793, + "learning_rate": 6.405177993527509e-06, + "logits/chosen": -4.367238521575928, + "logits/rejected": -4.370728969573975, + "logps/chosen": -732.5328979492188, + "logps/rejected": -744.9369506835938, + "loss": 0.9071, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.9486724734306335, + "rewards/margins": 0.18062777817249298, + "rewards/rejected": -1.1293003559112549, + "step": 6110 + }, + { + "epoch": 1.4257425742574257, + "grad_norm": 8.261577606201172, + "learning_rate": 6.379288025889968e-06, + "logits/chosen": -4.402305603027344, + "logits/rejected": -4.272169589996338, + "logps/chosen": -794.4027709960938, + "logps/rejected": -672.9693603515625, + "loss": 0.9172, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.830672562122345, + "rewards/margins": 0.0038339763414114714, + "rewards/rejected": -0.834506630897522, + "step": 6120 + }, + { + "epoch": 1.4280722189866046, + "grad_norm": 6.93900203704834, + "learning_rate": 6.353398058252428e-06, + "logits/chosen": -4.3805317878723145, + "logits/rejected": -4.36024284362793, + "logps/chosen": -707.8915405273438, + "logps/rejected": -755.4413452148438, + "loss": 0.6694, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6830347776412964, + "rewards/margins": 0.5677529573440552, + "rewards/rejected": -1.2507877349853516, + "step": 6130 + }, + { + "epoch": 1.4304018637157834, + "grad_norm": 6.082003593444824, + "learning_rate": 6.327508090614888e-06, + "logits/chosen": -4.284184455871582, + "logits/rejected": -4.340609073638916, + "logps/chosen": -723.261962890625, + "logps/rejected": -769.7706298828125, + "loss": 0.6174, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5278186798095703, + "rewards/margins": 0.6645825505256653, + "rewards/rejected": -1.1924011707305908, + "step": 6140 + }, + { + "epoch": 1.432731508444962, + "grad_norm": 9.62651252746582, + "learning_rate": 6.301618122977347e-06, + "logits/chosen": -4.3521809577941895, + "logits/rejected": -4.298386096954346, + "logps/chosen": -732.197509765625, + "logps/rejected": -733.2960205078125, + "loss": 0.6925, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7362813353538513, + "rewards/margins": 0.5624809265136719, + "rewards/rejected": -1.298762321472168, + "step": 6150 + }, + { + "epoch": 1.435061153174141, + "grad_norm": 3.412282705307007, + "learning_rate": 6.275728155339806e-06, + "logits/chosen": -4.32267951965332, + "logits/rejected": -4.326018810272217, + "logps/chosen": -714.3280029296875, + "logps/rejected": -781.4513549804688, + "loss": 0.7834, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6571785807609558, + "rewards/margins": 0.46814459562301636, + "rewards/rejected": -1.1253231763839722, + "step": 6160 + }, + { + "epoch": 1.4373907979033198, + "grad_norm": 6.920522212982178, + "learning_rate": 6.249838187702266e-06, + "logits/chosen": -4.282754898071289, + "logits/rejected": -4.342594623565674, + "logps/chosen": -721.2886962890625, + "logps/rejected": -721.8851928710938, + "loss": 1.0046, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.9614690542221069, + "rewards/margins": 0.011788728646934032, + "rewards/rejected": -0.9732577204704285, + "step": 6170 + }, + { + "epoch": 1.4397204426324985, + "grad_norm": 6.783796787261963, + "learning_rate": 6.223948220064725e-06, + "logits/chosen": -4.263086795806885, + "logits/rejected": -4.364320755004883, + "logps/chosen": -711.2506103515625, + "logps/rejected": -793.2518920898438, + "loss": 0.7856, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.8233615756034851, + "rewards/margins": 0.4337071478366852, + "rewards/rejected": -1.2570687532424927, + "step": 6180 + }, + { + "epoch": 1.4420500873616773, + "grad_norm": 7.462255954742432, + "learning_rate": 6.198058252427185e-06, + "logits/chosen": -4.287019729614258, + "logits/rejected": -4.253533840179443, + "logps/chosen": -726.5230712890625, + "logps/rejected": -747.6558837890625, + "loss": 0.8425, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.6028618812561035, + "rewards/margins": 0.3824983239173889, + "rewards/rejected": -0.9853602647781372, + "step": 6190 + }, + { + "epoch": 1.444379732090856, + "grad_norm": 9.85159683227539, + "learning_rate": 6.1721682847896445e-06, + "logits/chosen": -4.258937835693359, + "logits/rejected": -4.259571075439453, + "logps/chosen": -709.08642578125, + "logps/rejected": -794.2938232421875, + "loss": 0.6821, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5764713287353516, + "rewards/margins": 0.6002413034439087, + "rewards/rejected": -1.1767126321792603, + "step": 6200 + }, + { + "epoch": 1.444379732090856, + "eval_logits/chosen": -4.2764997482299805, + "eval_logits/rejected": -4.263204097747803, + "eval_logps/chosen": -697.7610473632812, + "eval_logps/rejected": -717.7634887695312, + "eval_loss": 0.6264663338661194, + "eval_rewards/accuracies": 0.642577588558197, + "eval_rewards/chosen": -0.693281888961792, + "eval_rewards/margins": 0.4031990170478821, + "eval_rewards/rejected": -1.0964809656143188, + "eval_runtime": 395.6778, + "eval_samples_per_second": 18.08, + "eval_steps_per_second": 9.04, + "step": 6200 + }, + { + "epoch": 1.446709376820035, + "grad_norm": 6.2666707038879395, + "learning_rate": 6.146278317152104e-06, + "logits/chosen": -4.378452301025391, + "logits/rejected": -4.316521644592285, + "logps/chosen": -799.1228637695312, + "logps/rejected": -763.740478515625, + "loss": 0.7032, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6369025707244873, + "rewards/margins": 0.4719843864440918, + "rewards/rejected": -1.108886957168579, + "step": 6210 + }, + { + "epoch": 1.4490390215492137, + "grad_norm": 4.337667942047119, + "learning_rate": 6.120388349514563e-06, + "logits/chosen": -4.4072771072387695, + "logits/rejected": -4.218862056732178, + "logps/chosen": -709.3562622070312, + "logps/rejected": -662.3941040039062, + "loss": 0.7151, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6898874640464783, + "rewards/margins": 0.465634822845459, + "rewards/rejected": -1.155522346496582, + "step": 6220 + }, + { + "epoch": 1.4513686662783924, + "grad_norm": 8.891453742980957, + "learning_rate": 6.094498381877023e-06, + "logits/chosen": -4.323766231536865, + "logits/rejected": -4.273622512817383, + "logps/chosen": -713.9008178710938, + "logps/rejected": -749.7387084960938, + "loss": 0.7017, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6876183152198792, + "rewards/margins": 0.8101609349250793, + "rewards/rejected": -1.4977792501449585, + "step": 6230 + }, + { + "epoch": 1.4536983110075714, + "grad_norm": 11.587925910949707, + "learning_rate": 6.068608414239483e-06, + "logits/chosen": -4.3129777908325195, + "logits/rejected": -4.2286272048950195, + "logps/chosen": -740.3646240234375, + "logps/rejected": -652.7010498046875, + "loss": 0.8272, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6182198524475098, + "rewards/margins": 0.3408670425415039, + "rewards/rejected": -0.9590870141983032, + "step": 6240 + }, + { + "epoch": 1.4560279557367501, + "grad_norm": 10.418200492858887, + "learning_rate": 6.042718446601941e-06, + "logits/chosen": -4.306921482086182, + "logits/rejected": -4.259629726409912, + "logps/chosen": -726.8382568359375, + "logps/rejected": -741.6177978515625, + "loss": 0.7956, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.702867329120636, + "rewards/margins": 0.33800870180130005, + "rewards/rejected": -1.040876030921936, + "step": 6250 + }, + { + "epoch": 1.4583576004659289, + "grad_norm": 9.083958625793457, + "learning_rate": 6.016828478964402e-06, + "logits/chosen": -4.267938137054443, + "logits/rejected": -4.296270847320557, + "logps/chosen": -719.9595947265625, + "logps/rejected": -721.6478271484375, + "loss": 0.8335, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.725295901298523, + "rewards/margins": 0.25456786155700684, + "rewards/rejected": -0.9798637628555298, + "step": 6260 + }, + { + "epoch": 1.4606872451951078, + "grad_norm": 10.350333213806152, + "learning_rate": 5.990938511326862e-06, + "logits/chosen": -4.272916793823242, + "logits/rejected": -4.3289313316345215, + "logps/chosen": -728.450927734375, + "logps/rejected": -767.2869262695312, + "loss": 0.9885, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.7057057619094849, + "rewards/margins": -0.07528909295797348, + "rewards/rejected": -0.6304166913032532, + "step": 6270 + }, + { + "epoch": 1.4630168899242866, + "grad_norm": 9.632805824279785, + "learning_rate": 5.965048543689321e-06, + "logits/chosen": -4.298973083496094, + "logits/rejected": -4.322752475738525, + "logps/chosen": -763.8212280273438, + "logps/rejected": -827.498046875, + "loss": 0.9871, + "rewards/accuracies": 0.38749998807907104, + "rewards/chosen": -0.9077277183532715, + "rewards/margins": -0.20082445442676544, + "rewards/rejected": -0.7069032788276672, + "step": 6280 + }, + { + "epoch": 1.4653465346534653, + "grad_norm": 8.367243766784668, + "learning_rate": 5.93915857605178e-06, + "logits/chosen": -4.3294997215271, + "logits/rejected": -4.337015628814697, + "logps/chosen": -674.9423217773438, + "logps/rejected": -705.767333984375, + "loss": 0.8136, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.8321495056152344, + "rewards/margins": 0.2989842891693115, + "rewards/rejected": -1.1311339139938354, + "step": 6290 + }, + { + "epoch": 1.4676761793826443, + "grad_norm": 6.319589614868164, + "learning_rate": 5.91326860841424e-06, + "logits/chosen": -4.261368751525879, + "logits/rejected": -4.256316184997559, + "logps/chosen": -703.7994384765625, + "logps/rejected": -739.6809692382812, + "loss": 0.8507, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8882040977478027, + "rewards/margins": 0.23814082145690918, + "rewards/rejected": -1.1263447999954224, + "step": 6300 + }, + { + "epoch": 1.4676761793826443, + "eval_logits/chosen": -4.281351089477539, + "eval_logits/rejected": -4.268238544464111, + "eval_logps/chosen": -698.1597290039062, + "eval_logps/rejected": -718.27978515625, + "eval_loss": 0.6276564002037048, + "eval_rewards/accuracies": 0.6410399675369263, + "eval_rewards/chosen": -0.7331462502479553, + "eval_rewards/margins": 0.41497042775154114, + "eval_rewards/rejected": -1.1481167078018188, + "eval_runtime": 395.5859, + "eval_samples_per_second": 18.085, + "eval_steps_per_second": 9.042, + "step": 6300 + }, + { + "epoch": 1.470005824111823, + "grad_norm": 6.561951160430908, + "learning_rate": 5.8873786407767e-06, + "logits/chosen": -4.3774094581604, + "logits/rejected": -4.387298107147217, + "logps/chosen": -729.2420043945312, + "logps/rejected": -758.8972778320312, + "loss": 0.9366, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.7099128365516663, + "rewards/margins": 0.16957201063632965, + "rewards/rejected": -0.8794847726821899, + "step": 6310 + }, + { + "epoch": 1.4723354688410017, + "grad_norm": 7.208823204040527, + "learning_rate": 5.8614886731391595e-06, + "logits/chosen": -4.253106117248535, + "logits/rejected": -4.3132171630859375, + "logps/chosen": -681.6314697265625, + "logps/rejected": -762.1727294921875, + "loss": 0.62, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.39332813024520874, + "rewards/margins": 0.6966632008552551, + "rewards/rejected": -1.0899913311004639, + "step": 6320 + }, + { + "epoch": 1.4746651135701805, + "grad_norm": 5.884200096130371, + "learning_rate": 5.835598705501618e-06, + "logits/chosen": -4.318593502044678, + "logits/rejected": -4.255011081695557, + "logps/chosen": -756.2564697265625, + "logps/rejected": -711.3599853515625, + "loss": 0.607, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.40394967794418335, + "rewards/margins": 0.7803922891616821, + "rewards/rejected": -1.1843420267105103, + "step": 6330 + }, + { + "epoch": 1.4769947582993592, + "grad_norm": 10.386765480041504, + "learning_rate": 5.809708737864078e-06, + "logits/chosen": -4.3260297775268555, + "logits/rejected": -4.328006267547607, + "logps/chosen": -685.3865356445312, + "logps/rejected": -731.4774169921875, + "loss": 0.7761, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.8475303649902344, + "rewards/margins": 0.36684754490852356, + "rewards/rejected": -1.2143778800964355, + "step": 6340 + }, + { + "epoch": 1.4793244030285382, + "grad_norm": 10.14816951751709, + "learning_rate": 5.783818770226538e-06, + "logits/chosen": -4.235125541687012, + "logits/rejected": -4.296066761016846, + "logps/chosen": -704.0057373046875, + "logps/rejected": -765.9306640625, + "loss": 0.8021, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7169125080108643, + "rewards/margins": 0.28062504529953003, + "rewards/rejected": -0.9975376129150391, + "step": 6350 + }, + { + "epoch": 1.481654047757717, + "grad_norm": 7.591325759887695, + "learning_rate": 5.757928802588997e-06, + "logits/chosen": -4.2524213790893555, + "logits/rejected": -4.324526309967041, + "logps/chosen": -773.524169921875, + "logps/rejected": -758.4846801757812, + "loss": 0.814, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8144239187240601, + "rewards/margins": 0.3293677866458893, + "rewards/rejected": -1.143791913986206, + "step": 6360 + }, + { + "epoch": 1.4839836924868957, + "grad_norm": 6.67929220199585, + "learning_rate": 5.732038834951456e-06, + "logits/chosen": -4.279242515563965, + "logits/rejected": -4.314077854156494, + "logps/chosen": -698.8849487304688, + "logps/rejected": -761.8951416015625, + "loss": 0.6029, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5412794947624207, + "rewards/margins": 0.7758609056472778, + "rewards/rejected": -1.3171404600143433, + "step": 6370 + }, + { + "epoch": 1.4863133372160746, + "grad_norm": 7.730616092681885, + "learning_rate": 5.706148867313916e-06, + "logits/chosen": -4.363424301147461, + "logits/rejected": -4.256382942199707, + "logps/chosen": -703.0626220703125, + "logps/rejected": -616.8387451171875, + "loss": 0.7749, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7838987708091736, + "rewards/margins": 0.3033984303474426, + "rewards/rejected": -1.0872972011566162, + "step": 6380 + }, + { + "epoch": 1.4886429819452534, + "grad_norm": 10.048879623413086, + "learning_rate": 5.680258899676377e-06, + "logits/chosen": -4.266480922698975, + "logits/rejected": -4.252657890319824, + "logps/chosen": -756.8456420898438, + "logps/rejected": -724.2012939453125, + "loss": 0.6698, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7446259260177612, + "rewards/margins": 0.6785385012626648, + "rewards/rejected": -1.4231643676757812, + "step": 6390 + }, + { + "epoch": 1.490972626674432, + "grad_norm": 9.971052169799805, + "learning_rate": 5.654368932038835e-06, + "logits/chosen": -4.291266441345215, + "logits/rejected": -4.3873162269592285, + "logps/chosen": -739.7542724609375, + "logps/rejected": -808.8006591796875, + "loss": 0.6913, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4845125079154968, + "rewards/margins": 0.6399686932563782, + "rewards/rejected": -1.1244813203811646, + "step": 6400 + }, + { + "epoch": 1.490972626674432, + "eval_logits/chosen": -4.279409408569336, + "eval_logits/rejected": -4.266495704650879, + "eval_logps/chosen": -698.281005859375, + "eval_logps/rejected": -718.4324951171875, + "eval_loss": 0.6276723742485046, + "eval_rewards/accuracies": 0.6434162855148315, + "eval_rewards/chosen": -0.7452816367149353, + "eval_rewards/margins": 0.4181005358695984, + "eval_rewards/rejected": -1.1633821725845337, + "eval_runtime": 395.6592, + "eval_samples_per_second": 18.081, + "eval_steps_per_second": 9.041, + "step": 6400 + }, + { + "epoch": 1.493302271403611, + "grad_norm": 7.82735538482666, + "learning_rate": 5.628478964401295e-06, + "logits/chosen": -4.288760185241699, + "logits/rejected": -4.4220356941223145, + "logps/chosen": -679.4021606445312, + "logps/rejected": -795.2906494140625, + "loss": 0.8033, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6727933883666992, + "rewards/margins": 0.33598798513412476, + "rewards/rejected": -1.0087814331054688, + "step": 6410 + }, + { + "epoch": 1.4956319161327898, + "grad_norm": 4.939048767089844, + "learning_rate": 5.602588996763755e-06, + "logits/chosen": -4.317221164703369, + "logits/rejected": -4.359116077423096, + "logps/chosen": -721.6626586914062, + "logps/rejected": -742.2684936523438, + "loss": 0.8091, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8711726069450378, + "rewards/margins": 0.3611958622932434, + "rewards/rejected": -1.2323685884475708, + "step": 6420 + }, + { + "epoch": 1.4979615608619685, + "grad_norm": 9.913616180419922, + "learning_rate": 5.576699029126214e-06, + "logits/chosen": -4.355402946472168, + "logits/rejected": -4.360577583312988, + "logps/chosen": -752.7567138671875, + "logps/rejected": -770.4042358398438, + "loss": 0.7849, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.0307178497314453, + "rewards/margins": 0.4963906705379486, + "rewards/rejected": -1.5271085500717163, + "step": 6430 + }, + { + "epoch": 1.5002912055911475, + "grad_norm": 6.584352970123291, + "learning_rate": 5.550809061488674e-06, + "logits/chosen": -4.346511363983154, + "logits/rejected": -4.295920372009277, + "logps/chosen": -728.0184326171875, + "logps/rejected": -761.0966796875, + "loss": 0.6471, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5676674842834473, + "rewards/margins": 0.6876822113990784, + "rewards/rejected": -1.2553496360778809, + "step": 6440 + }, + { + "epoch": 1.502620850320326, + "grad_norm": 9.470415115356445, + "learning_rate": 5.524919093851133e-06, + "logits/chosen": -4.299017906188965, + "logits/rejected": -4.294720649719238, + "logps/chosen": -711.3177490234375, + "logps/rejected": -738.9423217773438, + "loss": 0.6542, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5640817284584045, + "rewards/margins": 0.7242376804351807, + "rewards/rejected": -1.2883192300796509, + "step": 6450 + }, + { + "epoch": 1.504950495049505, + "grad_norm": 11.724821090698242, + "learning_rate": 5.499029126213593e-06, + "logits/chosen": -4.337367057800293, + "logits/rejected": -4.3308539390563965, + "logps/chosen": -726.2224731445312, + "logps/rejected": -696.56787109375, + "loss": 0.7747, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6660787463188171, + "rewards/margins": 0.44035547971725464, + "rewards/rejected": -1.1064343452453613, + "step": 6460 + }, + { + "epoch": 1.507280139778684, + "grad_norm": 8.660778045654297, + "learning_rate": 5.473139158576052e-06, + "logits/chosen": -4.300487041473389, + "logits/rejected": -4.254570960998535, + "logps/chosen": -701.4354858398438, + "logps/rejected": -739.0933227539062, + "loss": 0.8465, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8152783513069153, + "rewards/margins": 0.32741162180900574, + "rewards/rejected": -1.1426900625228882, + "step": 6470 + }, + { + "epoch": 1.5096097845078624, + "grad_norm": 4.783875942230225, + "learning_rate": 5.447249190938512e-06, + "logits/chosen": -4.258687973022461, + "logits/rejected": -4.330063819885254, + "logps/chosen": -658.1441650390625, + "logps/rejected": -762.2044677734375, + "loss": 0.6749, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6189396381378174, + "rewards/margins": 0.665094256401062, + "rewards/rejected": -1.284034013748169, + "step": 6480 + }, + { + "epoch": 1.5119394292370414, + "grad_norm": 6.743084907531738, + "learning_rate": 5.4213592233009714e-06, + "logits/chosen": -4.353699684143066, + "logits/rejected": -4.270865440368652, + "logps/chosen": -750.50341796875, + "logps/rejected": -700.8162841796875, + "loss": 0.7392, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6873252391815186, + "rewards/margins": 0.5150385499000549, + "rewards/rejected": -1.2023637294769287, + "step": 6490 + }, + { + "epoch": 1.5142690739662201, + "grad_norm": 6.350234508514404, + "learning_rate": 5.395469255663431e-06, + "logits/chosen": -4.325373649597168, + "logits/rejected": -4.258090972900391, + "logps/chosen": -683.0284423828125, + "logps/rejected": -685.321533203125, + "loss": 0.8345, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6604098081588745, + "rewards/margins": 0.30990126729011536, + "rewards/rejected": -0.9703109860420227, + "step": 6500 + }, + { + "epoch": 1.5142690739662201, + "eval_logits/chosen": -4.2847981452941895, + "eval_logits/rejected": -4.271369934082031, + "eval_logps/chosen": -697.9261474609375, + "eval_logps/rejected": -717.9816284179688, + "eval_loss": 0.6257911920547485, + "eval_rewards/accuracies": 0.6415991187095642, + "eval_rewards/chosen": -0.7097821831703186, + "eval_rewards/margins": 0.4085138142108917, + "eval_rewards/rejected": -1.1182959079742432, + "eval_runtime": 396.0331, + "eval_samples_per_second": 18.064, + "eval_steps_per_second": 9.032, + "step": 6500 + }, + { + "epoch": 1.5165987186953989, + "grad_norm": 9.322096824645996, + "learning_rate": 5.36957928802589e-06, + "logits/chosen": -4.336639404296875, + "logits/rejected": -4.434247016906738, + "logps/chosen": -701.051513671875, + "logps/rejected": -772.03662109375, + "loss": 0.7471, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7475873231887817, + "rewards/margins": 0.5629361867904663, + "rewards/rejected": -1.3105233907699585, + "step": 6510 + }, + { + "epoch": 1.5189283634245778, + "grad_norm": 8.181373596191406, + "learning_rate": 5.34368932038835e-06, + "logits/chosen": -4.339288234710693, + "logits/rejected": -4.350064754486084, + "logps/chosen": -761.74169921875, + "logps/rejected": -785.0966186523438, + "loss": 0.7449, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.7028181552886963, + "rewards/margins": 0.42330655455589294, + "rewards/rejected": -1.126124620437622, + "step": 6520 + }, + { + "epoch": 1.5212580081537566, + "grad_norm": 10.596341133117676, + "learning_rate": 5.3177993527508095e-06, + "logits/chosen": -4.388852119445801, + "logits/rejected": -4.307581424713135, + "logps/chosen": -725.6011352539062, + "logps/rejected": -678.3147583007812, + "loss": 0.7793, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6951478123664856, + "rewards/margins": 0.36321547627449036, + "rewards/rejected": -1.0583633184432983, + "step": 6530 + }, + { + "epoch": 1.5235876528829353, + "grad_norm": 6.610289573669434, + "learning_rate": 5.291909385113268e-06, + "logits/chosen": -4.336356163024902, + "logits/rejected": -4.272176742553711, + "logps/chosen": -709.7391357421875, + "logps/rejected": -714.23876953125, + "loss": 0.6984, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.705763578414917, + "rewards/margins": 0.55080246925354, + "rewards/rejected": -1.2565661668777466, + "step": 6540 + }, + { + "epoch": 1.5259172976121143, + "grad_norm": 9.951604843139648, + "learning_rate": 5.266019417475728e-06, + "logits/chosen": -4.248246669769287, + "logits/rejected": -4.297746658325195, + "logps/chosen": -597.0340576171875, + "logps/rejected": -707.5311889648438, + "loss": 0.6395, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5449532270431519, + "rewards/margins": 0.538214921951294, + "rewards/rejected": -1.0831680297851562, + "step": 6550 + }, + { + "epoch": 1.528246942341293, + "grad_norm": 8.162659645080566, + "learning_rate": 5.240129449838189e-06, + "logits/chosen": -4.318647861480713, + "logits/rejected": -4.250157356262207, + "logps/chosen": -699.1119995117188, + "logps/rejected": -662.0938720703125, + "loss": 0.6704, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.45620036125183105, + "rewards/margins": 0.6418908834457397, + "rewards/rejected": -1.0980913639068604, + "step": 6560 + }, + { + "epoch": 1.5305765870704717, + "grad_norm": 11.98337173461914, + "learning_rate": 5.214239482200648e-06, + "logits/chosen": -4.308518409729004, + "logits/rejected": -4.269722938537598, + "logps/chosen": -735.4429321289062, + "logps/rejected": -759.6636962890625, + "loss": 0.7414, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5557039976119995, + "rewards/margins": 0.6347615122795105, + "rewards/rejected": -1.1904654502868652, + "step": 6570 + }, + { + "epoch": 1.5329062317996507, + "grad_norm": 9.075456619262695, + "learning_rate": 5.188349514563107e-06, + "logits/chosen": -4.317292213439941, + "logits/rejected": -4.296011924743652, + "logps/chosen": -758.3450927734375, + "logps/rejected": -791.48974609375, + "loss": 0.8578, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6727965474128723, + "rewards/margins": 0.38544994592666626, + "rewards/rejected": -1.058246374130249, + "step": 6580 + }, + { + "epoch": 1.5352358765288292, + "grad_norm": 8.431157112121582, + "learning_rate": 5.162459546925567e-06, + "logits/chosen": -4.2513861656188965, + "logits/rejected": -4.324321746826172, + "logps/chosen": -662.2391967773438, + "logps/rejected": -759.1661987304688, + "loss": 0.8565, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6914823651313782, + "rewards/margins": 0.35625919699668884, + "rewards/rejected": -1.0477415323257446, + "step": 6590 + }, + { + "epoch": 1.5375655212580082, + "grad_norm": 5.4589948654174805, + "learning_rate": 5.136569579288027e-06, + "logits/chosen": -4.429289817810059, + "logits/rejected": -4.362177848815918, + "logps/chosen": -710.226806640625, + "logps/rejected": -702.2576293945312, + "loss": 0.7486, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.664710283279419, + "rewards/margins": 0.35124653577804565, + "rewards/rejected": -1.0159568786621094, + "step": 6600 + }, + { + "epoch": 1.5375655212580082, + "eval_logits/chosen": -4.283307075500488, + "eval_logits/rejected": -4.269985675811768, + "eval_logps/chosen": -697.8657836914062, + "eval_logps/rejected": -717.916015625, + "eval_loss": 0.6255219578742981, + "eval_rewards/accuracies": 0.6459323167800903, + "eval_rewards/chosen": -0.703760027885437, + "eval_rewards/margins": 0.4079740047454834, + "eval_rewards/rejected": -1.1117339134216309, + "eval_runtime": 396.0706, + "eval_samples_per_second": 18.062, + "eval_steps_per_second": 9.031, + "step": 6600 + }, + { + "epoch": 1.5398951659871871, + "grad_norm": 8.257590293884277, + "learning_rate": 5.110679611650486e-06, + "logits/chosen": -4.323059558868408, + "logits/rejected": -4.37637996673584, + "logps/chosen": -688.7907104492188, + "logps/rejected": -743.0214233398438, + "loss": 0.7344, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6487873792648315, + "rewards/margins": 0.36815351247787476, + "rewards/rejected": -1.0169408321380615, + "step": 6610 + }, + { + "epoch": 1.5422248107163656, + "grad_norm": 7.590610980987549, + "learning_rate": 5.084789644012945e-06, + "logits/chosen": -4.344366073608398, + "logits/rejected": -4.286723613739014, + "logps/chosen": -717.1707763671875, + "logps/rejected": -748.5653076171875, + "loss": 0.7652, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5456770658493042, + "rewards/margins": 0.4762745797634125, + "rewards/rejected": -1.021951675415039, + "step": 6620 + }, + { + "epoch": 1.5445544554455446, + "grad_norm": 9.014822006225586, + "learning_rate": 5.058899676375405e-06, + "logits/chosen": -4.227101802825928, + "logits/rejected": -4.350574493408203, + "logps/chosen": -655.3963012695312, + "logps/rejected": -781.3607788085938, + "loss": 0.7472, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.8425167798995972, + "rewards/margins": 0.4325069785118103, + "rewards/rejected": -1.2750238180160522, + "step": 6630 + }, + { + "epoch": 1.5468841001747233, + "grad_norm": 7.267883777618408, + "learning_rate": 5.033009708737865e-06, + "logits/chosen": -4.215926170349121, + "logits/rejected": -4.334687232971191, + "logps/chosen": -692.3230590820312, + "logps/rejected": -734.2679443359375, + "loss": 0.7137, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5044487714767456, + "rewards/margins": 0.4881749749183655, + "rewards/rejected": -0.9926236867904663, + "step": 6640 + }, + { + "epoch": 1.549213744903902, + "grad_norm": 7.1380462646484375, + "learning_rate": 5.007119741100324e-06, + "logits/chosen": -4.351950168609619, + "logits/rejected": -4.302297592163086, + "logps/chosen": -639.7730712890625, + "logps/rejected": -675.5047607421875, + "loss": 0.7411, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6441572308540344, + "rewards/margins": 0.49536651372909546, + "rewards/rejected": -1.1395237445831299, + "step": 6650 + }, + { + "epoch": 1.551543389633081, + "grad_norm": 9.367436408996582, + "learning_rate": 4.981229773462783e-06, + "logits/chosen": -4.354855537414551, + "logits/rejected": -4.330399513244629, + "logps/chosen": -732.99462890625, + "logps/rejected": -765.6302490234375, + "loss": 0.8679, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.8751713633537292, + "rewards/margins": 0.20318977534770966, + "rewards/rejected": -1.0783611536026, + "step": 6660 + }, + { + "epoch": 1.5538730343622598, + "grad_norm": 8.631954193115234, + "learning_rate": 4.955339805825243e-06, + "logits/chosen": -4.407146453857422, + "logits/rejected": -4.191717624664307, + "logps/chosen": -781.7088623046875, + "logps/rejected": -701.8729248046875, + "loss": 0.7618, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.723842442035675, + "rewards/margins": 0.3932204246520996, + "rewards/rejected": -1.1170628070831299, + "step": 6670 + }, + { + "epoch": 1.5562026790914385, + "grad_norm": 7.060195446014404, + "learning_rate": 4.929449838187703e-06, + "logits/chosen": -4.288878440856934, + "logits/rejected": -4.307755470275879, + "logps/chosen": -694.4677734375, + "logps/rejected": -699.7354736328125, + "loss": 0.672, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5265144109725952, + "rewards/margins": 0.6340707540512085, + "rewards/rejected": -1.1605851650238037, + "step": 6680 + }, + { + "epoch": 1.5585323238206175, + "grad_norm": 9.471017837524414, + "learning_rate": 4.9035598705501626e-06, + "logits/chosen": -4.301576137542725, + "logits/rejected": -4.327295303344727, + "logps/chosen": -667.992431640625, + "logps/rejected": -734.1383056640625, + "loss": 0.7076, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.743894636631012, + "rewards/margins": 0.48800763487815857, + "rewards/rejected": -1.2319023609161377, + "step": 6690 + }, + { + "epoch": 1.5608619685497962, + "grad_norm": 8.635560989379883, + "learning_rate": 4.8776699029126215e-06, + "logits/chosen": -4.34967565536499, + "logits/rejected": -4.292224407196045, + "logps/chosen": -729.954833984375, + "logps/rejected": -719.730224609375, + "loss": 0.761, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.784434974193573, + "rewards/margins": 0.4822344183921814, + "rewards/rejected": -1.2666693925857544, + "step": 6700 + }, + { + "epoch": 1.5608619685497962, + "eval_logits/chosen": -4.2828216552734375, + "eval_logits/rejected": -4.269512176513672, + "eval_logps/chosen": -698.0747680664062, + "eval_logps/rejected": -718.1797485351562, + "eval_loss": 0.6258891224861145, + "eval_rewards/accuracies": 0.6420184373855591, + "eval_rewards/chosen": -0.7246599793434143, + "eval_rewards/margins": 0.41345661878585815, + "eval_rewards/rejected": -1.1381165981292725, + "eval_runtime": 396.3176, + "eval_samples_per_second": 18.051, + "eval_steps_per_second": 9.026, + "step": 6700 + }, + { + "epoch": 1.563191613278975, + "grad_norm": 6.1233696937561035, + "learning_rate": 4.851779935275081e-06, + "logits/chosen": -4.277688503265381, + "logits/rejected": -4.283129692077637, + "logps/chosen": -703.0496215820312, + "logps/rejected": -707.1187133789062, + "loss": 0.7647, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5167033076286316, + "rewards/margins": 0.5566205978393555, + "rewards/rejected": -1.0733239650726318, + "step": 6710 + }, + { + "epoch": 1.565521258008154, + "grad_norm": 8.156778335571289, + "learning_rate": 4.825889967637541e-06, + "logits/chosen": -4.336854934692383, + "logits/rejected": -4.288191795349121, + "logps/chosen": -755.5333251953125, + "logps/rejected": -736.971435546875, + "loss": 0.7263, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9055421948432922, + "rewards/margins": 0.5150974988937378, + "rewards/rejected": -1.4206396341323853, + "step": 6720 + }, + { + "epoch": 1.5678509027373324, + "grad_norm": 6.012303352355957, + "learning_rate": 4.800000000000001e-06, + "logits/chosen": -4.293099880218506, + "logits/rejected": -4.312102317810059, + "logps/chosen": -667.4832763671875, + "logps/rejected": -746.6400756835938, + "loss": 0.8205, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8755755424499512, + "rewards/margins": 0.29335370659828186, + "rewards/rejected": -1.1689293384552002, + "step": 6730 + }, + { + "epoch": 1.5701805474665114, + "grad_norm": 9.780920028686523, + "learning_rate": 4.77411003236246e-06, + "logits/chosen": -4.192702770233154, + "logits/rejected": -4.291769504547119, + "logps/chosen": -641.2814331054688, + "logps/rejected": -755.7752075195312, + "loss": 0.7455, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.675727367401123, + "rewards/margins": 0.45784205198287964, + "rewards/rejected": -1.133569359779358, + "step": 6740 + }, + { + "epoch": 1.57251019219569, + "grad_norm": 10.279903411865234, + "learning_rate": 4.748220064724919e-06, + "logits/chosen": -4.267851829528809, + "logits/rejected": -4.342855930328369, + "logps/chosen": -712.3040161132812, + "logps/rejected": -765.2335205078125, + "loss": 0.8372, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7413617372512817, + "rewards/margins": 0.4191419184207916, + "rewards/rejected": -1.160503625869751, + "step": 6750 + }, + { + "epoch": 1.5748398369248688, + "grad_norm": 8.927513122558594, + "learning_rate": 4.722330097087379e-06, + "logits/chosen": -4.281728267669678, + "logits/rejected": -4.379133701324463, + "logps/chosen": -722.7548828125, + "logps/rejected": -774.9669189453125, + "loss": 0.9477, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.046169400215149, + "rewards/margins": 0.12018339335918427, + "rewards/rejected": -1.1663528680801392, + "step": 6760 + }, + { + "epoch": 1.5771694816540478, + "grad_norm": 8.109047889709473, + "learning_rate": 4.696440129449839e-06, + "logits/chosen": -4.362876892089844, + "logits/rejected": -4.314282417297363, + "logps/chosen": -742.7166137695312, + "logps/rejected": -780.0384521484375, + "loss": 0.651, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6960076093673706, + "rewards/margins": 0.5484358072280884, + "rewards/rejected": -1.244443416595459, + "step": 6770 + }, + { + "epoch": 1.5794991263832265, + "grad_norm": 8.884431838989258, + "learning_rate": 4.670550161812298e-06, + "logits/chosen": -4.268649578094482, + "logits/rejected": -4.254140377044678, + "logps/chosen": -707.0760498046875, + "logps/rejected": -735.7363891601562, + "loss": 0.8273, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.8003918528556824, + "rewards/margins": 0.3524617552757263, + "rewards/rejected": -1.1528536081314087, + "step": 6780 + }, + { + "epoch": 1.5818287711124053, + "grad_norm": 10.445850372314453, + "learning_rate": 4.644660194174757e-06, + "logits/chosen": -4.2838969230651855, + "logits/rejected": -4.417786121368408, + "logps/chosen": -738.2225341796875, + "logps/rejected": -828.3897705078125, + "loss": 0.7626, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6615802049636841, + "rewards/margins": 0.6080848574638367, + "rewards/rejected": -1.2696651220321655, + "step": 6790 + }, + { + "epoch": 1.5841584158415842, + "grad_norm": 12.222297668457031, + "learning_rate": 4.618770226537217e-06, + "logits/chosen": -4.329405784606934, + "logits/rejected": -4.318453788757324, + "logps/chosen": -767.93994140625, + "logps/rejected": -750.4871215820312, + "loss": 0.7829, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7184725999832153, + "rewards/margins": 0.2945941388607025, + "rewards/rejected": -1.0130667686462402, + "step": 6800 + }, + { + "epoch": 1.5841584158415842, + "eval_logits/chosen": -4.285313606262207, + "eval_logits/rejected": -4.272164821624756, + "eval_logps/chosen": -698.1854248046875, + "eval_logps/rejected": -718.3281860351562, + "eval_loss": 0.6260075569152832, + "eval_rewards/accuracies": 0.6421582102775574, + "eval_rewards/chosen": -0.7357184886932373, + "eval_rewards/margins": 0.41724127531051636, + "eval_rewards/rejected": -1.1529598236083984, + "eval_runtime": 396.851, + "eval_samples_per_second": 18.027, + "eval_steps_per_second": 9.013, + "step": 6800 + }, + { + "epoch": 1.586488060570763, + "grad_norm": 10.387283325195312, + "learning_rate": 4.592880258899677e-06, + "logits/chosen": -4.342694282531738, + "logits/rejected": -4.3356614112854, + "logps/chosen": -735.6070556640625, + "logps/rejected": -787.474609375, + "loss": 0.7371, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5960084795951843, + "rewards/margins": 0.5805218815803528, + "rewards/rejected": -1.176530361175537, + "step": 6810 + }, + { + "epoch": 1.5888177052999417, + "grad_norm": 6.978819370269775, + "learning_rate": 4.5669902912621365e-06, + "logits/chosen": -4.330822944641113, + "logits/rejected": -4.264742851257324, + "logps/chosen": -742.8681640625, + "logps/rejected": -699.9263305664062, + "loss": 0.7347, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5250498056411743, + "rewards/margins": 0.4952210783958435, + "rewards/rejected": -1.0202710628509521, + "step": 6820 + }, + { + "epoch": 1.5911473500291207, + "grad_norm": 4.977511405944824, + "learning_rate": 4.541100323624596e-06, + "logits/chosen": -4.301724433898926, + "logits/rejected": -4.31328821182251, + "logps/chosen": -704.4278564453125, + "logps/rejected": -805.1658935546875, + "loss": 0.5991, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6408155560493469, + "rewards/margins": 0.7181216478347778, + "rewards/rejected": -1.3589370250701904, + "step": 6830 + }, + { + "epoch": 1.5934769947582994, + "grad_norm": 7.5867180824279785, + "learning_rate": 4.515210355987055e-06, + "logits/chosen": -4.225648880004883, + "logits/rejected": -4.192324161529541, + "logps/chosen": -701.4697265625, + "logps/rejected": -730.7188720703125, + "loss": 0.7539, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7625960111618042, + "rewards/margins": 0.5121796131134033, + "rewards/rejected": -1.274775743484497, + "step": 6840 + }, + { + "epoch": 1.5958066394874781, + "grad_norm": 8.60496997833252, + "learning_rate": 4.489320388349515e-06, + "logits/chosen": -4.283195495605469, + "logits/rejected": -4.2724738121032715, + "logps/chosen": -681.2093505859375, + "logps/rejected": -659.5936279296875, + "loss": 0.7666, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7397498488426208, + "rewards/margins": 0.3033349812030792, + "rewards/rejected": -1.0430848598480225, + "step": 6850 + }, + { + "epoch": 1.598136284216657, + "grad_norm": 8.380082130432129, + "learning_rate": 4.4634304207119745e-06, + "logits/chosen": -4.3215131759643555, + "logits/rejected": -4.331038475036621, + "logps/chosen": -709.4156494140625, + "logps/rejected": -726.0703125, + "loss": 0.6852, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5304920673370361, + "rewards/margins": 0.7836783528327942, + "rewards/rejected": -1.3141703605651855, + "step": 6860 + }, + { + "epoch": 1.6004659289458356, + "grad_norm": 7.335846424102783, + "learning_rate": 4.437540453074434e-06, + "logits/chosen": -4.3939008712768555, + "logits/rejected": -4.39631986618042, + "logps/chosen": -753.7726440429688, + "logps/rejected": -834.2058715820312, + "loss": 0.7744, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8983514904975891, + "rewards/margins": 0.40852493047714233, + "rewards/rejected": -1.306876540184021, + "step": 6870 + }, + { + "epoch": 1.6027955736750146, + "grad_norm": 9.488310813903809, + "learning_rate": 4.411650485436894e-06, + "logits/chosen": -4.334609031677246, + "logits/rejected": -4.30482816696167, + "logps/chosen": -828.6798706054688, + "logps/rejected": -817.4603271484375, + "loss": 0.8288, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7392042875289917, + "rewards/margins": 0.388349711894989, + "rewards/rejected": -1.1275540590286255, + "step": 6880 + }, + { + "epoch": 1.6051252184041933, + "grad_norm": 7.569668769836426, + "learning_rate": 4.385760517799353e-06, + "logits/chosen": -4.326897621154785, + "logits/rejected": -4.352515697479248, + "logps/chosen": -705.760498046875, + "logps/rejected": -750.1553955078125, + "loss": 0.86, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.9272357225418091, + "rewards/margins": 0.24375076591968536, + "rewards/rejected": -1.170986294746399, + "step": 6890 + }, + { + "epoch": 1.607454863133372, + "grad_norm": 6.373630046844482, + "learning_rate": 4.359870550161813e-06, + "logits/chosen": -4.288449764251709, + "logits/rejected": -4.232588768005371, + "logps/chosen": -704.5266723632812, + "logps/rejected": -696.7781982421875, + "loss": 0.7887, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.7161403298377991, + "rewards/margins": 0.3344431519508362, + "rewards/rejected": -1.0505833625793457, + "step": 6900 + }, + { + "epoch": 1.607454863133372, + "eval_logits/chosen": -4.281885623931885, + "eval_logits/rejected": -4.268438816070557, + "eval_logps/chosen": -698.2005004882812, + "eval_logps/rejected": -718.3382568359375, + "eval_loss": 0.6262277960777283, + "eval_rewards/accuracies": 0.6422980427742004, + "eval_rewards/chosen": -0.7372271418571472, + "eval_rewards/margins": 0.4167364835739136, + "eval_rewards/rejected": -1.1539636850357056, + "eval_runtime": 397.1792, + "eval_samples_per_second": 18.012, + "eval_steps_per_second": 9.006, + "step": 6900 + }, + { + "epoch": 1.609784507862551, + "grad_norm": 4.869718074798584, + "learning_rate": 4.333980582524272e-06, + "logits/chosen": -4.281427383422852, + "logits/rejected": -4.302354335784912, + "logps/chosen": -655.8043823242188, + "logps/rejected": -685.4343872070312, + "loss": 0.8487, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.6849377155303955, + "rewards/margins": 0.14450839161872864, + "rewards/rejected": -0.8294461369514465, + "step": 6910 + }, + { + "epoch": 1.6121141525917297, + "grad_norm": 10.79990291595459, + "learning_rate": 4.308090614886732e-06, + "logits/chosen": -4.325753688812256, + "logits/rejected": -4.275383472442627, + "logps/chosen": -808.7792358398438, + "logps/rejected": -778.7926025390625, + "loss": 0.6496, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6239713430404663, + "rewards/margins": 0.6055768132209778, + "rewards/rejected": -1.2295480966567993, + "step": 6920 + }, + { + "epoch": 1.6144437973209085, + "grad_norm": 12.391186714172363, + "learning_rate": 4.282200647249191e-06, + "logits/chosen": -4.316169738769531, + "logits/rejected": -4.227480888366699, + "logps/chosen": -763.8555297851562, + "logps/rejected": -705.6220703125, + "loss": 0.7934, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7102519273757935, + "rewards/margins": 0.34832531213760376, + "rewards/rejected": -1.058577299118042, + "step": 6930 + }, + { + "epoch": 1.6167734420500874, + "grad_norm": 11.898706436157227, + "learning_rate": 4.256310679611651e-06, + "logits/chosen": -4.290870189666748, + "logits/rejected": -4.266106605529785, + "logps/chosen": -713.439208984375, + "logps/rejected": -743.7091064453125, + "loss": 0.8143, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.699730396270752, + "rewards/margins": 0.231022909283638, + "rewards/rejected": -0.9307533502578735, + "step": 6940 + }, + { + "epoch": 1.6191030867792662, + "grad_norm": 6.399176597595215, + "learning_rate": 4.23042071197411e-06, + "logits/chosen": -4.361333847045898, + "logits/rejected": -4.301661491394043, + "logps/chosen": -744.58544921875, + "logps/rejected": -732.2003784179688, + "loss": 0.771, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.7607227563858032, + "rewards/margins": 0.3533969521522522, + "rewards/rejected": -1.1141196489334106, + "step": 6950 + }, + { + "epoch": 1.621432731508445, + "grad_norm": 9.512768745422363, + "learning_rate": 4.20453074433657e-06, + "logits/chosen": -4.269883632659912, + "logits/rejected": -4.253101825714111, + "logps/chosen": -680.302734375, + "logps/rejected": -736.2039794921875, + "loss": 0.7223, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5698976516723633, + "rewards/margins": 0.6293027997016907, + "rewards/rejected": -1.1992003917694092, + "step": 6960 + }, + { + "epoch": 1.6237623762376239, + "grad_norm": 9.72691822052002, + "learning_rate": 4.17864077669903e-06, + "logits/chosen": -4.406013488769531, + "logits/rejected": -4.377242565155029, + "logps/chosen": -785.4564208984375, + "logps/rejected": -820.7072143554688, + "loss": 0.8597, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.7639799118041992, + "rewards/margins": 0.27192139625549316, + "rewards/rejected": -1.0359013080596924, + "step": 6970 + }, + { + "epoch": 1.6260920209668026, + "grad_norm": 6.968569755554199, + "learning_rate": 4.152750809061489e-06, + "logits/chosen": -4.263640403747559, + "logits/rejected": -4.292579650878906, + "logps/chosen": -717.577880859375, + "logps/rejected": -740.34375, + "loss": 0.6937, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5662204623222351, + "rewards/margins": 0.4708368182182312, + "rewards/rejected": -1.0370572805404663, + "step": 6980 + }, + { + "epoch": 1.6284216656959813, + "grad_norm": 4.735540866851807, + "learning_rate": 4.1268608414239484e-06, + "logits/chosen": -4.283355236053467, + "logits/rejected": -4.369661331176758, + "logps/chosen": -646.6312866210938, + "logps/rejected": -711.9313354492188, + "loss": 0.5925, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5355406999588013, + "rewards/margins": 0.7102679014205933, + "rewards/rejected": -1.245808720588684, + "step": 6990 + }, + { + "epoch": 1.6307513104251603, + "grad_norm": 9.391236305236816, + "learning_rate": 4.100970873786408e-06, + "logits/chosen": -4.427331447601318, + "logits/rejected": -4.339338779449463, + "logps/chosen": -737.71728515625, + "logps/rejected": -694.397705078125, + "loss": 0.8044, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6445012092590332, + "rewards/margins": 0.38664406538009644, + "rewards/rejected": -1.0311453342437744, + "step": 7000 + }, + { + "epoch": 1.6307513104251603, + "eval_logits/chosen": -4.281728744506836, + "eval_logits/rejected": -4.268459320068359, + "eval_logps/chosen": -698.0704956054688, + "eval_logps/rejected": -718.185546875, + "eval_loss": 0.6258257031440735, + "eval_rewards/accuracies": 0.6424378156661987, + "eval_rewards/chosen": -0.7242283225059509, + "eval_rewards/margins": 0.4144580364227295, + "eval_rewards/rejected": -1.1386864185333252, + "eval_runtime": 398.3664, + "eval_samples_per_second": 17.958, + "eval_steps_per_second": 8.979, + "step": 7000 + }, + { + "epoch": 1.6330809551543388, + "grad_norm": 6.797269344329834, + "learning_rate": 4.075080906148868e-06, + "logits/chosen": -4.291042327880859, + "logits/rejected": -4.331503391265869, + "logps/chosen": -685.22900390625, + "logps/rejected": -719.4610595703125, + "loss": 0.8001, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6552774310112, + "rewards/margins": 0.33681371808052063, + "rewards/rejected": -0.9920912981033325, + "step": 7010 + }, + { + "epoch": 1.6354105998835178, + "grad_norm": 7.896794319152832, + "learning_rate": 4.049190938511327e-06, + "logits/chosen": -4.361865043640137, + "logits/rejected": -4.35894775390625, + "logps/chosen": -734.9173583984375, + "logps/rejected": -729.639892578125, + "loss": 0.7177, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.46472835540771484, + "rewards/margins": 0.5228708982467651, + "rewards/rejected": -0.9875991940498352, + "step": 7020 + }, + { + "epoch": 1.6377402446126965, + "grad_norm": 9.912988662719727, + "learning_rate": 4.0233009708737865e-06, + "logits/chosen": -4.3807172775268555, + "logits/rejected": -4.355521202087402, + "logps/chosen": -731.7893676757812, + "logps/rejected": -743.5462646484375, + "loss": 0.769, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4980197846889496, + "rewards/margins": 0.5245866775512695, + "rewards/rejected": -1.0226064920425415, + "step": 7030 + }, + { + "epoch": 1.6400698893418753, + "grad_norm": 8.826336860656738, + "learning_rate": 3.997411003236246e-06, + "logits/chosen": -4.4199957847595215, + "logits/rejected": -4.296575546264648, + "logps/chosen": -777.4635009765625, + "logps/rejected": -732.2686767578125, + "loss": 0.7529, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5620446801185608, + "rewards/margins": 0.4427622854709625, + "rewards/rejected": -1.0048068761825562, + "step": 7040 + }, + { + "epoch": 1.6423995340710542, + "grad_norm": 4.805436611175537, + "learning_rate": 3.971521035598706e-06, + "logits/chosen": -4.415337562561035, + "logits/rejected": -4.307534217834473, + "logps/chosen": -772.3611450195312, + "logps/rejected": -743.7156982421875, + "loss": 0.8022, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6573207378387451, + "rewards/margins": 0.4766790270805359, + "rewards/rejected": -1.1339998245239258, + "step": 7050 + }, + { + "epoch": 1.644729178800233, + "grad_norm": 8.641572952270508, + "learning_rate": 3.945631067961166e-06, + "logits/chosen": -4.261378288269043, + "logits/rejected": -4.3650617599487305, + "logps/chosen": -692.762451171875, + "logps/rejected": -797.03271484375, + "loss": 0.7324, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6822121143341064, + "rewards/margins": 0.6177128553390503, + "rewards/rejected": -1.2999250888824463, + "step": 7060 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 7.119178295135498, + "learning_rate": 3.9197411003236245e-06, + "logits/chosen": -4.267707824707031, + "logits/rejected": -4.342516899108887, + "logps/chosen": -768.018310546875, + "logps/rejected": -775.420166015625, + "loss": 0.6835, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6481823325157166, + "rewards/margins": 0.6533300280570984, + "rewards/rejected": -1.3015124797821045, + "step": 7070 + }, + { + "epoch": 1.6493884682585906, + "grad_norm": 5.9756035804748535, + "learning_rate": 3.893851132686084e-06, + "logits/chosen": -4.280825614929199, + "logits/rejected": -4.224474906921387, + "logps/chosen": -766.645263671875, + "logps/rejected": -708.4898681640625, + "loss": 0.7056, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6119120717048645, + "rewards/margins": 0.5710783004760742, + "rewards/rejected": -1.182990312576294, + "step": 7080 + }, + { + "epoch": 1.6517181129877694, + "grad_norm": 7.022614479064941, + "learning_rate": 3.867961165048544e-06, + "logits/chosen": -4.209417819976807, + "logits/rejected": -4.280752182006836, + "logps/chosen": -692.9827880859375, + "logps/rejected": -725.9755859375, + "loss": 0.9008, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.8311317563056946, + "rewards/margins": 0.08656143397092819, + "rewards/rejected": -0.9176931381225586, + "step": 7090 + }, + { + "epoch": 1.6540477577169481, + "grad_norm": 10.223764419555664, + "learning_rate": 3.842071197411004e-06, + "logits/chosen": -4.297172546386719, + "logits/rejected": -4.260180950164795, + "logps/chosen": -718.2964477539062, + "logps/rejected": -706.2061767578125, + "loss": 0.7227, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6614014506340027, + "rewards/margins": 0.4238288998603821, + "rewards/rejected": -1.0852304697036743, + "step": 7100 + }, + { + "epoch": 1.6540477577169481, + "eval_logits/chosen": -4.282748699188232, + "eval_logits/rejected": -4.269398212432861, + "eval_logps/chosen": -698.1327514648438, + "eval_logps/rejected": -718.2734985351562, + "eval_loss": 0.6262640357017517, + "eval_rewards/accuracies": 0.6439753770828247, + "eval_rewards/chosen": -0.7304497957229614, + "eval_rewards/margins": 0.4170399308204651, + "eval_rewards/rejected": -1.1474899053573608, + "eval_runtime": 397.9053, + "eval_samples_per_second": 17.979, + "eval_steps_per_second": 8.99, + "step": 7100 + }, + { + "epoch": 1.656377402446127, + "grad_norm": 9.430929183959961, + "learning_rate": 3.816181229773463e-06, + "logits/chosen": -4.280282974243164, + "logits/rejected": -4.2974395751953125, + "logps/chosen": -671.2548828125, + "logps/rejected": -727.61474609375, + "loss": 0.8933, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.820625901222229, + "rewards/margins": 0.22736160457134247, + "rewards/rejected": -1.0479875802993774, + "step": 7110 + }, + { + "epoch": 1.6587070471753058, + "grad_norm": 8.540874481201172, + "learning_rate": 3.7902912621359228e-06, + "logits/chosen": -4.318397045135498, + "logits/rejected": -4.397231578826904, + "logps/chosen": -662.1047973632812, + "logps/rejected": -736.2491455078125, + "loss": 0.589, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.46827468276023865, + "rewards/margins": 0.6185547113418579, + "rewards/rejected": -1.0868293046951294, + "step": 7120 + }, + { + "epoch": 1.6610366919044846, + "grad_norm": 5.626376628875732, + "learning_rate": 3.764401294498382e-06, + "logits/chosen": -4.4004225730896, + "logits/rejected": -4.287680625915527, + "logps/chosen": -790.2825317382812, + "logps/rejected": -747.4461669921875, + "loss": 0.739, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6833652853965759, + "rewards/margins": 0.49871358275413513, + "rewards/rejected": -1.1820788383483887, + "step": 7130 + }, + { + "epoch": 1.6633663366336635, + "grad_norm": 9.346349716186523, + "learning_rate": 3.7385113268608418e-06, + "logits/chosen": -4.3877363204956055, + "logits/rejected": -4.267838478088379, + "logps/chosen": -756.852783203125, + "logps/rejected": -692.7822875976562, + "loss": 0.8934, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7731833457946777, + "rewards/margins": -0.02572050131857395, + "rewards/rejected": -0.7474628686904907, + "step": 7140 + }, + { + "epoch": 1.665695981362842, + "grad_norm": 6.2495856285095215, + "learning_rate": 3.712621359223301e-06, + "logits/chosen": -4.388154029846191, + "logits/rejected": -4.3741984367370605, + "logps/chosen": -692.0726318359375, + "logps/rejected": -711.5775146484375, + "loss": 0.6488, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.49499768018722534, + "rewards/margins": 0.555439829826355, + "rewards/rejected": -1.050437569618225, + "step": 7150 + }, + { + "epoch": 1.668025626092021, + "grad_norm": 9.950132369995117, + "learning_rate": 3.6867313915857604e-06, + "logits/chosen": -4.390973091125488, + "logits/rejected": -4.378559112548828, + "logps/chosen": -736.49951171875, + "logps/rejected": -717.4456176757812, + "loss": 0.9051, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7832115292549133, + "rewards/margins": -0.003316342830657959, + "rewards/rejected": -0.7798951864242554, + "step": 7160 + }, + { + "epoch": 1.6703552708211997, + "grad_norm": 8.273674964904785, + "learning_rate": 3.6608414239482205e-06, + "logits/chosen": -4.308285713195801, + "logits/rejected": -4.307774543762207, + "logps/chosen": -732.8416137695312, + "logps/rejected": -751.5208740234375, + "loss": 0.8783, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.9038097262382507, + "rewards/margins": 0.08503206819295883, + "rewards/rejected": -0.9888418316841125, + "step": 7170 + }, + { + "epoch": 1.6726849155503785, + "grad_norm": 6.736724376678467, + "learning_rate": 3.63495145631068e-06, + "logits/chosen": -4.277791976928711, + "logits/rejected": -4.248794078826904, + "logps/chosen": -731.4321899414062, + "logps/rejected": -688.3527221679688, + "loss": 0.6928, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6009598970413208, + "rewards/margins": 0.6401046514511108, + "rewards/rejected": -1.241064429283142, + "step": 7180 + }, + { + "epoch": 1.6750145602795574, + "grad_norm": 6.992065906524658, + "learning_rate": 3.6090614886731396e-06, + "logits/chosen": -4.259005069732666, + "logits/rejected": -4.273360729217529, + "logps/chosen": -694.7288818359375, + "logps/rejected": -755.7852783203125, + "loss": 0.7376, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6417701244354248, + "rewards/margins": 0.5308027267456055, + "rewards/rejected": -1.1725728511810303, + "step": 7190 + }, + { + "epoch": 1.6773442050087362, + "grad_norm": 10.378262519836426, + "learning_rate": 3.583171521035599e-06, + "logits/chosen": -4.214404106140137, + "logits/rejected": -4.259570121765137, + "logps/chosen": -692.5252685546875, + "logps/rejected": -770.9401245117188, + "loss": 0.6745, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.46339020133018494, + "rewards/margins": 0.5105647444725037, + "rewards/rejected": -0.9739548563957214, + "step": 7200 + }, + { + "epoch": 1.6773442050087362, + "eval_logits/chosen": -4.2833170890808105, + "eval_logits/rejected": -4.27020788192749, + "eval_logps/chosen": -698.0100708007812, + "eval_logps/rejected": -718.1314697265625, + "eval_loss": 0.6253830194473267, + "eval_rewards/accuracies": 0.6422980427742004, + "eval_rewards/chosen": -0.7181804180145264, + "eval_rewards/margins": 0.41510212421417236, + "eval_rewards/rejected": -1.1332825422286987, + "eval_runtime": 397.8645, + "eval_samples_per_second": 17.981, + "eval_steps_per_second": 8.99, + "step": 7200 + }, + { + "epoch": 1.679673849737915, + "grad_norm": 10.800102233886719, + "learning_rate": 3.5572815533980586e-06, + "logits/chosen": -4.311060905456543, + "logits/rejected": -4.251940727233887, + "logps/chosen": -732.6629638671875, + "logps/rejected": -724.3796997070312, + "loss": 0.7655, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6294369101524353, + "rewards/margins": 0.5156153440475464, + "rewards/rejected": -1.1450523138046265, + "step": 7210 + }, + { + "epoch": 1.6820034944670939, + "grad_norm": 10.474735260009766, + "learning_rate": 3.531391585760518e-06, + "logits/chosen": -4.366720676422119, + "logits/rejected": -4.3180999755859375, + "logps/chosen": -747.0120239257812, + "logps/rejected": -736.7986450195312, + "loss": 0.8329, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5397108793258667, + "rewards/margins": 0.27118149399757385, + "rewards/rejected": -0.8108924627304077, + "step": 7220 + }, + { + "epoch": 1.6843331391962726, + "grad_norm": 9.822484016418457, + "learning_rate": 3.505501618122978e-06, + "logits/chosen": -4.2457780838012695, + "logits/rejected": -4.331841945648193, + "logps/chosen": -668.4440307617188, + "logps/rejected": -744.3336181640625, + "loss": 0.6862, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6030920147895813, + "rewards/margins": 0.6354323625564575, + "rewards/rejected": -1.238524317741394, + "step": 7230 + }, + { + "epoch": 1.6866627839254513, + "grad_norm": 9.211170196533203, + "learning_rate": 3.4796116504854374e-06, + "logits/chosen": -4.304722785949707, + "logits/rejected": -4.387798309326172, + "logps/chosen": -735.7977294921875, + "logps/rejected": -770.9636840820312, + "loss": 0.8774, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.776853084564209, + "rewards/margins": 0.18464542925357819, + "rewards/rejected": -0.9614984393119812, + "step": 7240 + }, + { + "epoch": 1.6889924286546303, + "grad_norm": 11.63446044921875, + "learning_rate": 3.453721682847897e-06, + "logits/chosen": -4.30993127822876, + "logits/rejected": -4.366557598114014, + "logps/chosen": -655.0383911132812, + "logps/rejected": -833.9552612304688, + "loss": 0.7086, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5186705589294434, + "rewards/margins": 0.5775880813598633, + "rewards/rejected": -1.0962587594985962, + "step": 7250 + }, + { + "epoch": 1.691322073383809, + "grad_norm": 10.69422721862793, + "learning_rate": 3.4278317152103564e-06, + "logits/chosen": -4.38188362121582, + "logits/rejected": -4.391608238220215, + "logps/chosen": -738.8275756835938, + "logps/rejected": -759.1577758789062, + "loss": 0.9575, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.0043985843658447, + "rewards/margins": 0.1158156618475914, + "rewards/rejected": -1.1202142238616943, + "step": 7260 + }, + { + "epoch": 1.6936517181129878, + "grad_norm": 7.140145778656006, + "learning_rate": 3.4019417475728157e-06, + "logits/chosen": -4.351529598236084, + "logits/rejected": -4.38330602645874, + "logps/chosen": -733.5669555664062, + "logps/rejected": -773.0108642578125, + "loss": 0.6557, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6187713146209717, + "rewards/margins": 0.7960120439529419, + "rewards/rejected": -1.4147834777832031, + "step": 7270 + }, + { + "epoch": 1.6959813628421667, + "grad_norm": 8.086371421813965, + "learning_rate": 3.3760517799352754e-06, + "logits/chosen": -4.3373541831970215, + "logits/rejected": -4.3250932693481445, + "logps/chosen": -676.114013671875, + "logps/rejected": -698.8336181640625, + "loss": 0.7978, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7031859159469604, + "rewards/margins": 0.3312601149082184, + "rewards/rejected": -1.034446120262146, + "step": 7280 + }, + { + "epoch": 1.6983110075713452, + "grad_norm": 4.94987154006958, + "learning_rate": 3.3501618122977347e-06, + "logits/chosen": -4.358597755432129, + "logits/rejected": -4.308444023132324, + "logps/chosen": -703.7459716796875, + "logps/rejected": -711.6162109375, + "loss": 0.7399, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.640599250793457, + "rewards/margins": 0.48218145966529846, + "rewards/rejected": -1.122780680656433, + "step": 7290 + }, + { + "epoch": 1.7006406523005242, + "grad_norm": 8.307604789733887, + "learning_rate": 3.3242718446601944e-06, + "logits/chosen": -4.2530412673950195, + "logits/rejected": -4.3351922035217285, + "logps/chosen": -679.9979858398438, + "logps/rejected": -757.0491943359375, + "loss": 0.7378, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6946650743484497, + "rewards/margins": 0.634982705116272, + "rewards/rejected": -1.3296477794647217, + "step": 7300 + }, + { + "epoch": 1.7006406523005242, + "eval_logits/chosen": -4.2809062004089355, + "eval_logits/rejected": -4.267802715301514, + "eval_logps/chosen": -698.1642456054688, + "eval_logps/rejected": -718.3338012695312, + "eval_loss": 0.6260935664176941, + "eval_rewards/accuracies": 0.6396421790122986, + "eval_rewards/chosen": -0.7335991263389587, + "eval_rewards/margins": 0.41992077231407166, + "eval_rewards/rejected": -1.153519868850708, + "eval_runtime": 398.5397, + "eval_samples_per_second": 17.951, + "eval_steps_per_second": 8.975, + "step": 7300 + }, + { + "epoch": 1.702970297029703, + "grad_norm": 5.429368495941162, + "learning_rate": 3.2983818770226537e-06, + "logits/chosen": -4.3033037185668945, + "logits/rejected": -4.257104396820068, + "logps/chosen": -705.1695556640625, + "logps/rejected": -694.6905517578125, + "loss": 0.6325, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6281101703643799, + "rewards/margins": 0.6659101247787476, + "rewards/rejected": -1.294020414352417, + "step": 7310 + }, + { + "epoch": 1.7052999417588817, + "grad_norm": 10.893732070922852, + "learning_rate": 3.272491909385114e-06, + "logits/chosen": -4.365532875061035, + "logits/rejected": -4.285029411315918, + "logps/chosen": -743.0709228515625, + "logps/rejected": -727.3438720703125, + "loss": 0.7859, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7658766508102417, + "rewards/margins": 0.39953237771987915, + "rewards/rejected": -1.165408968925476, + "step": 7320 + }, + { + "epoch": 1.7076295864880606, + "grad_norm": 7.524363994598389, + "learning_rate": 3.246601941747573e-06, + "logits/chosen": -4.205938339233398, + "logits/rejected": -4.287212371826172, + "logps/chosen": -620.0014038085938, + "logps/rejected": -696.42529296875, + "loss": 0.6766, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5650202631950378, + "rewards/margins": 0.5094878077507019, + "rewards/rejected": -1.0745080709457397, + "step": 7330 + }, + { + "epoch": 1.7099592312172394, + "grad_norm": 7.964355945587158, + "learning_rate": 3.220711974110033e-06, + "logits/chosen": -4.312127590179443, + "logits/rejected": -4.2499098777771, + "logps/chosen": -713.2406005859375, + "logps/rejected": -692.626953125, + "loss": 0.825, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.8016546964645386, + "rewards/margins": 0.2324962317943573, + "rewards/rejected": -1.0341508388519287, + "step": 7340 + }, + { + "epoch": 1.712288875946418, + "grad_norm": 10.5994291305542, + "learning_rate": 3.1948220064724922e-06, + "logits/chosen": -4.278526306152344, + "logits/rejected": -4.249955654144287, + "logps/chosen": -762.7586059570312, + "logps/rejected": -740.9987182617188, + "loss": 0.8548, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.647429883480072, + "rewards/margins": 0.18259665369987488, + "rewards/rejected": -0.8300265073776245, + "step": 7350 + }, + { + "epoch": 1.714618520675597, + "grad_norm": 9.083148956298828, + "learning_rate": 3.1689320388349515e-06, + "logits/chosen": -4.3152174949646, + "logits/rejected": -4.333291053771973, + "logps/chosen": -673.6456298828125, + "logps/rejected": -751.5562133789062, + "loss": 0.7587, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.7572881579399109, + "rewards/margins": 0.3388776183128357, + "rewards/rejected": -1.0961657762527466, + "step": 7360 + }, + { + "epoch": 1.7169481654047758, + "grad_norm": 8.386931419372559, + "learning_rate": 3.1430420711974113e-06, + "logits/chosen": -4.2550249099731445, + "logits/rejected": -4.318543434143066, + "logps/chosen": -631.4505615234375, + "logps/rejected": -726.1788940429688, + "loss": 0.6494, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7165141105651855, + "rewards/margins": 0.6534533500671387, + "rewards/rejected": -1.3699674606323242, + "step": 7370 + }, + { + "epoch": 1.7192778101339545, + "grad_norm": 7.15736722946167, + "learning_rate": 3.1171521035598706e-06, + "logits/chosen": -4.390912055969238, + "logits/rejected": -4.2898688316345215, + "logps/chosen": -764.761474609375, + "logps/rejected": -714.9124145507812, + "loss": 0.6744, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.49627989530563354, + "rewards/margins": 0.760360836982727, + "rewards/rejected": -1.256640911102295, + "step": 7380 + }, + { + "epoch": 1.7216074548631335, + "grad_norm": 8.566767692565918, + "learning_rate": 3.0912621359223303e-06, + "logits/chosen": -4.31394100189209, + "logits/rejected": -4.317699432373047, + "logps/chosen": -742.6536865234375, + "logps/rejected": -742.4452514648438, + "loss": 0.8337, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.9110280871391296, + "rewards/margins": 0.31222930550575256, + "rewards/rejected": -1.2232574224472046, + "step": 7390 + }, + { + "epoch": 1.7239370995923122, + "grad_norm": 10.37869644165039, + "learning_rate": 3.0653721682847896e-06, + "logits/chosen": -4.3188958168029785, + "logits/rejected": -4.317343711853027, + "logps/chosen": -710.0857543945312, + "logps/rejected": -701.8760986328125, + "loss": 0.6513, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6056315302848816, + "rewards/margins": 0.6209887862205505, + "rewards/rejected": -1.2266203165054321, + "step": 7400 + }, + { + "epoch": 1.7239370995923122, + "eval_logits/chosen": -4.277156352996826, + "eval_logits/rejected": -4.264052867889404, + "eval_logps/chosen": -698.2481079101562, + "eval_logps/rejected": -718.45166015625, + "eval_loss": 0.62615567445755, + "eval_rewards/accuracies": 0.6424378156661987, + "eval_rewards/chosen": -0.7419866919517517, + "eval_rewards/margins": 0.42331522703170776, + "eval_rewards/rejected": -1.1653019189834595, + "eval_runtime": 398.4685, + "eval_samples_per_second": 17.954, + "eval_steps_per_second": 8.977, + "step": 7400 + }, + { + "epoch": 1.726266744321491, + "grad_norm": 7.4377665519714355, + "learning_rate": 3.0394822006472497e-06, + "logits/chosen": -4.296648979187012, + "logits/rejected": -4.405775547027588, + "logps/chosen": -651.9999389648438, + "logps/rejected": -759.31640625, + "loss": 0.7337, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7278409004211426, + "rewards/margins": 0.5400255918502808, + "rewards/rejected": -1.267866611480713, + "step": 7410 + }, + { + "epoch": 1.72859638905067, + "grad_norm": 7.177879810333252, + "learning_rate": 3.013592233009709e-06, + "logits/chosen": -4.278170585632324, + "logits/rejected": -4.323129653930664, + "logps/chosen": -729.5637817382812, + "logps/rejected": -774.7483520507812, + "loss": 0.7374, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7349106669425964, + "rewards/margins": 0.5853937864303589, + "rewards/rejected": -1.3203043937683105, + "step": 7420 + }, + { + "epoch": 1.7309260337798484, + "grad_norm": 7.8414154052734375, + "learning_rate": 2.9877022653721688e-06, + "logits/chosen": -4.2496867179870605, + "logits/rejected": -4.125625133514404, + "logps/chosen": -684.25439453125, + "logps/rejected": -632.718017578125, + "loss": 0.9269, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.7515495419502258, + "rewards/margins": 0.010830598883330822, + "rewards/rejected": -0.7623801231384277, + "step": 7430 + }, + { + "epoch": 1.7332556785090274, + "grad_norm": 6.723340034484863, + "learning_rate": 2.961812297734628e-06, + "logits/chosen": -4.267904281616211, + "logits/rejected": -4.327345848083496, + "logps/chosen": -729.6060180664062, + "logps/rejected": -797.0587768554688, + "loss": 0.706, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7860048413276672, + "rewards/margins": 0.5039297342300415, + "rewards/rejected": -1.289934515953064, + "step": 7440 + }, + { + "epoch": 1.7355853232382061, + "grad_norm": 9.484472274780273, + "learning_rate": 2.9359223300970874e-06, + "logits/chosen": -4.306872844696045, + "logits/rejected": -4.2969560623168945, + "logps/chosen": -731.2567138671875, + "logps/rejected": -699.7662963867188, + "loss": 0.7022, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4795847535133362, + "rewards/margins": 0.6933883428573608, + "rewards/rejected": -1.1729731559753418, + "step": 7450 + }, + { + "epoch": 1.7379149679673849, + "grad_norm": 8.210362434387207, + "learning_rate": 2.910032362459547e-06, + "logits/chosen": -4.308070659637451, + "logits/rejected": -4.306254863739014, + "logps/chosen": -738.3827514648438, + "logps/rejected": -738.0606689453125, + "loss": 0.8305, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6364607810974121, + "rewards/margins": 0.3409319221973419, + "rewards/rejected": -0.9773927927017212, + "step": 7460 + }, + { + "epoch": 1.7402446126965638, + "grad_norm": 6.760463237762451, + "learning_rate": 2.8841423948220064e-06, + "logits/chosen": -4.3071770668029785, + "logits/rejected": -4.340117454528809, + "logps/chosen": -691.4385375976562, + "logps/rejected": -691.5602416992188, + "loss": 0.7247, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44640105962753296, + "rewards/margins": 0.4229421615600586, + "rewards/rejected": -0.8693434000015259, + "step": 7470 + }, + { + "epoch": 1.7425742574257426, + "grad_norm": 8.015266418457031, + "learning_rate": 2.8582524271844665e-06, + "logits/chosen": -4.395244598388672, + "logits/rejected": -4.3816423416137695, + "logps/chosen": -722.7725830078125, + "logps/rejected": -766.9971923828125, + "loss": 0.6531, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.49877268075942993, + "rewards/margins": 0.8005030751228333, + "rewards/rejected": -1.2992757558822632, + "step": 7480 + }, + { + "epoch": 1.7449039021549213, + "grad_norm": 5.885158538818359, + "learning_rate": 2.832362459546926e-06, + "logits/chosen": -4.314398765563965, + "logits/rejected": -4.305683135986328, + "logps/chosen": -717.0361328125, + "logps/rejected": -767.6456909179688, + "loss": 0.7443, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6509785652160645, + "rewards/margins": 0.6104000806808472, + "rewards/rejected": -1.2613786458969116, + "step": 7490 + }, + { + "epoch": 1.7472335468841003, + "grad_norm": 9.181596755981445, + "learning_rate": 2.8064724919093856e-06, + "logits/chosen": -4.307205677032471, + "logits/rejected": -4.181800842285156, + "logps/chosen": -753.4910888671875, + "logps/rejected": -698.2742309570312, + "loss": 0.8344, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.74730384349823, + "rewards/margins": 0.28025683760643005, + "rewards/rejected": -1.0275605916976929, + "step": 7500 + }, + { + "epoch": 1.7472335468841003, + "eval_logits/chosen": -4.275453567504883, + "eval_logits/rejected": -4.262080669403076, + "eval_logps/chosen": -698.1603393554688, + "eval_logps/rejected": -718.3576049804688, + "eval_loss": 0.6255109310150146, + "eval_rewards/accuracies": 0.6420184373855591, + "eval_rewards/chosen": -0.7332110404968262, + "eval_rewards/margins": 0.4226882755756378, + "eval_rewards/rejected": -1.1558992862701416, + "eval_runtime": 398.5378, + "eval_samples_per_second": 17.951, + "eval_steps_per_second": 8.975, + "step": 7500 + }, + { + "epoch": 1.749563191613279, + "grad_norm": 9.900083541870117, + "learning_rate": 2.780582524271845e-06, + "logits/chosen": -4.25106143951416, + "logits/rejected": -4.330036640167236, + "logps/chosen": -697.1131591796875, + "logps/rejected": -813.1942138671875, + "loss": 0.8211, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.755881130695343, + "rewards/margins": 0.3740493059158325, + "rewards/rejected": -1.1299302577972412, + "step": 7510 + }, + { + "epoch": 1.7518928363424577, + "grad_norm": 5.772104263305664, + "learning_rate": 2.7546925566343046e-06, + "logits/chosen": -4.359757423400879, + "logits/rejected": -4.24202299118042, + "logps/chosen": -744.2601318359375, + "logps/rejected": -661.6866455078125, + "loss": 0.9188, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.7953876852989197, + "rewards/margins": 0.12192598730325699, + "rewards/rejected": -0.9173136949539185, + "step": 7520 + }, + { + "epoch": 1.7542224810716367, + "grad_norm": 7.5138258934021, + "learning_rate": 2.728802588996764e-06, + "logits/chosen": -4.272562503814697, + "logits/rejected": -4.372793674468994, + "logps/chosen": -690.07763671875, + "logps/rejected": -707.4510498046875, + "loss": 0.6859, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6146534085273743, + "rewards/margins": 0.5785967111587524, + "rewards/rejected": -1.1932501792907715, + "step": 7530 + }, + { + "epoch": 1.7565521258008152, + "grad_norm": 9.83190631866455, + "learning_rate": 2.702912621359223e-06, + "logits/chosen": -4.3367462158203125, + "logits/rejected": -4.351749897003174, + "logps/chosen": -745.0218505859375, + "logps/rejected": -784.8512573242188, + "loss": 0.7967, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7240052819252014, + "rewards/margins": 0.3044678866863251, + "rewards/rejected": -1.028473138809204, + "step": 7540 + }, + { + "epoch": 1.7588817705299942, + "grad_norm": 5.653636455535889, + "learning_rate": 2.677022653721683e-06, + "logits/chosen": -4.269984245300293, + "logits/rejected": -4.253081321716309, + "logps/chosen": -725.6463623046875, + "logps/rejected": -775.1932373046875, + "loss": 0.745, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7236585021018982, + "rewards/margins": 0.5556066036224365, + "rewards/rejected": -1.2792651653289795, + "step": 7550 + }, + { + "epoch": 1.7612114152591731, + "grad_norm": 8.759044647216797, + "learning_rate": 2.6511326860841422e-06, + "logits/chosen": -4.290041923522949, + "logits/rejected": -4.351320266723633, + "logps/chosen": -702.0689697265625, + "logps/rejected": -783.780517578125, + "loss": 0.6961, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7005491256713867, + "rewards/margins": 0.575498104095459, + "rewards/rejected": -1.2760472297668457, + "step": 7560 + }, + { + "epoch": 1.7635410599883516, + "grad_norm": 9.059598922729492, + "learning_rate": 2.6252427184466024e-06, + "logits/chosen": -4.330138683319092, + "logits/rejected": -4.341468811035156, + "logps/chosen": -710.1514892578125, + "logps/rejected": -788.5646362304688, + "loss": 0.7336, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6530278921127319, + "rewards/margins": 0.44766393303871155, + "rewards/rejected": -1.1006916761398315, + "step": 7570 + }, + { + "epoch": 1.7658707047175306, + "grad_norm": 8.274618148803711, + "learning_rate": 2.5993527508090617e-06, + "logits/chosen": -4.33445930480957, + "logits/rejected": -4.335292816162109, + "logps/chosen": -665.8831176757812, + "logps/rejected": -717.4337158203125, + "loss": 0.7093, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7828375101089478, + "rewards/margins": 0.5649530291557312, + "rewards/rejected": -1.3477904796600342, + "step": 7580 + }, + { + "epoch": 1.7682003494467093, + "grad_norm": 9.58355712890625, + "learning_rate": 2.5734627831715214e-06, + "logits/chosen": -4.344472408294678, + "logits/rejected": -4.394615650177002, + "logps/chosen": -649.8656616210938, + "logps/rejected": -706.46435546875, + "loss": 0.7247, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6893410086631775, + "rewards/margins": 0.4330008029937744, + "rewards/rejected": -1.1223418712615967, + "step": 7590 + }, + { + "epoch": 1.770529994175888, + "grad_norm": 9.934365272521973, + "learning_rate": 2.5475728155339807e-06, + "logits/chosen": -4.332821846008301, + "logits/rejected": -4.36389684677124, + "logps/chosen": -700.098876953125, + "logps/rejected": -759.6007690429688, + "loss": 0.7871, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6798760890960693, + "rewards/margins": 0.39228060841560364, + "rewards/rejected": -1.0721566677093506, + "step": 7600 + }, + { + "epoch": 1.770529994175888, + "eval_logits/chosen": -4.275769233703613, + "eval_logits/rejected": -4.262385368347168, + "eval_logps/chosen": -698.19287109375, + "eval_logps/rejected": -718.3925170898438, + "eval_loss": 0.6256328225135803, + "eval_rewards/accuracies": 0.642577588558197, + "eval_rewards/chosen": -0.7364568114280701, + "eval_rewards/margins": 0.4229297339916229, + "eval_rewards/rejected": -1.1593865156173706, + "eval_runtime": 399.4902, + "eval_samples_per_second": 17.908, + "eval_steps_per_second": 8.954, + "step": 7600 + }, + { + "epoch": 1.772859638905067, + "grad_norm": 3.8265748023986816, + "learning_rate": 2.5216828478964404e-06, + "logits/chosen": -4.340038299560547, + "logits/rejected": -4.323204040527344, + "logps/chosen": -684.2075805664062, + "logps/rejected": -782.0001220703125, + "loss": 0.6874, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5509809255599976, + "rewards/margins": 0.7414697408676147, + "rewards/rejected": -1.2924506664276123, + "step": 7610 + }, + { + "epoch": 1.7751892836342458, + "grad_norm": 8.738218307495117, + "learning_rate": 2.4957928802588998e-06, + "logits/chosen": -4.31033992767334, + "logits/rejected": -4.3488922119140625, + "logps/chosen": -703.8746948242188, + "logps/rejected": -761.0021362304688, + "loss": 0.7514, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7934512495994568, + "rewards/margins": 0.521425724029541, + "rewards/rejected": -1.314876914024353, + "step": 7620 + }, + { + "epoch": 1.7775189283634245, + "grad_norm": 6.381500720977783, + "learning_rate": 2.4699029126213595e-06, + "logits/chosen": -4.314557075500488, + "logits/rejected": -4.293426513671875, + "logps/chosen": -685.4746704101562, + "logps/rejected": -710.4097900390625, + "loss": 0.7379, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.48360586166381836, + "rewards/margins": 0.5202947854995728, + "rewards/rejected": -1.0039006471633911, + "step": 7630 + }, + { + "epoch": 1.7798485730926035, + "grad_norm": 5.309506416320801, + "learning_rate": 2.444012944983819e-06, + "logits/chosen": -4.322248935699463, + "logits/rejected": -4.2495527267456055, + "logps/chosen": -675.5975952148438, + "logps/rejected": -633.2434692382812, + "loss": 0.9243, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7094519138336182, + "rewards/margins": 0.12580251693725586, + "rewards/rejected": -0.8352544903755188, + "step": 7640 + }, + { + "epoch": 1.7821782178217822, + "grad_norm": 6.463253021240234, + "learning_rate": 2.4181229773462785e-06, + "logits/chosen": -4.193962097167969, + "logits/rejected": -4.322009086608887, + "logps/chosen": -679.2346801757812, + "logps/rejected": -781.1619873046875, + "loss": 0.7621, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6658368706703186, + "rewards/margins": 0.421161025762558, + "rewards/rejected": -1.0869977474212646, + "step": 7650 + }, + { + "epoch": 1.784507862550961, + "grad_norm": 7.450245380401611, + "learning_rate": 2.3922330097087382e-06, + "logits/chosen": -4.363302707672119, + "logits/rejected": -4.293172836303711, + "logps/chosen": -732.4378051757812, + "logps/rejected": -791.06884765625, + "loss": 0.8789, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.0643038749694824, + "rewards/margins": 0.19379642605781555, + "rewards/rejected": -1.2581002712249756, + "step": 7660 + }, + { + "epoch": 1.78683750728014, + "grad_norm": 9.34884262084961, + "learning_rate": 2.3663430420711975e-06, + "logits/chosen": -4.283385276794434, + "logits/rejected": -4.333686351776123, + "logps/chosen": -695.9212646484375, + "logps/rejected": -753.2310791015625, + "loss": 0.7264, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.694754958152771, + "rewards/margins": 0.5363430380821228, + "rewards/rejected": -1.231097936630249, + "step": 7670 + }, + { + "epoch": 1.7891671520093184, + "grad_norm": 8.882437705993652, + "learning_rate": 2.340453074433657e-06, + "logits/chosen": -4.361789226531982, + "logits/rejected": -4.341314792633057, + "logps/chosen": -699.6898803710938, + "logps/rejected": -758.737548828125, + "loss": 0.7417, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.7322936058044434, + "rewards/margins": 0.5006028413772583, + "rewards/rejected": -1.2328965663909912, + "step": 7680 + }, + { + "epoch": 1.7914967967384974, + "grad_norm": 5.45563268661499, + "learning_rate": 2.3145631067961166e-06, + "logits/chosen": -4.283277988433838, + "logits/rejected": -4.236169815063477, + "logps/chosen": -698.2086181640625, + "logps/rejected": -688.638671875, + "loss": 0.6836, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6562092304229736, + "rewards/margins": 0.5385984778404236, + "rewards/rejected": -1.194807767868042, + "step": 7690 + }, + { + "epoch": 1.7938264414676763, + "grad_norm": 10.295047760009766, + "learning_rate": 2.2886731391585763e-06, + "logits/chosen": -4.275271415710449, + "logits/rejected": -4.330410003662109, + "logps/chosen": -690.9629516601562, + "logps/rejected": -742.60791015625, + "loss": 0.775, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6221812963485718, + "rewards/margins": 0.4292815327644348, + "rewards/rejected": -1.0514627695083618, + "step": 7700 + }, + { + "epoch": 1.7938264414676763, + "eval_logits/chosen": -4.2768354415893555, + "eval_logits/rejected": -4.263593673706055, + "eval_logps/chosen": -698.334228515625, + "eval_logps/rejected": -718.576416015625, + "eval_loss": 0.626298725605011, + "eval_rewards/accuracies": 0.6427173614501953, + "eval_rewards/chosen": -0.7505965828895569, + "eval_rewards/margins": 0.4271797239780426, + "eval_rewards/rejected": -1.1777764558792114, + "eval_runtime": 398.6638, + "eval_samples_per_second": 17.945, + "eval_steps_per_second": 8.972, + "step": 7700 + }, + { + "epoch": 1.7961560861968549, + "grad_norm": 7.378889560699463, + "learning_rate": 2.2627831715210356e-06, + "logits/chosen": -4.222140312194824, + "logits/rejected": -4.275264263153076, + "logps/chosen": -627.9237060546875, + "logps/rejected": -636.3966064453125, + "loss": 0.6343, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.20627155900001526, + "rewards/margins": 0.7409781217575073, + "rewards/rejected": -0.9472497701644897, + "step": 7710 + }, + { + "epoch": 1.7984857309260338, + "grad_norm": 9.264955520629883, + "learning_rate": 2.2368932038834953e-06, + "logits/chosen": -4.308340072631836, + "logits/rejected": -4.277257442474365, + "logps/chosen": -746.6083374023438, + "logps/rejected": -733.2451782226562, + "loss": 0.8311, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.899739146232605, + "rewards/margins": 0.07021939754486084, + "rewards/rejected": -0.969958484172821, + "step": 7720 + }, + { + "epoch": 1.8008153756552125, + "grad_norm": 5.251680374145508, + "learning_rate": 2.211003236245955e-06, + "logits/chosen": -4.405594825744629, + "logits/rejected": -4.321906089782715, + "logps/chosen": -751.5158081054688, + "logps/rejected": -801.6177368164062, + "loss": 0.8219, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8107985258102417, + "rewards/margins": 0.6600787043571472, + "rewards/rejected": -1.4708774089813232, + "step": 7730 + }, + { + "epoch": 1.8031450203843913, + "grad_norm": 8.310518264770508, + "learning_rate": 2.1851132686084143e-06, + "logits/chosen": -4.286992073059082, + "logits/rejected": -4.3545308113098145, + "logps/chosen": -703.4472045898438, + "logps/rejected": -783.1981811523438, + "loss": 0.8341, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8503850102424622, + "rewards/margins": 0.3780274987220764, + "rewards/rejected": -1.228412389755249, + "step": 7740 + }, + { + "epoch": 1.8054746651135702, + "grad_norm": 8.352068901062012, + "learning_rate": 2.159223300970874e-06, + "logits/chosen": -4.327363014221191, + "logits/rejected": -4.168322563171387, + "logps/chosen": -769.9176025390625, + "logps/rejected": -680.9280395507812, + "loss": 0.8691, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.8926218152046204, + "rewards/margins": 0.17855612933635712, + "rewards/rejected": -1.0711780786514282, + "step": 7750 + }, + { + "epoch": 1.807804309842749, + "grad_norm": 8.405198097229004, + "learning_rate": 2.133333333333334e-06, + "logits/chosen": -4.351681709289551, + "logits/rejected": -4.3126020431518555, + "logps/chosen": -745.2198486328125, + "logps/rejected": -738.9443969726562, + "loss": 0.6288, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6772146821022034, + "rewards/margins": 0.5970760583877563, + "rewards/rejected": -1.274290680885315, + "step": 7760 + }, + { + "epoch": 1.8101339545719277, + "grad_norm": 7.872401714324951, + "learning_rate": 2.107443365695793e-06, + "logits/chosen": -4.302443027496338, + "logits/rejected": -4.289053440093994, + "logps/chosen": -640.4171752929688, + "logps/rejected": -674.5543212890625, + "loss": 0.7753, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.8442346453666687, + "rewards/margins": 0.3581005036830902, + "rewards/rejected": -1.2023351192474365, + "step": 7770 + }, + { + "epoch": 1.8124635993011067, + "grad_norm": 4.250477313995361, + "learning_rate": 2.0815533980582524e-06, + "logits/chosen": -4.271813869476318, + "logits/rejected": -4.248841285705566, + "logps/chosen": -711.6376342773438, + "logps/rejected": -629.3045043945312, + "loss": 0.8603, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8051861524581909, + "rewards/margins": 0.3091093599796295, + "rewards/rejected": -1.114295482635498, + "step": 7780 + }, + { + "epoch": 1.8147932440302854, + "grad_norm": 8.479049682617188, + "learning_rate": 2.055663430420712e-06, + "logits/chosen": -4.2787370681762695, + "logits/rejected": -4.315961837768555, + "logps/chosen": -670.7811279296875, + "logps/rejected": -719.6918334960938, + "loss": 0.756, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5828672647476196, + "rewards/margins": 0.5051405429840088, + "rewards/rejected": -1.0880076885223389, + "step": 7790 + }, + { + "epoch": 1.8171228887594641, + "grad_norm": 13.810235977172852, + "learning_rate": 2.029773462783172e-06, + "logits/chosen": -4.362678527832031, + "logits/rejected": -4.296935081481934, + "logps/chosen": -753.6568603515625, + "logps/rejected": -747.1669921875, + "loss": 0.9258, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.816769003868103, + "rewards/margins": 0.12891487777233124, + "rewards/rejected": -0.9456839561462402, + "step": 7800 + }, + { + "epoch": 1.8171228887594641, + "eval_logits/chosen": -4.273819923400879, + "eval_logits/rejected": -4.260204315185547, + "eval_logps/chosen": -698.2659301757812, + "eval_logps/rejected": -718.5016479492188, + "eval_loss": 0.6260212063789368, + "eval_rewards/accuracies": 0.6434162855148315, + "eval_rewards/chosen": -0.7437689304351807, + "eval_rewards/margins": 0.42653757333755493, + "eval_rewards/rejected": -1.1703065633773804, + "eval_runtime": 400.821, + "eval_samples_per_second": 17.848, + "eval_steps_per_second": 8.924, + "step": 7800 + }, + { + "epoch": 1.819452533488643, + "grad_norm": 9.626912117004395, + "learning_rate": 2.003883495145631e-06, + "logits/chosen": -4.315255165100098, + "logits/rejected": -4.34834098815918, + "logps/chosen": -734.5718383789062, + "logps/rejected": -787.1271362304688, + "loss": 0.7108, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7061156034469604, + "rewards/margins": 0.692793071269989, + "rewards/rejected": -1.3989086151123047, + "step": 7810 + }, + { + "epoch": 1.8217821782178216, + "grad_norm": 8.975574493408203, + "learning_rate": 1.977993527508091e-06, + "logits/chosen": -4.334473133087158, + "logits/rejected": -4.307866096496582, + "logps/chosen": -764.4278564453125, + "logps/rejected": -787.1414794921875, + "loss": 0.7731, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.5991033315658569, + "rewards/margins": 0.35862669348716736, + "rewards/rejected": -0.9577299952507019, + "step": 7820 + }, + { + "epoch": 1.8241118229470006, + "grad_norm": 4.2930908203125, + "learning_rate": 1.95210355987055e-06, + "logits/chosen": -4.372093200683594, + "logits/rejected": -4.383025169372559, + "logps/chosen": -773.4154663085938, + "logps/rejected": -833.3216552734375, + "loss": 0.8166, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8959034085273743, + "rewards/margins": 0.4735845923423767, + "rewards/rejected": -1.369488000869751, + "step": 7830 + }, + { + "epoch": 1.8264414676761795, + "grad_norm": 7.499445915222168, + "learning_rate": 1.92621359223301e-06, + "logits/chosen": -4.287027359008789, + "logits/rejected": -4.379525184631348, + "logps/chosen": -655.5816650390625, + "logps/rejected": -765.8964233398438, + "loss": 0.7044, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6393209099769592, + "rewards/margins": 0.46456795930862427, + "rewards/rejected": -1.103888750076294, + "step": 7840 + }, + { + "epoch": 1.828771112405358, + "grad_norm": 5.650694847106934, + "learning_rate": 1.9003236245954696e-06, + "logits/chosen": -4.295722007751465, + "logits/rejected": -4.299536228179932, + "logps/chosen": -736.7391357421875, + "logps/rejected": -712.6022338867188, + "loss": 0.7138, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5302426218986511, + "rewards/margins": 0.5727535486221313, + "rewards/rejected": -1.1029961109161377, + "step": 7850 + }, + { + "epoch": 1.831100757134537, + "grad_norm": 11.053319931030273, + "learning_rate": 1.8744336569579287e-06, + "logits/chosen": -4.285447120666504, + "logits/rejected": -4.212759494781494, + "logps/chosen": -772.167236328125, + "logps/rejected": -714.1157836914062, + "loss": 0.7928, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6176115870475769, + "rewards/margins": 0.4766506552696228, + "rewards/rejected": -1.0942623615264893, + "step": 7860 + }, + { + "epoch": 1.8334304018637158, + "grad_norm": 8.503183364868164, + "learning_rate": 1.8485436893203885e-06, + "logits/chosen": -4.3330230712890625, + "logits/rejected": -4.239395618438721, + "logps/chosen": -706.237060546875, + "logps/rejected": -652.3427124023438, + "loss": 0.7318, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6463378667831421, + "rewards/margins": 0.46101585030555725, + "rewards/rejected": -1.107353687286377, + "step": 7870 + }, + { + "epoch": 1.8357600465928945, + "grad_norm": 8.738791465759277, + "learning_rate": 1.822653721682848e-06, + "logits/chosen": -4.2857489585876465, + "logits/rejected": -4.343883991241455, + "logps/chosen": -678.5247802734375, + "logps/rejected": -713.4631958007812, + "loss": 0.7368, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6008560657501221, + "rewards/margins": 0.5160638093948364, + "rewards/rejected": -1.116919755935669, + "step": 7880 + }, + { + "epoch": 1.8380896913220734, + "grad_norm": 5.170600414276123, + "learning_rate": 1.7967637540453075e-06, + "logits/chosen": -4.332296371459961, + "logits/rejected": -4.3902058601379395, + "logps/chosen": -717.0233764648438, + "logps/rejected": -801.6724243164062, + "loss": 0.8154, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7391607165336609, + "rewards/margins": 0.354645311832428, + "rewards/rejected": -1.0938060283660889, + "step": 7890 + }, + { + "epoch": 1.8404193360512522, + "grad_norm": 7.153270244598389, + "learning_rate": 1.7708737864077672e-06, + "logits/chosen": -4.206341743469238, + "logits/rejected": -4.272002220153809, + "logps/chosen": -659.7720947265625, + "logps/rejected": -721.71044921875, + "loss": 0.7981, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.6344069242477417, + "rewards/margins": 0.3891531825065613, + "rewards/rejected": -1.0235600471496582, + "step": 7900 + }, + { + "epoch": 1.8404193360512522, + "eval_logits/chosen": -4.273324489593506, + "eval_logits/rejected": -4.259591579437256, + "eval_logps/chosen": -698.2616577148438, + "eval_logps/rejected": -718.4935913085938, + "eval_loss": 0.6259632110595703, + "eval_rewards/accuracies": 0.6406206488609314, + "eval_rewards/chosen": -0.7433328032493591, + "eval_rewards/margins": 0.42616304755210876, + "eval_rewards/rejected": -1.1694958209991455, + "eval_runtime": 399.2234, + "eval_samples_per_second": 17.92, + "eval_steps_per_second": 8.96, + "step": 7900 + }, + { + "epoch": 1.842748980780431, + "grad_norm": 5.014840602874756, + "learning_rate": 1.7449838187702267e-06, + "logits/chosen": -4.308610916137695, + "logits/rejected": -4.290936470031738, + "logps/chosen": -711.2816162109375, + "logps/rejected": -699.1672973632812, + "loss": 0.633, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5748124718666077, + "rewards/margins": 0.48621124029159546, + "rewards/rejected": -1.0610238313674927, + "step": 7910 + }, + { + "epoch": 1.8450786255096099, + "grad_norm": 2.524541139602661, + "learning_rate": 1.7190938511326862e-06, + "logits/chosen": -4.281649589538574, + "logits/rejected": -4.29948091506958, + "logps/chosen": -682.9008178710938, + "logps/rejected": -737.8621215820312, + "loss": 0.6611, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.45142731070518494, + "rewards/margins": 0.7023296356201172, + "rewards/rejected": -1.153756856918335, + "step": 7920 + }, + { + "epoch": 1.8474082702387886, + "grad_norm": 7.655969619750977, + "learning_rate": 1.6932038834951458e-06, + "logits/chosen": -4.2819390296936035, + "logits/rejected": -4.311500549316406, + "logps/chosen": -703.700439453125, + "logps/rejected": -756.3419189453125, + "loss": 0.7514, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.7004653811454773, + "rewards/margins": 0.49804216623306274, + "rewards/rejected": -1.1985076665878296, + "step": 7930 + }, + { + "epoch": 1.8497379149679674, + "grad_norm": 10.318922996520996, + "learning_rate": 1.6673139158576055e-06, + "logits/chosen": -4.371182441711426, + "logits/rejected": -4.295583248138428, + "logps/chosen": -759.6697998046875, + "logps/rejected": -713.5474853515625, + "loss": 0.7842, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7593971490859985, + "rewards/margins": 0.34532198309898376, + "rewards/rejected": -1.1047192811965942, + "step": 7940 + }, + { + "epoch": 1.8520675596971463, + "grad_norm": 7.683895587921143, + "learning_rate": 1.6414239482200648e-06, + "logits/chosen": -4.274389266967773, + "logits/rejected": -4.26663064956665, + "logps/chosen": -701.9673461914062, + "logps/rejected": -762.0696411132812, + "loss": 0.6948, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5486912131309509, + "rewards/margins": 0.6160353422164917, + "rewards/rejected": -1.1647266149520874, + "step": 7950 + }, + { + "epoch": 1.8543972044263248, + "grad_norm": 5.8917694091796875, + "learning_rate": 1.6155339805825243e-06, + "logits/chosen": -4.227823257446289, + "logits/rejected": -4.251143455505371, + "logps/chosen": -699.05908203125, + "logps/rejected": -751.8037719726562, + "loss": 0.7264, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6425224542617798, + "rewards/margins": 0.3867589831352234, + "rewards/rejected": -1.0292813777923584, + "step": 7960 + }, + { + "epoch": 1.8567268491555038, + "grad_norm": 8.299991607666016, + "learning_rate": 1.5896440129449838e-06, + "logits/chosen": -4.296449184417725, + "logits/rejected": -4.244095802307129, + "logps/chosen": -728.1547241210938, + "logps/rejected": -708.0887451171875, + "loss": 0.7491, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6678886413574219, + "rewards/margins": 0.49384164810180664, + "rewards/rejected": -1.1617302894592285, + "step": 7970 + }, + { + "epoch": 1.8590564938846827, + "grad_norm": 8.590520858764648, + "learning_rate": 1.5637540453074435e-06, + "logits/chosen": -4.296567440032959, + "logits/rejected": -4.204328536987305, + "logps/chosen": -734.1271362304688, + "logps/rejected": -685.5897827148438, + "loss": 0.6546, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6129536628723145, + "rewards/margins": 0.6994892358779907, + "rewards/rejected": -1.3124430179595947, + "step": 7980 + }, + { + "epoch": 1.8613861386138613, + "grad_norm": 8.054755210876465, + "learning_rate": 1.537864077669903e-06, + "logits/chosen": -4.2982258796691895, + "logits/rejected": -4.374433994293213, + "logps/chosen": -710.0291748046875, + "logps/rejected": -802.5664672851562, + "loss": 0.7795, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6659218668937683, + "rewards/margins": 0.44334450364112854, + "rewards/rejected": -1.1092662811279297, + "step": 7990 + }, + { + "epoch": 1.8637157833430402, + "grad_norm": 8.272866249084473, + "learning_rate": 1.5119741100323626e-06, + "logits/chosen": -4.2993011474609375, + "logits/rejected": -4.279488563537598, + "logps/chosen": -712.0296630859375, + "logps/rejected": -736.62060546875, + "loss": 0.7925, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5873527526855469, + "rewards/margins": 0.4799245297908783, + "rewards/rejected": -1.0672773122787476, + "step": 8000 + }, + { + "epoch": 1.8637157833430402, + "eval_logits/chosen": -4.2714433670043945, + "eval_logits/rejected": -4.257770538330078, + "eval_logps/chosen": -698.35888671875, + "eval_logps/rejected": -718.630615234375, + "eval_loss": 0.62626051902771, + "eval_rewards/accuracies": 0.6418786644935608, + "eval_rewards/chosen": -0.7530632615089417, + "eval_rewards/margins": 0.4301401674747467, + "eval_rewards/rejected": -1.1832033395767212, + "eval_runtime": 400.3459, + "eval_samples_per_second": 17.87, + "eval_steps_per_second": 8.935, + "step": 8000 + }, + { + "epoch": 1.866045428072219, + "grad_norm": 8.103987693786621, + "learning_rate": 1.486084142394822e-06, + "logits/chosen": -4.286310195922852, + "logits/rejected": -4.278182506561279, + "logps/chosen": -654.7042846679688, + "logps/rejected": -674.2049560546875, + "loss": 0.6231, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4982284903526306, + "rewards/margins": 0.7050498127937317, + "rewards/rejected": -1.2032783031463623, + "step": 8010 + }, + { + "epoch": 1.8683750728013977, + "grad_norm": 10.897774696350098, + "learning_rate": 1.4601941747572818e-06, + "logits/chosen": -4.338923454284668, + "logits/rejected": -4.356582164764404, + "logps/chosen": -692.2127685546875, + "logps/rejected": -728.3756103515625, + "loss": 0.7361, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.737460196018219, + "rewards/margins": 0.5897017121315002, + "rewards/rejected": -1.3271619081497192, + "step": 8020 + }, + { + "epoch": 1.8707047175305767, + "grad_norm": 6.542692184448242, + "learning_rate": 1.4343042071197413e-06, + "logits/chosen": -4.281493663787842, + "logits/rejected": -4.266264915466309, + "logps/chosen": -690.42529296875, + "logps/rejected": -724.5419921875, + "loss": 0.7727, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7195274233818054, + "rewards/margins": 0.527152419090271, + "rewards/rejected": -1.2466800212860107, + "step": 8030 + }, + { + "epoch": 1.8730343622597554, + "grad_norm": 5.282635688781738, + "learning_rate": 1.4084142394822006e-06, + "logits/chosen": -4.363732814788818, + "logits/rejected": -4.371499538421631, + "logps/chosen": -707.1060791015625, + "logps/rejected": -674.9572143554688, + "loss": 0.7736, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.6483897566795349, + "rewards/margins": 0.3561496436595917, + "rewards/rejected": -1.0045394897460938, + "step": 8040 + }, + { + "epoch": 1.8753640069889341, + "grad_norm": 7.903660774230957, + "learning_rate": 1.3825242718446601e-06, + "logits/chosen": -4.357583522796631, + "logits/rejected": -4.340155124664307, + "logps/chosen": -715.6651611328125, + "logps/rejected": -782.0966796875, + "loss": 0.5883, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6292458772659302, + "rewards/margins": 0.7687798738479614, + "rewards/rejected": -1.3980258703231812, + "step": 8050 + }, + { + "epoch": 1.877693651718113, + "grad_norm": 7.870277404785156, + "learning_rate": 1.3566343042071199e-06, + "logits/chosen": -4.245657444000244, + "logits/rejected": -4.232403755187988, + "logps/chosen": -734.6783447265625, + "logps/rejected": -739.2367553710938, + "loss": 0.8023, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.762460470199585, + "rewards/margins": 0.41493305563926697, + "rewards/rejected": -1.1773935556411743, + "step": 8060 + }, + { + "epoch": 1.8800232964472918, + "grad_norm": 4.446013927459717, + "learning_rate": 1.3307443365695794e-06, + "logits/chosen": -4.216975212097168, + "logits/rejected": -4.27420711517334, + "logps/chosen": -696.5819091796875, + "logps/rejected": -726.9581909179688, + "loss": 0.7313, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6386895775794983, + "rewards/margins": 0.49886369705200195, + "rewards/rejected": -1.137553334236145, + "step": 8070 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 8.197561264038086, + "learning_rate": 1.304854368932039e-06, + "logits/chosen": -4.304692268371582, + "logits/rejected": -4.309518337249756, + "logps/chosen": -668.96728515625, + "logps/rejected": -743.6773681640625, + "loss": 0.8273, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6109591722488403, + "rewards/margins": 0.3413018584251404, + "rewards/rejected": -0.9522610902786255, + "step": 8080 + }, + { + "epoch": 1.8846825859056495, + "grad_norm": 6.251431941986084, + "learning_rate": 1.2789644012944984e-06, + "logits/chosen": -4.228099346160889, + "logits/rejected": -4.20468807220459, + "logps/chosen": -669.6293334960938, + "logps/rejected": -689.2266845703125, + "loss": 0.633, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6801440119743347, + "rewards/margins": 0.6941016316413879, + "rewards/rejected": -1.3742456436157227, + "step": 8090 + }, + { + "epoch": 1.887012230634828, + "grad_norm": 11.473604202270508, + "learning_rate": 1.2530744336569581e-06, + "logits/chosen": -4.333575248718262, + "logits/rejected": -4.392006874084473, + "logps/chosen": -756.8983154296875, + "logps/rejected": -833.8203125, + "loss": 0.7757, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.7302811741828918, + "rewards/margins": 0.4153829514980316, + "rewards/rejected": -1.1456642150878906, + "step": 8100 + }, + { + "epoch": 1.887012230634828, + "eval_logits/chosen": -4.270134925842285, + "eval_logits/rejected": -4.256483554840088, + "eval_logps/chosen": -698.4719848632812, + "eval_logps/rejected": -718.76123046875, + "eval_loss": 0.6265602707862854, + "eval_rewards/accuracies": 0.6420184373855591, + "eval_rewards/chosen": -0.7643771767616272, + "eval_rewards/margins": 0.43188703060150146, + "eval_rewards/rejected": -1.1962642669677734, + "eval_runtime": 401.3482, + "eval_samples_per_second": 17.825, + "eval_steps_per_second": 8.912, + "step": 8100 + }, + { + "epoch": 1.889341875364007, + "grad_norm": 6.380511283874512, + "learning_rate": 1.2271844660194174e-06, + "logits/chosen": -4.341069221496582, + "logits/rejected": -4.404160499572754, + "logps/chosen": -696.8298950195312, + "logps/rejected": -806.1851806640625, + "loss": 0.5914, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5037180781364441, + "rewards/margins": 0.8781551122665405, + "rewards/rejected": -1.3818731307983398, + "step": 8110 + }, + { + "epoch": 1.891671520093186, + "grad_norm": 7.608849048614502, + "learning_rate": 1.2012944983818772e-06, + "logits/chosen": -4.330622673034668, + "logits/rejected": -4.2414679527282715, + "logps/chosen": -784.8829956054688, + "logps/rejected": -726.9249877929688, + "loss": 0.7297, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6424816250801086, + "rewards/margins": 0.39398622512817383, + "rewards/rejected": -1.0364679098129272, + "step": 8120 + }, + { + "epoch": 1.8940011648223645, + "grad_norm": 9.878482818603516, + "learning_rate": 1.1754045307443367e-06, + "logits/chosen": -4.255199432373047, + "logits/rejected": -4.249835014343262, + "logps/chosen": -713.4352416992188, + "logps/rejected": -753.8508911132812, + "loss": 0.6942, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.8424083590507507, + "rewards/margins": 0.5438202619552612, + "rewards/rejected": -1.3862287998199463, + "step": 8130 + }, + { + "epoch": 1.8963308095515434, + "grad_norm": 9.539684295654297, + "learning_rate": 1.1495145631067962e-06, + "logits/chosen": -4.222280025482178, + "logits/rejected": -4.286808013916016, + "logps/chosen": -668.6334838867188, + "logps/rejected": -658.0465087890625, + "loss": 0.8308, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.8567003011703491, + "rewards/margins": 0.35510557889938354, + "rewards/rejected": -1.2118059396743774, + "step": 8140 + }, + { + "epoch": 1.8986604542807222, + "grad_norm": 6.965002059936523, + "learning_rate": 1.1236245954692557e-06, + "logits/chosen": -4.31527853012085, + "logits/rejected": -4.246081352233887, + "logps/chosen": -706.160400390625, + "logps/rejected": -683.8353271484375, + "loss": 0.795, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6172300577163696, + "rewards/margins": 0.39958369731903076, + "rewards/rejected": -1.0168136358261108, + "step": 8150 + }, + { + "epoch": 1.900990099009901, + "grad_norm": 6.807827472686768, + "learning_rate": 1.0977346278317152e-06, + "logits/chosen": -4.384181022644043, + "logits/rejected": -4.358831882476807, + "logps/chosen": -772.3015747070312, + "logps/rejected": -818.3670043945312, + "loss": 0.8398, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7376090288162231, + "rewards/margins": 0.4424554705619812, + "rewards/rejected": -1.1800644397735596, + "step": 8160 + }, + { + "epoch": 1.9033197437390799, + "grad_norm": 8.301105499267578, + "learning_rate": 1.0718446601941747e-06, + "logits/chosen": -4.291867256164551, + "logits/rejected": -4.323934078216553, + "logps/chosen": -726.5836791992188, + "logps/rejected": -758.00048828125, + "loss": 0.7862, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.764946699142456, + "rewards/margins": 0.43642458319664, + "rewards/rejected": -1.2013713121414185, + "step": 8170 + }, + { + "epoch": 1.9056493884682586, + "grad_norm": 7.553655624389648, + "learning_rate": 1.0459546925566345e-06, + "logits/chosen": -4.2980146408081055, + "logits/rejected": -4.247636795043945, + "logps/chosen": -691.2302856445312, + "logps/rejected": -720.0042114257812, + "loss": 0.784, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8119754791259766, + "rewards/margins": 0.34611016511917114, + "rewards/rejected": -1.1580857038497925, + "step": 8180 + }, + { + "epoch": 1.9079790331974373, + "grad_norm": 5.689741611480713, + "learning_rate": 1.020064724919094e-06, + "logits/chosen": -4.381438732147217, + "logits/rejected": -4.324999809265137, + "logps/chosen": -739.9146728515625, + "logps/rejected": -710.13623046875, + "loss": 0.7337, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.577957272529602, + "rewards/margins": 0.40901726484298706, + "rewards/rejected": -0.9869745373725891, + "step": 8190 + }, + { + "epoch": 1.9103086779266163, + "grad_norm": 11.527932167053223, + "learning_rate": 9.941747572815535e-07, + "logits/chosen": -4.309118747711182, + "logits/rejected": -4.265748977661133, + "logps/chosen": -731.2278442382812, + "logps/rejected": -681.3240966796875, + "loss": 0.8045, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.7814499139785767, + "rewards/margins": 0.3624773621559143, + "rewards/rejected": -1.1439272165298462, + "step": 8200 + }, + { + "epoch": 1.9103086779266163, + "eval_logits/chosen": -4.268999099731445, + "eval_logits/rejected": -4.255213737487793, + "eval_logps/chosen": -698.4553833007812, + "eval_logps/rejected": -718.7557373046875, + "eval_loss": 0.6263387203216553, + "eval_rewards/accuracies": 0.6413195133209229, + "eval_rewards/chosen": -0.762718677520752, + "eval_rewards/margins": 0.4329899847507477, + "eval_rewards/rejected": -1.1957087516784668, + "eval_runtime": 401.6446, + "eval_samples_per_second": 17.812, + "eval_steps_per_second": 8.906, + "step": 8200 + }, + { + "epoch": 1.912638322655795, + "grad_norm": 9.736942291259766, + "learning_rate": 9.68284789644013e-07, + "logits/chosen": -4.324416160583496, + "logits/rejected": -4.226147174835205, + "logps/chosen": -716.7040405273438, + "logps/rejected": -628.514404296875, + "loss": 0.67, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5403128862380981, + "rewards/margins": 0.6297000646591187, + "rewards/rejected": -1.1700130701065063, + "step": 8210 + }, + { + "epoch": 1.9149679673849738, + "grad_norm": 7.084342956542969, + "learning_rate": 9.423948220064725e-07, + "logits/chosen": -4.212306022644043, + "logits/rejected": -4.274476051330566, + "logps/chosen": -632.3684692382812, + "logps/rejected": -746.298828125, + "loss": 0.7419, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.607592761516571, + "rewards/margins": 0.5596817135810852, + "rewards/rejected": -1.1672742366790771, + "step": 8220 + }, + { + "epoch": 1.9172976121141527, + "grad_norm": 7.207754135131836, + "learning_rate": 9.165048543689321e-07, + "logits/chosen": -4.234009265899658, + "logits/rejected": -4.276724338531494, + "logps/chosen": -658.24609375, + "logps/rejected": -690.4041748046875, + "loss": 0.7392, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6036561727523804, + "rewards/margins": 0.697953462600708, + "rewards/rejected": -1.301609754562378, + "step": 8230 + }, + { + "epoch": 1.9196272568433312, + "grad_norm": 9.125251770019531, + "learning_rate": 8.906148867313917e-07, + "logits/chosen": -4.313952922821045, + "logits/rejected": -4.3028974533081055, + "logps/chosen": -757.9784545898438, + "logps/rejected": -726.7279052734375, + "loss": 0.7373, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6254912614822388, + "rewards/margins": 0.4764394164085388, + "rewards/rejected": -1.1019306182861328, + "step": 8240 + }, + { + "epoch": 1.9219569015725102, + "grad_norm": 6.093334197998047, + "learning_rate": 8.647249190938512e-07, + "logits/chosen": -4.276091575622559, + "logits/rejected": -4.3135552406311035, + "logps/chosen": -667.789306640625, + "logps/rejected": -731.2029418945312, + "loss": 0.7606, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8004811406135559, + "rewards/margins": 0.5769751667976379, + "rewards/rejected": -1.3774563074111938, + "step": 8250 + }, + { + "epoch": 1.924286546301689, + "grad_norm": 7.825791358947754, + "learning_rate": 8.388349514563107e-07, + "logits/chosen": -4.3161139488220215, + "logits/rejected": -4.360238075256348, + "logps/chosen": -693.4849853515625, + "logps/rejected": -744.6207275390625, + "loss": 0.7171, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5785298347473145, + "rewards/margins": 0.4514383375644684, + "rewards/rejected": -1.0299681425094604, + "step": 8260 + }, + { + "epoch": 1.9266161910308677, + "grad_norm": 9.916013717651367, + "learning_rate": 8.129449838187703e-07, + "logits/chosen": -4.298556327819824, + "logits/rejected": -4.30155611038208, + "logps/chosen": -658.8529663085938, + "logps/rejected": -694.6532592773438, + "loss": 0.8381, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0241538286209106, + "rewards/margins": 0.14310523867607117, + "rewards/rejected": -1.1672589778900146, + "step": 8270 + }, + { + "epoch": 1.9289458357600466, + "grad_norm": 8.170594215393066, + "learning_rate": 7.870550161812298e-07, + "logits/chosen": -4.3301568031311035, + "logits/rejected": -4.340358734130859, + "logps/chosen": -723.5537109375, + "logps/rejected": -732.6986083984375, + "loss": 0.7501, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.59930020570755, + "rewards/margins": 0.5064767599105835, + "rewards/rejected": -1.1057769060134888, + "step": 8280 + }, + { + "epoch": 1.9312754804892254, + "grad_norm": 8.464248657226562, + "learning_rate": 7.611650485436894e-07, + "logits/chosen": -4.264374732971191, + "logits/rejected": -4.301041603088379, + "logps/chosen": -712.2825927734375, + "logps/rejected": -738.6800537109375, + "loss": 0.7616, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6949116587638855, + "rewards/margins": 0.40924954414367676, + "rewards/rejected": -1.1041611433029175, + "step": 8290 + }, + { + "epoch": 1.933605125218404, + "grad_norm": 5.5404486656188965, + "learning_rate": 7.352750809061489e-07, + "logits/chosen": -4.312140464782715, + "logits/rejected": -4.395899772644043, + "logps/chosen": -734.2998657226562, + "logps/rejected": -804.1611328125, + "loss": 0.7502, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4405832290649414, + "rewards/margins": 0.5064100027084351, + "rewards/rejected": -0.9469932317733765, + "step": 8300 + }, + { + "epoch": 1.933605125218404, + "eval_logits/chosen": -4.269779205322266, + "eval_logits/rejected": -4.255950450897217, + "eval_logps/chosen": -698.4453735351562, + "eval_logps/rejected": -718.7404174804688, + "eval_loss": 0.6262587308883667, + "eval_rewards/accuracies": 0.6404808759689331, + "eval_rewards/chosen": -0.7617153525352478, + "eval_rewards/margins": 0.43246886134147644, + "eval_rewards/rejected": -1.1941843032836914, + "eval_runtime": 402.0528, + "eval_samples_per_second": 17.794, + "eval_steps_per_second": 8.897, + "step": 8300 + }, + { + "epoch": 1.935934769947583, + "grad_norm": 10.476713180541992, + "learning_rate": 7.093851132686085e-07, + "logits/chosen": -4.345536231994629, + "logits/rejected": -4.35883092880249, + "logps/chosen": -726.04248046875, + "logps/rejected": -762.2106323242188, + "loss": 0.6994, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6188130378723145, + "rewards/margins": 0.6291608214378357, + "rewards/rejected": -1.2479736804962158, + "step": 8310 + }, + { + "epoch": 1.9382644146767618, + "grad_norm": 6.145667552947998, + "learning_rate": 6.83495145631068e-07, + "logits/chosen": -4.3391337394714355, + "logits/rejected": -4.24570894241333, + "logps/chosen": -774.0887451171875, + "logps/rejected": -720.0848388671875, + "loss": 0.7267, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5717106461524963, + "rewards/margins": 0.6787561774253845, + "rewards/rejected": -1.2504669427871704, + "step": 8320 + }, + { + "epoch": 1.9405940594059405, + "grad_norm": 8.16446304321289, + "learning_rate": 6.576051779935276e-07, + "logits/chosen": -4.334807395935059, + "logits/rejected": -4.253960132598877, + "logps/chosen": -725.342041015625, + "logps/rejected": -719.07568359375, + "loss": 0.8266, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.633865475654602, + "rewards/margins": 0.2745392322540283, + "rewards/rejected": -0.9084047079086304, + "step": 8330 + }, + { + "epoch": 1.9429237041351195, + "grad_norm": 10.712969779968262, + "learning_rate": 6.31715210355987e-07, + "logits/chosen": -4.292527675628662, + "logits/rejected": -4.2442946434021, + "logps/chosen": -761.6686401367188, + "logps/rejected": -714.60009765625, + "loss": 0.8968, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9049726724624634, + "rewards/margins": 0.27385425567626953, + "rewards/rejected": -1.1788270473480225, + "step": 8340 + }, + { + "epoch": 1.9452533488642982, + "grad_norm": 6.336966514587402, + "learning_rate": 6.058252427184466e-07, + "logits/chosen": -4.317821025848389, + "logits/rejected": -4.242893218994141, + "logps/chosen": -717.8641967773438, + "logps/rejected": -699.2586059570312, + "loss": 0.8501, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7709203958511353, + "rewards/margins": 0.3294094204902649, + "rewards/rejected": -1.1003297567367554, + "step": 8350 + }, + { + "epoch": 1.947582993593477, + "grad_norm": 10.864871978759766, + "learning_rate": 5.799352750809062e-07, + "logits/chosen": -4.350275993347168, + "logits/rejected": -4.370471000671387, + "logps/chosen": -731.3067626953125, + "logps/rejected": -769.8187255859375, + "loss": 0.6432, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.558472752571106, + "rewards/margins": 0.7509238123893738, + "rewards/rejected": -1.309396505355835, + "step": 8360 + }, + { + "epoch": 1.949912638322656, + "grad_norm": 8.846085548400879, + "learning_rate": 5.540453074433658e-07, + "logits/chosen": -4.353498458862305, + "logits/rejected": -4.3908796310424805, + "logps/chosen": -741.0128173828125, + "logps/rejected": -692.1842651367188, + "loss": 0.8107, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7740066647529602, + "rewards/margins": 0.366254597902298, + "rewards/rejected": -1.1402612924575806, + "step": 8370 + }, + { + "epoch": 1.9522422830518344, + "grad_norm": 7.637683868408203, + "learning_rate": 5.281553398058253e-07, + "logits/chosen": -4.264584541320801, + "logits/rejected": -4.2607526779174805, + "logps/chosen": -735.846923828125, + "logps/rejected": -754.1130981445312, + "loss": 0.7077, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7124431729316711, + "rewards/margins": 0.6570056676864624, + "rewards/rejected": -1.3694486618041992, + "step": 8380 + }, + { + "epoch": 1.9545719277810134, + "grad_norm": 4.9761223793029785, + "learning_rate": 5.022653721682848e-07, + "logits/chosen": -4.241456985473633, + "logits/rejected": -4.229941368103027, + "logps/chosen": -664.3502197265625, + "logps/rejected": -715.2335205078125, + "loss": 0.8032, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.8136642575263977, + "rewards/margins": 0.39051347970962524, + "rewards/rejected": -1.2041776180267334, + "step": 8390 + }, + { + "epoch": 1.9569015725101921, + "grad_norm": 10.152214050292969, + "learning_rate": 4.7637540453074437e-07, + "logits/chosen": -4.321287631988525, + "logits/rejected": -4.322064399719238, + "logps/chosen": -711.2864990234375, + "logps/rejected": -713.7066650390625, + "loss": 0.8314, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9399154782295227, + "rewards/margins": 0.31622153520584106, + "rewards/rejected": -1.2561370134353638, + "step": 8400 + }, + { + "epoch": 1.9569015725101921, + "eval_logits/chosen": -4.2698798179626465, + "eval_logits/rejected": -4.2562713623046875, + "eval_logps/chosen": -698.441162109375, + "eval_logps/rejected": -718.7234497070312, + "eval_loss": 0.6265471577644348, + "eval_rewards/accuracies": 0.6420184373855591, + "eval_rewards/chosen": -0.7612878084182739, + "eval_rewards/margins": 0.4311898946762085, + "eval_rewards/rejected": -1.1924777030944824, + "eval_runtime": 402.4783, + "eval_samples_per_second": 17.775, + "eval_steps_per_second": 8.887, + "step": 8400 + }, + { + "epoch": 1.9592312172393709, + "grad_norm": 8.497055053710938, + "learning_rate": 4.5048543689320394e-07, + "logits/chosen": -4.284560203552246, + "logits/rejected": -4.310437202453613, + "logps/chosen": -723.8456420898438, + "logps/rejected": -715.3678588867188, + "loss": 0.7216, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8485382795333862, + "rewards/margins": 0.5425747036933899, + "rewards/rejected": -1.3911129236221313, + "step": 8410 + }, + { + "epoch": 1.9615608619685498, + "grad_norm": 6.9082255363464355, + "learning_rate": 4.2459546925566345e-07, + "logits/chosen": -4.283592700958252, + "logits/rejected": -4.316073417663574, + "logps/chosen": -669.37109375, + "logps/rejected": -663.8726806640625, + "loss": 0.8655, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.8164242506027222, + "rewards/margins": 0.10506206750869751, + "rewards/rejected": -0.9214862585067749, + "step": 8420 + }, + { + "epoch": 1.9638905066977286, + "grad_norm": 8.472549438476562, + "learning_rate": 3.98705501618123e-07, + "logits/chosen": -4.299191474914551, + "logits/rejected": -4.256920337677002, + "logps/chosen": -747.3541259765625, + "logps/rejected": -740.9989013671875, + "loss": 0.8372, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6805824041366577, + "rewards/margins": 0.365847647190094, + "rewards/rejected": -1.046429991722107, + "step": 8430 + }, + { + "epoch": 1.9662201514269073, + "grad_norm": 7.3164167404174805, + "learning_rate": 3.728155339805826e-07, + "logits/chosen": -4.24942684173584, + "logits/rejected": -4.248818397521973, + "logps/chosen": -681.0838623046875, + "logps/rejected": -723.9698486328125, + "loss": 0.8116, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8195317983627319, + "rewards/margins": 0.21901898086071014, + "rewards/rejected": -1.0385507345199585, + "step": 8440 + }, + { + "epoch": 1.9685497961560863, + "grad_norm": 7.188282489776611, + "learning_rate": 3.469255663430421e-07, + "logits/chosen": -4.256657600402832, + "logits/rejected": -4.36815881729126, + "logps/chosen": -641.541748046875, + "logps/rejected": -759.9830322265625, + "loss": 0.6642, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5071582794189453, + "rewards/margins": 0.6715329885482788, + "rewards/rejected": -1.1786913871765137, + "step": 8450 + }, + { + "epoch": 1.970879440885265, + "grad_norm": 10.558123588562012, + "learning_rate": 3.2103559870550167e-07, + "logits/chosen": -4.252476692199707, + "logits/rejected": -4.316245079040527, + "logps/chosen": -730.2252197265625, + "logps/rejected": -751.7780151367188, + "loss": 0.7471, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7278391718864441, + "rewards/margins": 0.45042625069618225, + "rewards/rejected": -1.1782654523849487, + "step": 8460 + }, + { + "epoch": 1.9732090856144437, + "grad_norm": 5.863772869110107, + "learning_rate": 2.951456310679612e-07, + "logits/chosen": -4.232786178588867, + "logits/rejected": -4.32352876663208, + "logps/chosen": -692.9447021484375, + "logps/rejected": -767.4256591796875, + "loss": 0.6891, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.6030572652816772, + "rewards/margins": 0.5774694085121155, + "rewards/rejected": -1.1805267333984375, + "step": 8470 + }, + { + "epoch": 1.9755387303436227, + "grad_norm": 3.489633083343506, + "learning_rate": 2.6925566343042075e-07, + "logits/chosen": -4.237580299377441, + "logits/rejected": -4.371821403503418, + "logps/chosen": -693.5853881835938, + "logps/rejected": -830.3731689453125, + "loss": 0.7209, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7704498767852783, + "rewards/margins": 0.7803723216056824, + "rewards/rejected": -1.550822138786316, + "step": 8480 + }, + { + "epoch": 1.9778683750728014, + "grad_norm": 8.350820541381836, + "learning_rate": 2.4336569579288027e-07, + "logits/chosen": -4.262223243713379, + "logits/rejected": -4.329285621643066, + "logps/chosen": -667.3316040039062, + "logps/rejected": -720.9274291992188, + "loss": 0.8041, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7020472288131714, + "rewards/margins": 0.3439479470252991, + "rewards/rejected": -1.0459951162338257, + "step": 8490 + }, + { + "epoch": 1.9801980198019802, + "grad_norm": 7.101047992706299, + "learning_rate": 2.1747572815533983e-07, + "logits/chosen": -4.292538166046143, + "logits/rejected": -4.3668742179870605, + "logps/chosen": -746.7476806640625, + "logps/rejected": -765.1817626953125, + "loss": 0.694, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6788529753684998, + "rewards/margins": 0.801043689250946, + "rewards/rejected": -1.4798967838287354, + "step": 8500 + }, + { + "epoch": 1.9801980198019802, + "eval_logits/chosen": -4.269947528839111, + "eval_logits/rejected": -4.256167888641357, + "eval_logps/chosen": -698.446044921875, + "eval_logps/rejected": -718.7388916015625, + "eval_loss": 0.6263306140899658, + "eval_rewards/accuracies": 0.6432765126228333, + "eval_rewards/chosen": -0.7617831826210022, + "eval_rewards/margins": 0.4322440028190613, + "eval_rewards/rejected": -1.1940271854400635, + "eval_runtime": 401.1668, + "eval_samples_per_second": 17.833, + "eval_steps_per_second": 8.916, + "step": 8500 + } + ], + "logging_steps": 10, + "max_steps": 8584, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}