{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999197753710389, "eval_steps": 10000, "global_step": 6232, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001604492579221821, "grad_norm": 13.0625, "learning_rate": 8.012820512820513e-09, "logits/chosen": -2.801064968109131, "logits/rejected": -2.7777090072631836, "logps/chosen": -452.79461669921875, "logps/rejected": -287.9381103515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.001604492579221821, "grad_norm": 15.0, "learning_rate": 8.012820512820512e-08, "logits/chosen": -2.8678274154663086, "logits/rejected": -2.808558225631714, "logps/chosen": -467.5328369140625, "logps/rejected": -260.4118347167969, "loss": 0.6957, "rewards/accuracies": 0.4236111044883728, "rewards/chosen": -0.0027104958426207304, "rewards/margins": -0.004716634750366211, "rewards/rejected": 0.002006138674914837, "step": 10 }, { "epoch": 0.003208985158443642, "grad_norm": 12.9375, "learning_rate": 1.6025641025641025e-07, "logits/chosen": -2.847327709197998, "logits/rejected": -2.801842212677002, "logps/chosen": -376.6588134765625, "logps/rejected": -235.9285430908203, "loss": 0.6913, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.004946783185005188, "rewards/margins": 0.004215491469949484, "rewards/rejected": 0.0007312919478863478, "step": 20 }, { "epoch": 0.0048134777376654635, "grad_norm": 11.8125, "learning_rate": 2.403846153846154e-07, "logits/chosen": -2.861180543899536, "logits/rejected": -2.795544385910034, "logps/chosen": -430.3834533691406, "logps/rejected": -277.0617370605469, "loss": 0.69, "rewards/accuracies": 0.53125, "rewards/chosen": 0.005568523891270161, "rewards/margins": 0.006780300289392471, "rewards/rejected": -0.0012117767473682761, "step": 30 }, { "epoch": 0.006417970316887284, "grad_norm": 13.8125, "learning_rate": 3.205128205128205e-07, "logits/chosen": -2.8326148986816406, "logits/rejected": -2.7890467643737793, "logps/chosen": -387.25201416015625, "logps/rejected": -265.2815856933594, "loss": 0.6938, "rewards/accuracies": 0.5, "rewards/chosen": 0.0039651584811508656, "rewards/margins": -0.0007998358341865242, "rewards/rejected": 0.004764994140714407, "step": 40 }, { "epoch": 0.008022462896109106, "grad_norm": 15.625, "learning_rate": 4.006410256410257e-07, "logits/chosen": -2.8603596687316895, "logits/rejected": -2.7873940467834473, "logps/chosen": -416.3079528808594, "logps/rejected": -220.4951934814453, "loss": 0.6905, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.011940542608499527, "rewards/margins": 0.005786549765616655, "rewards/rejected": 0.0061539942398667336, "step": 50 }, { "epoch": 0.009626955475330927, "grad_norm": 12.375, "learning_rate": 4.807692307692308e-07, "logits/chosen": -2.838106155395508, "logits/rejected": -2.769294261932373, "logps/chosen": -440.0091857910156, "logps/rejected": -220.72445678710938, "loss": 0.6873, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.023594962432980537, "rewards/margins": 0.012441210448741913, "rewards/rejected": 0.011153748258948326, "step": 60 }, { "epoch": 0.011231448054552748, "grad_norm": 11.875, "learning_rate": 5.60897435897436e-07, "logits/chosen": -2.8301548957824707, "logits/rejected": -2.784411907196045, "logps/chosen": -407.0857849121094, "logps/rejected": -258.1935119628906, "loss": 0.6903, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0322578027844429, "rewards/margins": 0.006352896336466074, "rewards/rejected": 0.025904908776283264, "step": 70 }, { "epoch": 0.012835940633774568, "grad_norm": 12.875, "learning_rate": 6.41025641025641e-07, "logits/chosen": -2.8414039611816406, "logits/rejected": -2.7981231212615967, "logps/chosen": -370.0433044433594, "logps/rejected": -248.3733673095703, "loss": 0.6847, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.05469526723027229, "rewards/margins": 0.017828941345214844, "rewards/rejected": 0.036866314709186554, "step": 80 }, { "epoch": 0.01444043321299639, "grad_norm": 11.25, "learning_rate": 7.211538461538461e-07, "logits/chosen": -2.8511440753936768, "logits/rejected": -2.7955079078674316, "logps/chosen": -418.014404296875, "logps/rejected": -241.5692138671875, "loss": 0.6806, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.07352234423160553, "rewards/margins": 0.026746606454253197, "rewards/rejected": 0.04677574336528778, "step": 90 }, { "epoch": 0.016044925792218213, "grad_norm": 10.75, "learning_rate": 8.012820512820515e-07, "logits/chosen": -2.8341434001922607, "logits/rejected": -2.7957639694213867, "logps/chosen": -362.33349609375, "logps/rejected": -247.6381378173828, "loss": 0.6827, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.08949927240610123, "rewards/margins": 0.02323673665523529, "rewards/rejected": 0.06626254320144653, "step": 100 }, { "epoch": 0.01764941837144003, "grad_norm": 10.8125, "learning_rate": 8.814102564102566e-07, "logits/chosen": -2.8410420417785645, "logits/rejected": -2.788717269897461, "logps/chosen": -411.6890563964844, "logps/rejected": -239.9008331298828, "loss": 0.6708, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.12321096658706665, "rewards/margins": 0.047582268714904785, "rewards/rejected": 0.07562869042158127, "step": 110 }, { "epoch": 0.019253910950661854, "grad_norm": 11.0, "learning_rate": 9.615384615384617e-07, "logits/chosen": -2.835981845855713, "logits/rejected": -2.7958662509918213, "logps/chosen": -404.7931213378906, "logps/rejected": -265.7095642089844, "loss": 0.6732, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.15688064694404602, "rewards/margins": 0.044150885194540024, "rewards/rejected": 0.11272978782653809, "step": 120 }, { "epoch": 0.020858403529883673, "grad_norm": 14.3125, "learning_rate": 1.0416666666666667e-06, "logits/chosen": -2.846203327178955, "logits/rejected": -2.7959675788879395, "logps/chosen": -436.6390075683594, "logps/rejected": -278.2392578125, "loss": 0.6645, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.20976293087005615, "rewards/margins": 0.0639864057302475, "rewards/rejected": 0.14577652513980865, "step": 130 }, { "epoch": 0.022462896109105495, "grad_norm": 11.125, "learning_rate": 1.121794871794872e-06, "logits/chosen": -2.871081829071045, "logits/rejected": -2.8369741439819336, "logps/chosen": -401.9211730957031, "logps/rejected": -270.09759521484375, "loss": 0.6653, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.2322733849287033, "rewards/margins": 0.06529739499092102, "rewards/rejected": 0.16697600483894348, "step": 140 }, { "epoch": 0.024067388688327317, "grad_norm": 12.4375, "learning_rate": 1.201923076923077e-06, "logits/chosen": -2.840069532394409, "logits/rejected": -2.7874045372009277, "logps/chosen": -389.24371337890625, "logps/rejected": -237.10720825195312, "loss": 0.6511, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2733134627342224, "rewards/margins": 0.09585341811180115, "rewards/rejected": 0.17746008932590485, "step": 150 }, { "epoch": 0.025671881267549136, "grad_norm": 11.6875, "learning_rate": 1.282051282051282e-06, "logits/chosen": -2.8439865112304688, "logits/rejected": -2.7847373485565186, "logps/chosen": -433.8013610839844, "logps/rejected": -261.60552978515625, "loss": 0.6419, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.34866684675216675, "rewards/margins": 0.12493026256561279, "rewards/rejected": 0.22373656928539276, "step": 160 }, { "epoch": 0.02727637384677096, "grad_norm": 9.625, "learning_rate": 1.3621794871794872e-06, "logits/chosen": -2.826463222503662, "logits/rejected": -2.7833850383758545, "logps/chosen": -390.516845703125, "logps/rejected": -249.82632446289062, "loss": 0.639, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.4098316729068756, "rewards/margins": 0.13579455018043518, "rewards/rejected": 0.27403712272644043, "step": 170 }, { "epoch": 0.02888086642599278, "grad_norm": 10.5625, "learning_rate": 1.4423076923076922e-06, "logits/chosen": -2.841709613800049, "logits/rejected": -2.799297571182251, "logps/chosen": -376.3614196777344, "logps/rejected": -273.1657409667969, "loss": 0.6513, "rewards/accuracies": 0.65625, "rewards/chosen": 0.4502575397491455, "rewards/margins": 0.11466304212808609, "rewards/rejected": 0.33559450507164, "step": 180 }, { "epoch": 0.0304853590052146, "grad_norm": 11.1875, "learning_rate": 1.5224358974358975e-06, "logits/chosen": -2.849318027496338, "logits/rejected": -2.799050807952881, "logps/chosen": -420.9671936035156, "logps/rejected": -242.8567657470703, "loss": 0.6371, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.49345773458480835, "rewards/margins": 0.15174873173236847, "rewards/rejected": 0.3417089581489563, "step": 190 }, { "epoch": 0.032089851584436425, "grad_norm": 13.4375, "learning_rate": 1.602564102564103e-06, "logits/chosen": -2.844722270965576, "logits/rejected": -2.782104015350342, "logps/chosen": -442.8193359375, "logps/rejected": -248.2726287841797, "loss": 0.6063, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6106610298156738, "rewards/margins": 0.24606601893901825, "rewards/rejected": 0.3645949959754944, "step": 200 }, { "epoch": 0.03369434416365824, "grad_norm": 12.9375, "learning_rate": 1.682692307692308e-06, "logits/chosen": -2.8549864292144775, "logits/rejected": -2.789827823638916, "logps/chosen": -492.47271728515625, "logps/rejected": -267.96307373046875, "loss": 0.5865, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.7030670046806335, "rewards/margins": 0.2864145338535309, "rewards/rejected": 0.41665250062942505, "step": 210 }, { "epoch": 0.03529883674288006, "grad_norm": 9.75, "learning_rate": 1.7628205128205131e-06, "logits/chosen": -2.8427743911743164, "logits/rejected": -2.775764226913452, "logps/chosen": -409.92022705078125, "logps/rejected": -237.02920532226562, "loss": 0.5781, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.703518271446228, "rewards/margins": 0.3402438759803772, "rewards/rejected": 0.36327439546585083, "step": 220 }, { "epoch": 0.036903329322101885, "grad_norm": 11.6875, "learning_rate": 1.8429487179487182e-06, "logits/chosen": -2.8348164558410645, "logits/rejected": -2.7822184562683105, "logps/chosen": -372.57769775390625, "logps/rejected": -249.77828979492188, "loss": 0.627, "rewards/accuracies": 0.65625, "rewards/chosen": 0.6330143809318542, "rewards/margins": 0.22580763697624207, "rewards/rejected": 0.4072067141532898, "step": 230 }, { "epoch": 0.03850782190132371, "grad_norm": 9.4375, "learning_rate": 1.9230769230769234e-06, "logits/chosen": -2.836017608642578, "logits/rejected": -2.7949111461639404, "logps/chosen": -392.5030212402344, "logps/rejected": -272.5045471191406, "loss": 0.6203, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6903320550918579, "rewards/margins": 0.27093595266342163, "rewards/rejected": 0.41939616203308105, "step": 240 }, { "epoch": 0.04011231448054553, "grad_norm": 10.8125, "learning_rate": 2.0032051282051286e-06, "logits/chosen": -2.8418538570404053, "logits/rejected": -2.796204090118408, "logps/chosen": -418.55584716796875, "logps/rejected": -263.2822265625, "loss": 0.6467, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.6684531569480896, "rewards/margins": 0.20640552043914795, "rewards/rejected": 0.46204763650894165, "step": 250 }, { "epoch": 0.041716807059767345, "grad_norm": 12.0, "learning_rate": 2.0833333333333334e-06, "logits/chosen": -2.8437702655792236, "logits/rejected": -2.797869920730591, "logps/chosen": -397.18377685546875, "logps/rejected": -267.39312744140625, "loss": 0.6204, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6440244913101196, "rewards/margins": 0.25003182888031006, "rewards/rejected": 0.3939926028251648, "step": 260 }, { "epoch": 0.04332129963898917, "grad_norm": 9.4375, "learning_rate": 2.1634615384615387e-06, "logits/chosen": -2.866079568862915, "logits/rejected": -2.8208019733428955, "logps/chosen": -364.3704528808594, "logps/rejected": -234.03955078125, "loss": 0.6128, "rewards/accuracies": 0.65625, "rewards/chosen": 0.5425211787223816, "rewards/margins": 0.2726251780986786, "rewards/rejected": 0.2698960304260254, "step": 270 }, { "epoch": 0.04492579221821099, "grad_norm": 7.4375, "learning_rate": 2.243589743589744e-06, "logits/chosen": -2.827198028564453, "logits/rejected": -2.787705421447754, "logps/chosen": -414.30194091796875, "logps/rejected": -265.24383544921875, "loss": 0.5743, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.6018232703208923, "rewards/margins": 0.3598710894584656, "rewards/rejected": 0.24195227026939392, "step": 280 }, { "epoch": 0.04653028479743281, "grad_norm": 9.0, "learning_rate": 2.323717948717949e-06, "logits/chosen": -2.844902753829956, "logits/rejected": -2.7955081462860107, "logps/chosen": -380.4109802246094, "logps/rejected": -238.82467651367188, "loss": 0.5781, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.5810123682022095, "rewards/margins": 0.37828773260116577, "rewards/rejected": 0.2027246505022049, "step": 290 }, { "epoch": 0.048134777376654635, "grad_norm": 9.625, "learning_rate": 2.403846153846154e-06, "logits/chosen": -2.833622455596924, "logits/rejected": -2.790318250656128, "logps/chosen": -367.81622314453125, "logps/rejected": -238.77816772460938, "loss": 0.5724, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.6187222599983215, "rewards/margins": 0.4182785451412201, "rewards/rejected": 0.20044367015361786, "step": 300 }, { "epoch": 0.04973926995587646, "grad_norm": 13.375, "learning_rate": 2.483974358974359e-06, "logits/chosen": -2.8505499362945557, "logits/rejected": -2.797499179840088, "logps/chosen": -415.4098205566406, "logps/rejected": -254.8622283935547, "loss": 0.5852, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.6546602249145508, "rewards/margins": 0.37283462285995483, "rewards/rejected": 0.28182560205459595, "step": 310 }, { "epoch": 0.05134376253509827, "grad_norm": 10.125, "learning_rate": 2.564102564102564e-06, "logits/chosen": -2.862910747528076, "logits/rejected": -2.7968838214874268, "logps/chosen": -437.25128173828125, "logps/rejected": -237.92538452148438, "loss": 0.5432, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.7486186027526855, "rewards/margins": 0.5206918120384216, "rewards/rejected": 0.2279268503189087, "step": 320 }, { "epoch": 0.052948255114320095, "grad_norm": 10.0, "learning_rate": 2.6442307692307696e-06, "logits/chosen": -2.821664333343506, "logits/rejected": -2.7885138988494873, "logps/chosen": -378.65472412109375, "logps/rejected": -271.7118225097656, "loss": 0.6557, "rewards/accuracies": 0.625, "rewards/chosen": 0.4578235149383545, "rewards/margins": 0.27163413166999817, "rewards/rejected": 0.18618936836719513, "step": 330 }, { "epoch": 0.05455274769354192, "grad_norm": 7.125, "learning_rate": 2.7243589743589744e-06, "logits/chosen": -2.8503165245056152, "logits/rejected": -2.7894930839538574, "logps/chosen": -387.0467224121094, "logps/rejected": -224.3261260986328, "loss": 0.559, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.4251370429992676, "rewards/margins": 0.4986073970794678, "rewards/rejected": -0.07347030937671661, "step": 340 }, { "epoch": 0.05615724027276374, "grad_norm": 11.3125, "learning_rate": 2.8044871794871797e-06, "logits/chosen": -2.843735933303833, "logits/rejected": -2.7957053184509277, "logps/chosen": -407.9142150878906, "logps/rejected": -277.27484130859375, "loss": 0.5984, "rewards/accuracies": 0.65625, "rewards/chosen": 0.3845111131668091, "rewards/margins": 0.40977153182029724, "rewards/rejected": -0.025260334834456444, "step": 350 }, { "epoch": 0.05776173285198556, "grad_norm": 14.1875, "learning_rate": 2.8846153846153845e-06, "logits/chosen": -2.8279991149902344, "logits/rejected": -2.7839889526367188, "logps/chosen": -376.7598571777344, "logps/rejected": -270.28118896484375, "loss": 0.6018, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.2761105000972748, "rewards/margins": 0.4085955023765564, "rewards/rejected": -0.13248497247695923, "step": 360 }, { "epoch": 0.059366225431207384, "grad_norm": 8.9375, "learning_rate": 2.96474358974359e-06, "logits/chosen": -2.8521111011505127, "logits/rejected": -2.7879891395568848, "logps/chosen": -438.30706787109375, "logps/rejected": -258.4615783691406, "loss": 0.5302, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.40747037529945374, "rewards/margins": 0.5611386895179749, "rewards/rejected": -0.1536683440208435, "step": 370 }, { "epoch": 0.0609707180104292, "grad_norm": 10.1875, "learning_rate": 3.044871794871795e-06, "logits/chosen": -2.8454055786132812, "logits/rejected": -2.800569534301758, "logps/chosen": -377.9129943847656, "logps/rejected": -248.82284545898438, "loss": 0.5788, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.35149574279785156, "rewards/margins": 0.4671395421028137, "rewards/rejected": -0.11564375460147858, "step": 380 }, { "epoch": 0.06257521058965103, "grad_norm": 8.75, "learning_rate": 3.125e-06, "logits/chosen": -2.862487316131592, "logits/rejected": -2.8140060901641846, "logps/chosen": -388.1512451171875, "logps/rejected": -239.6217803955078, "loss": 0.5614, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.4201714098453522, "rewards/margins": 0.4846549928188324, "rewards/rejected": -0.06448356807231903, "step": 390 }, { "epoch": 0.06417970316887285, "grad_norm": 13.5, "learning_rate": 3.205128205128206e-06, "logits/chosen": -2.8609440326690674, "logits/rejected": -2.809755802154541, "logps/chosen": -400.55230712890625, "logps/rejected": -252.0806427001953, "loss": 0.5862, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.3960089683532715, "rewards/margins": 0.4152015745639801, "rewards/rejected": -0.019192615523934364, "step": 400 }, { "epoch": 0.06578419574809466, "grad_norm": 10.1875, "learning_rate": 3.2852564102564106e-06, "logits/chosen": -2.8571176528930664, "logits/rejected": -2.810673952102661, "logps/chosen": -370.0520324707031, "logps/rejected": -241.5593719482422, "loss": 0.5769, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3274959921836853, "rewards/margins": 0.4942966103553772, "rewards/rejected": -0.1668006330728531, "step": 410 }, { "epoch": 0.06738868832731648, "grad_norm": 10.25, "learning_rate": 3.365384615384616e-06, "logits/chosen": -2.8532814979553223, "logits/rejected": -2.8018105030059814, "logps/chosen": -423.3705139160156, "logps/rejected": -269.21759033203125, "loss": 0.5407, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.3692367970943451, "rewards/margins": 0.5465822815895081, "rewards/rejected": -0.17734552919864655, "step": 420 }, { "epoch": 0.0689931809065383, "grad_norm": 10.0, "learning_rate": 3.4455128205128206e-06, "logits/chosen": -2.845258951187134, "logits/rejected": -2.8067426681518555, "logps/chosen": -415.3434143066406, "logps/rejected": -263.89385986328125, "loss": 0.5885, "rewards/accuracies": 0.6875, "rewards/chosen": 0.31811749935150146, "rewards/margins": 0.4732894003391266, "rewards/rejected": -0.15517184138298035, "step": 430 }, { "epoch": 0.07059767348576013, "grad_norm": 9.125, "learning_rate": 3.5256410256410263e-06, "logits/chosen": -2.859617233276367, "logits/rejected": -2.795375108718872, "logps/chosen": -428.1005859375, "logps/rejected": -248.1711883544922, "loss": 0.5328, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.4356008470058441, "rewards/margins": 0.609188973903656, "rewards/rejected": -0.1735881268978119, "step": 440 }, { "epoch": 0.07220216606498195, "grad_norm": 12.125, "learning_rate": 3.605769230769231e-06, "logits/chosen": -2.8604073524475098, "logits/rejected": -2.7885172367095947, "logps/chosen": -433.9112243652344, "logps/rejected": -230.7516632080078, "loss": 0.5132, "rewards/accuracies": 0.75, "rewards/chosen": 0.3045286238193512, "rewards/margins": 0.6573038697242737, "rewards/rejected": -0.35277533531188965, "step": 450 }, { "epoch": 0.07380665864420377, "grad_norm": 9.75, "learning_rate": 3.6858974358974363e-06, "logits/chosen": -2.8528621196746826, "logits/rejected": -2.79054594039917, "logps/chosen": -449.8255920410156, "logps/rejected": -244.44070434570312, "loss": 0.4864, "rewards/accuracies": 0.8125, "rewards/chosen": 0.32995155453681946, "rewards/margins": 0.7411577105522156, "rewards/rejected": -0.4112061858177185, "step": 460 }, { "epoch": 0.0754111512234256, "grad_norm": 9.9375, "learning_rate": 3.766025641025641e-06, "logits/chosen": -2.8395819664001465, "logits/rejected": -2.7896876335144043, "logps/chosen": -400.5814514160156, "logps/rejected": -245.1863555908203, "loss": 0.5782, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.1910112351179123, "rewards/margins": 0.5769097208976746, "rewards/rejected": -0.38589853048324585, "step": 470 }, { "epoch": 0.07701564380264742, "grad_norm": 11.125, "learning_rate": 3.846153846153847e-06, "logits/chosen": -2.8585433959960938, "logits/rejected": -2.803093671798706, "logps/chosen": -433.22265625, "logps/rejected": -259.0569763183594, "loss": 0.542, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.24159100651741028, "rewards/margins": 0.6452925801277161, "rewards/rejected": -0.4037016034126282, "step": 480 }, { "epoch": 0.07862013638186924, "grad_norm": 9.75, "learning_rate": 3.926282051282051e-06, "logits/chosen": -2.852581739425659, "logits/rejected": -2.809305429458618, "logps/chosen": -387.5144958496094, "logps/rejected": -269.47540283203125, "loss": 0.5967, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.1256721317768097, "rewards/margins": 0.5540415048599243, "rewards/rejected": -0.4283693730831146, "step": 490 }, { "epoch": 0.08022462896109106, "grad_norm": 12.5625, "learning_rate": 4.006410256410257e-06, "logits/chosen": -2.844813108444214, "logits/rejected": -2.8015778064727783, "logps/chosen": -356.5226135253906, "logps/rejected": -227.7875213623047, "loss": 0.5476, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.17828026413917542, "rewards/margins": 0.5639787912368774, "rewards/rejected": -0.385698527097702, "step": 500 }, { "epoch": 0.08182912154031288, "grad_norm": 9.4375, "learning_rate": 4.086538461538462e-06, "logits/chosen": -2.862905502319336, "logits/rejected": -2.8066134452819824, "logps/chosen": -419.22052001953125, "logps/rejected": -263.11529541015625, "loss": 0.5625, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.2933647036552429, "rewards/margins": 0.5651786923408508, "rewards/rejected": -0.27181392908096313, "step": 510 }, { "epoch": 0.08343361411953469, "grad_norm": 11.5, "learning_rate": 4.166666666666667e-06, "logits/chosen": -2.86434006690979, "logits/rejected": -2.8119564056396484, "logps/chosen": -368.76422119140625, "logps/rejected": -228.81649780273438, "loss": 0.5156, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.09425476938486099, "rewards/margins": 0.6207835674285889, "rewards/rejected": -0.5265287756919861, "step": 520 }, { "epoch": 0.08503810669875651, "grad_norm": 10.1875, "learning_rate": 4.246794871794872e-06, "logits/chosen": -2.8573191165924072, "logits/rejected": -2.806361675262451, "logps/chosen": -431.3561096191406, "logps/rejected": -302.09466552734375, "loss": 0.5593, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.12172748148441315, "rewards/margins": 0.6556259393692017, "rewards/rejected": -0.5338984727859497, "step": 530 }, { "epoch": 0.08664259927797834, "grad_norm": 12.1875, "learning_rate": 4.326923076923077e-06, "logits/chosen": -2.8340907096862793, "logits/rejected": -2.7975094318389893, "logps/chosen": -384.72674560546875, "logps/rejected": -264.4954528808594, "loss": 0.5348, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.14650240540504456, "rewards/margins": 0.6401864290237427, "rewards/rejected": -0.7866889238357544, "step": 540 }, { "epoch": 0.08824709185720016, "grad_norm": 8.75, "learning_rate": 4.4070512820512826e-06, "logits/chosen": -2.8491458892822266, "logits/rejected": -2.795612096786499, "logps/chosen": -410.06884765625, "logps/rejected": -244.5002899169922, "loss": 0.5309, "rewards/accuracies": 0.75, "rewards/chosen": -0.025567490607500076, "rewards/margins": 0.6756025552749634, "rewards/rejected": -0.7011700868606567, "step": 550 }, { "epoch": 0.08985158443642198, "grad_norm": 10.875, "learning_rate": 4.487179487179488e-06, "logits/chosen": -2.867560863494873, "logits/rejected": -2.816922664642334, "logps/chosen": -478.77001953125, "logps/rejected": -282.06488037109375, "loss": 0.5592, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.1806914061307907, "rewards/margins": 0.6196719408035278, "rewards/rejected": -0.4389805197715759, "step": 560 }, { "epoch": 0.0914560770156438, "grad_norm": 16.375, "learning_rate": 4.567307692307692e-06, "logits/chosen": -2.846088409423828, "logits/rejected": -2.7940449714660645, "logps/chosen": -429.5796813964844, "logps/rejected": -270.264404296875, "loss": 0.535, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.09899018704891205, "rewards/margins": 0.631646454334259, "rewards/rejected": -0.7306365966796875, "step": 570 }, { "epoch": 0.09306056959486562, "grad_norm": 14.125, "learning_rate": 4.647435897435898e-06, "logits/chosen": -2.833425521850586, "logits/rejected": -2.7957961559295654, "logps/chosen": -401.82244873046875, "logps/rejected": -260.91717529296875, "loss": 0.5443, "rewards/accuracies": 0.75, "rewards/chosen": -0.3537163734436035, "rewards/margins": 0.5703007578849792, "rewards/rejected": -0.9240171313285828, "step": 580 }, { "epoch": 0.09466506217408745, "grad_norm": 16.625, "learning_rate": 4.727564102564103e-06, "logits/chosen": -2.833768606185913, "logits/rejected": -2.7937381267547607, "logps/chosen": -386.2181091308594, "logps/rejected": -261.5721740722656, "loss": 0.5995, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3206387162208557, "rewards/margins": 0.5892382264137268, "rewards/rejected": -0.9098770022392273, "step": 590 }, { "epoch": 0.09626955475330927, "grad_norm": 12.8125, "learning_rate": 4.807692307692308e-06, "logits/chosen": -2.8591580390930176, "logits/rejected": -2.8130130767822266, "logps/chosen": -408.80303955078125, "logps/rejected": -281.59698486328125, "loss": 0.5646, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4771391749382019, "rewards/margins": 0.5393397808074951, "rewards/rejected": -1.0164788961410522, "step": 600 }, { "epoch": 0.09787404733253109, "grad_norm": 11.8125, "learning_rate": 4.887820512820513e-06, "logits/chosen": -2.8518896102905273, "logits/rejected": -2.8146440982818604, "logps/chosen": -405.8784484863281, "logps/rejected": -277.8583068847656, "loss": 0.5194, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.15921525657176971, "rewards/margins": 0.655091404914856, "rewards/rejected": -0.8143066167831421, "step": 610 }, { "epoch": 0.09947853991175291, "grad_norm": 13.1875, "learning_rate": 4.967948717948718e-06, "logits/chosen": -2.852956771850586, "logits/rejected": -2.7987475395202637, "logps/chosen": -426.99169921875, "logps/rejected": -253.60910034179688, "loss": 0.5252, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1902974545955658, "rewards/margins": 0.7495798468589783, "rewards/rejected": -0.5592823028564453, "step": 620 }, { "epoch": 0.10108303249097472, "grad_norm": 10.0625, "learning_rate": 4.999985878011927e-06, "logits/chosen": -2.847449779510498, "logits/rejected": -2.8099355697631836, "logps/chosen": -382.493408203125, "logps/rejected": -264.1233215332031, "loss": 0.5445, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.11230488121509552, "rewards/margins": 0.6818257570266724, "rewards/rejected": -0.7941306829452515, "step": 630 }, { "epoch": 0.10268752507019654, "grad_norm": 9.0, "learning_rate": 4.999899577551476e-06, "logits/chosen": -2.854527473449707, "logits/rejected": -2.8013367652893066, "logps/chosen": -431.7940979003906, "logps/rejected": -259.3672180175781, "loss": 0.4619, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.22716331481933594, "rewards/margins": 0.8081411123275757, "rewards/rejected": -1.0353046655654907, "step": 640 }, { "epoch": 0.10429201764941837, "grad_norm": 8.3125, "learning_rate": 4.999734824884512e-06, "logits/chosen": -2.8519949913024902, "logits/rejected": -2.789717674255371, "logps/chosen": -375.45941162109375, "logps/rejected": -214.428466796875, "loss": 0.4887, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.11137155443429947, "rewards/margins": 0.8049596548080444, "rewards/rejected": -0.9163312911987305, "step": 650 }, { "epoch": 0.10589651022864019, "grad_norm": 11.9375, "learning_rate": 4.99949162518133e-06, "logits/chosen": -2.8302054405212402, "logits/rejected": -2.7920379638671875, "logps/chosen": -368.4721984863281, "logps/rejected": -245.62875366210938, "loss": 0.4666, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.5015857219696045, "rewards/margins": 0.9629722833633423, "rewards/rejected": -0.4613865315914154, "step": 660 }, { "epoch": 0.10750100280786201, "grad_norm": 10.875, "learning_rate": 4.999169986074056e-06, "logits/chosen": -2.8450841903686523, "logits/rejected": -2.8031482696533203, "logps/chosen": -369.4075927734375, "logps/rejected": -246.04324340820312, "loss": 0.5751, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.5235503315925598, "rewards/margins": 0.7644535303115845, "rewards/rejected": -0.24090318381786346, "step": 670 }, { "epoch": 0.10910549538708383, "grad_norm": 7.625, "learning_rate": 4.9987699176564145e-06, "logits/chosen": -2.87187123298645, "logits/rejected": -2.814629077911377, "logps/chosen": -430.436279296875, "logps/rejected": -270.9133605957031, "loss": 0.4719, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.6268840432167053, "rewards/margins": 0.8945601582527161, "rewards/rejected": -0.2676761746406555, "step": 680 }, { "epoch": 0.11070998796630566, "grad_norm": 13.1875, "learning_rate": 4.99829143248341e-06, "logits/chosen": -2.8547470569610596, "logits/rejected": -2.7994818687438965, "logps/chosen": -392.24725341796875, "logps/rejected": -245.3411102294922, "loss": 0.5205, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.12994572520256042, "rewards/margins": 0.8544895052909851, "rewards/rejected": -0.7245438098907471, "step": 690 }, { "epoch": 0.11231448054552748, "grad_norm": 11.375, "learning_rate": 4.997734545570932e-06, "logits/chosen": -2.851435899734497, "logits/rejected": -2.7995338439941406, "logps/chosen": -431.2757263183594, "logps/rejected": -276.44757080078125, "loss": 0.4347, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.13558213412761688, "rewards/margins": 1.0834672451019287, "rewards/rejected": -0.9478851556777954, "step": 700 }, { "epoch": 0.1139189731247493, "grad_norm": 10.8125, "learning_rate": 4.997099274395288e-06, "logits/chosen": -2.852932929992676, "logits/rejected": -2.7906641960144043, "logps/chosen": -472.8045959472656, "logps/rejected": -255.48794555664062, "loss": 0.4551, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.2564769685268402, "rewards/margins": 1.0417909622192383, "rewards/rejected": -0.7853139638900757, "step": 710 }, { "epoch": 0.11552346570397112, "grad_norm": 15.75, "learning_rate": 4.9963856388926464e-06, "logits/chosen": -2.849851608276367, "logits/rejected": -2.814002513885498, "logps/chosen": -358.0218505859375, "logps/rejected": -269.11151123046875, "loss": 0.5099, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.12306994199752808, "rewards/margins": 0.8975353240966797, "rewards/rejected": -0.7744653820991516, "step": 720 }, { "epoch": 0.11712795828319295, "grad_norm": 7.84375, "learning_rate": 4.995593661458419e-06, "logits/chosen": -2.878582000732422, "logits/rejected": -2.8161563873291016, "logps/chosen": -454.62725830078125, "logps/rejected": -232.20849609375, "loss": 0.5009, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.19656017422676086, "rewards/margins": 0.9850205183029175, "rewards/rejected": -1.181580662727356, "step": 730 }, { "epoch": 0.11873245086241477, "grad_norm": 11.0625, "learning_rate": 4.994723366946557e-06, "logits/chosen": -2.862365245819092, "logits/rejected": -2.800396203994751, "logps/chosen": -411.9161071777344, "logps/rejected": -252.68533325195312, "loss": 0.4914, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5986613035202026, "rewards/margins": 0.9145353436470032, "rewards/rejected": -1.513196587562561, "step": 740 }, { "epoch": 0.12033694344163658, "grad_norm": 9.625, "learning_rate": 4.9937747826687644e-06, "logits/chosen": -2.849714756011963, "logits/rejected": -2.7966842651367188, "logps/chosen": -479.533203125, "logps/rejected": -283.9917297363281, "loss": 0.4249, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.3555232584476471, "rewards/margins": 1.1186997890472412, "rewards/rejected": -1.474223017692566, "step": 750 }, { "epoch": 0.1219414360208584, "grad_norm": 9.5625, "learning_rate": 4.9927479383936515e-06, "logits/chosen": -2.856879711151123, "logits/rejected": -2.8196873664855957, "logps/chosen": -433.6773376464844, "logps/rejected": -283.730224609375, "loss": 0.4933, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.39898625016212463, "rewards/margins": 1.1021121740341187, "rewards/rejected": -0.7031258344650269, "step": 760 }, { "epoch": 0.12354592860008022, "grad_norm": 8.375, "learning_rate": 4.99164286634579e-06, "logits/chosen": -2.857846975326538, "logits/rejected": -2.8020546436309814, "logps/chosen": -386.4705810546875, "logps/rejected": -247.07412719726562, "loss": 0.507, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5760042667388916, "rewards/margins": 0.9880954027175903, "rewards/rejected": -0.41209107637405396, "step": 770 }, { "epoch": 0.12515042117930206, "grad_norm": 13.4375, "learning_rate": 4.990459601204712e-06, "logits/chosen": -2.8580970764160156, "logits/rejected": -2.8166897296905518, "logps/chosen": -405.2987365722656, "logps/rejected": -273.1578063964844, "loss": 0.5007, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.13129062950611115, "rewards/margins": 0.90264493227005, "rewards/rejected": -0.7713543176651001, "step": 780 }, { "epoch": 0.12675491375852388, "grad_norm": 14.0, "learning_rate": 4.989198180103812e-06, "logits/chosen": -2.8549675941467285, "logits/rejected": -2.794792890548706, "logps/chosen": -436.3470764160156, "logps/rejected": -258.2693786621094, "loss": 0.4931, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.05619237571954727, "rewards/margins": 0.943490207195282, "rewards/rejected": -0.999682605266571, "step": 790 }, { "epoch": 0.1283594063377457, "grad_norm": 7.5625, "learning_rate": 4.987858642629186e-06, "logits/chosen": -2.863595962524414, "logits/rejected": -2.813183307647705, "logps/chosen": -408.1491394042969, "logps/rejected": -262.6934814453125, "loss": 0.4276, "rewards/accuracies": 0.78125, "rewards/chosen": 0.09297367185354233, "rewards/margins": 1.0539405345916748, "rewards/rejected": -0.960966944694519, "step": 800 }, { "epoch": 0.1299638989169675, "grad_norm": 12.375, "learning_rate": 4.986441030818394e-06, "logits/chosen": -2.863105535507202, "logits/rejected": -2.8114943504333496, "logps/chosen": -440.7064514160156, "logps/rejected": -273.7287902832031, "loss": 0.4992, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.02235526405274868, "rewards/margins": 0.9203702807426453, "rewards/rejected": -0.9427255392074585, "step": 810 }, { "epoch": 0.13156839149618932, "grad_norm": 8.875, "learning_rate": 4.984945389159131e-06, "logits/chosen": -2.8692402839660645, "logits/rejected": -2.8338398933410645, "logps/chosen": -411.2261657714844, "logps/rejected": -300.99090576171875, "loss": 0.532, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.1685343086719513, "rewards/margins": 0.8615954518318176, "rewards/rejected": -1.0301297903060913, "step": 820 }, { "epoch": 0.13317288407541114, "grad_norm": 11.3125, "learning_rate": 4.9833717645878376e-06, "logits/chosen": -2.8571853637695312, "logits/rejected": -2.8165042400360107, "logps/chosen": -386.77191162109375, "logps/rejected": -267.6039123535156, "loss": 0.4565, "rewards/accuracies": 0.8125, "rewards/chosen": -0.24943840503692627, "rewards/margins": 0.9196028709411621, "rewards/rejected": -1.1690412759780884, "step": 830 }, { "epoch": 0.13477737665463296, "grad_norm": 11.1875, "learning_rate": 4.981720206488226e-06, "logits/chosen": -2.869558095932007, "logits/rejected": -2.8199820518493652, "logps/chosen": -426.79107666015625, "logps/rejected": -272.1717224121094, "loss": 0.4679, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.03959231078624725, "rewards/margins": 0.9898979067802429, "rewards/rejected": -1.0294902324676514, "step": 840 }, { "epoch": 0.13638186923385479, "grad_norm": 14.375, "learning_rate": 4.97999076668973e-06, "logits/chosen": -2.855313777923584, "logits/rejected": -2.816606283187866, "logps/chosen": -429.4808044433594, "logps/rejected": -290.6871643066406, "loss": 0.459, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.12090059369802475, "rewards/margins": 1.1042053699493408, "rewards/rejected": -1.2251060009002686, "step": 850 }, { "epoch": 0.1379863618130766, "grad_norm": 12.625, "learning_rate": 4.978183499465874e-06, "logits/chosen": -2.869777202606201, "logits/rejected": -2.831350803375244, "logps/chosen": -398.8718566894531, "logps/rejected": -267.1139831542969, "loss": 0.4888, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.049588270485401154, "rewards/margins": 0.9202731251716614, "rewards/rejected": -0.870684802532196, "step": 860 }, { "epoch": 0.13959085439229843, "grad_norm": 12.1875, "learning_rate": 4.97629846153258e-06, "logits/chosen": -2.857947826385498, "logits/rejected": -2.8109123706817627, "logps/chosen": -399.7394104003906, "logps/rejected": -256.19866943359375, "loss": 0.4656, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3647017776966095, "rewards/margins": 1.0171430110931396, "rewards/rejected": -0.6524412035942078, "step": 870 }, { "epoch": 0.14119534697152025, "grad_norm": 14.6875, "learning_rate": 4.974335712046377e-06, "logits/chosen": -2.8517537117004395, "logits/rejected": -2.8100411891937256, "logps/chosen": -406.5589599609375, "logps/rejected": -280.5801696777344, "loss": 0.5495, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.02481737732887268, "rewards/margins": 0.9063884019851685, "rewards/rejected": -0.9312057495117188, "step": 880 }, { "epoch": 0.14279983955074207, "grad_norm": 8.5, "learning_rate": 4.972295312602549e-06, "logits/chosen": -2.860966205596924, "logits/rejected": -2.826178789138794, "logps/chosen": -387.4122009277344, "logps/rejected": -279.26861572265625, "loss": 0.5074, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.3756955862045288, "rewards/margins": 0.8499695658683777, "rewards/rejected": -1.2256652116775513, "step": 890 }, { "epoch": 0.1444043321299639, "grad_norm": 16.25, "learning_rate": 4.970177327233207e-06, "logits/chosen": -2.846632957458496, "logits/rejected": -2.8146209716796875, "logps/chosen": -408.5803527832031, "logps/rejected": -315.1888732910156, "loss": 0.5584, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5998748540878296, "rewards/margins": 0.8381489515304565, "rewards/rejected": -1.4380238056182861, "step": 900 }, { "epoch": 0.14600882470918572, "grad_norm": 13.375, "learning_rate": 4.967981822405267e-06, "logits/chosen": -2.874896287918091, "logits/rejected": -2.8323984146118164, "logps/chosen": -375.1218566894531, "logps/rejected": -258.2112121582031, "loss": 0.4358, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3986411690711975, "rewards/margins": 1.0664697885513306, "rewards/rejected": -1.4651110172271729, "step": 910 }, { "epoch": 0.14761331728840754, "grad_norm": 13.125, "learning_rate": 4.96570886701838e-06, "logits/chosen": -2.8632893562316895, "logits/rejected": -2.8257575035095215, "logps/chosen": -385.93505859375, "logps/rejected": -276.40252685546875, "loss": 0.456, "rewards/accuracies": 0.78125, "rewards/chosen": -0.04311869293451309, "rewards/margins": 1.1386544704437256, "rewards/rejected": -1.181773066520691, "step": 920 }, { "epoch": 0.14921780986762936, "grad_norm": 13.9375, "learning_rate": 4.963358532402754e-06, "logits/chosen": -2.865166187286377, "logits/rejected": -2.8243813514709473, "logps/chosen": -406.18511962890625, "logps/rejected": -298.96923828125, "loss": 0.4994, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.12896624207496643, "rewards/margins": 1.0513274669647217, "rewards/rejected": -0.9223612546920776, "step": 930 }, { "epoch": 0.1508223024468512, "grad_norm": 11.125, "learning_rate": 4.960930892316928e-06, "logits/chosen": -2.842050552368164, "logits/rejected": -2.8052966594696045, "logps/chosen": -371.76544189453125, "logps/rejected": -271.7075500488281, "loss": 0.5334, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.12452211230993271, "rewards/margins": 0.8370375633239746, "rewards/rejected": -0.9615595936775208, "step": 940 }, { "epoch": 0.152426795026073, "grad_norm": 11.0625, "learning_rate": 4.958426022945451e-06, "logits/chosen": -2.850590944290161, "logits/rejected": -2.7905306816101074, "logps/chosen": -418.02008056640625, "logps/rejected": -247.71969604492188, "loss": 0.456, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.09821374714374542, "rewards/margins": 1.034183144569397, "rewards/rejected": -0.935969352722168, "step": 950 }, { "epoch": 0.15403128760529483, "grad_norm": 8.1875, "learning_rate": 4.955844002896491e-06, "logits/chosen": -2.8453829288482666, "logits/rejected": -2.794001340866089, "logps/chosen": -399.96881103515625, "logps/rejected": -224.82278442382812, "loss": 0.3753, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.14108920097351074, "rewards/margins": 1.3780138492584229, "rewards/rejected": -1.2369245290756226, "step": 960 }, { "epoch": 0.15563578018451665, "grad_norm": 14.375, "learning_rate": 4.953184913199374e-06, "logits/chosen": -2.8354249000549316, "logits/rejected": -2.7770633697509766, "logps/chosen": -427.29425048828125, "logps/rejected": -238.22476196289062, "loss": 0.4496, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4021914005279541, "rewards/margins": 1.2614705562591553, "rewards/rejected": -1.6636619567871094, "step": 970 }, { "epoch": 0.15724027276373848, "grad_norm": 11.9375, "learning_rate": 4.95044883730203e-06, "logits/chosen": -2.860459566116333, "logits/rejected": -2.8065123558044434, "logps/chosen": -445.52142333984375, "logps/rejected": -279.6673583984375, "loss": 0.4867, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8680437803268433, "rewards/margins": 1.191143274307251, "rewards/rejected": -2.059187173843384, "step": 980 }, { "epoch": 0.1588447653429603, "grad_norm": 13.4375, "learning_rate": 4.947635861068386e-06, "logits/chosen": -2.8469669818878174, "logits/rejected": -2.801302194595337, "logps/chosen": -449.8240661621094, "logps/rejected": -283.5860290527344, "loss": 0.5071, "rewards/accuracies": 0.75, "rewards/chosen": -1.2054916620254517, "rewards/margins": 1.0238996744155884, "rewards/rejected": -2.22939133644104, "step": 990 }, { "epoch": 0.16044925792218212, "grad_norm": 7.4375, "learning_rate": 4.944746072775665e-06, "logits/chosen": -2.8220298290252686, "logits/rejected": -2.78554368019104, "logps/chosen": -408.4573974609375, "logps/rejected": -301.96954345703125, "loss": 0.5083, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.0626147985458374, "rewards/margins": 0.9443966150283813, "rewards/rejected": -2.0070114135742188, "step": 1000 }, { "epoch": 0.16205375050140394, "grad_norm": 12.4375, "learning_rate": 4.941779563111618e-06, "logits/chosen": -2.832078456878662, "logits/rejected": -2.7752461433410645, "logps/chosen": -463.5667419433594, "logps/rejected": -298.6123352050781, "loss": 0.4117, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.49622875452041626, "rewards/margins": 1.234946608543396, "rewards/rejected": -1.731175422668457, "step": 1010 }, { "epoch": 0.16365824308062576, "grad_norm": 9.4375, "learning_rate": 4.938736425171674e-06, "logits/chosen": -2.8524792194366455, "logits/rejected": -2.800553321838379, "logps/chosen": -447.437255859375, "logps/rejected": -275.7645568847656, "loss": 0.495, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.24053123593330383, "rewards/margins": 1.1447057723999023, "rewards/rejected": -1.3852369785308838, "step": 1020 }, { "epoch": 0.16526273565984756, "grad_norm": 7.28125, "learning_rate": 4.935616754456025e-06, "logits/chosen": -2.8419628143310547, "logits/rejected": -2.796003580093384, "logps/chosen": -424.24786376953125, "logps/rejected": -261.08819580078125, "loss": 0.4618, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1646566092967987, "rewards/margins": 1.0846534967422485, "rewards/rejected": -1.2493102550506592, "step": 1030 }, { "epoch": 0.16686722823906938, "grad_norm": 9.75, "learning_rate": 4.932420648866625e-06, "logits/chosen": -2.8227810859680176, "logits/rejected": -2.7894959449768066, "logps/chosen": -365.9049377441406, "logps/rejected": -245.19790649414062, "loss": 0.5173, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.08955149352550507, "rewards/margins": 0.9930402040481567, "rewards/rejected": -1.0825916528701782, "step": 1040 }, { "epoch": 0.1684717208182912, "grad_norm": 5.4375, "learning_rate": 4.929148208704115e-06, "logits/chosen": -2.8581783771514893, "logits/rejected": -2.7928261756896973, "logps/chosen": -447.5279235839844, "logps/rejected": -260.6807556152344, "loss": 0.4387, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.3931046426296234, "rewards/margins": 1.2747266292572021, "rewards/rejected": -0.8816219568252563, "step": 1050 }, { "epoch": 0.17007621339751303, "grad_norm": 10.875, "learning_rate": 4.9257995366646815e-06, "logits/chosen": -2.8498356342315674, "logits/rejected": -2.80733323097229, "logps/chosen": -397.88824462890625, "logps/rejected": -269.755615234375, "loss": 0.5273, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.051018256694078445, "rewards/margins": 1.0080373287200928, "rewards/rejected": -1.0590555667877197, "step": 1060 }, { "epoch": 0.17168070597673485, "grad_norm": 9.25, "learning_rate": 4.922374737836831e-06, "logits/chosen": -2.8488006591796875, "logits/rejected": -2.802272081375122, "logps/chosen": -421.5355529785156, "logps/rejected": -272.3642883300781, "loss": 0.4554, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.04262730851769447, "rewards/margins": 1.1909922361373901, "rewards/rejected": -1.2336194515228271, "step": 1070 }, { "epoch": 0.17328519855595667, "grad_norm": 9.625, "learning_rate": 4.91887391969809e-06, "logits/chosen": -2.8226168155670166, "logits/rejected": -2.782015800476074, "logps/chosen": -420.8662109375, "logps/rejected": -293.19842529296875, "loss": 0.4742, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.38794204592704773, "rewards/margins": 1.0689277648925781, "rewards/rejected": -1.4568698406219482, "step": 1080 }, { "epoch": 0.1748896911351785, "grad_norm": 10.625, "learning_rate": 4.915297192111634e-06, "logits/chosen": -2.8481686115264893, "logits/rejected": -2.7858822345733643, "logps/chosen": -379.24029541015625, "logps/rejected": -233.7908935546875, "loss": 0.4592, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.6695745587348938, "rewards/margins": 1.0812642574310303, "rewards/rejected": -1.7508388757705688, "step": 1090 }, { "epoch": 0.17649418371440032, "grad_norm": 14.75, "learning_rate": 4.911644667322842e-06, "logits/chosen": -2.8497910499572754, "logits/rejected": -2.7979178428649902, "logps/chosen": -389.6773376464844, "logps/rejected": -259.4122009277344, "loss": 0.3949, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.5893046855926514, "rewards/margins": 1.2521405220031738, "rewards/rejected": -1.8414452075958252, "step": 1100 }, { "epoch": 0.17809867629362214, "grad_norm": 12.4375, "learning_rate": 4.907916459955768e-06, "logits/chosen": -2.8499035835266113, "logits/rejected": -2.7966737747192383, "logps/chosen": -429.894287109375, "logps/rejected": -270.0157165527344, "loss": 0.4608, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.050095342099666595, "rewards/margins": 1.27973210811615, "rewards/rejected": -1.3298275470733643, "step": 1110 }, { "epoch": 0.17970316887284396, "grad_norm": 6.8125, "learning_rate": 4.904112687009551e-06, "logits/chosen": -2.827000141143799, "logits/rejected": -2.768686056137085, "logps/chosen": -393.4642639160156, "logps/rejected": -244.27841186523438, "loss": 0.4818, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.33323028683662415, "rewards/margins": 1.263914942741394, "rewards/rejected": -1.5971451997756958, "step": 1120 }, { "epoch": 0.18130766145206578, "grad_norm": 11.3125, "learning_rate": 4.900233467854737e-06, "logits/chosen": -2.825713872909546, "logits/rejected": -2.7785568237304688, "logps/chosen": -412.3636779785156, "logps/rejected": -260.38043212890625, "loss": 0.4012, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10967252403497696, "rewards/margins": 1.4954288005828857, "rewards/rejected": -1.6051013469696045, "step": 1130 }, { "epoch": 0.1829121540312876, "grad_norm": 12.5, "learning_rate": 4.8962789242295385e-06, "logits/chosen": -2.8395121097564697, "logits/rejected": -2.785365104675293, "logps/chosen": -408.33599853515625, "logps/rejected": -230.55618286132812, "loss": 0.4252, "rewards/accuracies": 0.8125, "rewards/chosen": -0.011464995332062244, "rewards/margins": 1.5289808511734009, "rewards/rejected": -1.5404456853866577, "step": 1140 }, { "epoch": 0.18451664661050943, "grad_norm": 8.8125, "learning_rate": 4.892249180236008e-06, "logits/chosen": -2.812471389770508, "logits/rejected": -2.7771260738372803, "logps/chosen": -407.0672302246094, "logps/rejected": -285.8708801269531, "loss": 0.4825, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.17156875133514404, "rewards/margins": 1.0909749269485474, "rewards/rejected": -0.9194060564041138, "step": 1150 }, { "epoch": 0.18612113918973125, "grad_norm": 11.0, "learning_rate": 4.88814436233615e-06, "logits/chosen": -2.835604667663574, "logits/rejected": -2.7880663871765137, "logps/chosen": -393.1040954589844, "logps/rejected": -274.363525390625, "loss": 0.5086, "rewards/accuracies": 0.71875, "rewards/chosen": 0.1455533802509308, "rewards/margins": 0.991690456867218, "rewards/rejected": -0.8461370468139648, "step": 1160 }, { "epoch": 0.18772563176895307, "grad_norm": 14.5, "learning_rate": 4.883964599347947e-06, "logits/chosen": -2.865811586380005, "logits/rejected": -2.791569232940674, "logps/chosen": -455.54266357421875, "logps/rejected": -236.8774871826172, "loss": 0.3972, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.46389633417129517, "rewards/margins": 1.3061515092849731, "rewards/rejected": -0.8422551155090332, "step": 1170 }, { "epoch": 0.1893301243481749, "grad_norm": 10.75, "learning_rate": 4.879710022441319e-06, "logits/chosen": -2.8505630493164062, "logits/rejected": -2.794327735900879, "logps/chosen": -417.4291076660156, "logps/rejected": -276.7917785644531, "loss": 0.433, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.0002804175019264221, "rewards/margins": 1.269378662109375, "rewards/rejected": -1.2690984010696411, "step": 1180 }, { "epoch": 0.19093461692739672, "grad_norm": 11.375, "learning_rate": 4.8753807651340094e-06, "logits/chosen": -2.8082587718963623, "logits/rejected": -2.7676730155944824, "logps/chosen": -389.95208740234375, "logps/rejected": -269.0458984375, "loss": 0.4204, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.21035516262054443, "rewards/margins": 1.308610200881958, "rewards/rejected": -1.5189653635025024, "step": 1190 }, { "epoch": 0.19253910950661854, "grad_norm": 12.3125, "learning_rate": 4.870976963287389e-06, "logits/chosen": -2.854358434677124, "logits/rejected": -2.81211256980896, "logps/chosen": -442.133056640625, "logps/rejected": -296.097900390625, "loss": 0.4333, "rewards/accuracies": 0.84375, "rewards/chosen": -0.180402934551239, "rewards/margins": 1.4225049018859863, "rewards/rejected": -1.6029078960418701, "step": 1200 }, { "epoch": 0.19414360208584036, "grad_norm": 11.0625, "learning_rate": 4.866498755102199e-06, "logits/chosen": -2.815762996673584, "logits/rejected": -2.7701659202575684, "logps/chosen": -404.69659423828125, "logps/rejected": -255.0467987060547, "loss": 0.5278, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.017588134855031967, "rewards/margins": 1.0438967943191528, "rewards/rejected": -1.061484932899475, "step": 1210 }, { "epoch": 0.19574809466506218, "grad_norm": 10.75, "learning_rate": 4.86194628111421e-06, "logits/chosen": -2.8358616828918457, "logits/rejected": -2.7959179878234863, "logps/chosen": -409.2869567871094, "logps/rejected": -280.0111083984375, "loss": 0.4087, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0479540154337883, "rewards/margins": 1.389870047569275, "rewards/rejected": -1.3419160842895508, "step": 1220 }, { "epoch": 0.197352587244284, "grad_norm": 12.25, "learning_rate": 4.8573196841898115e-06, "logits/chosen": -2.8417351245880127, "logits/rejected": -2.786050319671631, "logps/chosen": -428.37225341796875, "logps/rejected": -272.734619140625, "loss": 0.4414, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3356698155403137, "rewards/margins": 1.3983747959136963, "rewards/rejected": -1.7340446710586548, "step": 1230 }, { "epoch": 0.19895707982350583, "grad_norm": 7.09375, "learning_rate": 4.852619109521533e-06, "logits/chosen": -2.826575517654419, "logits/rejected": -2.7821571826934814, "logps/chosen": -411.47576904296875, "logps/rejected": -274.7923278808594, "loss": 0.4743, "rewards/accuracies": 0.75, "rewards/chosen": -0.8697759509086609, "rewards/margins": 1.1408138275146484, "rewards/rejected": -2.010589838027954, "step": 1240 }, { "epoch": 0.20056157240272765, "grad_norm": 10.25, "learning_rate": 4.84784470462348e-06, "logits/chosen": -2.823415756225586, "logits/rejected": -2.765533685684204, "logps/chosen": -425.9173889160156, "logps/rejected": -256.94482421875, "loss": 0.439, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9649397730827332, "rewards/margins": 1.2031081914901733, "rewards/rejected": -2.1680479049682617, "step": 1250 }, { "epoch": 0.20216606498194944, "grad_norm": 9.875, "learning_rate": 4.8429966193267105e-06, "logits/chosen": -2.8484039306640625, "logits/rejected": -2.8079075813293457, "logps/chosen": -405.5970764160156, "logps/rejected": -288.0689697265625, "loss": 0.4665, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.6731967926025391, "rewards/margins": 1.1682379245758057, "rewards/rejected": -1.8414348363876343, "step": 1260 }, { "epoch": 0.20377055756117127, "grad_norm": 8.1875, "learning_rate": 4.838075005774532e-06, "logits/chosen": -2.8509840965270996, "logits/rejected": -2.7832255363464355, "logps/chosen": -451.97418212890625, "logps/rejected": -266.8094177246094, "loss": 0.4219, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4400562644004822, "rewards/margins": 1.3925987482070923, "rewards/rejected": -1.8326551914215088, "step": 1270 }, { "epoch": 0.2053750501403931, "grad_norm": 14.375, "learning_rate": 4.833080018417726e-06, "logits/chosen": -2.8175435066223145, "logits/rejected": -2.777705430984497, "logps/chosen": -408.74139404296875, "logps/rejected": -273.4599304199219, "loss": 0.4336, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.35707685351371765, "rewards/margins": 1.3179552555084229, "rewards/rejected": -1.6750320196151733, "step": 1280 }, { "epoch": 0.2069795427196149, "grad_norm": 11.625, "learning_rate": 4.828011814009701e-06, "logits/chosen": -2.8355205059051514, "logits/rejected": -2.787078857421875, "logps/chosen": -345.0771179199219, "logps/rejected": -229.5488739013672, "loss": 0.4149, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.5090301632881165, "rewards/margins": 1.1647326946258545, "rewards/rejected": -1.6737626791000366, "step": 1290 }, { "epoch": 0.20858403529883673, "grad_norm": 11.5, "learning_rate": 4.8228705516015726e-06, "logits/chosen": -2.857856512069702, "logits/rejected": -2.8049046993255615, "logps/chosen": -410.0738830566406, "logps/rejected": -256.43267822265625, "loss": 0.4822, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.24196282029151917, "rewards/margins": 1.2176387310028076, "rewards/rejected": -1.4596015214920044, "step": 1300 }, { "epoch": 0.21018852787805856, "grad_norm": 6.6875, "learning_rate": 4.8176563925371754e-06, "logits/chosen": -2.82441782951355, "logits/rejected": -2.7806971073150635, "logps/chosen": -403.340576171875, "logps/rejected": -252.67520141601562, "loss": 0.4386, "rewards/accuracies": 0.78125, "rewards/chosen": -0.029354382306337357, "rewards/margins": 1.3465321063995361, "rewards/rejected": -1.37588632106781, "step": 1310 }, { "epoch": 0.21179302045728038, "grad_norm": 9.75, "learning_rate": 4.812369500447995e-06, "logits/chosen": -2.834958553314209, "logits/rejected": -2.7789368629455566, "logps/chosen": -439.12884521484375, "logps/rejected": -263.0425720214844, "loss": 0.4197, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.12956687808036804, "rewards/margins": 1.3475834131240845, "rewards/rejected": -1.477150321006775, "step": 1320 }, { "epoch": 0.2133975130365022, "grad_norm": 8.5, "learning_rate": 4.807010041248034e-06, "logits/chosen": -2.8462655544281006, "logits/rejected": -2.789226531982422, "logps/chosen": -432.11859130859375, "logps/rejected": -265.541015625, "loss": 0.459, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5119088888168335, "rewards/margins": 1.2584164142608643, "rewards/rejected": -1.7703250646591187, "step": 1330 }, { "epoch": 0.21500200561572402, "grad_norm": 10.0625, "learning_rate": 4.801578183128612e-06, "logits/chosen": -2.821584939956665, "logits/rejected": -2.784703016281128, "logps/chosen": -376.1842956542969, "logps/rejected": -260.5780029296875, "loss": 0.4674, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7184211611747742, "rewards/margins": 1.1745176315307617, "rewards/rejected": -1.8929389715194702, "step": 1340 }, { "epoch": 0.21660649819494585, "grad_norm": 11.4375, "learning_rate": 4.796074096553076e-06, "logits/chosen": -2.8425607681274414, "logits/rejected": -2.806462049484253, "logps/chosen": -405.66046142578125, "logps/rejected": -288.26763916015625, "loss": 0.4602, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5505048632621765, "rewards/margins": 1.1473301649093628, "rewards/rejected": -1.6978349685668945, "step": 1350 }, { "epoch": 0.21821099077416767, "grad_norm": 8.875, "learning_rate": 4.79049795425146e-06, "logits/chosen": -2.8352413177490234, "logits/rejected": -2.7683029174804688, "logps/chosen": -430.3526916503906, "logps/rejected": -249.4574432373047, "loss": 0.3596, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.16624969244003296, "rewards/margins": 1.4309508800506592, "rewards/rejected": -1.597200632095337, "step": 1360 }, { "epoch": 0.2198154833533895, "grad_norm": 8.0625, "learning_rate": 4.7848499312150594e-06, "logits/chosen": -2.828045129776001, "logits/rejected": -2.7975258827209473, "logps/chosen": -373.49481201171875, "logps/rejected": -280.9105529785156, "loss": 0.4308, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.19651548564434052, "rewards/margins": 1.321374773979187, "rewards/rejected": -1.517890214920044, "step": 1370 }, { "epoch": 0.2214199759326113, "grad_norm": 11.875, "learning_rate": 4.779130204690943e-06, "logits/chosen": -2.8563780784606934, "logits/rejected": -2.8081536293029785, "logps/chosen": -436.86688232421875, "logps/rejected": -273.16558837890625, "loss": 0.4634, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2028815746307373, "rewards/margins": 1.32326340675354, "rewards/rejected": -1.5261448621749878, "step": 1380 }, { "epoch": 0.22302446851183313, "grad_norm": 7.375, "learning_rate": 4.773338954176387e-06, "logits/chosen": -2.817887783050537, "logits/rejected": -2.782639741897583, "logps/chosen": -378.1986083984375, "logps/rejected": -279.36846923828125, "loss": 0.5194, "rewards/accuracies": 0.75, "rewards/chosen": -0.42984476685523987, "rewards/margins": 1.157174825668335, "rewards/rejected": -1.587019681930542, "step": 1390 }, { "epoch": 0.22462896109105496, "grad_norm": 13.3125, "learning_rate": 4.7674763614132434e-06, "logits/chosen": -2.8435416221618652, "logits/rejected": -2.790074586868286, "logps/chosen": -429.6957092285156, "logps/rejected": -264.2143859863281, "loss": 0.4331, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.608059287071228, "rewards/margins": 1.1564500331878662, "rewards/rejected": -1.7645094394683838, "step": 1400 }, { "epoch": 0.22623345367027678, "grad_norm": 6.8125, "learning_rate": 4.761542610382239e-06, "logits/chosen": -2.8392746448516846, "logits/rejected": -2.7809157371520996, "logps/chosen": -425.05316162109375, "logps/rejected": -261.99847412109375, "loss": 0.4502, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.5174814462661743, "rewards/margins": 1.2994234561920166, "rewards/rejected": -1.8169047832489014, "step": 1410 }, { "epoch": 0.2278379462494986, "grad_norm": 9.875, "learning_rate": 4.755537887297199e-06, "logits/chosen": -2.8379852771759033, "logits/rejected": -2.799140453338623, "logps/chosen": -389.2047424316406, "logps/rejected": -268.64117431640625, "loss": 0.4417, "rewards/accuracies": 0.78125, "rewards/chosen": -0.41011518239974976, "rewards/margins": 1.1836442947387695, "rewards/rejected": -1.5937594175338745, "step": 1420 }, { "epoch": 0.22944243882872042, "grad_norm": 8.9375, "learning_rate": 4.749462380599202e-06, "logits/chosen": -2.8324601650238037, "logits/rejected": -2.7815661430358887, "logps/chosen": -420.51287841796875, "logps/rejected": -267.8291015625, "loss": 0.4968, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.6424036026000977, "rewards/margins": 1.261707067489624, "rewards/rejected": -1.9041106700897217, "step": 1430 }, { "epoch": 0.23104693140794225, "grad_norm": 8.4375, "learning_rate": 4.743316280950667e-06, "logits/chosen": -2.820141315460205, "logits/rejected": -2.7683265209198, "logps/chosen": -444.98553466796875, "logps/rejected": -248.98373413085938, "loss": 0.4244, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9500995874404907, "rewards/margins": 1.3280335664749146, "rewards/rejected": -2.2781333923339844, "step": 1440 }, { "epoch": 0.23265142398716407, "grad_norm": 10.0625, "learning_rate": 4.737099781229378e-06, "logits/chosen": -2.8534438610076904, "logits/rejected": -2.799450635910034, "logps/chosen": -422.92620849609375, "logps/rejected": -272.5727844238281, "loss": 0.4609, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0444400310516357, "rewards/margins": 1.201486349105835, "rewards/rejected": -2.2459261417388916, "step": 1450 }, { "epoch": 0.2342559165663859, "grad_norm": 9.0625, "learning_rate": 4.730813076522416e-06, "logits/chosen": -2.8369858264923096, "logits/rejected": -2.7790279388427734, "logps/chosen": -413.2454528808594, "logps/rejected": -265.76177978515625, "loss": 0.4273, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0562739372253418, "rewards/margins": 1.2740825414657593, "rewards/rejected": -2.3303565979003906, "step": 1460 }, { "epoch": 0.2358604091456077, "grad_norm": 15.1875, "learning_rate": 4.724456364120049e-06, "logits/chosen": -2.8288745880126953, "logits/rejected": -2.786720037460327, "logps/chosen": -396.2320861816406, "logps/rejected": -257.0665283203125, "loss": 0.4642, "rewards/accuracies": 0.75, "rewards/chosen": -1.1088237762451172, "rewards/margins": 1.2594928741455078, "rewards/rejected": -2.368316650390625, "step": 1470 }, { "epoch": 0.23746490172482954, "grad_norm": 12.0, "learning_rate": 4.718029843509536e-06, "logits/chosen": -2.821851968765259, "logits/rejected": -2.794757127761841, "logps/chosen": -359.7787170410156, "logps/rejected": -287.4159851074219, "loss": 0.4796, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9977776408195496, "rewards/margins": 1.1746795177459717, "rewards/rejected": -2.172457218170166, "step": 1480 }, { "epoch": 0.23906939430405133, "grad_norm": 11.4375, "learning_rate": 4.711533716368868e-06, "logits/chosen": -2.8450379371643066, "logits/rejected": -2.7930729389190674, "logps/chosen": -417.41424560546875, "logps/rejected": -271.30255126953125, "loss": 0.4276, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9935910105705261, "rewards/margins": 1.4305678606033325, "rewards/rejected": -2.424158811569214, "step": 1490 }, { "epoch": 0.24067388688327315, "grad_norm": 8.6875, "learning_rate": 4.704968186560435e-06, "logits/chosen": -2.838139772415161, "logits/rejected": -2.7687313556671143, "logps/chosen": -422.4140625, "logps/rejected": -246.0552978515625, "loss": 0.4304, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.1783483028411865, "rewards/margins": 1.331295371055603, "rewards/rejected": -2.5096435546875, "step": 1500 }, { "epoch": 0.24227837946249497, "grad_norm": 13.0, "learning_rate": 4.698333460124635e-06, "logits/chosen": -2.822394371032715, "logits/rejected": -2.7874555587768555, "logps/chosen": -420.3224182128906, "logps/rejected": -329.09368896484375, "loss": 0.5389, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8192963600158691, "rewards/margins": 1.1562691926956177, "rewards/rejected": -1.9755655527114868, "step": 1510 }, { "epoch": 0.2438828720417168, "grad_norm": 7.75, "learning_rate": 4.6916297452734035e-06, "logits/chosen": -2.8320469856262207, "logits/rejected": -2.790133237838745, "logps/chosen": -399.43499755859375, "logps/rejected": -266.1390075683594, "loss": 0.3834, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.47780370712280273, "rewards/margins": 1.447096347808838, "rewards/rejected": -1.9249000549316406, "step": 1520 }, { "epoch": 0.24548736462093862, "grad_norm": 10.875, "learning_rate": 4.68485725238368e-06, "logits/chosen": -2.83125901222229, "logits/rejected": -2.7863268852233887, "logps/chosen": -364.90167236328125, "logps/rejected": -254.18734741210938, "loss": 0.4265, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4623478353023529, "rewards/margins": 1.215044379234314, "rewards/rejected": -1.6773923635482788, "step": 1530 }, { "epoch": 0.24709185720016044, "grad_norm": 9.6875, "learning_rate": 4.678016193990804e-06, "logits/chosen": -2.843860387802124, "logits/rejected": -2.79443097114563, "logps/chosen": -431.3357849121094, "logps/rejected": -274.87298583984375, "loss": 0.4505, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.4723086357116699, "rewards/margins": 1.3923436403274536, "rewards/rejected": -1.8646522760391235, "step": 1540 }, { "epoch": 0.24869634977938226, "grad_norm": 8.0, "learning_rate": 4.671106784781852e-06, "logits/chosen": -2.8497323989868164, "logits/rejected": -2.7909748554229736, "logps/chosen": -394.0882263183594, "logps/rejected": -261.38116455078125, "loss": 0.4565, "rewards/accuracies": 0.75, "rewards/chosen": -0.7600976228713989, "rewards/margins": 1.2734730243682861, "rewards/rejected": -2.0335705280303955, "step": 1550 }, { "epoch": 0.2503008423586041, "grad_norm": 11.625, "learning_rate": 4.664129241588892e-06, "logits/chosen": -2.8478569984436035, "logits/rejected": -2.8017325401306152, "logps/chosen": -346.27850341796875, "logps/rejected": -269.350830078125, "loss": 0.3808, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.9132752418518066, "rewards/margins": 1.5875321626663208, "rewards/rejected": -2.500807523727417, "step": 1560 }, { "epoch": 0.2519053349378259, "grad_norm": 7.3125, "learning_rate": 4.657083783382183e-06, "logits/chosen": -2.8549914360046387, "logits/rejected": -2.788637161254883, "logps/chosen": -451.19354248046875, "logps/rejected": -271.6217041015625, "loss": 0.3843, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.5077577829360962, "rewards/margins": 1.5544869899749756, "rewards/rejected": -2.0622448921203613, "step": 1570 }, { "epoch": 0.25350982751704776, "grad_norm": 9.8125, "learning_rate": 4.649970631263305e-06, "logits/chosen": -2.84084415435791, "logits/rejected": -2.7851650714874268, "logps/chosen": -439.1463317871094, "logps/rejected": -260.8518371582031, "loss": 0.4195, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.36325803399086, "rewards/margins": 1.4407813549041748, "rewards/rejected": -1.8040390014648438, "step": 1580 }, { "epoch": 0.25511432009626955, "grad_norm": 9.875, "learning_rate": 4.642790008458215e-06, "logits/chosen": -2.8434596061706543, "logits/rejected": -2.807659149169922, "logps/chosen": -373.6240539550781, "logps/rejected": -261.77874755859375, "loss": 0.4556, "rewards/accuracies": 0.78125, "rewards/chosen": -0.43002891540527344, "rewards/margins": 1.2107504606246948, "rewards/rejected": -1.6407792568206787, "step": 1590 }, { "epoch": 0.2567188126754914, "grad_norm": 9.875, "learning_rate": 4.635542140310246e-06, "logits/chosen": -2.852417469024658, "logits/rejected": -2.7987499237060547, "logps/chosen": -361.0308532714844, "logps/rejected": -230.9735870361328, "loss": 0.4234, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.5390613079071045, "rewards/margins": 1.4163873195648193, "rewards/rejected": -1.9554487466812134, "step": 1600 }, { "epoch": 0.2583233052547132, "grad_norm": 11.0625, "learning_rate": 4.628227254273035e-06, "logits/chosen": -2.8414974212646484, "logits/rejected": -2.7943129539489746, "logps/chosen": -411.3245544433594, "logps/rejected": -294.3775939941406, "loss": 0.4239, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.030005883425474167, "rewards/margins": 1.4693728685379028, "rewards/rejected": -1.4993788003921509, "step": 1610 }, { "epoch": 0.259927797833935, "grad_norm": 5.65625, "learning_rate": 4.620845579903384e-06, "logits/chosen": -2.8417601585388184, "logits/rejected": -2.804967164993286, "logps/chosen": -406.329833984375, "logps/rejected": -285.1833190917969, "loss": 0.5088, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.09530623257160187, "rewards/margins": 1.1653735637664795, "rewards/rejected": -1.0700671672821045, "step": 1620 }, { "epoch": 0.26153229041315684, "grad_norm": 8.875, "learning_rate": 4.613397348854052e-06, "logits/chosen": -2.840447425842285, "logits/rejected": -2.776920795440674, "logps/chosen": -429.1918029785156, "logps/rejected": -250.17703247070312, "loss": 0.4002, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.34828829765319824, "rewards/margins": 1.4636526107788086, "rewards/rejected": -1.8119409084320068, "step": 1630 }, { "epoch": 0.26313678299237864, "grad_norm": 15.5, "learning_rate": 4.6058827948664955e-06, "logits/chosen": -2.8132128715515137, "logits/rejected": -2.7712903022766113, "logps/chosen": -436.04571533203125, "logps/rejected": -297.0361022949219, "loss": 0.5216, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7419519424438477, "rewards/margins": 1.3626269102096558, "rewards/rejected": -2.104578733444214, "step": 1640 }, { "epoch": 0.2647412755716005, "grad_norm": 14.6875, "learning_rate": 4.598302153763521e-06, "logits/chosen": -2.8330299854278564, "logits/rejected": -2.779334545135498, "logps/chosen": -447.2433166503906, "logps/rejected": -306.1699523925781, "loss": 0.4278, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.736446738243103, "rewards/margins": 1.4597265720367432, "rewards/rejected": -2.1961731910705566, "step": 1650 }, { "epoch": 0.2663457681508223, "grad_norm": 11.0625, "learning_rate": 4.590655663441895e-06, "logits/chosen": -2.82206392288208, "logits/rejected": -2.7750067710876465, "logps/chosen": -376.8445129394531, "logps/rejected": -263.13800048828125, "loss": 0.4451, "rewards/accuracies": 0.78125, "rewards/chosen": -1.030211091041565, "rewards/margins": 1.1909860372543335, "rewards/rejected": -2.2211971282958984, "step": 1660 }, { "epoch": 0.26795026073004413, "grad_norm": 10.25, "learning_rate": 4.582943563864871e-06, "logits/chosen": -2.8382768630981445, "logits/rejected": -2.789553165435791, "logps/chosen": -421.49798583984375, "logps/rejected": -263.3898010253906, "loss": 0.4338, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.6687736511230469, "rewards/margins": 1.4316935539245605, "rewards/rejected": -2.1004672050476074, "step": 1670 }, { "epoch": 0.2695547533092659, "grad_norm": 9.25, "learning_rate": 4.575166097054662e-06, "logits/chosen": -2.8354883193969727, "logits/rejected": -2.8013734817504883, "logps/chosen": -340.6007995605469, "logps/rejected": -255.2383575439453, "loss": 0.5074, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7958927154541016, "rewards/margins": 1.0950578451156616, "rewards/rejected": -1.8909505605697632, "step": 1680 }, { "epoch": 0.2711592458884878, "grad_norm": 11.0625, "learning_rate": 4.5673235070848466e-06, "logits/chosen": -2.829465389251709, "logits/rejected": -2.776041030883789, "logps/chosen": -455.27313232421875, "logps/rejected": -294.5025329589844, "loss": 0.4468, "rewards/accuracies": 0.78125, "rewards/chosen": -0.49423256516456604, "rewards/margins": 1.312168836593628, "rewards/rejected": -1.806401252746582, "step": 1690 }, { "epoch": 0.27276373846770957, "grad_norm": 12.375, "learning_rate": 4.559416040072704e-06, "logits/chosen": -2.8363401889801025, "logits/rejected": -2.7936367988586426, "logps/chosen": -435.79095458984375, "logps/rejected": -316.5937805175781, "loss": 0.4535, "rewards/accuracies": 0.75, "rewards/chosen": -0.5846672058105469, "rewards/margins": 1.275101900100708, "rewards/rejected": -1.8597691059112549, "step": 1700 }, { "epoch": 0.2743682310469314, "grad_norm": 14.5625, "learning_rate": 4.551443944171498e-06, "logits/chosen": -2.831512212753296, "logits/rejected": -2.772264003753662, "logps/chosen": -452.65643310546875, "logps/rejected": -275.2079162597656, "loss": 0.4382, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6748954653739929, "rewards/margins": 1.2464044094085693, "rewards/rejected": -1.921299695968628, "step": 1710 }, { "epoch": 0.2759727236261532, "grad_norm": 10.9375, "learning_rate": 4.543407469562683e-06, "logits/chosen": -2.8399877548217773, "logits/rejected": -2.7791390419006348, "logps/chosen": -441.6180114746094, "logps/rejected": -276.15911865234375, "loss": 0.3855, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.6309624910354614, "rewards/margins": 1.5666242837905884, "rewards/rejected": -2.19758677482605, "step": 1720 }, { "epoch": 0.27757721620537507, "grad_norm": 12.25, "learning_rate": 4.535306868448056e-06, "logits/chosen": -2.8203558921813965, "logits/rejected": -2.766663074493408, "logps/chosen": -441.9458923339844, "logps/rejected": -273.5175476074219, "loss": 0.3641, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.6861680150032043, "rewards/margins": 1.4781992435455322, "rewards/rejected": -2.164367437362671, "step": 1730 }, { "epoch": 0.27918170878459686, "grad_norm": 9.75, "learning_rate": 4.527142395041842e-06, "logits/chosen": -2.8248648643493652, "logits/rejected": -2.7880301475524902, "logps/chosen": -368.2842712402344, "logps/rejected": -283.63177490234375, "loss": 0.5307, "rewards/accuracies": 0.75, "rewards/chosen": -1.2060248851776123, "rewards/margins": 1.0623140335083008, "rewards/rejected": -2.268339157104492, "step": 1740 }, { "epoch": 0.2807862013638187, "grad_norm": 9.1875, "learning_rate": 4.5189143055627125e-06, "logits/chosen": -2.8148093223571777, "logits/rejected": -2.754791736602783, "logps/chosen": -418.1742248535156, "logps/rejected": -237.22714233398438, "loss": 0.4155, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9080227017402649, "rewards/margins": 1.5029445886611938, "rewards/rejected": -2.4109673500061035, "step": 1750 }, { "epoch": 0.2823906939430405, "grad_norm": 14.25, "learning_rate": 4.510622858225751e-06, "logits/chosen": -2.826892137527466, "logits/rejected": -2.773489475250244, "logps/chosen": -426.64013671875, "logps/rejected": -265.1723937988281, "loss": 0.4393, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.7476126551628113, "rewards/margins": 1.3893746137619019, "rewards/rejected": -2.1369872093200684, "step": 1760 }, { "epoch": 0.28399518652226236, "grad_norm": 11.1875, "learning_rate": 4.502268313234346e-06, "logits/chosen": -2.8378753662109375, "logits/rejected": -2.804105281829834, "logps/chosen": -395.4019470214844, "logps/rejected": -299.331298828125, "loss": 0.4605, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.7183247804641724, "rewards/margins": 1.1963268518447876, "rewards/rejected": -1.914651870727539, "step": 1770 }, { "epoch": 0.28559967910148415, "grad_norm": 13.0, "learning_rate": 4.493850932772023e-06, "logits/chosen": -2.835758686065674, "logits/rejected": -2.777498483657837, "logps/chosen": -450.83001708984375, "logps/rejected": -296.15155029296875, "loss": 0.4322, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5921076536178589, "rewards/margins": 1.3440371751785278, "rewards/rejected": -1.9361445903778076, "step": 1780 }, { "epoch": 0.287204171680706, "grad_norm": 9.75, "learning_rate": 4.485370980994222e-06, "logits/chosen": -2.811516284942627, "logits/rejected": -2.7632298469543457, "logps/chosen": -445.705078125, "logps/rejected": -288.3770446777344, "loss": 0.361, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.9876825213432312, "rewards/margins": 1.6245466470718384, "rewards/rejected": -2.612229108810425, "step": 1790 }, { "epoch": 0.2888086642599278, "grad_norm": 8.0625, "learning_rate": 4.476828724020004e-06, "logits/chosen": -2.81585693359375, "logits/rejected": -2.7703843116760254, "logps/chosen": -440.3304748535156, "logps/rejected": -305.70623779296875, "loss": 0.4022, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7863627076148987, "rewards/margins": 1.5422474145889282, "rewards/rejected": -2.3286099433898926, "step": 1800 }, { "epoch": 0.29041315683914964, "grad_norm": 10.5, "learning_rate": 4.4682244299237e-06, "logits/chosen": -2.833800792694092, "logits/rejected": -2.766378879547119, "logps/chosen": -419.5318298339844, "logps/rejected": -244.69845581054688, "loss": 0.382, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.8790322542190552, "rewards/margins": 1.5583170652389526, "rewards/rejected": -2.437349319458008, "step": 1810 }, { "epoch": 0.29201764941837144, "grad_norm": 7.0, "learning_rate": 4.4595583687264995e-06, "logits/chosen": -2.8306946754455566, "logits/rejected": -2.7733991146087646, "logps/chosen": -417.86407470703125, "logps/rejected": -281.1275329589844, "loss": 0.3496, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.4836646616458893, "rewards/margins": 1.7228864431381226, "rewards/rejected": -2.2065510749816895, "step": 1820 }, { "epoch": 0.29362214199759323, "grad_norm": 11.9375, "learning_rate": 4.450830812387977e-06, "logits/chosen": -2.8085079193115234, "logits/rejected": -2.754930019378662, "logps/chosen": -395.44842529296875, "logps/rejected": -260.34173583984375, "loss": 0.3796, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.4923313558101654, "rewards/margins": 1.7218822240829468, "rewards/rejected": -2.2142133712768555, "step": 1830 }, { "epoch": 0.2952266345768151, "grad_norm": 14.125, "learning_rate": 4.442042034797553e-06, "logits/chosen": -2.8227367401123047, "logits/rejected": -2.7747013568878174, "logps/chosen": -403.69647216796875, "logps/rejected": -277.3199462890625, "loss": 0.4172, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.33268576860427856, "rewards/margins": 1.791925072669983, "rewards/rejected": -2.124610424041748, "step": 1840 }, { "epoch": 0.2968311271560369, "grad_norm": 16.875, "learning_rate": 4.4331923117659074e-06, "logits/chosen": -2.8039228916168213, "logits/rejected": -2.750880718231201, "logps/chosen": -394.9085998535156, "logps/rejected": -254.77847290039062, "loss": 0.456, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4723745882511139, "rewards/margins": 1.4253343343734741, "rewards/rejected": -1.8977091312408447, "step": 1850 }, { "epoch": 0.29843561973525873, "grad_norm": 12.1875, "learning_rate": 4.424281921016313e-06, "logits/chosen": -2.819580078125, "logits/rejected": -2.7685558795928955, "logps/chosen": -413.388916015625, "logps/rejected": -266.0133972167969, "loss": 0.4021, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.025130027905106544, "rewards/margins": 1.64925217628479, "rewards/rejected": -1.6743818521499634, "step": 1860 }, { "epoch": 0.3000401123144805, "grad_norm": 7.8125, "learning_rate": 4.4153111421759295e-06, "logits/chosen": -2.8199543952941895, "logits/rejected": -2.7723941802978516, "logps/chosen": -436.6410217285156, "logps/rejected": -281.31097412109375, "loss": 0.4174, "rewards/accuracies": 0.8125, "rewards/chosen": 0.07161926478147507, "rewards/margins": 1.3388832807540894, "rewards/rejected": -1.2672641277313232, "step": 1870 }, { "epoch": 0.3016446048937024, "grad_norm": 11.75, "learning_rate": 4.406280256767022e-06, "logits/chosen": -2.7875537872314453, "logits/rejected": -2.7518553733825684, "logps/chosen": -347.3486633300781, "logps/rejected": -241.81088256835938, "loss": 0.4995, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.30371221899986267, "rewards/margins": 1.167853832244873, "rewards/rejected": -1.4715659618377686, "step": 1880 }, { "epoch": 0.30324909747292417, "grad_norm": 9.6875, "learning_rate": 4.397189548198131e-06, "logits/chosen": -2.808062791824341, "logits/rejected": -2.7635011672973633, "logps/chosen": -350.69622802734375, "logps/rejected": -259.8271484375, "loss": 0.3661, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.11702452600002289, "rewards/margins": 1.5752732753753662, "rewards/rejected": -1.6922976970672607, "step": 1890 }, { "epoch": 0.304853590052146, "grad_norm": 11.875, "learning_rate": 4.3880393017551726e-06, "logits/chosen": -2.804323673248291, "logits/rejected": -2.7637481689453125, "logps/chosen": -404.2551574707031, "logps/rejected": -308.30615234375, "loss": 0.473, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.26971399784088135, "rewards/margins": 1.2568085193634033, "rewards/rejected": -1.5265226364135742, "step": 1900 }, { "epoch": 0.3064580826313678, "grad_norm": 7.21875, "learning_rate": 4.378829804592492e-06, "logits/chosen": -2.7919106483459473, "logits/rejected": -2.745802164077759, "logps/chosen": -382.80267333984375, "logps/rejected": -243.62216186523438, "loss": 0.4132, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.5920853614807129, "rewards/margins": 1.4570844173431396, "rewards/rejected": -2.0491700172424316, "step": 1910 }, { "epoch": 0.30806257521058966, "grad_norm": 9.0, "learning_rate": 4.369561345723849e-06, "logits/chosen": -2.792496919631958, "logits/rejected": -2.757033348083496, "logps/chosen": -364.6177062988281, "logps/rejected": -265.30108642578125, "loss": 0.4128, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.7346659898757935, "rewards/margins": 1.4026756286621094, "rewards/rejected": -2.1373417377471924, "step": 1920 }, { "epoch": 0.30966706778981146, "grad_norm": 18.375, "learning_rate": 4.3602342160133465e-06, "logits/chosen": -2.7862532138824463, "logits/rejected": -2.746852159500122, "logps/chosen": -360.64923095703125, "logps/rejected": -280.3394775390625, "loss": 0.5027, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.250262975692749, "rewards/margins": 1.1754741668701172, "rewards/rejected": -2.425737142562866, "step": 1930 }, { "epoch": 0.3112715603690333, "grad_norm": 9.25, "learning_rate": 4.350848708166303e-06, "logits/chosen": -2.797743558883667, "logits/rejected": -2.7491729259490967, "logps/chosen": -443.4658203125, "logps/rejected": -277.5204162597656, "loss": 0.488, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.5107940435409546, "rewards/margins": 1.3312230110168457, "rewards/rejected": -1.8420171737670898, "step": 1940 }, { "epoch": 0.3128760529482551, "grad_norm": 11.0, "learning_rate": 4.341405116720071e-06, "logits/chosen": -2.8033390045166016, "logits/rejected": -2.7506601810455322, "logps/chosen": -414.3819274902344, "logps/rejected": -274.1234436035156, "loss": 0.4838, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.6175442337989807, "rewards/margins": 1.1533035039901733, "rewards/rejected": -1.7708479166030884, "step": 1950 }, { "epoch": 0.31448054552747695, "grad_norm": 9.1875, "learning_rate": 4.331903738034789e-06, "logits/chosen": -2.810904026031494, "logits/rejected": -2.770843267440796, "logps/chosen": -422.90606689453125, "logps/rejected": -284.57940673828125, "loss": 0.5256, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7260259389877319, "rewards/margins": 1.0210621356964111, "rewards/rejected": -1.747088074684143, "step": 1960 }, { "epoch": 0.31608503810669875, "grad_norm": 12.5625, "learning_rate": 4.322344870284085e-06, "logits/chosen": -2.8122241497039795, "logits/rejected": -2.7609055042266846, "logps/chosen": -395.7962341308594, "logps/rejected": -271.0941467285156, "loss": 0.42, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6745200753211975, "rewards/margins": 1.4384453296661377, "rewards/rejected": -2.1129653453826904, "step": 1970 }, { "epoch": 0.3176895306859206, "grad_norm": 9.9375, "learning_rate": 4.3127288134457134e-06, "logits/chosen": -2.7955710887908936, "logits/rejected": -2.7677693367004395, "logps/chosen": -353.9372253417969, "logps/rejected": -273.2738952636719, "loss": 0.459, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8430483937263489, "rewards/margins": 1.1921310424804688, "rewards/rejected": -2.035179615020752, "step": 1980 }, { "epoch": 0.3192940232651424, "grad_norm": 8.0, "learning_rate": 4.303055869292148e-06, "logits/chosen": -2.824467182159424, "logits/rejected": -2.7715914249420166, "logps/chosen": -398.9433898925781, "logps/rejected": -255.2515411376953, "loss": 0.4277, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7953391075134277, "rewards/margins": 1.4688130617141724, "rewards/rejected": -2.2641520500183105, "step": 1990 }, { "epoch": 0.32089851584436424, "grad_norm": 8.0, "learning_rate": 4.293326341381107e-06, "logits/chosen": -2.797856569290161, "logits/rejected": -2.7529053688049316, "logps/chosen": -399.34112548828125, "logps/rejected": -278.1756286621094, "loss": 0.3842, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.8905127644538879, "rewards/margins": 1.5137399435043335, "rewards/rejected": -2.404252767562866, "step": 2000 }, { "epoch": 0.32250300842358604, "grad_norm": 10.125, "learning_rate": 4.283540535046028e-06, "logits/chosen": -2.817237377166748, "logits/rejected": -2.7562451362609863, "logps/chosen": -437.9877014160156, "logps/rejected": -268.0690002441406, "loss": 0.3691, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5093081593513489, "rewards/margins": 1.6034595966339111, "rewards/rejected": -2.1127676963806152, "step": 2010 }, { "epoch": 0.3241075010028079, "grad_norm": 10.4375, "learning_rate": 4.273698757386488e-06, "logits/chosen": -2.802812099456787, "logits/rejected": -2.7696757316589355, "logps/chosen": -364.01788330078125, "logps/rejected": -270.11322021484375, "loss": 0.4031, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.649468183517456, "rewards/margins": 1.449625015258789, "rewards/rejected": -2.099092960357666, "step": 2020 }, { "epoch": 0.3257119935820297, "grad_norm": 10.625, "learning_rate": 4.263801317258561e-06, "logits/chosen": -2.8311779499053955, "logits/rejected": -2.7837471961975098, "logps/chosen": -434.9518127441406, "logps/rejected": -283.3076171875, "loss": 0.4052, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.31916719675064087, "rewards/margins": 1.8119815587997437, "rewards/rejected": -2.1311488151550293, "step": 2030 }, { "epoch": 0.32731648616125153, "grad_norm": 13.25, "learning_rate": 4.253848525265133e-06, "logits/chosen": -2.810074806213379, "logits/rejected": -2.74710750579834, "logps/chosen": -447.9266052246094, "logps/rejected": -257.04217529296875, "loss": 0.3461, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.6491631269454956, "rewards/margins": 1.897173285484314, "rewards/rejected": -2.5463361740112305, "step": 2040 }, { "epoch": 0.3289209787404733, "grad_norm": 9.25, "learning_rate": 4.2438406937461454e-06, "logits/chosen": -2.8261799812316895, "logits/rejected": -2.770085573196411, "logps/chosen": -403.342529296875, "logps/rejected": -283.41583251953125, "loss": 0.4482, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9808372259140015, "rewards/margins": 1.575923204421997, "rewards/rejected": -2.556760549545288, "step": 2050 }, { "epoch": 0.3305254713196951, "grad_norm": 7.78125, "learning_rate": 4.2337781367688005e-06, "logits/chosen": -2.804826021194458, "logits/rejected": -2.7547125816345215, "logps/chosen": -434.2330017089844, "logps/rejected": -265.1722106933594, "loss": 0.4707, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.0464017391204834, "rewards/margins": 1.363721489906311, "rewards/rejected": -2.410123348236084, "step": 2060 }, { "epoch": 0.33212996389891697, "grad_norm": 11.375, "learning_rate": 4.2236611701177014e-06, "logits/chosen": -2.8363442420959473, "logits/rejected": -2.7976276874542236, "logps/chosen": -391.71063232421875, "logps/rejected": -288.6444091796875, "loss": 0.3832, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.1158915758132935, "rewards/margins": 1.5520169734954834, "rewards/rejected": -2.6679084300994873, "step": 2070 }, { "epoch": 0.33373445647813876, "grad_norm": 9.8125, "learning_rate": 4.213490111284945e-06, "logits/chosen": -2.8100860118865967, "logits/rejected": -2.756178617477417, "logps/chosen": -426.25994873046875, "logps/rejected": -272.02874755859375, "loss": 0.4081, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.7943537831306458, "rewards/margins": 1.5453530550003052, "rewards/rejected": -2.339707136154175, "step": 2080 }, { "epoch": 0.3353389490573606, "grad_norm": 13.0, "learning_rate": 4.2032652794601566e-06, "logits/chosen": -2.7990894317626953, "logits/rejected": -2.7656126022338867, "logps/chosen": -383.18670654296875, "logps/rejected": -259.5707092285156, "loss": 0.4467, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8756702542304993, "rewards/margins": 1.3891487121582031, "rewards/rejected": -2.2648189067840576, "step": 2090 }, { "epoch": 0.3369434416365824, "grad_norm": 11.625, "learning_rate": 4.192986995520472e-06, "logits/chosen": -2.8053946495056152, "logits/rejected": -2.7492048740386963, "logps/chosen": -431.46649169921875, "logps/rejected": -269.7936706542969, "loss": 0.4301, "rewards/accuracies": 0.8125, "rewards/chosen": -0.45706477761268616, "rewards/margins": 1.676553726196289, "rewards/rejected": -2.1336185932159424, "step": 2100 }, { "epoch": 0.33854793421580426, "grad_norm": 8.8125, "learning_rate": 4.18265558202047e-06, "logits/chosen": -2.8314008712768555, "logits/rejected": -2.775664806365967, "logps/chosen": -408.23651123046875, "logps/rejected": -257.9898376464844, "loss": 0.423, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.5516425967216492, "rewards/margins": 1.5292810201644897, "rewards/rejected": -2.080923557281494, "step": 2110 }, { "epoch": 0.34015242679502605, "grad_norm": 8.5, "learning_rate": 4.172271363182048e-06, "logits/chosen": -2.8232319355010986, "logits/rejected": -2.779074192047119, "logps/chosen": -394.4797668457031, "logps/rejected": -279.1906433105469, "loss": 0.4632, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.8158979415893555, "rewards/margins": 1.4413425922393799, "rewards/rejected": -2.2572405338287354, "step": 2120 }, { "epoch": 0.3417569193742479, "grad_norm": 10.625, "learning_rate": 4.161834664884249e-06, "logits/chosen": -2.810708999633789, "logits/rejected": -2.760195255279541, "logps/chosen": -391.6730041503906, "logps/rejected": -252.3364715576172, "loss": 0.4198, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.8611714243888855, "rewards/margins": 1.4506516456604004, "rewards/rejected": -2.3118226528167725, "step": 2130 }, { "epoch": 0.3433614119534697, "grad_norm": 10.125, "learning_rate": 4.151345814653032e-06, "logits/chosen": -2.812229871749878, "logits/rejected": -2.760028839111328, "logps/chosen": -405.5102844238281, "logps/rejected": -268.9241638183594, "loss": 0.4008, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.4481165409088135, "rewards/margins": 1.5251991748809814, "rewards/rejected": -1.9733158349990845, "step": 2140 }, { "epoch": 0.34496590453269155, "grad_norm": 12.875, "learning_rate": 4.140805141650998e-06, "logits/chosen": -2.8150129318237305, "logits/rejected": -2.759650707244873, "logps/chosen": -442.197265625, "logps/rejected": -273.97222900390625, "loss": 0.4121, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4230071008205414, "rewards/margins": 1.5626881122589111, "rewards/rejected": -1.985695481300354, "step": 2150 }, { "epoch": 0.34657039711191334, "grad_norm": 9.1875, "learning_rate": 4.130212976667058e-06, "logits/chosen": -2.8240721225738525, "logits/rejected": -2.7625181674957275, "logps/chosen": -458.7286071777344, "logps/rejected": -280.1552734375, "loss": 0.4462, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.5845053791999817, "rewards/margins": 1.5388051271438599, "rewards/rejected": -2.1233105659484863, "step": 2160 }, { "epoch": 0.3481748896911352, "grad_norm": 9.75, "learning_rate": 4.119569652106048e-06, "logits/chosen": -2.824092388153076, "logits/rejected": -2.7669942378997803, "logps/chosen": -426.59466552734375, "logps/rejected": -264.8882141113281, "loss": 0.3381, "rewards/accuracies": 0.875, "rewards/chosen": -0.9379093050956726, "rewards/margins": 1.6747223138809204, "rewards/rejected": -2.6126315593719482, "step": 2170 }, { "epoch": 0.349779382270357, "grad_norm": 15.125, "learning_rate": 4.108875501978304e-06, "logits/chosen": -2.814786672592163, "logits/rejected": -2.781355381011963, "logps/chosen": -387.7843322753906, "logps/rejected": -287.6878967285156, "loss": 0.5382, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9285928606987, "rewards/margins": 1.2608126401901245, "rewards/rejected": -2.1894054412841797, "step": 2180 }, { "epoch": 0.35138387484957884, "grad_norm": 16.375, "learning_rate": 4.098130861889178e-06, "logits/chosen": -2.807741641998291, "logits/rejected": -2.787757396697998, "logps/chosen": -340.2018737792969, "logps/rejected": -281.7198791503906, "loss": 0.5677, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0224530696868896, "rewards/margins": 1.136397123336792, "rewards/rejected": -2.1588501930236816, "step": 2190 }, { "epoch": 0.35298836742880063, "grad_norm": 9.1875, "learning_rate": 4.087336069028501e-06, "logits/chosen": -2.8195130825042725, "logits/rejected": -2.770843029022217, "logps/chosen": -387.366943359375, "logps/rejected": -267.26116943359375, "loss": 0.403, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.834459662437439, "rewards/margins": 1.3918986320495605, "rewards/rejected": -2.226358413696289, "step": 2200 }, { "epoch": 0.3545928600080225, "grad_norm": 8.1875, "learning_rate": 4.076491462160011e-06, "logits/chosen": -2.81046724319458, "logits/rejected": -2.7738451957702637, "logps/chosen": -389.29522705078125, "logps/rejected": -272.861083984375, "loss": 0.4251, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8123197555541992, "rewards/margins": 1.3873460292816162, "rewards/rejected": -2.1996657848358154, "step": 2210 }, { "epoch": 0.3561973525872443, "grad_norm": 11.9375, "learning_rate": 4.065597381610712e-06, "logits/chosen": -2.819195032119751, "logits/rejected": -2.7711880207061768, "logps/chosen": -403.8146057128906, "logps/rejected": -266.08270263671875, "loss": 0.4372, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7162784934043884, "rewards/margins": 1.3817870616912842, "rewards/rejected": -2.0980656147003174, "step": 2220 }, { "epoch": 0.3578018451664661, "grad_norm": 10.4375, "learning_rate": 4.0546541692602005e-06, "logits/chosen": -2.8412981033325195, "logits/rejected": -2.7708215713500977, "logps/chosen": -456.4703063964844, "logps/rejected": -265.5205383300781, "loss": 0.4353, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.45415082573890686, "rewards/margins": 1.364924669265747, "rewards/rejected": -1.819075584411621, "step": 2230 }, { "epoch": 0.3594063377456879, "grad_norm": 11.3125, "learning_rate": 4.043662168529933e-06, "logits/chosen": -2.8241047859191895, "logits/rejected": -2.767178535461426, "logps/chosen": -395.66058349609375, "logps/rejected": -268.1952819824219, "loss": 0.4661, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.7429437637329102, "rewards/margins": 1.180983543395996, "rewards/rejected": -1.9239270687103271, "step": 2240 }, { "epoch": 0.36101083032490977, "grad_norm": 7.34375, "learning_rate": 4.032621724372452e-06, "logits/chosen": -2.8291306495666504, "logits/rejected": -2.7688534259796143, "logps/chosen": -405.8734436035156, "logps/rejected": -262.1336669921875, "loss": 0.4472, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.4065615236759186, "rewards/margins": 1.3807846307754517, "rewards/rejected": -1.7873461246490479, "step": 2250 }, { "epoch": 0.36261532290413157, "grad_norm": 6.71875, "learning_rate": 4.021533183260555e-06, "logits/chosen": -2.800424814224243, "logits/rejected": -2.7467758655548096, "logps/chosen": -419.69256591796875, "logps/rejected": -253.67221069335938, "loss": 0.4394, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.38030701875686646, "rewards/margins": 1.4713172912597656, "rewards/rejected": -1.8516244888305664, "step": 2260 }, { "epoch": 0.3642198154833534, "grad_norm": 7.46875, "learning_rate": 4.010396893176427e-06, "logits/chosen": -2.8103132247924805, "logits/rejected": -2.757340908050537, "logps/chosen": -408.7400817871094, "logps/rejected": -255.8497772216797, "loss": 0.438, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.6360756158828735, "rewards/margins": 1.4360620975494385, "rewards/rejected": -2.0721375942230225, "step": 2270 }, { "epoch": 0.3658243080625752, "grad_norm": 11.6875, "learning_rate": 3.9992132036007195e-06, "logits/chosen": -2.826112985610962, "logits/rejected": -2.775026559829712, "logps/chosen": -421.4810485839844, "logps/rejected": -263.1106262207031, "loss": 0.4174, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.5669287443161011, "rewards/margins": 1.465318202972412, "rewards/rejected": -2.0322468280792236, "step": 2280 }, { "epoch": 0.367428800641797, "grad_norm": 10.5, "learning_rate": 3.987982465501579e-06, "logits/chosen": -2.8291943073272705, "logits/rejected": -2.769111394882202, "logps/chosen": -402.0927429199219, "logps/rejected": -250.5889434814453, "loss": 0.4605, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8140931129455566, "rewards/margins": 1.2127196788787842, "rewards/rejected": -2.026812791824341, "step": 2290 }, { "epoch": 0.36903329322101885, "grad_norm": 7.5, "learning_rate": 3.9767050313236374e-06, "logits/chosen": -2.8169147968292236, "logits/rejected": -2.767400026321411, "logps/chosen": -435.84918212890625, "logps/rejected": -273.3145446777344, "loss": 0.426, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.6654006242752075, "rewards/margins": 1.403143286705017, "rewards/rejected": -2.0685439109802246, "step": 2300 }, { "epoch": 0.37063778580024065, "grad_norm": 11.125, "learning_rate": 3.9653812549769485e-06, "logits/chosen": -2.8127574920654297, "logits/rejected": -2.771003246307373, "logps/chosen": -375.5522155761719, "logps/rejected": -262.3276672363281, "loss": 0.4196, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.1329165697097778, "rewards/margins": 1.416504144668579, "rewards/rejected": -2.5494205951690674, "step": 2310 }, { "epoch": 0.3722422783794625, "grad_norm": 9.6875, "learning_rate": 3.954011491825883e-06, "logits/chosen": -2.8120346069335938, "logits/rejected": -2.7585608959198, "logps/chosen": -433.7591247558594, "logps/rejected": -285.0611877441406, "loss": 0.4277, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0102908611297607, "rewards/margins": 1.3766281604766846, "rewards/rejected": -2.3869190216064453, "step": 2320 }, { "epoch": 0.3738467709586843, "grad_norm": 12.25, "learning_rate": 3.942596098677976e-06, "logits/chosen": -2.840128183364868, "logits/rejected": -2.801668643951416, "logps/chosen": -419.1048889160156, "logps/rejected": -294.9473876953125, "loss": 0.4003, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.0013692378997803, "rewards/margins": 1.4748889207839966, "rewards/rejected": -2.4762582778930664, "step": 2330 }, { "epoch": 0.37545126353790614, "grad_norm": 14.5, "learning_rate": 3.931135433772732e-06, "logits/chosen": -2.829989433288574, "logits/rejected": -2.782634735107422, "logps/chosen": -407.46612548828125, "logps/rejected": -289.56036376953125, "loss": 0.5036, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.205578327178955, "rewards/margins": 1.238547444343567, "rewards/rejected": -2.4441256523132324, "step": 2340 }, { "epoch": 0.37705575611712794, "grad_norm": 12.5, "learning_rate": 3.919629856770375e-06, "logits/chosen": -2.82781720161438, "logits/rejected": -2.776423454284668, "logps/chosen": -403.5875244140625, "logps/rejected": -271.0021057128906, "loss": 0.4562, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9876360893249512, "rewards/margins": 1.4429900646209717, "rewards/rejected": -2.4306259155273438, "step": 2350 }, { "epoch": 0.3786602486963498, "grad_norm": 11.0625, "learning_rate": 3.908079728740571e-06, "logits/chosen": -2.813687801361084, "logits/rejected": -2.771852493286133, "logps/chosen": -393.7330322265625, "logps/rejected": -289.1087951660156, "loss": 0.3794, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8774979710578918, "rewards/margins": 1.6197984218597412, "rewards/rejected": -2.497296094894409, "step": 2360 }, { "epoch": 0.3802647412755716, "grad_norm": 6.15625, "learning_rate": 3.896485412151094e-06, "logits/chosen": -2.8208537101745605, "logits/rejected": -2.784996509552002, "logps/chosen": -401.4300842285156, "logps/rejected": -310.7096862792969, "loss": 0.4321, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.263930082321167, "rewards/margins": 1.3450746536254883, "rewards/rejected": -2.6090049743652344, "step": 2370 }, { "epoch": 0.38186923385479343, "grad_norm": 10.125, "learning_rate": 3.8848472708564445e-06, "logits/chosen": -2.8269906044006348, "logits/rejected": -2.7886741161346436, "logps/chosen": -388.1075134277344, "logps/rejected": -289.2283935546875, "loss": 0.4744, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7732826471328735, "rewards/margins": 1.1338844299316406, "rewards/rejected": -2.907167434692383, "step": 2380 }, { "epoch": 0.3834737264340152, "grad_norm": 15.4375, "learning_rate": 3.87316567008644e-06, "logits/chosen": -2.834463596343994, "logits/rejected": -2.782074451446533, "logps/chosen": -435.50244140625, "logps/rejected": -286.7928771972656, "loss": 0.4374, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.746323823928833, "rewards/margins": 1.4414706230163574, "rewards/rejected": -3.1877944469451904, "step": 2390 }, { "epoch": 0.3850782190132371, "grad_norm": 13.375, "learning_rate": 3.861440976434749e-06, "logits/chosen": -2.819500684738159, "logits/rejected": -2.7791168689727783, "logps/chosen": -432.6981506347656, "logps/rejected": -296.9704895019531, "loss": 0.4513, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6540443897247314, "rewards/margins": 1.417845368385315, "rewards/rejected": -3.071889877319336, "step": 2400 }, { "epoch": 0.38668271159245887, "grad_norm": 7.84375, "learning_rate": 3.8496735578473864e-06, "logits/chosen": -2.822260618209839, "logits/rejected": -2.7694497108459473, "logps/chosen": -419.53631591796875, "logps/rejected": -278.3862609863281, "loss": 0.4452, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.71856689453125, "rewards/margins": 1.280632734298706, "rewards/rejected": -2.999199628829956, "step": 2410 }, { "epoch": 0.3882872041716807, "grad_norm": 14.6875, "learning_rate": 3.837863783611168e-06, "logits/chosen": -2.816465139389038, "logits/rejected": -2.763068675994873, "logps/chosen": -400.8653869628906, "logps/rejected": -263.584228515625, "loss": 0.3993, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.452508807182312, "rewards/margins": 1.5745631456375122, "rewards/rejected": -3.027071952819824, "step": 2420 }, { "epoch": 0.3898916967509025, "grad_norm": 9.125, "learning_rate": 3.826012024342118e-06, "logits/chosen": -2.8215198516845703, "logits/rejected": -2.7807555198669434, "logps/chosen": -418.01953125, "logps/rejected": -288.7490234375, "loss": 0.4143, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2191088199615479, "rewards/margins": 1.4244413375854492, "rewards/rejected": -2.643550157546997, "step": 2430 }, { "epoch": 0.39149618933012437, "grad_norm": 9.0, "learning_rate": 3.8141186519738453e-06, "logits/chosen": -2.842118978500366, "logits/rejected": -2.7792158126831055, "logps/chosen": -464.254150390625, "logps/rejected": -278.1700134277344, "loss": 0.3999, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0178345441818237, "rewards/margins": 1.515088438987732, "rewards/rejected": -2.5329232215881348, "step": 2440 }, { "epoch": 0.39310068190934616, "grad_norm": 7.125, "learning_rate": 3.8021840397458633e-06, "logits/chosen": -2.8276729583740234, "logits/rejected": -2.781210422515869, "logps/chosen": -390.99884033203125, "logps/rejected": -252.1514892578125, "loss": 0.3977, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9298402070999146, "rewards/margins": 1.5225456953048706, "rewards/rejected": -2.452385663986206, "step": 2450 }, { "epoch": 0.394705174488568, "grad_norm": 9.75, "learning_rate": 3.7902085621918814e-06, "logits/chosen": -2.8419032096862793, "logits/rejected": -2.774501323699951, "logps/chosen": -496.63671875, "logps/rejected": -298.4337158203125, "loss": 0.4271, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.4720006585121155, "rewards/margins": 1.6973215341567993, "rewards/rejected": -2.1693222522735596, "step": 2460 }, { "epoch": 0.3963096670677898, "grad_norm": 8.625, "learning_rate": 3.778192595128052e-06, "logits/chosen": -2.80600905418396, "logits/rejected": -2.7639694213867188, "logps/chosen": -425.97509765625, "logps/rejected": -268.548583984375, "loss": 0.439, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.8670732378959656, "rewards/margins": 1.4058377742767334, "rewards/rejected": -2.2729110717773438, "step": 2470 }, { "epoch": 0.39791415964701166, "grad_norm": 13.0, "learning_rate": 3.7661365156411737e-06, "logits/chosen": -2.8124642372131348, "logits/rejected": -2.766831159591675, "logps/chosen": -426.078369140625, "logps/rejected": -257.0282287597656, "loss": 0.4325, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.113468885421753, "rewards/margins": 1.406341791152954, "rewards/rejected": -2.519810914993286, "step": 2480 }, { "epoch": 0.39951865222623345, "grad_norm": 10.0, "learning_rate": 3.75404070207686e-06, "logits/chosen": -2.825017213821411, "logits/rejected": -2.7758235931396484, "logps/chosen": -391.9830017089844, "logps/rejected": -250.1360321044922, "loss": 0.4103, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0305068492889404, "rewards/margins": 1.485849380493164, "rewards/rejected": -2.5163562297821045, "step": 2490 }, { "epoch": 0.4011231448054553, "grad_norm": 12.8125, "learning_rate": 3.741905534027662e-06, "logits/chosen": -2.8506264686584473, "logits/rejected": -2.7808947563171387, "logps/chosen": -489.24493408203125, "logps/rejected": -304.2207946777344, "loss": 0.3811, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.7031130790710449, "rewards/margins": 1.6806329488754272, "rewards/rejected": -2.3837459087371826, "step": 2500 }, { "epoch": 0.4027276373846771, "grad_norm": 12.8125, "learning_rate": 3.7297313923211644e-06, "logits/chosen": -2.8180794715881348, "logits/rejected": -2.7730562686920166, "logps/chosen": -404.1630859375, "logps/rejected": -272.183837890625, "loss": 0.4551, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.7340630292892456, "rewards/margins": 1.4273908138275146, "rewards/rejected": -2.1614537239074707, "step": 2510 }, { "epoch": 0.4043321299638989, "grad_norm": 13.375, "learning_rate": 3.717518659008023e-06, "logits/chosen": -2.8430328369140625, "logits/rejected": -2.7841670513153076, "logps/chosen": -446.97137451171875, "logps/rejected": -296.09161376953125, "loss": 0.3646, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.6095987558364868, "rewards/margins": 1.5820740461349487, "rewards/rejected": -2.1916728019714355, "step": 2520 }, { "epoch": 0.40593662254312074, "grad_norm": 7.0, "learning_rate": 3.705267717349984e-06, "logits/chosen": -2.816943645477295, "logits/rejected": -2.780935049057007, "logps/chosen": -381.6192321777344, "logps/rejected": -287.7428283691406, "loss": 0.4803, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7459432482719421, "rewards/margins": 1.3833239078521729, "rewards/rejected": -2.1292672157287598, "step": 2530 }, { "epoch": 0.40754111512234253, "grad_norm": 12.9375, "learning_rate": 3.6929789518078535e-06, "logits/chosen": -2.846510410308838, "logits/rejected": -2.7946457862854004, "logps/chosen": -436.550048828125, "logps/rejected": -289.9068298339844, "loss": 0.4671, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.7877927422523499, "rewards/margins": 1.3494470119476318, "rewards/rejected": -2.137239933013916, "step": 2540 }, { "epoch": 0.4091456077015644, "grad_norm": 10.5, "learning_rate": 3.680652748029431e-06, "logits/chosen": -2.8241729736328125, "logits/rejected": -2.779114246368408, "logps/chosen": -389.94464111328125, "logps/rejected": -274.4510192871094, "loss": 0.466, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.2651816606521606, "rewards/margins": 1.3450332880020142, "rewards/rejected": -2.6102147102355957, "step": 2550 }, { "epoch": 0.4107501002807862, "grad_norm": 7.875, "learning_rate": 3.6682894928374074e-06, "logits/chosen": -2.822953462600708, "logits/rejected": -2.773242473602295, "logps/chosen": -453.013671875, "logps/rejected": -272.04840087890625, "loss": 0.3733, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.254067063331604, "rewards/margins": 1.6483091115951538, "rewards/rejected": -2.9023759365081787, "step": 2560 }, { "epoch": 0.41235459286000803, "grad_norm": 13.4375, "learning_rate": 3.655889574217229e-06, "logits/chosen": -2.8311386108398438, "logits/rejected": -2.777768611907959, "logps/chosen": -433.63238525390625, "logps/rejected": -287.40277099609375, "loss": 0.4388, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.0037591457366943, "rewards/margins": 1.4758713245391846, "rewards/rejected": -2.479630470275879, "step": 2570 }, { "epoch": 0.4139590854392298, "grad_norm": 8.9375, "learning_rate": 3.6434533813049167e-06, "logits/chosen": -2.8311331272125244, "logits/rejected": -2.7710201740264893, "logps/chosen": -435.89312744140625, "logps/rejected": -264.0892028808594, "loss": 0.3996, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9602543711662292, "rewards/margins": 1.4719023704528809, "rewards/rejected": -2.432156801223755, "step": 2580 }, { "epoch": 0.4155635780184517, "grad_norm": 13.125, "learning_rate": 3.630981304374858e-06, "logits/chosen": -2.837360382080078, "logits/rejected": -2.7852842807769775, "logps/chosen": -452.78228759765625, "logps/rejected": -302.85662841796875, "loss": 0.4398, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.6609991788864136, "rewards/margins": 1.5464762449264526, "rewards/rejected": -2.207475423812866, "step": 2590 }, { "epoch": 0.41716807059767347, "grad_norm": 10.9375, "learning_rate": 3.6184737348275562e-06, "logits/chosen": -2.8294615745544434, "logits/rejected": -2.7756845951080322, "logps/chosen": -418.3937072753906, "logps/rejected": -259.9859313964844, "loss": 0.4409, "rewards/accuracies": 0.78125, "rewards/chosen": -1.071225881576538, "rewards/margins": 1.3552013635635376, "rewards/rejected": -2.4264273643493652, "step": 2600 }, { "epoch": 0.4187725631768953, "grad_norm": 13.25, "learning_rate": 3.6059310651773505e-06, "logits/chosen": -2.827119827270508, "logits/rejected": -2.7706587314605713, "logps/chosen": -414.55047607421875, "logps/rejected": -269.3507995605469, "loss": 0.4799, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1380531787872314, "rewards/margins": 1.2171127796173096, "rewards/rejected": -2.35516619682312, "step": 2610 }, { "epoch": 0.4203770557561171, "grad_norm": 11.6875, "learning_rate": 3.5933536890400945e-06, "logits/chosen": -2.828174114227295, "logits/rejected": -2.773561954498291, "logps/chosen": -398.8441467285156, "logps/rejected": -278.879638671875, "loss": 0.4032, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.1211647987365723, "rewards/margins": 1.3674887418746948, "rewards/rejected": -2.4886534214019775, "step": 2620 }, { "epoch": 0.42198154833533896, "grad_norm": 9.25, "learning_rate": 3.5807420011208094e-06, "logits/chosen": -2.8267405033111572, "logits/rejected": -2.7764289379119873, "logps/chosen": -404.2403869628906, "logps/rejected": -256.2148742675781, "loss": 0.425, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.2520352602005005, "rewards/margins": 1.3581024408340454, "rewards/rejected": -2.610137939453125, "step": 2630 }, { "epoch": 0.42358604091456076, "grad_norm": 7.1875, "learning_rate": 3.5680963972012895e-06, "logits/chosen": -2.8059284687042236, "logits/rejected": -2.7661609649658203, "logps/chosen": -437.0152282714844, "logps/rejected": -284.64642333984375, "loss": 0.4249, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0187032222747803, "rewards/margins": 1.4981029033660889, "rewards/rejected": -2.516806125640869, "step": 2640 }, { "epoch": 0.4251905334937826, "grad_norm": 11.125, "learning_rate": 3.555417274127688e-06, "logits/chosen": -2.806041717529297, "logits/rejected": -2.7345752716064453, "logps/chosen": -437.42449951171875, "logps/rejected": -243.7925262451172, "loss": 0.4043, "rewards/accuracies": 0.8125, "rewards/chosen": -1.131103277206421, "rewards/margins": 1.5449262857437134, "rewards/rejected": -2.676029682159424, "step": 2650 }, { "epoch": 0.4267950260730044, "grad_norm": 11.75, "learning_rate": 3.5427050297980625e-06, "logits/chosen": -2.833854913711548, "logits/rejected": -2.789044141769409, "logps/chosen": -424.49395751953125, "logps/rejected": -306.00250244140625, "loss": 0.478, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2134528160095215, "rewards/margins": 1.2729146480560303, "rewards/rejected": -2.486367702484131, "step": 2660 }, { "epoch": 0.42839951865222625, "grad_norm": 13.3125, "learning_rate": 3.529960063149883e-06, "logits/chosen": -2.8049068450927734, "logits/rejected": -2.763725757598877, "logps/chosen": -402.43316650390625, "logps/rejected": -282.8229675292969, "loss": 0.3966, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1925939321517944, "rewards/margins": 1.5080509185791016, "rewards/rejected": -2.7006449699401855, "step": 2670 }, { "epoch": 0.43000401123144805, "grad_norm": 13.0, "learning_rate": 3.5171827741475184e-06, "logits/chosen": -2.820702075958252, "logits/rejected": -2.7615792751312256, "logps/chosen": -463.08734130859375, "logps/rejected": -291.80682373046875, "loss": 0.389, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9163376092910767, "rewards/margins": 1.6298586130142212, "rewards/rejected": -2.546196460723877, "step": 2680 }, { "epoch": 0.4316085038106699, "grad_norm": 8.5625, "learning_rate": 3.5043735637696817e-06, "logits/chosen": -2.8125245571136475, "logits/rejected": -2.7627992630004883, "logps/chosen": -416.66790771484375, "logps/rejected": -284.27008056640625, "loss": 0.425, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.9711006283760071, "rewards/margins": 1.4484375715255737, "rewards/rejected": -2.4195382595062256, "step": 2690 }, { "epoch": 0.4332129963898917, "grad_norm": 8.75, "learning_rate": 3.491532833996848e-06, "logits/chosen": -2.8026819229125977, "logits/rejected": -2.767908811569214, "logps/chosen": -393.42193603515625, "logps/rejected": -294.35113525390625, "loss": 0.3752, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.0100655555725098, "rewards/margins": 1.4970743656158447, "rewards/rejected": -2.5071399211883545, "step": 2700 }, { "epoch": 0.43481748896911354, "grad_norm": 11.125, "learning_rate": 3.4786609877986355e-06, "logits/chosen": -2.8021390438079834, "logits/rejected": -2.7565255165100098, "logps/chosen": -435.7565002441406, "logps/rejected": -317.2334899902344, "loss": 0.4193, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8513836860656738, "rewards/margins": 1.651188611984253, "rewards/rejected": -2.502572536468506, "step": 2710 }, { "epoch": 0.43642198154833534, "grad_norm": 11.8125, "learning_rate": 3.465758429121168e-06, "logits/chosen": -2.8115901947021484, "logits/rejected": -2.7734529972076416, "logps/chosen": -382.9571228027344, "logps/rejected": -293.75384521484375, "loss": 0.4403, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.962203860282898, "rewards/margins": 1.3298451900482178, "rewards/rejected": -2.292048931121826, "step": 2720 }, { "epoch": 0.4380264741275572, "grad_norm": 10.625, "learning_rate": 3.452825562874386e-06, "logits/chosen": -2.836378574371338, "logits/rejected": -2.779000997543335, "logps/chosen": -408.4696350097656, "logps/rejected": -282.02618408203125, "loss": 0.4295, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.843408465385437, "rewards/margins": 1.5484886169433594, "rewards/rejected": -2.391896963119507, "step": 2730 }, { "epoch": 0.439630966706779, "grad_norm": 9.3125, "learning_rate": 3.439862794919353e-06, "logits/chosen": -2.816718578338623, "logits/rejected": -2.7659106254577637, "logps/chosen": -420.3966369628906, "logps/rejected": -282.55401611328125, "loss": 0.4141, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.7940025925636292, "rewards/margins": 1.4998505115509033, "rewards/rejected": -2.293853282928467, "step": 2740 }, { "epoch": 0.4412354592860008, "grad_norm": 15.1875, "learning_rate": 3.4268705320555073e-06, "logits/chosen": -2.817455768585205, "logits/rejected": -2.7806692123413086, "logps/chosen": -410.55218505859375, "logps/rejected": -307.0101623535156, "loss": 0.4978, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.7928449511528015, "rewards/margins": 1.2935985326766968, "rewards/rejected": -2.0864434242248535, "step": 2750 }, { "epoch": 0.4428399518652226, "grad_norm": 7.90625, "learning_rate": 3.4138491820079034e-06, "logits/chosen": -2.8248612880706787, "logits/rejected": -2.765878915786743, "logps/chosen": -422.9700622558594, "logps/rejected": -274.6608581542969, "loss": 0.3983, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5680141448974609, "rewards/margins": 1.628121018409729, "rewards/rejected": -2.1961350440979004, "step": 2760 }, { "epoch": 0.4444444444444444, "grad_norm": 9.8125, "learning_rate": 3.4007991534144125e-06, "logits/chosen": -2.8207664489746094, "logits/rejected": -2.783773899078369, "logps/chosen": -358.66204833984375, "logps/rejected": -265.75396728515625, "loss": 0.4918, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7189741730690002, "rewards/margins": 1.3029149770736694, "rewards/rejected": -2.0218892097473145, "step": 2770 }, { "epoch": 0.44604893702366627, "grad_norm": 10.875, "learning_rate": 3.387720855812901e-06, "logits/chosen": -2.8234477043151855, "logits/rejected": -2.7518372535705566, "logps/chosen": -451.5909118652344, "logps/rejected": -248.64041137695312, "loss": 0.3437, "rewards/accuracies": 0.84375, "rewards/chosen": -0.4180319905281067, "rewards/margins": 1.6234352588653564, "rewards/rejected": -2.0414671897888184, "step": 2780 }, { "epoch": 0.44765342960288806, "grad_norm": 8.5625, "learning_rate": 3.374614699628377e-06, "logits/chosen": -2.8360514640808105, "logits/rejected": -2.771167278289795, "logps/chosen": -426.1410217285156, "logps/rejected": -260.17852783203125, "loss": 0.4054, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.6536850333213806, "rewards/margins": 1.5942373275756836, "rewards/rejected": -2.247922420501709, "step": 2790 }, { "epoch": 0.4492579221821099, "grad_norm": 11.625, "learning_rate": 3.361481096160109e-06, "logits/chosen": -2.8202052116394043, "logits/rejected": -2.7625045776367188, "logps/chosen": -414.6222229003906, "logps/rejected": -271.0558166503906, "loss": 0.4737, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0394846200942993, "rewards/margins": 1.3224780559539795, "rewards/rejected": -2.3619627952575684, "step": 2800 }, { "epoch": 0.4508624147613317, "grad_norm": 6.21875, "learning_rate": 3.3483204575687244e-06, "logits/chosen": -2.8286826610565186, "logits/rejected": -2.7769789695739746, "logps/chosen": -393.79913330078125, "logps/rejected": -282.994873046875, "loss": 0.4601, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9660390615463257, "rewards/margins": 1.30779230594635, "rewards/rejected": -2.273831367492676, "step": 2810 }, { "epoch": 0.45246690734055356, "grad_norm": 6.8125, "learning_rate": 3.335133196863266e-06, "logits/chosen": -2.822765827178955, "logits/rejected": -2.7666163444519043, "logps/chosen": -424.19854736328125, "logps/rejected": -261.652587890625, "loss": 0.4459, "rewards/accuracies": 0.78125, "rewards/chosen": -1.089927315711975, "rewards/margins": 1.3931143283843994, "rewards/rejected": -2.483041524887085, "step": 2820 }, { "epoch": 0.45407139991977535, "grad_norm": 9.6875, "learning_rate": 3.3219197278882386e-06, "logits/chosen": -2.8291375637054443, "logits/rejected": -2.773285388946533, "logps/chosen": -425.01995849609375, "logps/rejected": -286.07220458984375, "loss": 0.4259, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9031774401664734, "rewards/margins": 1.37078058719635, "rewards/rejected": -2.2739577293395996, "step": 2830 }, { "epoch": 0.4556758924989972, "grad_norm": 8.0, "learning_rate": 3.3086804653106175e-06, "logits/chosen": -2.80998158454895, "logits/rejected": -2.7614188194274902, "logps/chosen": -396.6272888183594, "logps/rejected": -280.6564636230469, "loss": 0.3834, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7941573858261108, "rewards/margins": 1.6057764291763306, "rewards/rejected": -2.3999335765838623, "step": 2840 }, { "epoch": 0.457280385078219, "grad_norm": 10.4375, "learning_rate": 3.2954158246068356e-06, "logits/chosen": -2.8146986961364746, "logits/rejected": -2.7502431869506836, "logps/chosen": -404.8876037597656, "logps/rejected": -264.81707763671875, "loss": 0.3371, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8326250910758972, "rewards/margins": 1.725288987159729, "rewards/rejected": -2.5579140186309814, "step": 2850 }, { "epoch": 0.45888487765744085, "grad_norm": 11.6875, "learning_rate": 3.282126222049748e-06, "logits/chosen": -2.81593656539917, "logits/rejected": -2.76568603515625, "logps/chosen": -421.78155517578125, "logps/rejected": -273.9609375, "loss": 0.4281, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9653350710868835, "rewards/margins": 1.3745200634002686, "rewards/rejected": -2.339855194091797, "step": 2860 }, { "epoch": 0.46048937023666264, "grad_norm": 14.1875, "learning_rate": 3.2688120746955656e-06, "logits/chosen": -2.7972731590270996, "logits/rejected": -2.7385306358337402, "logps/chosen": -436.412353515625, "logps/rejected": -261.4578552246094, "loss": 0.3874, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.035569667816162, "rewards/margins": 1.5121324062347412, "rewards/rejected": -2.5477023124694824, "step": 2870 }, { "epoch": 0.4620938628158845, "grad_norm": 12.625, "learning_rate": 3.255473800370765e-06, "logits/chosen": -2.8207039833068848, "logits/rejected": -2.7749929428100586, "logps/chosen": -390.1311340332031, "logps/rejected": -284.9284973144531, "loss": 0.4756, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9819747805595398, "rewards/margins": 1.3222700357437134, "rewards/rejected": -2.3042449951171875, "step": 2880 }, { "epoch": 0.4636983553951063, "grad_norm": 11.875, "learning_rate": 3.242111817658984e-06, "logits/chosen": -2.827316999435425, "logits/rejected": -2.775053024291992, "logps/chosen": -411.3734436035156, "logps/rejected": -271.496337890625, "loss": 0.453, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9504737854003906, "rewards/margins": 1.460353970527649, "rewards/rejected": -2.410827875137329, "step": 2890 }, { "epoch": 0.46530284797432814, "grad_norm": 10.25, "learning_rate": 3.228726545887875e-06, "logits/chosen": -2.809264898300171, "logits/rejected": -2.7536513805389404, "logps/chosen": -453.7724609375, "logps/rejected": -288.0251770019531, "loss": 0.3991, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7339199781417847, "rewards/margins": 1.5921478271484375, "rewards/rejected": -2.3260676860809326, "step": 2900 }, { "epoch": 0.46690734055354993, "grad_norm": 9.3125, "learning_rate": 3.2153184051159534e-06, "logits/chosen": -2.797029733657837, "logits/rejected": -2.74292254447937, "logps/chosen": -417.32666015625, "logps/rejected": -270.3124084472656, "loss": 0.3553, "rewards/accuracies": 0.84375, "rewards/chosen": -1.0259898900985718, "rewards/margins": 1.5857590436935425, "rewards/rejected": -2.6117489337921143, "step": 2910 }, { "epoch": 0.4685118331327718, "grad_norm": 10.9375, "learning_rate": 3.2018878161194156e-06, "logits/chosen": -2.827279567718506, "logits/rejected": -2.7764148712158203, "logps/chosen": -410.844970703125, "logps/rejected": -287.60699462890625, "loss": 0.4449, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9705850481987, "rewards/margins": 1.306443452835083, "rewards/rejected": -2.2770285606384277, "step": 2920 }, { "epoch": 0.4701163257119936, "grad_norm": 12.1875, "learning_rate": 3.1884352003789253e-06, "logits/chosen": -2.8161652088165283, "logits/rejected": -2.763558864593506, "logps/chosen": -401.2320861816406, "logps/rejected": -271.4428405761719, "loss": 0.3952, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6931312680244446, "rewards/margins": 1.5306313037872314, "rewards/rejected": -2.2237625122070312, "step": 2930 }, { "epoch": 0.4717208182912154, "grad_norm": 8.1875, "learning_rate": 3.1749609800663985e-06, "logits/chosen": -2.824585437774658, "logits/rejected": -2.757786273956299, "logps/chosen": -436.14337158203125, "logps/rejected": -245.1741180419922, "loss": 0.4077, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7851163744926453, "rewards/margins": 1.5094777345657349, "rewards/rejected": -2.2945942878723145, "step": 2940 }, { "epoch": 0.4733253108704372, "grad_norm": 7.625, "learning_rate": 3.161465578031745e-06, "logits/chosen": -2.8240790367126465, "logits/rejected": -2.7777669429779053, "logps/chosen": -385.8100891113281, "logps/rejected": -294.3188171386719, "loss": 0.4614, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.937174916267395, "rewards/margins": 1.3125665187835693, "rewards/rejected": -2.249741792678833, "step": 2950 }, { "epoch": 0.47492980344965907, "grad_norm": 10.6875, "learning_rate": 3.147949417789604e-06, "logits/chosen": -2.8083853721618652, "logits/rejected": -2.7507805824279785, "logps/chosen": -437.103515625, "logps/rejected": -248.70059204101562, "loss": 0.4152, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.7807377576828003, "rewards/margins": 1.509389877319336, "rewards/rejected": -2.2901272773742676, "step": 2960 }, { "epoch": 0.47653429602888087, "grad_norm": 18.75, "learning_rate": 3.1344129235060518e-06, "logits/chosen": -2.810344696044922, "logits/rejected": -2.7640693187713623, "logps/chosen": -394.677490234375, "logps/rejected": -280.7336120605469, "loss": 0.4265, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.943194568157196, "rewards/margins": 1.4846444129943848, "rewards/rejected": -2.4278390407562256, "step": 2970 }, { "epoch": 0.47813878860810266, "grad_norm": 10.8125, "learning_rate": 3.1208565199852907e-06, "logits/chosen": -2.812633991241455, "logits/rejected": -2.770557403564453, "logps/chosen": -386.4386291503906, "logps/rejected": -287.1010437011719, "loss": 0.4122, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9378561973571777, "rewards/margins": 1.4616590738296509, "rewards/rejected": -2.399515390396118, "step": 2980 }, { "epoch": 0.4797432811873245, "grad_norm": 11.375, "learning_rate": 3.1072806326563175e-06, "logits/chosen": -2.8195152282714844, "logits/rejected": -2.7562193870544434, "logps/chosen": -485.4278259277344, "logps/rejected": -302.16583251953125, "loss": 0.3809, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.7290199995040894, "rewards/margins": 1.6673882007598877, "rewards/rejected": -2.3964080810546875, "step": 2990 }, { "epoch": 0.4813477737665463, "grad_norm": 10.0, "learning_rate": 3.093685687559571e-06, "logits/chosen": -2.811738967895508, "logits/rejected": -2.773380994796753, "logps/chosen": -361.74725341796875, "logps/rejected": -262.7242736816406, "loss": 0.414, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8318725824356079, "rewards/margins": 1.4891269207000732, "rewards/rejected": -2.3209993839263916, "step": 3000 }, { "epoch": 0.48295226634576816, "grad_norm": 9.6875, "learning_rate": 3.0800721113335663e-06, "logits/chosen": -2.8122129440307617, "logits/rejected": -2.762244701385498, "logps/chosen": -383.7951354980469, "logps/rejected": -244.3408966064453, "loss": 0.3966, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8150434494018555, "rewards/margins": 1.517504096031189, "rewards/rejected": -2.332547664642334, "step": 3010 }, { "epoch": 0.48455675892498995, "grad_norm": 13.6875, "learning_rate": 3.066440331201502e-06, "logits/chosen": -2.828385591506958, "logits/rejected": -2.7672581672668457, "logps/chosen": -425.8971252441406, "logps/rejected": -273.9447326660156, "loss": 0.361, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.6210823655128479, "rewards/margins": 1.7288341522216797, "rewards/rejected": -2.349916934967041, "step": 3020 }, { "epoch": 0.4861612515042118, "grad_norm": 10.0, "learning_rate": 3.0527907749578515e-06, "logits/chosen": -2.8156967163085938, "logits/rejected": -2.754695177078247, "logps/chosen": -421.4173278808594, "logps/rejected": -254.8525390625, "loss": 0.3706, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.725676417350769, "rewards/margins": 1.5676281452178955, "rewards/rejected": -2.293304443359375, "step": 3030 }, { "epoch": 0.4877657440834336, "grad_norm": 13.125, "learning_rate": 3.039123870954947e-06, "logits/chosen": -2.812260150909424, "logits/rejected": -2.779130697250366, "logps/chosen": -366.0400695800781, "logps/rejected": -283.5957946777344, "loss": 0.4715, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9428842663764954, "rewards/margins": 1.2980037927627563, "rewards/rejected": -2.2408878803253174, "step": 3040 }, { "epoch": 0.48937023666265544, "grad_norm": 9.6875, "learning_rate": 3.0254400480895245e-06, "logits/chosen": -2.8062095642089844, "logits/rejected": -2.754417896270752, "logps/chosen": -393.327880859375, "logps/rejected": -268.80194091796875, "loss": 0.4952, "rewards/accuracies": 0.75, "rewards/chosen": -0.9522675275802612, "rewards/margins": 1.216313123703003, "rewards/rejected": -2.1685807704925537, "step": 3050 }, { "epoch": 0.49097472924187724, "grad_norm": 9.8125, "learning_rate": 3.0117397357892735e-06, "logits/chosen": -2.8453474044799805, "logits/rejected": -2.789247989654541, "logps/chosen": -406.6672058105469, "logps/rejected": -274.49566650390625, "loss": 0.4196, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9159935712814331, "rewards/margins": 1.4512434005737305, "rewards/rejected": -2.367236852645874, "step": 3060 }, { "epoch": 0.4925792218210991, "grad_norm": 8.875, "learning_rate": 2.9980233639993567e-06, "logits/chosen": -2.8256187438964844, "logits/rejected": -2.780043601989746, "logps/chosen": -413.3309631347656, "logps/rejected": -303.7715759277344, "loss": 0.4734, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.8849212527275085, "rewards/margins": 1.2870713472366333, "rewards/rejected": -2.171992540359497, "step": 3070 }, { "epoch": 0.4941837144003209, "grad_norm": 9.875, "learning_rate": 2.984291363168918e-06, "logits/chosen": -2.8247365951538086, "logits/rejected": -2.779397487640381, "logps/chosen": -426.2349548339844, "logps/rejected": -296.6715393066406, "loss": 0.4346, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.0822045803070068, "rewards/margins": 1.2939157485961914, "rewards/rejected": -2.3761203289031982, "step": 3080 }, { "epoch": 0.49578820697954273, "grad_norm": 11.4375, "learning_rate": 2.9705441642375714e-06, "logits/chosen": -2.81083607673645, "logits/rejected": -2.76041841506958, "logps/chosen": -447.42266845703125, "logps/rejected": -292.23956298828125, "loss": 0.4591, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0524767637252808, "rewards/margins": 1.1792973279953003, "rewards/rejected": -2.231773853302002, "step": 3090 }, { "epoch": 0.49739269955876453, "grad_norm": 11.6875, "learning_rate": 2.9567821986218846e-06, "logits/chosen": -2.816746950149536, "logits/rejected": -2.783461570739746, "logps/chosen": -395.02935791015625, "logps/rejected": -310.9845275878906, "loss": 0.3801, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.0094465017318726, "rewards/margins": 1.5552771091461182, "rewards/rejected": -2.564723491668701, "step": 3100 }, { "epoch": 0.4989971921379864, "grad_norm": 10.75, "learning_rate": 2.943005898201833e-06, "logits/chosen": -2.8044450283050537, "logits/rejected": -2.7512025833129883, "logps/chosen": -447.44329833984375, "logps/rejected": -268.3612976074219, "loss": 0.4376, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9771315455436707, "rewards/margins": 1.3316794633865356, "rewards/rejected": -2.3088107109069824, "step": 3110 }, { "epoch": 0.5006016847172082, "grad_norm": 13.625, "learning_rate": 2.929215695307248e-06, "logits/chosen": -2.827730178833008, "logits/rejected": -2.7869603633880615, "logps/chosen": -392.8257141113281, "logps/rejected": -294.04119873046875, "loss": 0.4731, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9070205688476562, "rewards/margins": 1.2561603784561157, "rewards/rejected": -2.1631808280944824, "step": 3120 }, { "epoch": 0.50220617729643, "grad_norm": 7.28125, "learning_rate": 2.9154120227042527e-06, "logits/chosen": -2.837663173675537, "logits/rejected": -2.785902500152588, "logps/chosen": -454.11883544921875, "logps/rejected": -282.0298767089844, "loss": 0.4353, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.6419077515602112, "rewards/margins": 1.3099151849746704, "rewards/rejected": -1.9518228769302368, "step": 3130 }, { "epoch": 0.5038106698756518, "grad_norm": 7.28125, "learning_rate": 2.9015953135816767e-06, "logits/chosen": -2.8190338611602783, "logits/rejected": -2.7599315643310547, "logps/chosen": -426.4024353027344, "logps/rejected": -278.43853759765625, "loss": 0.4343, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.6205136775970459, "rewards/margins": 1.388504147529602, "rewards/rejected": -2.0090179443359375, "step": 3140 }, { "epoch": 0.5054151624548736, "grad_norm": 6.15625, "learning_rate": 2.8877660015374635e-06, "logits/chosen": -2.837505340576172, "logits/rejected": -2.782353162765503, "logps/chosen": -404.3583068847656, "logps/rejected": -273.02630615234375, "loss": 0.3559, "rewards/accuracies": 0.875, "rewards/chosen": -0.5392373204231262, "rewards/margins": 1.5700709819793701, "rewards/rejected": -2.1093080043792725, "step": 3150 }, { "epoch": 0.5070196550340955, "grad_norm": 14.0625, "learning_rate": 2.873924520565065e-06, "logits/chosen": -2.8454363346099854, "logits/rejected": -2.783719539642334, "logps/chosen": -444.43560791015625, "logps/rejected": -278.44036865234375, "loss": 0.4428, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.6443065404891968, "rewards/margins": 1.460407018661499, "rewards/rejected": -2.1047134399414062, "step": 3160 }, { "epoch": 0.5086241476133173, "grad_norm": 9.625, "learning_rate": 2.860071305039819e-06, "logits/chosen": -2.8145885467529297, "logits/rejected": -2.7574973106384277, "logps/chosen": -429.93841552734375, "logps/rejected": -281.12188720703125, "loss": 0.3343, "rewards/accuracies": 0.84375, "rewards/chosen": -0.46948665380477905, "rewards/margins": 1.6490061283111572, "rewards/rejected": -2.118492603302002, "step": 3170 }, { "epoch": 0.5102286401925391, "grad_norm": 10.5, "learning_rate": 2.8462067897053175e-06, "logits/chosen": -2.818878650665283, "logits/rejected": -2.7522521018981934, "logps/chosen": -433.22589111328125, "logps/rejected": -241.54733276367188, "loss": 0.3659, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.61244797706604, "rewards/margins": 1.6380103826522827, "rewards/rejected": -2.250458240509033, "step": 3180 }, { "epoch": 0.5118331327717609, "grad_norm": 7.375, "learning_rate": 2.832331409659768e-06, "logits/chosen": -2.8395230770111084, "logits/rejected": -2.780604839324951, "logps/chosen": -447.9345703125, "logps/rejected": -281.2388610839844, "loss": 0.4196, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6304101943969727, "rewards/margins": 1.6316190958023071, "rewards/rejected": -2.2620291709899902, "step": 3190 }, { "epoch": 0.5134376253509828, "grad_norm": 11.375, "learning_rate": 2.818445600342332e-06, "logits/chosen": -2.8145315647125244, "logits/rejected": -2.7563281059265137, "logps/chosen": -410.9217224121094, "logps/rejected": -267.31170654296875, "loss": 0.4161, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8047757148742676, "rewards/margins": 1.4726535081863403, "rewards/rejected": -2.2774291038513184, "step": 3200 }, { "epoch": 0.5150421179302046, "grad_norm": 8.375, "learning_rate": 2.8045497975194668e-06, "logits/chosen": -2.798147201538086, "logits/rejected": -2.764585018157959, "logps/chosen": -389.26776123046875, "logps/rejected": -296.3063659667969, "loss": 0.3999, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.7095081806182861, "rewards/margins": 1.4624547958374023, "rewards/rejected": -2.1719629764556885, "step": 3210 }, { "epoch": 0.5166466105094264, "grad_norm": 9.8125, "learning_rate": 2.7906444372712476e-06, "logits/chosen": -2.8277535438537598, "logits/rejected": -2.768761157989502, "logps/chosen": -423.33905029296875, "logps/rejected": -260.6512145996094, "loss": 0.4177, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.47891801595687866, "rewards/margins": 1.6241782903671265, "rewards/rejected": -2.1030964851379395, "step": 3220 }, { "epoch": 0.5182511030886482, "grad_norm": 8.625, "learning_rate": 2.7767299559776784e-06, "logits/chosen": -2.813702344894409, "logits/rejected": -2.762385129928589, "logps/chosen": -433.2394104003906, "logps/rejected": -286.14996337890625, "loss": 0.35, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.3821702003479004, "rewards/margins": 1.7594045400619507, "rewards/rejected": -2.1415746212005615, "step": 3230 }, { "epoch": 0.51985559566787, "grad_norm": 8.375, "learning_rate": 2.7628067903050055e-06, "logits/chosen": -2.799839496612549, "logits/rejected": -2.7407896518707275, "logps/chosen": -430.390625, "logps/rejected": -265.56768798828125, "loss": 0.4306, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8192785382270813, "rewards/margins": 1.385452389717102, "rewards/rejected": -2.204730749130249, "step": 3240 }, { "epoch": 0.5214600882470919, "grad_norm": 11.875, "learning_rate": 2.7488753771920067e-06, "logits/chosen": -2.7897450923919678, "logits/rejected": -2.7512030601501465, "logps/chosen": -417.1802673339844, "logps/rejected": -281.0238342285156, "loss": 0.3636, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7027153372764587, "rewards/margins": 1.622704267501831, "rewards/rejected": -2.3254199028015137, "step": 3250 }, { "epoch": 0.5230645808263137, "grad_norm": 7.84375, "learning_rate": 2.7349361538362845e-06, "logits/chosen": -2.8008475303649902, "logits/rejected": -2.7487616539001465, "logps/chosen": -448.62689208984375, "logps/rejected": -278.1871337890625, "loss": 0.3608, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.7277678847312927, "rewards/margins": 1.7045724391937256, "rewards/rejected": -2.432340145111084, "step": 3260 }, { "epoch": 0.5246690734055355, "grad_norm": 9.625, "learning_rate": 2.72098955768054e-06, "logits/chosen": -2.81931734085083, "logits/rejected": -2.7655539512634277, "logps/chosen": -419.9658203125, "logps/rejected": -286.56915283203125, "loss": 0.4239, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6744049191474915, "rewards/margins": 1.6448739767074585, "rewards/rejected": -2.3192787170410156, "step": 3270 }, { "epoch": 0.5262735659847573, "grad_norm": 9.625, "learning_rate": 2.7070360263988527e-06, "logits/chosen": -2.8149046897888184, "logits/rejected": -2.7474820613861084, "logps/chosen": -433.845947265625, "logps/rejected": -246.2403106689453, "loss": 0.419, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.625068187713623, "rewards/margins": 1.5727230310440063, "rewards/rejected": -2.197791337966919, "step": 3280 }, { "epoch": 0.5278780585639792, "grad_norm": 9.25, "learning_rate": 2.693075997882939e-06, "logits/chosen": -2.8122918605804443, "logits/rejected": -2.751154899597168, "logps/chosen": -441.769287109375, "logps/rejected": -278.37786865234375, "loss": 0.4307, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7035094499588013, "rewards/margins": 1.5479826927185059, "rewards/rejected": -2.2514920234680176, "step": 3290 }, { "epoch": 0.529482551143201, "grad_norm": 6.90625, "learning_rate": 2.679109910228413e-06, "logits/chosen": -2.805504560470581, "logits/rejected": -2.7424871921539307, "logps/chosen": -416.5133361816406, "logps/rejected": -251.0725860595703, "loss": 0.3577, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.4549214243888855, "rewards/margins": 1.8048673868179321, "rewards/rejected": -2.259788751602173, "step": 3300 }, { "epoch": 0.5310870437224228, "grad_norm": 10.5625, "learning_rate": 2.6651382017210365e-06, "logits/chosen": -2.7940673828125, "logits/rejected": -2.748976230621338, "logps/chosen": -390.5081481933594, "logps/rejected": -260.8641357421875, "loss": 0.4676, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.591526210308075, "rewards/margins": 1.326886773109436, "rewards/rejected": -1.9184131622314453, "step": 3310 }, { "epoch": 0.5326915363016446, "grad_norm": 9.8125, "learning_rate": 2.651161310822966e-06, "logits/chosen": -2.804448127746582, "logits/rejected": -2.757857322692871, "logps/chosen": -431.2451171875, "logps/rejected": -309.4925842285156, "loss": 0.4097, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.24188832938671112, "rewards/margins": 1.564399003982544, "rewards/rejected": -1.806287169456482, "step": 3320 }, { "epoch": 0.5342960288808665, "grad_norm": 11.5, "learning_rate": 2.637179676158995e-06, "logits/chosen": -2.815986156463623, "logits/rejected": -2.753085136413574, "logps/chosen": -408.653076171875, "logps/rejected": -268.625732421875, "loss": 0.4096, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.325435072183609, "rewards/margins": 1.5999637842178345, "rewards/rejected": -1.925398588180542, "step": 3330 }, { "epoch": 0.5359005214600883, "grad_norm": 11.8125, "learning_rate": 2.623193736502784e-06, "logits/chosen": -2.798496961593628, "logits/rejected": -2.740133285522461, "logps/chosen": -464.0697326660156, "logps/rejected": -277.10003662109375, "loss": 0.4656, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.6347956657409668, "rewards/margins": 1.406773567199707, "rewards/rejected": -2.041569471359253, "step": 3340 }, { "epoch": 0.5375050140393101, "grad_norm": 9.9375, "learning_rate": 2.6092039307630924e-06, "logits/chosen": -2.8149635791778564, "logits/rejected": -2.7576045989990234, "logps/chosen": -470.7455139160156, "logps/rejected": -279.44512939453125, "loss": 0.3998, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.5465697050094604, "rewards/margins": 1.5956320762634277, "rewards/rejected": -2.1422016620635986, "step": 3350 }, { "epoch": 0.5391095066185319, "grad_norm": 10.0625, "learning_rate": 2.5952106979700093e-06, "logits/chosen": -2.8070387840270996, "logits/rejected": -2.757483959197998, "logps/chosen": -407.852783203125, "logps/rejected": -289.1630859375, "loss": 0.4125, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.6453671455383301, "rewards/margins": 1.5706332921981812, "rewards/rejected": -2.2160003185272217, "step": 3360 }, { "epoch": 0.5407139991977538, "grad_norm": 9.8125, "learning_rate": 2.5812144772611685e-06, "logits/chosen": -2.813453197479248, "logits/rejected": -2.758901596069336, "logps/chosen": -436.7747497558594, "logps/rejected": -270.12335205078125, "loss": 0.4802, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.032455563545227, "rewards/margins": 1.4240622520446777, "rewards/rejected": -2.4565176963806152, "step": 3370 }, { "epoch": 0.5423184917769756, "grad_norm": 7.75, "learning_rate": 2.567215707867972e-06, "logits/chosen": -2.804898738861084, "logits/rejected": -2.7586092948913574, "logps/chosen": -483.58074951171875, "logps/rejected": -310.3348693847656, "loss": 0.4246, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9283766746520996, "rewards/margins": 1.429927110671997, "rewards/rejected": -2.3583037853240967, "step": 3380 }, { "epoch": 0.5439229843561973, "grad_norm": 9.0, "learning_rate": 2.5532148291018077e-06, "logits/chosen": -2.8150339126586914, "logits/rejected": -2.7635552883148193, "logps/chosen": -411.359130859375, "logps/rejected": -279.1216735839844, "loss": 0.3837, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.1406351327896118, "rewards/margins": 1.5715888738632202, "rewards/rejected": -2.712223768234253, "step": 3390 }, { "epoch": 0.5455274769354191, "grad_norm": 8.25, "learning_rate": 2.5392122803402564e-06, "logits/chosen": -2.8244986534118652, "logits/rejected": -2.7671003341674805, "logps/chosen": -431.70587158203125, "logps/rejected": -287.25909423828125, "loss": 0.4128, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.0212467908859253, "rewards/margins": 1.48100745677948, "rewards/rejected": -2.5022542476654053, "step": 3400 }, { "epoch": 0.547131969514641, "grad_norm": 10.0, "learning_rate": 2.5252085010133075e-06, "logits/chosen": -2.7937278747558594, "logits/rejected": -2.7452750205993652, "logps/chosen": -397.11199951171875, "logps/rejected": -269.13079833984375, "loss": 0.3749, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.8743575811386108, "rewards/margins": 1.6731517314910889, "rewards/rejected": -2.5475096702575684, "step": 3410 }, { "epoch": 0.5487364620938628, "grad_norm": 9.9375, "learning_rate": 2.511203930589571e-06, "logits/chosen": -2.8265206813812256, "logits/rejected": -2.76664137840271, "logps/chosen": -428.5648498535156, "logps/rejected": -281.16259765625, "loss": 0.4064, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.07552170753479, "rewards/margins": 1.5419305562973022, "rewards/rejected": -2.6174521446228027, "step": 3420 }, { "epoch": 0.5503409546730846, "grad_norm": 13.25, "learning_rate": 2.4971990085624813e-06, "logits/chosen": -2.804974317550659, "logits/rejected": -2.7478086948394775, "logps/chosen": -430.12933349609375, "logps/rejected": -262.39959716796875, "loss": 0.4599, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0942763090133667, "rewards/margins": 1.3672016859054565, "rewards/rejected": -2.461477756500244, "step": 3430 }, { "epoch": 0.5519454472523064, "grad_norm": 10.1875, "learning_rate": 2.4831941744365045e-06, "logits/chosen": -2.799697160720825, "logits/rejected": -2.750645875930786, "logps/chosen": -389.0050048828125, "logps/rejected": -255.73141479492188, "loss": 0.3938, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.0806915760040283, "rewards/margins": 1.5109665393829346, "rewards/rejected": -2.591658115386963, "step": 3440 }, { "epoch": 0.5535499398315282, "grad_norm": 7.21875, "learning_rate": 2.4691898677133523e-06, "logits/chosen": -2.80962872505188, "logits/rejected": -2.7543718814849854, "logps/chosen": -439.8152770996094, "logps/rejected": -285.1009826660156, "loss": 0.3815, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.7586385011672974, "rewards/margins": 1.5565236806869507, "rewards/rejected": -2.315162181854248, "step": 3450 }, { "epoch": 0.5551544324107501, "grad_norm": 13.1875, "learning_rate": 2.4551865278781836e-06, "logits/chosen": -2.830653429031372, "logits/rejected": -2.780566930770874, "logps/chosen": -394.16204833984375, "logps/rejected": -289.81707763671875, "loss": 0.4831, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.014175534248352, "rewards/margins": 1.2701776027679443, "rewards/rejected": -2.284353256225586, "step": 3460 }, { "epoch": 0.5567589249899719, "grad_norm": 14.75, "learning_rate": 2.4411845943858136e-06, "logits/chosen": -2.7798845767974854, "logits/rejected": -2.7445790767669678, "logps/chosen": -404.05560302734375, "logps/rejected": -289.27154541015625, "loss": 0.3989, "rewards/accuracies": 0.84375, "rewards/chosen": -0.699254035949707, "rewards/margins": 1.5684045553207397, "rewards/rejected": -2.2676587104797363, "step": 3470 }, { "epoch": 0.5583634175691937, "grad_norm": 12.125, "learning_rate": 2.427184506646925e-06, "logits/chosen": -2.8047478199005127, "logits/rejected": -2.7511937618255615, "logps/chosen": -454.34356689453125, "logps/rejected": -302.7257995605469, "loss": 0.3706, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.44825559854507446, "rewards/margins": 1.7281758785247803, "rewards/rejected": -2.176431655883789, "step": 3480 }, { "epoch": 0.5599679101484155, "grad_norm": 8.875, "learning_rate": 2.4131867040142756e-06, "logits/chosen": -2.797234535217285, "logits/rejected": -2.761536121368408, "logps/chosen": -389.79827880859375, "logps/rejected": -302.9991149902344, "loss": 0.4315, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.5677042007446289, "rewards/margins": 1.4912917613983154, "rewards/rejected": -2.0589959621429443, "step": 3490 }, { "epoch": 0.5615724027276374, "grad_norm": 11.5625, "learning_rate": 2.399191625768911e-06, "logits/chosen": -2.8137288093566895, "logits/rejected": -2.7709288597106934, "logps/chosen": -461.034912109375, "logps/rejected": -320.4874267578125, "loss": 0.401, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.49945053458213806, "rewards/margins": 1.5343934297561646, "rewards/rejected": -2.033843994140625, "step": 3500 }, { "epoch": 0.5631768953068592, "grad_norm": 9.1875, "learning_rate": 2.385199711106382e-06, "logits/chosen": -2.8061740398406982, "logits/rejected": -2.74739408493042, "logps/chosen": -433.6083984375, "logps/rejected": -265.6051025390625, "loss": 0.3955, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6609107255935669, "rewards/margins": 1.5862716436386108, "rewards/rejected": -2.247182607650757, "step": 3510 }, { "epoch": 0.564781387886081, "grad_norm": 7.46875, "learning_rate": 2.371211399122958e-06, "logits/chosen": -2.811275005340576, "logits/rejected": -2.757606029510498, "logps/chosen": -394.35198974609375, "logps/rejected": -277.55633544921875, "loss": 0.3996, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6262287497520447, "rewards/margins": 1.4663324356079102, "rewards/rejected": -2.0925612449645996, "step": 3520 }, { "epoch": 0.5663858804653028, "grad_norm": 10.625, "learning_rate": 2.357227128801849e-06, "logits/chosen": -2.82269549369812, "logits/rejected": -2.7526392936706543, "logps/chosen": -465.2603454589844, "logps/rejected": -285.5815124511719, "loss": 0.3609, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.5568746328353882, "rewards/margins": 1.7886512279510498, "rewards/rejected": -2.3455257415771484, "step": 3530 }, { "epoch": 0.5679903730445247, "grad_norm": 8.5, "learning_rate": 2.3432473389994276e-06, "logits/chosen": -2.810063600540161, "logits/rejected": -2.748582601547241, "logps/chosen": -388.13671875, "logps/rejected": -237.1779022216797, "loss": 0.4008, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.865714430809021, "rewards/margins": 1.471392273902893, "rewards/rejected": -2.337106466293335, "step": 3540 }, { "epoch": 0.5695948656237465, "grad_norm": 11.125, "learning_rate": 2.3292724684314595e-06, "logits/chosen": -2.8103973865509033, "logits/rejected": -2.758643627166748, "logps/chosen": -397.8321228027344, "logps/rejected": -265.73736572265625, "loss": 0.3808, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.7724410891532898, "rewards/margins": 1.6716773509979248, "rewards/rejected": -2.4441182613372803, "step": 3550 }, { "epoch": 0.5711993582029683, "grad_norm": 13.8125, "learning_rate": 2.3153029556593333e-06, "logits/chosen": -2.807677745819092, "logits/rejected": -2.7411513328552246, "logps/chosen": -486.91558837890625, "logps/rejected": -290.1862487792969, "loss": 0.3517, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.43620580434799194, "rewards/margins": 1.9805030822753906, "rewards/rejected": -2.4167087078094482, "step": 3560 }, { "epoch": 0.5728038507821901, "grad_norm": 9.25, "learning_rate": 2.3013392390763008e-06, "logits/chosen": -2.8137283325195312, "logits/rejected": -2.7650885581970215, "logps/chosen": -450.74053955078125, "logps/rejected": -301.20611572265625, "loss": 0.4275, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5778528451919556, "rewards/margins": 1.4564993381500244, "rewards/rejected": -2.0343523025512695, "step": 3570 }, { "epoch": 0.574408343361412, "grad_norm": 9.5, "learning_rate": 2.2873817568937144e-06, "logits/chosen": -2.7993502616882324, "logits/rejected": -2.7518744468688965, "logps/chosen": -423.250732421875, "logps/rejected": -288.00103759765625, "loss": 0.4407, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5895527601242065, "rewards/margins": 1.6051137447357178, "rewards/rejected": -2.194666624069214, "step": 3580 }, { "epoch": 0.5760128359406338, "grad_norm": 15.0, "learning_rate": 2.2734309471272784e-06, "logits/chosen": -2.8063387870788574, "logits/rejected": -2.7565455436706543, "logps/chosen": -417.3709411621094, "logps/rejected": -294.87579345703125, "loss": 0.453, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.544521152973175, "rewards/margins": 1.4459476470947266, "rewards/rejected": -1.9904687404632568, "step": 3590 }, { "epoch": 0.5776173285198556, "grad_norm": 9.6875, "learning_rate": 2.259487247583303e-06, "logits/chosen": -2.8046133518218994, "logits/rejected": -2.7487637996673584, "logps/chosen": -424.52325439453125, "logps/rejected": -306.76397705078125, "loss": 0.3921, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.22226199507713318, "rewards/margins": 1.6710447072982788, "rewards/rejected": -1.8933067321777344, "step": 3600 }, { "epoch": 0.5792218210990774, "grad_norm": 10.5625, "learning_rate": 2.245551095844964e-06, "logits/chosen": -2.8068976402282715, "logits/rejected": -2.767521381378174, "logps/chosen": -381.14898681640625, "logps/rejected": -285.68414306640625, "loss": 0.4716, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.679606020450592, "rewards/margins": 1.5227012634277344, "rewards/rejected": -2.2023074626922607, "step": 3610 }, { "epoch": 0.5808263136782993, "grad_norm": 7.4375, "learning_rate": 2.23162292925857e-06, "logits/chosen": -2.8096930980682373, "logits/rejected": -2.75054669380188, "logps/chosen": -400.10443115234375, "logps/rejected": -265.8934631347656, "loss": 0.3532, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.5253150463104248, "rewards/margins": 1.6780914068222046, "rewards/rejected": -2.203406572341919, "step": 3620 }, { "epoch": 0.5824308062575211, "grad_norm": 13.1875, "learning_rate": 2.2177031849198397e-06, "logits/chosen": -2.8091230392456055, "logits/rejected": -2.760089635848999, "logps/chosen": -377.43359375, "logps/rejected": -261.92449951171875, "loss": 0.431, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7985199689865112, "rewards/margins": 1.551023006439209, "rewards/rejected": -2.3495430946350098, "step": 3630 }, { "epoch": 0.5840352988367429, "grad_norm": 10.6875, "learning_rate": 2.203792299660184e-06, "logits/chosen": -2.807889461517334, "logits/rejected": -2.7618396282196045, "logps/chosen": -430.28240966796875, "logps/rejected": -314.0732727050781, "loss": 0.3836, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.643751323223114, "rewards/margins": 1.571907639503479, "rewards/rejected": -2.215658664703369, "step": 3640 }, { "epoch": 0.5856397914159647, "grad_norm": 13.75, "learning_rate": 2.1898907100329944e-06, "logits/chosen": -2.786302089691162, "logits/rejected": -2.7537412643432617, "logps/chosen": -418.32470703125, "logps/rejected": -327.54327392578125, "loss": 0.4917, "rewards/accuracies": 0.75, "rewards/chosen": -0.873579204082489, "rewards/margins": 1.2870389223098755, "rewards/rejected": -2.1606178283691406, "step": 3650 }, { "epoch": 0.5872442839951865, "grad_norm": 12.375, "learning_rate": 2.175998852299949e-06, "logits/chosen": -2.8101484775543213, "logits/rejected": -2.767287492752075, "logps/chosen": -387.3631591796875, "logps/rejected": -258.545654296875, "loss": 0.4218, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9899168014526367, "rewards/margins": 1.5624148845672607, "rewards/rejected": -2.5523319244384766, "step": 3660 }, { "epoch": 0.5888487765744084, "grad_norm": 5.59375, "learning_rate": 2.1621171624173156e-06, "logits/chosen": -2.816471815109253, "logits/rejected": -2.770481586456299, "logps/chosen": -410.04449462890625, "logps/rejected": -296.2618408203125, "loss": 0.4097, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.8739595413208008, "rewards/margins": 1.5006382465362549, "rewards/rejected": -2.3745975494384766, "step": 3670 }, { "epoch": 0.5904532691536302, "grad_norm": 11.5, "learning_rate": 2.1482460760222735e-06, "logits/chosen": -2.801726818084717, "logits/rejected": -2.763115406036377, "logps/chosen": -411.90484619140625, "logps/rejected": -297.09808349609375, "loss": 0.4058, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.9806419610977173, "rewards/margins": 1.4209672212600708, "rewards/rejected": -2.401609182357788, "step": 3680 }, { "epoch": 0.592057761732852, "grad_norm": 10.5, "learning_rate": 2.1343860284192423e-06, "logits/chosen": -2.802231788635254, "logits/rejected": -2.7310638427734375, "logps/chosen": -442.01220703125, "logps/rejected": -250.5236358642578, "loss": 0.3811, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.86461341381073, "rewards/margins": 1.7754936218261719, "rewards/rejected": -2.6401069164276123, "step": 3690 }, { "epoch": 0.5936622543120738, "grad_norm": 9.1875, "learning_rate": 2.1205374545662203e-06, "logits/chosen": -2.8051986694335938, "logits/rejected": -2.751840829849243, "logps/chosen": -415.57989501953125, "logps/rejected": -271.5701904296875, "loss": 0.4119, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8999338150024414, "rewards/margins": 1.624054193496704, "rewards/rejected": -2.5239877700805664, "step": 3700 }, { "epoch": 0.5952667468912957, "grad_norm": 12.5625, "learning_rate": 2.1067007890611358e-06, "logits/chosen": -2.7913641929626465, "logits/rejected": -2.754225969314575, "logps/chosen": -384.2332763671875, "logps/rejected": -287.58099365234375, "loss": 0.4116, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8017387390136719, "rewards/margins": 1.48720121383667, "rewards/rejected": -2.288939952850342, "step": 3710 }, { "epoch": 0.5968712394705175, "grad_norm": 10.1875, "learning_rate": 2.0928764661282068e-06, "logits/chosen": -2.8200762271881104, "logits/rejected": -2.761756181716919, "logps/chosen": -418.6720275878906, "logps/rejected": -258.8283386230469, "loss": 0.4177, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9178218841552734, "rewards/margins": 1.3490447998046875, "rewards/rejected": -2.26686692237854, "step": 3720 }, { "epoch": 0.5984757320497393, "grad_norm": 11.375, "learning_rate": 2.0790649196043157e-06, "logits/chosen": -2.78727388381958, "logits/rejected": -2.735844850540161, "logps/chosen": -379.0087890625, "logps/rejected": -238.69290161132812, "loss": 0.4666, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9689494371414185, "rewards/margins": 1.4774259328842163, "rewards/rejected": -2.4463753700256348, "step": 3730 }, { "epoch": 0.600080224628961, "grad_norm": 4.3125, "learning_rate": 2.0652665829253916e-06, "logits/chosen": -2.8110642433166504, "logits/rejected": -2.7456202507019043, "logps/chosen": -480.2578125, "logps/rejected": -271.13677978515625, "loss": 0.4349, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.7119728922843933, "rewards/margins": 1.5833182334899902, "rewards/rejected": -2.295290946960449, "step": 3740 }, { "epoch": 0.601684717208183, "grad_norm": 12.1875, "learning_rate": 2.0514818891128134e-06, "logits/chosen": -2.7864766120910645, "logits/rejected": -2.76149320602417, "logps/chosen": -387.9775695800781, "logps/rejected": -295.6427001953125, "loss": 0.4794, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9436639547348022, "rewards/margins": 1.2249329090118408, "rewards/rejected": -2.1685969829559326, "step": 3750 }, { "epoch": 0.6032892097874047, "grad_norm": 10.625, "learning_rate": 2.037711270759816e-06, "logits/chosen": -2.832904100418091, "logits/rejected": -2.758803367614746, "logps/chosen": -439.9232482910156, "logps/rejected": -254.01840209960938, "loss": 0.3563, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.6350158452987671, "rewards/margins": 1.8828365802764893, "rewards/rejected": -2.517852306365967, "step": 3760 }, { "epoch": 0.6048937023666265, "grad_norm": 9.125, "learning_rate": 2.023955160017916e-06, "logits/chosen": -2.802201747894287, "logits/rejected": -2.7584195137023926, "logps/chosen": -403.06842041015625, "logps/rejected": -288.36724853515625, "loss": 0.4094, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7723891735076904, "rewards/margins": 1.431028127670288, "rewards/rejected": -2.2034173011779785, "step": 3770 }, { "epoch": 0.6064981949458483, "grad_norm": 8.625, "learning_rate": 2.010213988583351e-06, "logits/chosen": -2.7889671325683594, "logits/rejected": -2.7367241382598877, "logps/chosen": -385.10882568359375, "logps/rejected": -248.38931274414062, "loss": 0.3873, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.8519245982170105, "rewards/margins": 1.4799481630325317, "rewards/rejected": -2.3318727016448975, "step": 3780 }, { "epoch": 0.6081026875250702, "grad_norm": 10.875, "learning_rate": 1.99648818768353e-06, "logits/chosen": -2.8192174434661865, "logits/rejected": -2.7767531871795654, "logps/chosen": -407.5501403808594, "logps/rejected": -303.4102783203125, "loss": 0.4393, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.7725669145584106, "rewards/margins": 1.3266018629074097, "rewards/rejected": -2.0991690158843994, "step": 3790 }, { "epoch": 0.609707180104292, "grad_norm": 12.375, "learning_rate": 1.9827781880635037e-06, "logits/chosen": -2.802733898162842, "logits/rejected": -2.7493672370910645, "logps/chosen": -415.912109375, "logps/rejected": -269.6252746582031, "loss": 0.4271, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9625340700149536, "rewards/margins": 1.3097140789031982, "rewards/rejected": -2.2722480297088623, "step": 3800 }, { "epoch": 0.6113116726835138, "grad_norm": 10.0, "learning_rate": 1.969084419972441e-06, "logits/chosen": -2.8251876831054688, "logits/rejected": -2.761272430419922, "logps/chosen": -435.23712158203125, "logps/rejected": -261.80963134765625, "loss": 0.4014, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.8192535638809204, "rewards/margins": 1.5404853820800781, "rewards/rejected": -2.359738826751709, "step": 3810 }, { "epoch": 0.6129161652627356, "grad_norm": 11.4375, "learning_rate": 1.9554073131501346e-06, "logits/chosen": -2.800419569015503, "logits/rejected": -2.743459701538086, "logps/chosen": -407.4580383300781, "logps/rejected": -268.4112548828125, "loss": 0.4129, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7697056531906128, "rewards/margins": 1.5844014883041382, "rewards/rejected": -2.35410737991333, "step": 3820 }, { "epoch": 0.6145206578419575, "grad_norm": 6.125, "learning_rate": 1.941747296813509e-06, "logits/chosen": -2.8017678260803223, "logits/rejected": -2.7457785606384277, "logps/chosen": -416.92523193359375, "logps/rejected": -274.69757080078125, "loss": 0.3604, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.9049040079116821, "rewards/margins": 1.5968316793441772, "rewards/rejected": -2.5017356872558594, "step": 3830 }, { "epoch": 0.6161251504211793, "grad_norm": 12.1875, "learning_rate": 1.928104799643153e-06, "logits/chosen": -2.7996907234191895, "logits/rejected": -2.7438995838165283, "logps/chosen": -426.6146545410156, "logps/rejected": -262.61370849609375, "loss": 0.3308, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.5919214487075806, "rewards/margins": 1.812429666519165, "rewards/rejected": -2.404350996017456, "step": 3840 }, { "epoch": 0.6177296430004011, "grad_norm": 10.375, "learning_rate": 1.914480249769865e-06, "logits/chosen": -2.8010478019714355, "logits/rejected": -2.7515006065368652, "logps/chosen": -385.528564453125, "logps/rejected": -245.79672241210938, "loss": 0.3509, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7448406219482422, "rewards/margins": 1.6024587154388428, "rewards/rejected": -2.347299337387085, "step": 3850 }, { "epoch": 0.6193341355796229, "grad_norm": 7.46875, "learning_rate": 1.9008740747612222e-06, "logits/chosen": -2.8001396656036377, "logits/rejected": -2.761345386505127, "logps/chosen": -394.0040283203125, "logps/rejected": -281.64373779296875, "loss": 0.4988, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8544265031814575, "rewards/margins": 1.4108295440673828, "rewards/rejected": -2.265255928039551, "step": 3860 }, { "epoch": 0.6209386281588448, "grad_norm": 10.0, "learning_rate": 1.887286701608156e-06, "logits/chosen": -2.782724380493164, "logits/rejected": -2.7342724800109863, "logps/chosen": -381.5513916015625, "logps/rejected": -255.05642700195312, "loss": 0.4027, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.827198326587677, "rewards/margins": 1.528525710105896, "rewards/rejected": -2.3557240962982178, "step": 3870 }, { "epoch": 0.6225431207380666, "grad_norm": 9.1875, "learning_rate": 1.8737185567115557e-06, "logits/chosen": -2.8151192665100098, "logits/rejected": -2.752053737640381, "logps/chosen": -442.13092041015625, "logps/rejected": -271.4375, "loss": 0.3274, "rewards/accuracies": 0.875, "rewards/chosen": -0.6357272863388062, "rewards/margins": 1.7539234161376953, "rewards/rejected": -2.389650583267212, "step": 3880 }, { "epoch": 0.6241476133172884, "grad_norm": 4.59375, "learning_rate": 1.8601700658688887e-06, "logits/chosen": -2.796959400177002, "logits/rejected": -2.740633487701416, "logps/chosen": -405.79046630859375, "logps/rejected": -240.43087768554688, "loss": 0.4031, "rewards/accuracies": 0.8125, "rewards/chosen": -0.849949836730957, "rewards/margins": 1.612457036972046, "rewards/rejected": -2.462407350540161, "step": 3890 }, { "epoch": 0.6257521058965102, "grad_norm": 8.625, "learning_rate": 1.8466416542608338e-06, "logits/chosen": -2.80003023147583, "logits/rejected": -2.7316930294036865, "logps/chosen": -438.61419677734375, "logps/rejected": -267.50921630859375, "loss": 0.3552, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.5875496864318848, "rewards/margins": 1.692122220993042, "rewards/rejected": -2.279672145843506, "step": 3900 }, { "epoch": 0.627356598475732, "grad_norm": 14.1875, "learning_rate": 1.8331337464379407e-06, "logits/chosen": -2.8307840824127197, "logits/rejected": -2.7464661598205566, "logps/chosen": -456.8536071777344, "logps/rejected": -255.9669952392578, "loss": 0.3793, "rewards/accuracies": 0.84375, "rewards/chosen": -0.45472201704978943, "rewards/margins": 1.880006194114685, "rewards/rejected": -2.334728240966797, "step": 3910 }, { "epoch": 0.6289610910549539, "grad_norm": 8.625, "learning_rate": 1.8196467663073081e-06, "logits/chosen": -2.795959949493408, "logits/rejected": -2.7440707683563232, "logps/chosen": -407.6551208496094, "logps/rejected": -268.53533935546875, "loss": 0.3886, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.7915475368499756, "rewards/margins": 1.6384340524673462, "rewards/rejected": -2.4299817085266113, "step": 3920 }, { "epoch": 0.6305655836341757, "grad_norm": 11.5625, "learning_rate": 1.8061811371192775e-06, "logits/chosen": -2.8122072219848633, "logits/rejected": -2.769970655441284, "logps/chosen": -392.17962646484375, "logps/rejected": -294.79644775390625, "loss": 0.3327, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5803684592247009, "rewards/margins": 1.71274733543396, "rewards/rejected": -2.2931158542633057, "step": 3930 }, { "epoch": 0.6321700762133975, "grad_norm": 9.625, "learning_rate": 1.7927372814541522e-06, "logits/chosen": -2.795044422149658, "logits/rejected": -2.7566821575164795, "logps/chosen": -403.79852294921875, "logps/rejected": -298.9261779785156, "loss": 0.4715, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.8045080900192261, "rewards/margins": 1.3144686222076416, "rewards/rejected": -2.1189768314361572, "step": 3940 }, { "epoch": 0.6337745687926193, "grad_norm": 11.875, "learning_rate": 1.7793156212089352e-06, "logits/chosen": -2.7902801036834717, "logits/rejected": -2.7436933517456055, "logps/chosen": -392.6355895996094, "logps/rejected": -261.9256286621094, "loss": 0.4482, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8570887446403503, "rewards/margins": 1.7191225290298462, "rewards/rejected": -2.576211452484131, "step": 3950 }, { "epoch": 0.6353790613718412, "grad_norm": 13.0625, "learning_rate": 1.7659165775840908e-06, "logits/chosen": -2.805176258087158, "logits/rejected": -2.7722439765930176, "logps/chosen": -372.74688720703125, "logps/rejected": -282.3138732910156, "loss": 0.45, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7092429995536804, "rewards/margins": 1.336705207824707, "rewards/rejected": -2.0459482669830322, "step": 3960 }, { "epoch": 0.636983553951063, "grad_norm": 13.125, "learning_rate": 1.7525405710703237e-06, "logits/chosen": -2.801262855529785, "logits/rejected": -2.7363991737365723, "logps/chosen": -473.7757873535156, "logps/rejected": -280.92608642578125, "loss": 0.405, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6490505337715149, "rewards/margins": 1.60300612449646, "rewards/rejected": -2.25205659866333, "step": 3970 }, { "epoch": 0.6385880465302848, "grad_norm": 8.0, "learning_rate": 1.7391880214353869e-06, "logits/chosen": -2.77987003326416, "logits/rejected": -2.7330288887023926, "logps/chosen": -386.94366455078125, "logps/rejected": -275.6929016113281, "loss": 0.3933, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.7192014455795288, "rewards/margins": 1.5411689281463623, "rewards/rejected": -2.2603704929351807, "step": 3980 }, { "epoch": 0.6401925391095066, "grad_norm": 10.75, "learning_rate": 1.7258593477109043e-06, "logits/chosen": -2.801795244216919, "logits/rejected": -2.7691497802734375, "logps/chosen": -391.75494384765625, "logps/rejected": -291.85235595703125, "loss": 0.481, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8905754089355469, "rewards/margins": 1.2898279428482056, "rewards/rejected": -2.180403232574463, "step": 3990 }, { "epoch": 0.6417970316887285, "grad_norm": 9.3125, "learning_rate": 1.7125549681792233e-06, "logits/chosen": -2.8172945976257324, "logits/rejected": -2.7663023471832275, "logps/chosen": -393.18670654296875, "logps/rejected": -271.5379638671875, "loss": 0.4329, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7562323808670044, "rewards/margins": 1.4369302988052368, "rewards/rejected": -2.193162679672241, "step": 4000 }, { "epoch": 0.6434015242679503, "grad_norm": 5.96875, "learning_rate": 1.699275300360288e-06, "logits/chosen": -2.7967867851257324, "logits/rejected": -2.752283811569214, "logps/chosen": -403.39202880859375, "logps/rejected": -274.4188537597656, "loss": 0.3669, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.6664668321609497, "rewards/margins": 1.6767393350601196, "rewards/rejected": -2.3432061672210693, "step": 4010 }, { "epoch": 0.6450060168471721, "grad_norm": 12.5, "learning_rate": 1.6860207609985355e-06, "logits/chosen": -2.801706075668335, "logits/rejected": -2.7509706020355225, "logps/chosen": -403.73394775390625, "logps/rejected": -290.6875915527344, "loss": 0.4541, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8625680804252625, "rewards/margins": 1.4129666090011597, "rewards/rejected": -2.2755348682403564, "step": 4020 }, { "epoch": 0.6466105094263939, "grad_norm": 9.5625, "learning_rate": 1.6727917660498169e-06, "logits/chosen": -2.788971424102783, "logits/rejected": -2.7410120964050293, "logps/chosen": -403.1820983886719, "logps/rejected": -273.03753662109375, "loss": 0.45, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7781115770339966, "rewards/margins": 1.3518060445785522, "rewards/rejected": -2.129917621612549, "step": 4030 }, { "epoch": 0.6482150020056158, "grad_norm": 10.375, "learning_rate": 1.6595887306683473e-06, "logits/chosen": -2.7828314304351807, "logits/rejected": -2.755638360977173, "logps/chosen": -385.1922607421875, "logps/rejected": -316.51531982421875, "loss": 0.434, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.8118616342544556, "rewards/margins": 1.2999560832977295, "rewards/rejected": -2.1118178367614746, "step": 4040 }, { "epoch": 0.6498194945848376, "grad_norm": 9.4375, "learning_rate": 1.646412069193673e-06, "logits/chosen": -2.808457851409912, "logits/rejected": -2.7556493282318115, "logps/chosen": -426.4422302246094, "logps/rejected": -281.2369079589844, "loss": 0.3933, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.7109230756759644, "rewards/margins": 1.5413486957550049, "rewards/rejected": -2.2522716522216797, "step": 4050 }, { "epoch": 0.6514239871640594, "grad_norm": 11.5, "learning_rate": 1.6332621951376709e-06, "logits/chosen": -2.791311264038086, "logits/rejected": -2.7577075958251953, "logps/chosen": -398.83612060546875, "logps/rejected": -287.30560302734375, "loss": 0.393, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8398586511611938, "rewards/margins": 1.472285509109497, "rewards/rejected": -2.3121440410614014, "step": 4060 }, { "epoch": 0.6530284797432812, "grad_norm": 11.25, "learning_rate": 1.6201395211715725e-06, "logits/chosen": -2.8144595623016357, "logits/rejected": -2.7607927322387695, "logps/chosen": -450.253662109375, "logps/rejected": -304.55145263671875, "loss": 0.4401, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6317472457885742, "rewards/margins": 1.522066354751587, "rewards/rejected": -2.153813600540161, "step": 4070 }, { "epoch": 0.6546329723225031, "grad_norm": 9.1875, "learning_rate": 1.6070444591130114e-06, "logits/chosen": -2.79302978515625, "logits/rejected": -2.7369484901428223, "logps/chosen": -410.14471435546875, "logps/rejected": -246.12417602539062, "loss": 0.3721, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.6459525227546692, "rewards/margins": 1.5810879468917847, "rewards/rejected": -2.2270407676696777, "step": 4080 }, { "epoch": 0.6562374649017249, "grad_norm": 9.125, "learning_rate": 1.5939774199130986e-06, "logits/chosen": -2.80903697013855, "logits/rejected": -2.7588717937469482, "logps/chosen": -394.8160400390625, "logps/rejected": -273.0166320800781, "loss": 0.4764, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.9074431657791138, "rewards/margins": 1.2947542667388916, "rewards/rejected": -2.202197313308716, "step": 4090 }, { "epoch": 0.6578419574809466, "grad_norm": 8.5625, "learning_rate": 1.580938813643531e-06, "logits/chosen": -2.801699161529541, "logits/rejected": -2.754695415496826, "logps/chosen": -415.01800537109375, "logps/rejected": -275.08245849609375, "loss": 0.355, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.5942767858505249, "rewards/margins": 1.651201844215393, "rewards/rejected": -2.245478630065918, "step": 4100 }, { "epoch": 0.6594464500601684, "grad_norm": 8.875, "learning_rate": 1.5679290494837162e-06, "logits/chosen": -2.790424346923828, "logits/rejected": -2.748422384262085, "logps/chosen": -423.26361083984375, "logps/rejected": -275.253173828125, "loss": 0.3973, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7378716468811035, "rewards/margins": 1.620374083518982, "rewards/rejected": -2.358245372772217, "step": 4110 }, { "epoch": 0.6610509426393902, "grad_norm": 11.8125, "learning_rate": 1.5549485357079371e-06, "logits/chosen": -2.806856632232666, "logits/rejected": -2.7558820247650146, "logps/chosen": -407.7042541503906, "logps/rejected": -281.6960144042969, "loss": 0.4138, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6293872594833374, "rewards/margins": 1.5254137516021729, "rewards/rejected": -2.1548011302948, "step": 4120 }, { "epoch": 0.6626554352186121, "grad_norm": 7.75, "learning_rate": 1.5419976796725357e-06, "logits/chosen": -2.8032069206237793, "logits/rejected": -2.7493045330047607, "logps/chosen": -407.879150390625, "logps/rejected": -270.36407470703125, "loss": 0.4222, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.7028517723083496, "rewards/margins": 1.4742443561553955, "rewards/rejected": -2.177096366882324, "step": 4130 }, { "epoch": 0.6642599277978339, "grad_norm": 11.0625, "learning_rate": 1.5290768878031298e-06, "logits/chosen": -2.8034045696258545, "logits/rejected": -2.758371114730835, "logps/chosen": -372.4503479003906, "logps/rejected": -268.8387756347656, "loss": 0.4505, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.775677502155304, "rewards/margins": 1.4257248640060425, "rewards/rejected": -2.201402425765991, "step": 4140 }, { "epoch": 0.6658644203770557, "grad_norm": 12.1875, "learning_rate": 1.5161865655818602e-06, "logits/chosen": -2.8029770851135254, "logits/rejected": -2.7562177181243896, "logps/chosen": -446.55206298828125, "logps/rejected": -309.4889221191406, "loss": 0.3604, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.5812132954597473, "rewards/margins": 1.6959142684936523, "rewards/rejected": -2.277127742767334, "step": 4150 }, { "epoch": 0.6674689129562775, "grad_norm": 6.375, "learning_rate": 1.503327117534666e-06, "logits/chosen": -2.8109450340270996, "logits/rejected": -2.74576473236084, "logps/chosen": -379.79437255859375, "logps/rejected": -231.78759765625, "loss": 0.4288, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7933824062347412, "rewards/margins": 1.4210169315338135, "rewards/rejected": -2.2143993377685547, "step": 4160 }, { "epoch": 0.6690734055354994, "grad_norm": 9.5625, "learning_rate": 1.4904989472185854e-06, "logits/chosen": -2.7928597927093506, "logits/rejected": -2.7351901531219482, "logps/chosen": -419.4825134277344, "logps/rejected": -264.5611267089844, "loss": 0.4502, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.7392198443412781, "rewards/margins": 1.3908106088638306, "rewards/rejected": -2.1300301551818848, "step": 4170 }, { "epoch": 0.6706778981147212, "grad_norm": 4.84375, "learning_rate": 1.4777024572090964e-06, "logits/chosen": -2.8204879760742188, "logits/rejected": -2.7638959884643555, "logps/chosen": -407.22357177734375, "logps/rejected": -263.539306640625, "loss": 0.4033, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.6842435598373413, "rewards/margins": 1.6940816640853882, "rewards/rejected": -2.3783254623413086, "step": 4180 }, { "epoch": 0.672282390693943, "grad_norm": 13.125, "learning_rate": 1.4649380490874824e-06, "logits/chosen": -2.8005738258361816, "logits/rejected": -2.753756046295166, "logps/chosen": -378.85406494140625, "logps/rejected": -267.27532958984375, "loss": 0.4361, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6773296594619751, "rewards/margins": 1.384840726852417, "rewards/rejected": -2.0621705055236816, "step": 4190 }, { "epoch": 0.6738868832731648, "grad_norm": 11.5625, "learning_rate": 1.452206123428225e-06, "logits/chosen": -2.8057713508605957, "logits/rejected": -2.7538790702819824, "logps/chosen": -404.1257019042969, "logps/rejected": -267.3128662109375, "loss": 0.4553, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8507968783378601, "rewards/margins": 1.4706299304962158, "rewards/rejected": -2.3214268684387207, "step": 4200 }, { "epoch": 0.6754913758523867, "grad_norm": 9.0, "learning_rate": 1.4395070797864401e-06, "logits/chosen": -2.8088631629943848, "logits/rejected": -2.7706990242004395, "logps/chosen": -391.8755798339844, "logps/rejected": -298.5284118652344, "loss": 0.3645, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7465029954910278, "rewards/margins": 1.6624934673309326, "rewards/rejected": -2.408996343612671, "step": 4210 }, { "epoch": 0.6770958684316085, "grad_norm": 12.5625, "learning_rate": 1.4268413166853329e-06, "logits/chosen": -2.817951202392578, "logits/rejected": -2.7444024085998535, "logps/chosen": -500.7591247558594, "logps/rejected": -262.9937438964844, "loss": 0.4206, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7506982684135437, "rewards/margins": 1.5855884552001953, "rewards/rejected": -2.336286783218384, "step": 4220 }, { "epoch": 0.6787003610108303, "grad_norm": 12.1875, "learning_rate": 1.4142092316036934e-06, "logits/chosen": -2.8122165203094482, "logits/rejected": -2.766618251800537, "logps/chosen": -400.4319152832031, "logps/rejected": -277.3241271972656, "loss": 0.4859, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8061878085136414, "rewards/margins": 1.3526837825775146, "rewards/rejected": -2.1588714122772217, "step": 4230 }, { "epoch": 0.6803048535900521, "grad_norm": 10.6875, "learning_rate": 1.4016112209634258e-06, "logits/chosen": -2.7965214252471924, "logits/rejected": -2.741909980773926, "logps/chosen": -391.38580322265625, "logps/rejected": -247.2108612060547, "loss": 0.4492, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.791666567325592, "rewards/margins": 1.3734793663024902, "rewards/rejected": -2.1651456356048584, "step": 4240 }, { "epoch": 0.681909346169274, "grad_norm": 13.875, "learning_rate": 1.3890476801171045e-06, "logits/chosen": -2.8129959106445312, "logits/rejected": -2.766711950302124, "logps/chosen": -398.63482666015625, "logps/rejected": -278.9686584472656, "loss": 0.4787, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6726212501525879, "rewards/margins": 1.4514776468276978, "rewards/rejected": -2.124098777770996, "step": 4250 }, { "epoch": 0.6835138387484958, "grad_norm": 10.5, "learning_rate": 1.3765190033355657e-06, "logits/chosen": -2.8034119606018066, "logits/rejected": -2.734060287475586, "logps/chosen": -473.5391540527344, "logps/rejected": -273.9928894042969, "loss": 0.4239, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6124533414840698, "rewards/margins": 1.5313068628311157, "rewards/rejected": -2.1437602043151855, "step": 4260 }, { "epoch": 0.6851183313277176, "grad_norm": 9.25, "learning_rate": 1.3640255837955403e-06, "logits/chosen": -2.8076891899108887, "logits/rejected": -2.7525928020477295, "logps/chosen": -397.5497741699219, "logps/rejected": -254.95108032226562, "loss": 0.4551, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8338388204574585, "rewards/margins": 1.3069957494735718, "rewards/rejected": -2.140834331512451, "step": 4270 }, { "epoch": 0.6867228239069394, "grad_norm": 12.6875, "learning_rate": 1.3515678135673072e-06, "logits/chosen": -2.7849678993225098, "logits/rejected": -2.74548602104187, "logps/chosen": -371.60882568359375, "logps/rejected": -260.60211181640625, "loss": 0.3626, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7687543630599976, "rewards/margins": 1.6653320789337158, "rewards/rejected": -2.434086322784424, "step": 4280 }, { "epoch": 0.6883273164861613, "grad_norm": 14.125, "learning_rate": 1.339146083602396e-06, "logits/chosen": -2.8166301250457764, "logits/rejected": -2.755711793899536, "logps/chosen": -448.94390869140625, "logps/rejected": -285.28448486328125, "loss": 0.43, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6430542469024658, "rewards/margins": 1.4854627847671509, "rewards/rejected": -2.128516674041748, "step": 4290 }, { "epoch": 0.6899318090653831, "grad_norm": 11.0625, "learning_rate": 1.3267607837213166e-06, "logits/chosen": -2.804636001586914, "logits/rejected": -2.7480268478393555, "logps/chosen": -485.5668029785156, "logps/rejected": -305.345458984375, "loss": 0.4478, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.5493478775024414, "rewards/margins": 1.5380263328552246, "rewards/rejected": -2.087374210357666, "step": 4300 }, { "epoch": 0.6915363016446049, "grad_norm": 10.5, "learning_rate": 1.3144123026013216e-06, "logits/chosen": -2.7974398136138916, "logits/rejected": -2.749955177307129, "logps/chosen": -403.5362548828125, "logps/rejected": -264.06939697265625, "loss": 0.4487, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7713372111320496, "rewards/margins": 1.3487266302108765, "rewards/rejected": -2.1200637817382812, "step": 4310 }, { "epoch": 0.6931407942238267, "grad_norm": 12.5, "learning_rate": 1.3021010277642145e-06, "logits/chosen": -2.785207748413086, "logits/rejected": -2.761369228363037, "logps/chosen": -330.01873779296875, "logps/rejected": -277.1678466796875, "loss": 0.4993, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.0054681301116943, "rewards/margins": 1.174461007118225, "rewards/rejected": -2.179929256439209, "step": 4320 }, { "epoch": 0.6947452868030486, "grad_norm": 8.8125, "learning_rate": 1.2898273455641863e-06, "logits/chosen": -2.813948392868042, "logits/rejected": -2.744499683380127, "logps/chosen": -426.91864013671875, "logps/rejected": -260.5456848144531, "loss": 0.3775, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.651974081993103, "rewards/margins": 1.5958582162857056, "rewards/rejected": -2.2478320598602295, "step": 4330 }, { "epoch": 0.6963497793822704, "grad_norm": 15.125, "learning_rate": 1.277591641175687e-06, "logits/chosen": -2.7950873374938965, "logits/rejected": -2.750697374343872, "logps/chosen": -380.3634948730469, "logps/rejected": -258.3779602050781, "loss": 0.4684, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.7944061756134033, "rewards/margins": 1.4210004806518555, "rewards/rejected": -2.215406894683838, "step": 4340 }, { "epoch": 0.6979542719614922, "grad_norm": 13.375, "learning_rate": 1.2653942985813459e-06, "logits/chosen": -2.809816837310791, "logits/rejected": -2.7542736530303955, "logps/chosen": -416.45159912109375, "logps/rejected": -290.12432861328125, "loss": 0.4728, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8704196214675903, "rewards/margins": 1.262623906135559, "rewards/rejected": -2.1330435276031494, "step": 4350 }, { "epoch": 0.699558764540714, "grad_norm": 7.5, "learning_rate": 1.2532357005599127e-06, "logits/chosen": -2.834868907928467, "logits/rejected": -2.770932674407959, "logps/chosen": -416.9718322753906, "logps/rejected": -260.26422119140625, "loss": 0.3538, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.6441753506660461, "rewards/margins": 1.5882630348205566, "rewards/rejected": -2.232438564300537, "step": 4360 }, { "epoch": 0.7011632571199358, "grad_norm": 11.375, "learning_rate": 1.241116228674254e-06, "logits/chosen": -2.803190231323242, "logits/rejected": -2.7446179389953613, "logps/chosen": -426.1937561035156, "logps/rejected": -259.77862548828125, "loss": 0.3326, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.5243372321128845, "rewards/margins": 1.7193145751953125, "rewards/rejected": -2.243651866912842, "step": 4370 }, { "epoch": 0.7027677496991577, "grad_norm": 9.1875, "learning_rate": 1.2290362632593697e-06, "logits/chosen": -2.806694507598877, "logits/rejected": -2.7578351497650146, "logps/chosen": -380.10235595703125, "logps/rejected": -263.94195556640625, "loss": 0.432, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.9417054057121277, "rewards/margins": 1.3510887622833252, "rewards/rejected": -2.2927939891815186, "step": 4380 }, { "epoch": 0.7043722422783795, "grad_norm": 13.5, "learning_rate": 1.2169961834104652e-06, "logits/chosen": -2.8086371421813965, "logits/rejected": -2.767411470413208, "logps/chosen": -420.53582763671875, "logps/rejected": -295.87835693359375, "loss": 0.4494, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8231533765792847, "rewards/margins": 1.3802332878112793, "rewards/rejected": -2.2033867835998535, "step": 4390 }, { "epoch": 0.7059767348576013, "grad_norm": 10.125, "learning_rate": 1.204996366971051e-06, "logits/chosen": -2.8047499656677246, "logits/rejected": -2.7570178508758545, "logps/chosen": -402.4431457519531, "logps/rejected": -292.07733154296875, "loss": 0.4298, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.8702141642570496, "rewards/margins": 1.4339237213134766, "rewards/rejected": -2.30413818359375, "step": 4400 }, { "epoch": 0.7075812274368231, "grad_norm": 10.5, "learning_rate": 1.1930371905210835e-06, "logits/chosen": -2.8232624530792236, "logits/rejected": -2.775325059890747, "logps/chosen": -383.77874755859375, "logps/rejected": -254.4361572265625, "loss": 0.4084, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.9907976388931274, "rewards/margins": 1.421443223953247, "rewards/rejected": -2.412240505218506, "step": 4410 }, { "epoch": 0.709185720016045, "grad_norm": 9.0, "learning_rate": 1.1811190293651514e-06, "logits/chosen": -2.7877371311187744, "logits/rejected": -2.7363650798797607, "logps/chosen": -430.14208984375, "logps/rejected": -278.2581481933594, "loss": 0.3452, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.718539834022522, "rewards/margins": 1.567847490310669, "rewards/rejected": -2.2863872051239014, "step": 4420 }, { "epoch": 0.7107902125952668, "grad_norm": 8.9375, "learning_rate": 1.1692422575206958e-06, "logits/chosen": -2.817267894744873, "logits/rejected": -2.7536392211914062, "logps/chosen": -386.74688720703125, "logps/rejected": -237.7115020751953, "loss": 0.346, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7511005401611328, "rewards/margins": 1.626056432723999, "rewards/rejected": -2.3771567344665527, "step": 4430 }, { "epoch": 0.7123947051744886, "grad_norm": 12.375, "learning_rate": 1.1574072477062686e-06, "logits/chosen": -2.8182742595672607, "logits/rejected": -2.777460813522339, "logps/chosen": -403.7441101074219, "logps/rejected": -285.347412109375, "loss": 0.4284, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.8748002052307129, "rewards/margins": 1.431445837020874, "rewards/rejected": -2.306246280670166, "step": 4440 }, { "epoch": 0.7139991977537103, "grad_norm": 14.5, "learning_rate": 1.1456143713298457e-06, "logits/chosen": -2.8200080394744873, "logits/rejected": -2.7531981468200684, "logps/chosen": -440.8792419433594, "logps/rejected": -253.48287963867188, "loss": 0.3961, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8072819709777832, "rewards/margins": 1.527416706085205, "rewards/rejected": -2.3346986770629883, "step": 4450 }, { "epoch": 0.7156036903329323, "grad_norm": 7.90625, "learning_rate": 1.1338639984771594e-06, "logits/chosen": -2.7980704307556152, "logits/rejected": -2.7625181674957275, "logps/chosen": -394.828125, "logps/rejected": -296.4634094238281, "loss": 0.4301, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.7991641759872437, "rewards/margins": 1.427180528640747, "rewards/rejected": -2.2263448238372803, "step": 4460 }, { "epoch": 0.717208182912154, "grad_norm": 16.5, "learning_rate": 1.122156497900094e-06, "logits/chosen": -2.8170361518859863, "logits/rejected": -2.775404214859009, "logps/chosen": -387.80010986328125, "logps/rejected": -287.08172607421875, "loss": 0.4676, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9321502447128296, "rewards/margins": 1.3896796703338623, "rewards/rejected": -2.3218297958374023, "step": 4470 }, { "epoch": 0.7188126754913758, "grad_norm": 18.375, "learning_rate": 1.1104922370051105e-06, "logits/chosen": -2.8073785305023193, "logits/rejected": -2.770172595977783, "logps/chosen": -398.55218505859375, "logps/rejected": -302.452392578125, "loss": 0.4767, "rewards/accuracies": 0.75, "rewards/chosen": -0.7817176580429077, "rewards/margins": 1.360367774963379, "rewards/rejected": -2.142085075378418, "step": 4480 }, { "epoch": 0.7204171680705976, "grad_norm": 14.8125, "learning_rate": 1.0988715818417137e-06, "logits/chosen": -2.7933382987976074, "logits/rejected": -2.738826036453247, "logps/chosen": -436.403076171875, "logps/rejected": -276.77423095703125, "loss": 0.3807, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.6646957397460938, "rewards/margins": 1.6066029071807861, "rewards/rejected": -2.27129864692688, "step": 4490 }, { "epoch": 0.7220216606498195, "grad_norm": 11.5, "learning_rate": 1.0872948970909664e-06, "logits/chosen": -2.809124231338501, "logits/rejected": -2.7516136169433594, "logps/chosen": -417.6109924316406, "logps/rejected": -295.6455383300781, "loss": 0.4143, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9587324261665344, "rewards/margins": 1.4250155687332153, "rewards/rejected": -2.3837480545043945, "step": 4500 }, { "epoch": 0.7236261532290413, "grad_norm": 10.4375, "learning_rate": 1.0757625460540483e-06, "logits/chosen": -2.7933425903320312, "logits/rejected": -2.7361536026000977, "logps/chosen": -443.94757080078125, "logps/rejected": -263.0430603027344, "loss": 0.3371, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.8229011297225952, "rewards/margins": 1.6949036121368408, "rewards/rejected": -2.5178046226501465, "step": 4510 }, { "epoch": 0.7252306458082631, "grad_norm": 10.9375, "learning_rate": 1.0642748906408523e-06, "logits/chosen": -2.8089661598205566, "logits/rejected": -2.74162220954895, "logps/chosen": -392.43804931640625, "logps/rejected": -249.172119140625, "loss": 0.3369, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.8470695614814758, "rewards/margins": 1.6172224283218384, "rewards/rejected": -2.464291572570801, "step": 4520 }, { "epoch": 0.7268351383874849, "grad_norm": 9.4375, "learning_rate": 1.0528322913586234e-06, "logits/chosen": -2.8351688385009766, "logits/rejected": -2.7699105739593506, "logps/chosen": -449.22991943359375, "logps/rejected": -291.88616943359375, "loss": 0.439, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.8365947008132935, "rewards/margins": 1.4896392822265625, "rewards/rejected": -2.3262341022491455, "step": 4530 }, { "epoch": 0.7284396309667068, "grad_norm": 7.71875, "learning_rate": 1.0414351073006515e-06, "logits/chosen": -2.832261323928833, "logits/rejected": -2.776916027069092, "logps/chosen": -381.5976257324219, "logps/rejected": -259.5262756347656, "loss": 0.4096, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.8025308847427368, "rewards/margins": 1.5367860794067383, "rewards/rejected": -2.3393168449401855, "step": 4540 }, { "epoch": 0.7300441235459286, "grad_norm": 8.0625, "learning_rate": 1.030083696135e-06, "logits/chosen": -2.8223702907562256, "logits/rejected": -2.7592432498931885, "logps/chosen": -457.82373046875, "logps/rejected": -285.7713317871094, "loss": 0.3861, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.45495176315307617, "rewards/margins": 1.7584136724472046, "rewards/rejected": -2.2133655548095703, "step": 4550 }, { "epoch": 0.7316486161251504, "grad_norm": 8.8125, "learning_rate": 1.0187784140932775e-06, "logits/chosen": -2.8175137042999268, "logits/rejected": -2.754703998565674, "logps/chosen": -450.48016357421875, "logps/rejected": -272.6295166015625, "loss": 0.4149, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7075556516647339, "rewards/margins": 1.44368314743042, "rewards/rejected": -2.1512389183044434, "step": 4560 }, { "epoch": 0.7332531087043722, "grad_norm": 13.75, "learning_rate": 1.0075196159594647e-06, "logits/chosen": -2.8184738159179688, "logits/rejected": -2.753551483154297, "logps/chosen": -437.98028564453125, "logps/rejected": -260.2247009277344, "loss": 0.4249, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.7240505218505859, "rewards/margins": 1.554195761680603, "rewards/rejected": -2.2782464027404785, "step": 4570 }, { "epoch": 0.734857601283594, "grad_norm": 7.6875, "learning_rate": 9.96307655058778e-07, "logits/chosen": -2.7819628715515137, "logits/rejected": -2.734687328338623, "logps/chosen": -408.7232666015625, "logps/rejected": -301.0596923828125, "loss": 0.432, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9796217679977417, "rewards/margins": 1.279337763786316, "rewards/rejected": -2.2589597702026367, "step": 4580 }, { "epoch": 0.7364620938628159, "grad_norm": 14.0, "learning_rate": 9.851428832465778e-07, "logits/chosen": -2.8136870861053467, "logits/rejected": -2.767388105392456, "logps/chosen": -453.814208984375, "logps/rejected": -302.7470703125, "loss": 0.4999, "rewards/accuracies": 0.75, "rewards/chosen": -0.7576819658279419, "rewards/margins": 1.3704469203948975, "rewards/rejected": -2.12812876701355, "step": 4590 }, { "epoch": 0.7380665864420377, "grad_norm": 8.0625, "learning_rate": 9.740256508973331e-07, "logits/chosen": -2.8012802600860596, "logits/rejected": -2.733184337615967, "logps/chosen": -441.34185791015625, "logps/rejected": -259.69256591796875, "loss": 0.343, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.5872552394866943, "rewards/margins": 1.8577486276626587, "rewards/rejected": -2.4450037479400635, "step": 4600 }, { "epoch": 0.7396710790212595, "grad_norm": 7.625, "learning_rate": 9.629563068936217e-07, "logits/chosen": -2.810722589492798, "logits/rejected": -2.759697198867798, "logps/chosen": -384.3507385253906, "logps/rejected": -270.04986572265625, "loss": 0.4093, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.9548600912094116, "rewards/margins": 1.6129192113876343, "rewards/rejected": -2.567779302597046, "step": 4610 }, { "epoch": 0.7412755716004813, "grad_norm": 10.625, "learning_rate": 9.519351986151812e-07, "logits/chosen": -2.7945523262023926, "logits/rejected": -2.7523467540740967, "logps/chosen": -395.2113952636719, "logps/rejected": -248.64993286132812, "loss": 0.4123, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.8667179942131042, "rewards/margins": 1.5264112949371338, "rewards/rejected": -2.393129348754883, "step": 4620 }, { "epoch": 0.7428800641797032, "grad_norm": 8.125, "learning_rate": 9.409626719280108e-07, "logits/chosen": -2.802006483078003, "logits/rejected": -2.755690813064575, "logps/chosen": -405.69061279296875, "logps/rejected": -291.0460510253906, "loss": 0.3912, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.8145783543586731, "rewards/margins": 1.5481194257736206, "rewards/rejected": -2.3626980781555176, "step": 4630 }, { "epoch": 0.744484556758925, "grad_norm": 10.625, "learning_rate": 9.300390711735133e-07, "logits/chosen": -2.7932212352752686, "logits/rejected": -2.7438507080078125, "logps/chosen": -385.64312744140625, "logps/rejected": -261.5687561035156, "loss": 0.4484, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.9915562868118286, "rewards/margins": 1.4462177753448486, "rewards/rejected": -2.437774181365967, "step": 4640 }, { "epoch": 0.7460890493381468, "grad_norm": 13.5, "learning_rate": 9.191647391576913e-07, "logits/chosen": -2.8304948806762695, "logits/rejected": -2.77006459236145, "logps/chosen": -402.6106872558594, "logps/rejected": -269.8332824707031, "loss": 0.3605, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.8552454710006714, "rewards/margins": 1.646910309791565, "rewards/rejected": -2.5021557807922363, "step": 4650 }, { "epoch": 0.7476935419173686, "grad_norm": 10.625, "learning_rate": 9.0834001714039e-07, "logits/chosen": -2.8250904083251953, "logits/rejected": -2.767458438873291, "logps/chosen": -407.00238037109375, "logps/rejected": -274.154541015625, "loss": 0.3806, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.6669026017189026, "rewards/margins": 1.5693833827972412, "rewards/rejected": -2.236286163330078, "step": 4660 }, { "epoch": 0.7492980344965905, "grad_norm": 6.375, "learning_rate": 8.975652448245867e-07, "logits/chosen": -2.8188765048980713, "logits/rejected": -2.734626293182373, "logps/chosen": -469.0780334472656, "logps/rejected": -237.9582061767578, "loss": 0.3104, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.6751123666763306, "rewards/margins": 1.900902509689331, "rewards/rejected": -2.576014995574951, "step": 4670 }, { "epoch": 0.7509025270758123, "grad_norm": 8.25, "learning_rate": 8.868407603457272e-07, "logits/chosen": -2.8171775341033936, "logits/rejected": -2.7653214931488037, "logps/chosen": -425.9637756347656, "logps/rejected": -310.14056396484375, "loss": 0.3989, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7970841526985168, "rewards/margins": 1.5395481586456299, "rewards/rejected": -2.336632013320923, "step": 4680 }, { "epoch": 0.7525070196550341, "grad_norm": 7.5, "learning_rate": 8.761669002611201e-07, "logits/chosen": -2.804945945739746, "logits/rejected": -2.7537198066711426, "logps/chosen": -466.0166931152344, "logps/rejected": -330.3244934082031, "loss": 0.3887, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.49158304929733276, "rewards/margins": 1.7408682107925415, "rewards/rejected": -2.2324509620666504, "step": 4690 }, { "epoch": 0.7541115122342559, "grad_norm": 14.625, "learning_rate": 8.655439995393714e-07, "logits/chosen": -2.8074443340301514, "logits/rejected": -2.7576651573181152, "logps/chosen": -403.48199462890625, "logps/rejected": -282.7434387207031, "loss": 0.4444, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.8128441572189331, "rewards/margins": 1.4252400398254395, "rewards/rejected": -2.238084316253662, "step": 4700 }, { "epoch": 0.7557160048134778, "grad_norm": 10.0625, "learning_rate": 8.549723915498717e-07, "logits/chosen": -2.794193983078003, "logits/rejected": -2.7191898822784424, "logps/chosen": -422.3136291503906, "logps/rejected": -211.6849822998047, "loss": 0.3696, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.9110020399093628, "rewards/margins": 1.920965552330017, "rewards/rejected": -2.831968069076538, "step": 4710 }, { "epoch": 0.7573204973926996, "grad_norm": 12.625, "learning_rate": 8.44452408052337e-07, "logits/chosen": -2.8249881267547607, "logits/rejected": -2.7574400901794434, "logps/chosen": -427.70098876953125, "logps/rejected": -247.4449920654297, "loss": 0.3052, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.8516663312911987, "rewards/margins": 1.6736255884170532, "rewards/rejected": -2.525291919708252, "step": 4720 }, { "epoch": 0.7589249899719214, "grad_norm": 8.75, "learning_rate": 8.339843791863972e-07, "logits/chosen": -2.823765277862549, "logits/rejected": -2.767457962036133, "logps/chosen": -410.4039611816406, "logps/rejected": -262.34429931640625, "loss": 0.3691, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.6702436208724976, "rewards/margins": 1.8158502578735352, "rewards/rejected": -2.4860939979553223, "step": 4730 }, { "epoch": 0.7605294825511432, "grad_norm": 13.4375, "learning_rate": 8.235686334612308e-07, "logits/chosen": -2.803133964538574, "logits/rejected": -2.746887683868408, "logps/chosen": -437.78082275390625, "logps/rejected": -282.9031677246094, "loss": 0.4283, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.8265630006790161, "rewards/margins": 1.518532633781433, "rewards/rejected": -2.345095634460449, "step": 4740 }, { "epoch": 0.7621339751303651, "grad_norm": 11.3125, "learning_rate": 8.132054977452627e-07, "logits/chosen": -2.7960917949676514, "logits/rejected": -2.75126576423645, "logps/chosen": -419.90289306640625, "logps/rejected": -295.45025634765625, "loss": 0.4491, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7044726014137268, "rewards/margins": 1.4898113012313843, "rewards/rejected": -2.194283962249756, "step": 4750 }, { "epoch": 0.7637384677095869, "grad_norm": 9.0625, "learning_rate": 8.028952972559029e-07, "logits/chosen": -2.8335890769958496, "logits/rejected": -2.771564483642578, "logps/chosen": -414.1685485839844, "logps/rejected": -273.7349548339844, "loss": 0.4337, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8614042401313782, "rewards/margins": 1.5476605892181396, "rewards/rejected": -2.409064769744873, "step": 4760 }, { "epoch": 0.7653429602888087, "grad_norm": 11.375, "learning_rate": 7.926383555493386e-07, "logits/chosen": -2.815276622772217, "logits/rejected": -2.745915174484253, "logps/chosen": -430.24359130859375, "logps/rejected": -249.1603240966797, "loss": 0.4177, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9056553840637207, "rewards/margins": 1.4991941452026367, "rewards/rejected": -2.4048492908477783, "step": 4770 }, { "epoch": 0.7669474528680305, "grad_norm": 12.75, "learning_rate": 7.82434994510382e-07, "logits/chosen": -2.8085501194000244, "logits/rejected": -2.7439520359039307, "logps/chosen": -398.37835693359375, "logps/rejected": -225.0926513671875, "loss": 0.4228, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9160206913948059, "rewards/margins": 1.4925036430358887, "rewards/rejected": -2.40852427482605, "step": 4780 }, { "epoch": 0.7685519454472524, "grad_norm": 9.125, "learning_rate": 7.722855343423725e-07, "logits/chosen": -2.7973523139953613, "logits/rejected": -2.752079963684082, "logps/chosen": -410.24072265625, "logps/rejected": -292.62109375, "loss": 0.4391, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6927233338356018, "rewards/margins": 1.5563902854919434, "rewards/rejected": -2.2491135597229004, "step": 4790 }, { "epoch": 0.7701564380264742, "grad_norm": 11.0, "learning_rate": 7.621902935571202e-07, "logits/chosen": -2.796438694000244, "logits/rejected": -2.7655458450317383, "logps/chosen": -367.6105651855469, "logps/rejected": -279.1221008300781, "loss": 0.4285, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.0706039667129517, "rewards/margins": 1.3954635858535767, "rewards/rejected": -2.4660675525665283, "step": 4800 }, { "epoch": 0.771760930605696, "grad_norm": 9.3125, "learning_rate": 7.52149588964918e-07, "logits/chosen": -2.8125057220458984, "logits/rejected": -2.7581701278686523, "logps/chosen": -416.6363830566406, "logps/rejected": -287.6896667480469, "loss": 0.347, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.7504569888114929, "rewards/margins": 1.662416696548462, "rewards/rejected": -2.4128735065460205, "step": 4810 }, { "epoch": 0.7733654231849177, "grad_norm": 11.1875, "learning_rate": 7.421637356645964e-07, "logits/chosen": -2.8057570457458496, "logits/rejected": -2.7558627128601074, "logps/chosen": -418.97125244140625, "logps/rejected": -279.6209411621094, "loss": 0.4138, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8238815069198608, "rewards/margins": 1.5219407081604004, "rewards/rejected": -2.3458220958709717, "step": 4820 }, { "epoch": 0.7749699157641395, "grad_norm": 11.25, "learning_rate": 7.322330470336314e-07, "logits/chosen": -2.8020358085632324, "logits/rejected": -2.743691921234131, "logps/chosen": -388.35504150390625, "logps/rejected": -251.1291046142578, "loss": 0.3921, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.7989343404769897, "rewards/margins": 1.6932752132415771, "rewards/rejected": -2.4922091960906982, "step": 4830 }, { "epoch": 0.7765744083433614, "grad_norm": 6.0, "learning_rate": 7.223578347183167e-07, "logits/chosen": -2.792184591293335, "logits/rejected": -2.7313404083251953, "logps/chosen": -438.1961364746094, "logps/rejected": -276.2207946777344, "loss": 0.3309, "rewards/accuracies": 0.875, "rewards/chosen": -0.6878136396408081, "rewards/margins": 1.8116298913955688, "rewards/rejected": -2.499443292617798, "step": 4840 }, { "epoch": 0.7781789009225832, "grad_norm": 16.375, "learning_rate": 7.1253840862398e-07, "logits/chosen": -2.8089377880096436, "logits/rejected": -2.764955520629883, "logps/chosen": -381.8256530761719, "logps/rejected": -280.2010498046875, "loss": 0.4547, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9244373440742493, "rewards/margins": 1.4169368743896484, "rewards/rejected": -2.341374158859253, "step": 4850 }, { "epoch": 0.779783393501805, "grad_norm": 6.6875, "learning_rate": 7.027750769052555e-07, "logits/chosen": -2.814762830734253, "logits/rejected": -2.7603766918182373, "logps/chosen": -450.8922424316406, "logps/rejected": -286.4497375488281, "loss": 0.3534, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.6029097437858582, "rewards/margins": 1.6924997568130493, "rewards/rejected": -2.295409679412842, "step": 4860 }, { "epoch": 0.7813878860810268, "grad_norm": 7.34375, "learning_rate": 6.930681459564181e-07, "logits/chosen": -2.8057310581207275, "logits/rejected": -2.7384281158447266, "logps/chosen": -445.20550537109375, "logps/rejected": -275.3711242675781, "loss": 0.3424, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.5919303297996521, "rewards/margins": 1.7752958536148071, "rewards/rejected": -2.3672258853912354, "step": 4870 }, { "epoch": 0.7829923786602487, "grad_norm": 7.4375, "learning_rate": 6.834179204017655e-07, "logits/chosen": -2.8050875663757324, "logits/rejected": -2.739454984664917, "logps/chosen": -433.8733825683594, "logps/rejected": -263.2391052246094, "loss": 0.3544, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.8317115902900696, "rewards/margins": 1.7740052938461304, "rewards/rejected": -2.605717182159424, "step": 4880 }, { "epoch": 0.7845968712394705, "grad_norm": 8.75, "learning_rate": 6.738247030860562e-07, "logits/chosen": -2.8118817806243896, "logits/rejected": -2.763822555541992, "logps/chosen": -374.99200439453125, "logps/rejected": -259.0638732910156, "loss": 0.3662, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.750750720500946, "rewards/margins": 1.7818279266357422, "rewards/rejected": -2.532578706741333, "step": 4890 }, { "epoch": 0.7862013638186923, "grad_norm": 17.75, "learning_rate": 6.642887950650112e-07, "logits/chosen": -2.792278289794922, "logits/rejected": -2.735421657562256, "logps/chosen": -422.1402893066406, "logps/rejected": -264.7906494140625, "loss": 0.4786, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.8787875175476074, "rewards/margins": 1.543425440788269, "rewards/rejected": -2.422213077545166, "step": 4900 }, { "epoch": 0.7878058563979141, "grad_norm": 11.25, "learning_rate": 6.548104955958612e-07, "logits/chosen": -2.8306758403778076, "logits/rejected": -2.765225887298584, "logps/chosen": -450.4150390625, "logps/rejected": -278.30255126953125, "loss": 0.3911, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.7223361730575562, "rewards/margins": 1.609868049621582, "rewards/rejected": -2.3322043418884277, "step": 4910 }, { "epoch": 0.789410348977136, "grad_norm": 8.375, "learning_rate": 6.45390102127956e-07, "logits/chosen": -2.820207118988037, "logits/rejected": -2.758920192718506, "logps/chosen": -448.82354736328125, "logps/rejected": -284.1756896972656, "loss": 0.3858, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6038740873336792, "rewards/margins": 1.7051990032196045, "rewards/rejected": -2.3090732097625732, "step": 4920 }, { "epoch": 0.7910148415563578, "grad_norm": 8.4375, "learning_rate": 6.360279102934335e-07, "logits/chosen": -2.799983263015747, "logits/rejected": -2.7670369148254395, "logps/chosen": -350.095458984375, "logps/rejected": -276.91192626953125, "loss": 0.3791, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.9150103330612183, "rewards/margins": 1.4633735418319702, "rewards/rejected": -2.3783836364746094, "step": 4930 }, { "epoch": 0.7926193341355796, "grad_norm": 18.875, "learning_rate": 6.267242138979393e-07, "logits/chosen": -2.792755126953125, "logits/rejected": -2.762261152267456, "logps/chosen": -423.9115295410156, "logps/rejected": -325.4398498535156, "loss": 0.436, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7595723271369934, "rewards/margins": 1.5751006603240967, "rewards/rejected": -2.334672689437866, "step": 4940 }, { "epoch": 0.7942238267148014, "grad_norm": 9.4375, "learning_rate": 6.174793049114042e-07, "logits/chosen": -2.819870710372925, "logits/rejected": -2.751690626144409, "logps/chosen": -445.4422912597656, "logps/rejected": -260.32879638671875, "loss": 0.3886, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7682619690895081, "rewards/margins": 1.7336041927337646, "rewards/rejected": -2.501866102218628, "step": 4950 }, { "epoch": 0.7958283192940233, "grad_norm": 7.03125, "learning_rate": 6.082934734588867e-07, "logits/chosen": -2.8038647174835205, "logits/rejected": -2.745363235473633, "logps/chosen": -408.68463134765625, "logps/rejected": -274.67181396484375, "loss": 0.4051, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9390190839767456, "rewards/margins": 1.5985101461410522, "rewards/rejected": -2.537529468536377, "step": 4960 }, { "epoch": 0.7974328118732451, "grad_norm": 11.375, "learning_rate": 5.991670078114651e-07, "logits/chosen": -2.819920063018799, "logits/rejected": -2.758444309234619, "logps/chosen": -473.35595703125, "logps/rejected": -283.6885986328125, "loss": 0.3786, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.6496865153312683, "rewards/margins": 1.8054357767105103, "rewards/rejected": -2.4551219940185547, "step": 4970 }, { "epoch": 0.7990373044524669, "grad_norm": 5.5625, "learning_rate": 5.901001943771903e-07, "logits/chosen": -2.7976953983306885, "logits/rejected": -2.7535715103149414, "logps/chosen": -395.05364990234375, "logps/rejected": -287.90673828125, "loss": 0.385, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.827285647392273, "rewards/margins": 1.5949347019195557, "rewards/rejected": -2.4222207069396973, "step": 4980 }, { "epoch": 0.8006417970316887, "grad_norm": 13.5625, "learning_rate": 5.810933176921002e-07, "logits/chosen": -2.795240879058838, "logits/rejected": -2.755099058151245, "logps/chosen": -387.1313781738281, "logps/rejected": -282.2012023925781, "loss": 0.4062, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8046302795410156, "rewards/margins": 1.616646409034729, "rewards/rejected": -2.421276807785034, "step": 4990 }, { "epoch": 0.8022462896109106, "grad_norm": 9.625, "learning_rate": 5.721466604112894e-07, "logits/chosen": -2.8130838871002197, "logits/rejected": -2.7531440258026123, "logps/chosen": -434.8575744628906, "logps/rejected": -265.8315124511719, "loss": 0.4762, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8707440495491028, "rewards/margins": 1.463139295578003, "rewards/rejected": -2.333883047103882, "step": 5000 }, { "epoch": 0.8038507821901324, "grad_norm": 11.0, "learning_rate": 5.632605033000363e-07, "logits/chosen": -2.8145740032196045, "logits/rejected": -2.7690517902374268, "logps/chosen": -458.78973388671875, "logps/rejected": -310.5583190917969, "loss": 0.4338, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6770051717758179, "rewards/margins": 1.5068989992141724, "rewards/rejected": -2.183903932571411, "step": 5010 }, { "epoch": 0.8054552747693542, "grad_norm": 8.5625, "learning_rate": 5.544351252249969e-07, "logits/chosen": -2.8062644004821777, "logits/rejected": -2.7377982139587402, "logps/chosen": -474.56591796875, "logps/rejected": -274.51861572265625, "loss": 0.362, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.6742371320724487, "rewards/margins": 1.6378129720687866, "rewards/rejected": -2.3120503425598145, "step": 5020 }, { "epoch": 0.807059767348576, "grad_norm": 12.125, "learning_rate": 5.456708031454499e-07, "logits/chosen": -2.8102688789367676, "logits/rejected": -2.758871555328369, "logps/chosen": -374.5927734375, "logps/rejected": -259.72235107421875, "loss": 0.4264, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9330500364303589, "rewards/margins": 1.4185751676559448, "rewards/rejected": -2.3516249656677246, "step": 5030 }, { "epoch": 0.8086642599277978, "grad_norm": 12.0, "learning_rate": 5.369678121046054e-07, "logits/chosen": -2.8132777214050293, "logits/rejected": -2.7517592906951904, "logps/chosen": -403.87255859375, "logps/rejected": -268.4210205078125, "loss": 0.3751, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8429446220397949, "rewards/margins": 1.5858006477355957, "rewards/rejected": -2.4287452697753906, "step": 5040 }, { "epoch": 0.8102687525070197, "grad_norm": 9.5625, "learning_rate": 5.283264252209739e-07, "logits/chosen": -2.7996573448181152, "logits/rejected": -2.7516536712646484, "logps/chosen": -445.63232421875, "logps/rejected": -310.7314147949219, "loss": 0.4304, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.7545165419578552, "rewards/margins": 1.4638670682907104, "rewards/rejected": -2.218383550643921, "step": 5050 }, { "epoch": 0.8118732450862415, "grad_norm": 11.375, "learning_rate": 5.197469136797967e-07, "logits/chosen": -2.806793689727783, "logits/rejected": -2.751582622528076, "logps/chosen": -407.9443054199219, "logps/rejected": -269.05487060546875, "loss": 0.4448, "rewards/accuracies": 0.75, "rewards/chosen": -1.0222738981246948, "rewards/margins": 1.343806505203247, "rewards/rejected": -2.3660805225372314, "step": 5060 }, { "epoch": 0.8134777376654633, "grad_norm": 16.625, "learning_rate": 5.112295467245326e-07, "logits/chosen": -2.8153493404388428, "logits/rejected": -2.7709174156188965, "logps/chosen": -408.0180358886719, "logps/rejected": -294.43536376953125, "loss": 0.4546, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7921351790428162, "rewards/margins": 1.4281790256500244, "rewards/rejected": -2.2203140258789062, "step": 5070 }, { "epoch": 0.8150822302446851, "grad_norm": 13.75, "learning_rate": 5.02774591648412e-07, "logits/chosen": -2.7976303100585938, "logits/rejected": -2.7505338191986084, "logps/chosen": -437.38616943359375, "logps/rejected": -310.5398864746094, "loss": 0.4299, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.7794550657272339, "rewards/margins": 1.5949008464813232, "rewards/rejected": -2.3743557929992676, "step": 5080 }, { "epoch": 0.816686722823907, "grad_norm": 13.125, "learning_rate": 4.943823137860459e-07, "logits/chosen": -2.799306869506836, "logits/rejected": -2.7567639350891113, "logps/chosen": -406.96905517578125, "logps/rejected": -308.3716735839844, "loss": 0.4777, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0190885066986084, "rewards/margins": 1.3996927738189697, "rewards/rejected": -2.418781042098999, "step": 5090 }, { "epoch": 0.8182912154031288, "grad_norm": 9.5, "learning_rate": 4.860529765050986e-07, "logits/chosen": -2.826939582824707, "logits/rejected": -2.769885301589966, "logps/chosen": -440.55303955078125, "logps/rejected": -292.2820739746094, "loss": 0.4236, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8122813105583191, "rewards/margins": 1.38179612159729, "rewards/rejected": -2.194077491760254, "step": 5100 }, { "epoch": 0.8198957079823506, "grad_norm": 7.6875, "learning_rate": 4.777868411980266e-07, "logits/chosen": -2.827862501144409, "logits/rejected": -2.76121187210083, "logps/chosen": -429.16046142578125, "logps/rejected": -270.52423095703125, "loss": 0.3799, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.805324912071228, "rewards/margins": 1.7036962509155273, "rewards/rejected": -2.509021282196045, "step": 5110 }, { "epoch": 0.8215002005615724, "grad_norm": 9.625, "learning_rate": 4.695841672738718e-07, "logits/chosen": -2.7997562885284424, "logits/rejected": -2.7460687160491943, "logps/chosen": -437.3548889160156, "logps/rejected": -285.81341552734375, "loss": 0.4848, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8875702023506165, "rewards/margins": 1.3494956493377686, "rewards/rejected": -2.2370660305023193, "step": 5120 }, { "epoch": 0.8231046931407943, "grad_norm": 10.0, "learning_rate": 4.614452121501209e-07, "logits/chosen": -2.7984540462493896, "logits/rejected": -2.7619614601135254, "logps/chosen": -399.63043212890625, "logps/rejected": -310.70806884765625, "loss": 0.372, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.8408582806587219, "rewards/margins": 1.6181284189224243, "rewards/rejected": -2.458986759185791, "step": 5130 }, { "epoch": 0.8247091857200161, "grad_norm": 12.4375, "learning_rate": 4.533702312446295e-07, "logits/chosen": -2.811885356903076, "logits/rejected": -2.7577710151672363, "logps/chosen": -410.259033203125, "logps/rejected": -277.7704772949219, "loss": 0.376, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6400863528251648, "rewards/margins": 1.629621148109436, "rewards/rejected": -2.269707202911377, "step": 5140 }, { "epoch": 0.8263136782992379, "grad_norm": 11.625, "learning_rate": 4.4535947796760514e-07, "logits/chosen": -2.823920726776123, "logits/rejected": -2.7661232948303223, "logps/chosen": -414.277587890625, "logps/rejected": -264.0932312011719, "loss": 0.3542, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.7436869740486145, "rewards/margins": 1.6556533575057983, "rewards/rejected": -2.3993401527404785, "step": 5150 }, { "epoch": 0.8279181708784596, "grad_norm": 9.375, "learning_rate": 4.374132037136533e-07, "logits/chosen": -2.8019652366638184, "logits/rejected": -2.7578036785125732, "logps/chosen": -386.10064697265625, "logps/rejected": -272.6463623046875, "loss": 0.4301, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9946942329406738, "rewards/margins": 1.4427231550216675, "rewards/rejected": -2.437417507171631, "step": 5160 }, { "epoch": 0.8295226634576816, "grad_norm": 8.0, "learning_rate": 4.2953165785389104e-07, "logits/chosen": -2.81976056098938, "logits/rejected": -2.7638087272644043, "logps/chosen": -408.5876159667969, "logps/rejected": -278.19818115234375, "loss": 0.3414, "rewards/accuracies": 0.875, "rewards/chosen": -0.7815436720848083, "rewards/margins": 1.7438112497329712, "rewards/rejected": -2.525355100631714, "step": 5170 }, { "epoch": 0.8311271560369033, "grad_norm": 9.8125, "learning_rate": 4.217150877281198e-07, "logits/chosen": -2.8234617710113525, "logits/rejected": -2.7681641578674316, "logps/chosen": -431.0487365722656, "logps/rejected": -286.0492248535156, "loss": 0.4231, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7844264507293701, "rewards/margins": 1.519165277481079, "rewards/rejected": -2.303591728210449, "step": 5180 }, { "epoch": 0.8327316486161251, "grad_norm": 12.6875, "learning_rate": 4.139637386370618e-07, "logits/chosen": -2.823091983795166, "logits/rejected": -2.762180805206299, "logps/chosen": -423.797607421875, "logps/rejected": -281.6072692871094, "loss": 0.3888, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.7785607576370239, "rewards/margins": 1.6105117797851562, "rewards/rejected": -2.3890724182128906, "step": 5190 }, { "epoch": 0.8343361411953469, "grad_norm": 16.125, "learning_rate": 4.0627785383466346e-07, "logits/chosen": -2.8171427249908447, "logits/rejected": -2.7706940174102783, "logps/chosen": -376.15740966796875, "logps/rejected": -258.61602783203125, "loss": 0.435, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9728404879570007, "rewards/margins": 1.5463778972625732, "rewards/rejected": -2.5192184448242188, "step": 5200 }, { "epoch": 0.8359406337745688, "grad_norm": 10.9375, "learning_rate": 3.9865767452046287e-07, "logits/chosen": -2.81937837600708, "logits/rejected": -2.758531093597412, "logps/chosen": -424.25958251953125, "logps/rejected": -267.4178161621094, "loss": 0.3608, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.698890209197998, "rewards/margins": 1.6937427520751953, "rewards/rejected": -2.3926329612731934, "step": 5210 }, { "epoch": 0.8375451263537906, "grad_norm": 12.25, "learning_rate": 3.9110343983201906e-07, "logits/chosen": -2.808873414993286, "logits/rejected": -2.7532496452331543, "logps/chosen": -398.35162353515625, "logps/rejected": -275.1828308105469, "loss": 0.3887, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7885446548461914, "rewards/margins": 1.5184005498886108, "rewards/rejected": -2.3069450855255127, "step": 5220 }, { "epoch": 0.8391496189330124, "grad_norm": 12.25, "learning_rate": 3.8361538683740566e-07, "logits/chosen": -2.80633807182312, "logits/rejected": -2.7526285648345947, "logps/chosen": -382.8105163574219, "logps/rejected": -268.76116943359375, "loss": 0.3963, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.9611803293228149, "rewards/margins": 1.5241847038269043, "rewards/rejected": -2.485365390777588, "step": 5230 }, { "epoch": 0.8407541115122342, "grad_norm": 9.5625, "learning_rate": 3.761937505277752e-07, "logits/chosen": -2.8122754096984863, "logits/rejected": -2.7356228828430176, "logps/chosen": -422.33740234375, "logps/rejected": -230.3617706298828, "loss": 0.3323, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7025107145309448, "rewards/margins": 1.9366891384124756, "rewards/rejected": -2.639199733734131, "step": 5240 }, { "epoch": 0.8423586040914561, "grad_norm": 8.1875, "learning_rate": 3.688387638099794e-07, "logits/chosen": -2.8197898864746094, "logits/rejected": -2.7610397338867188, "logps/chosen": -433.6830139160156, "logps/rejected": -273.6290588378906, "loss": 0.3521, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.7150878310203552, "rewards/margins": 1.6571056842803955, "rewards/rejected": -2.3721930980682373, "step": 5250 }, { "epoch": 0.8439630966706779, "grad_norm": 12.6875, "learning_rate": 3.6155065749926584e-07, "logits/chosen": -2.822070598602295, "logits/rejected": -2.758746862411499, "logps/chosen": -443.93792724609375, "logps/rejected": -282.51910400390625, "loss": 0.4157, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6867502927780151, "rewards/margins": 1.6934669017791748, "rewards/rejected": -2.3802173137664795, "step": 5260 }, { "epoch": 0.8455675892498997, "grad_norm": 10.0625, "learning_rate": 3.543296603120308e-07, "logits/chosen": -2.806684732437134, "logits/rejected": -2.7682671546936035, "logps/chosen": -411.65704345703125, "logps/rejected": -312.7909851074219, "loss": 0.4265, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.837887167930603, "rewards/margins": 1.5839258432388306, "rewards/rejected": -2.4218130111694336, "step": 5270 }, { "epoch": 0.8471720818291215, "grad_norm": 11.9375, "learning_rate": 3.4717599885864037e-07, "logits/chosen": -2.817885160446167, "logits/rejected": -2.7544350624084473, "logps/chosen": -426.88421630859375, "logps/rejected": -270.0431213378906, "loss": 0.3799, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.8044511079788208, "rewards/margins": 1.665572166442871, "rewards/rejected": -2.4700231552124023, "step": 5280 }, { "epoch": 0.8487765744083433, "grad_norm": 9.875, "learning_rate": 3.40089897636324e-07, "logits/chosen": -2.7929961681365967, "logits/rejected": -2.7440924644470215, "logps/chosen": -390.76312255859375, "logps/rejected": -270.1073913574219, "loss": 0.3986, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.8982022404670715, "rewards/margins": 1.4818925857543945, "rewards/rejected": -2.3800950050354004, "step": 5290 }, { "epoch": 0.8503810669875652, "grad_norm": 7.8125, "learning_rate": 3.330715790221248e-07, "logits/chosen": -2.821324110031128, "logits/rejected": -2.765613079071045, "logps/chosen": -456.3348693847656, "logps/rejected": -283.5493469238281, "loss": 0.3887, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.8111359477043152, "rewards/margins": 1.5186651945114136, "rewards/rejected": -2.329801082611084, "step": 5300 }, { "epoch": 0.851985559566787, "grad_norm": 14.5, "learning_rate": 3.261212632659219e-07, "logits/chosen": -2.8117306232452393, "logits/rejected": -2.751755952835083, "logps/chosen": -421.87652587890625, "logps/rejected": -271.7775573730469, "loss": 0.3763, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8682340383529663, "rewards/margins": 1.6525026559829712, "rewards/rejected": -2.5207366943359375, "step": 5310 }, { "epoch": 0.8535900521460088, "grad_norm": 11.25, "learning_rate": 3.192391684835203e-07, "logits/chosen": -2.7990283966064453, "logits/rejected": -2.746638774871826, "logps/chosen": -386.21807861328125, "logps/rejected": -251.02700805664062, "loss": 0.4247, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.8555524945259094, "rewards/margins": 1.5267949104309082, "rewards/rejected": -2.382347583770752, "step": 5320 }, { "epoch": 0.8551945447252306, "grad_norm": 11.4375, "learning_rate": 3.124255106498031e-07, "logits/chosen": -2.8273370265960693, "logits/rejected": -2.7825379371643066, "logps/chosen": -424.45074462890625, "logps/rejected": -297.3385925292969, "loss": 0.4109, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8865674138069153, "rewards/margins": 1.4461644887924194, "rewards/rejected": -2.3327319622039795, "step": 5330 }, { "epoch": 0.8567990373044525, "grad_norm": 7.40625, "learning_rate": 3.056805035919569e-07, "logits/chosen": -2.804994583129883, "logits/rejected": -2.7485077381134033, "logps/chosen": -405.656005859375, "logps/rejected": -254.2196807861328, "loss": 0.4476, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8819984197616577, "rewards/margins": 1.5140966176986694, "rewards/rejected": -2.396095037460327, "step": 5340 }, { "epoch": 0.8584035298836743, "grad_norm": 13.0625, "learning_rate": 2.990043589827582e-07, "logits/chosen": -2.8018581867218018, "logits/rejected": -2.759606122970581, "logps/chosen": -391.8536071777344, "logps/rejected": -262.7406921386719, "loss": 0.4714, "rewards/accuracies": 0.75, "rewards/chosen": -0.8817217946052551, "rewards/margins": 1.510636329650879, "rewards/rejected": -2.3923583030700684, "step": 5350 }, { "epoch": 0.8600080224628961, "grad_norm": 7.0, "learning_rate": 2.923972863339336e-07, "logits/chosen": -2.8113226890563965, "logits/rejected": -2.7588014602661133, "logps/chosen": -381.93670654296875, "logps/rejected": -267.99407958984375, "loss": 0.4813, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9466564059257507, "rewards/margins": 1.4503722190856934, "rewards/rejected": -2.397028684616089, "step": 5360 }, { "epoch": 0.8616125150421179, "grad_norm": 12.6875, "learning_rate": 2.858594929895836e-07, "logits/chosen": -2.801490068435669, "logits/rejected": -2.7603392601013184, "logps/chosen": -391.86529541015625, "logps/rejected": -284.83477783203125, "loss": 0.3688, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.7361078262329102, "rewards/margins": 1.5831577777862549, "rewards/rejected": -2.319265842437744, "step": 5370 }, { "epoch": 0.8632170076213398, "grad_norm": 10.375, "learning_rate": 2.793911841196742e-07, "logits/chosen": -2.7941253185272217, "logits/rejected": -2.754394292831421, "logps/chosen": -392.01385498046875, "logps/rejected": -291.5173645019531, "loss": 0.3976, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.8063796162605286, "rewards/margins": 1.5629125833511353, "rewards/rejected": -2.3692922592163086, "step": 5380 }, { "epoch": 0.8648215002005616, "grad_norm": 8.3125, "learning_rate": 2.729925627136007e-07, "logits/chosen": -2.810976982116699, "logits/rejected": -2.7741477489471436, "logps/chosen": -380.4454345703125, "logps/rejected": -289.1482849121094, "loss": 0.4141, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.9370417594909668, "rewards/margins": 1.371072769165039, "rewards/rejected": -2.308114528656006, "step": 5390 }, { "epoch": 0.8664259927797834, "grad_norm": 9.625, "learning_rate": 2.666638295738169e-07, "logits/chosen": -2.8075249195098877, "logits/rejected": -2.756740093231201, "logps/chosen": -399.0289306640625, "logps/rejected": -275.8835754394531, "loss": 0.3976, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8527340888977051, "rewards/margins": 1.5328179597854614, "rewards/rejected": -2.385551929473877, "step": 5400 }, { "epoch": 0.8680304853590052, "grad_norm": 9.4375, "learning_rate": 2.60405183309532e-07, "logits/chosen": -2.8246920108795166, "logits/rejected": -2.7728168964385986, "logps/chosen": -398.81549072265625, "logps/rejected": -266.559814453125, "loss": 0.4334, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.7770450711250305, "rewards/margins": 1.5084846019744873, "rewards/rejected": -2.285529613494873, "step": 5410 }, { "epoch": 0.8696349779382271, "grad_norm": 11.5, "learning_rate": 2.542168203304793e-07, "logits/chosen": -2.8222408294677734, "logits/rejected": -2.759920358657837, "logps/chosen": -458.3328552246094, "logps/rejected": -262.6331787109375, "loss": 0.4089, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6602253913879395, "rewards/margins": 1.7139049768447876, "rewards/rejected": -2.3741304874420166, "step": 5420 }, { "epoch": 0.8712394705174489, "grad_norm": 13.5625, "learning_rate": 2.480989348407517e-07, "logits/chosen": -2.793947458267212, "logits/rejected": -2.747966766357422, "logps/chosen": -414.61468505859375, "logps/rejected": -300.35107421875, "loss": 0.3867, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.7328456044197083, "rewards/margins": 1.6142371892929077, "rewards/rejected": -2.3470826148986816, "step": 5430 }, { "epoch": 0.8728439630966707, "grad_norm": 13.0, "learning_rate": 2.420517188327079e-07, "logits/chosen": -2.819037437438965, "logits/rejected": -2.780388355255127, "logps/chosen": -423.30548095703125, "logps/rejected": -303.15203857421875, "loss": 0.4434, "rewards/accuracies": 0.8125, "rewards/chosen": -1.090359091758728, "rewards/margins": 1.2667559385299683, "rewards/rejected": -2.3571155071258545, "step": 5440 }, { "epoch": 0.8744484556758925, "grad_norm": 10.0, "learning_rate": 2.3607536208094693e-07, "logits/chosen": -2.798367977142334, "logits/rejected": -2.7441956996917725, "logps/chosen": -430.137939453125, "logps/rejected": -262.30108642578125, "loss": 0.3591, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8217250108718872, "rewards/margins": 1.5615367889404297, "rewards/rejected": -2.3832619190216064, "step": 5450 }, { "epoch": 0.8760529482551144, "grad_norm": 15.1875, "learning_rate": 2.30170052136352e-07, "logits/chosen": -2.8153538703918457, "logits/rejected": -2.767209529876709, "logps/chosen": -425.980224609375, "logps/rejected": -306.6390380859375, "loss": 0.4491, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9282452464103699, "rewards/margins": 1.46097731590271, "rewards/rejected": -2.3892226219177246, "step": 5460 }, { "epoch": 0.8776574408343362, "grad_norm": 12.375, "learning_rate": 2.2433597432020475e-07, "logits/chosen": -2.8114819526672363, "logits/rejected": -2.761840581893921, "logps/chosen": -408.8009948730469, "logps/rejected": -281.20367431640625, "loss": 0.4405, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8083914518356323, "rewards/margins": 1.4942744970321655, "rewards/rejected": -2.302665948867798, "step": 5470 }, { "epoch": 0.879261933413558, "grad_norm": 11.0625, "learning_rate": 2.185733117183711e-07, "logits/chosen": -2.806313991546631, "logits/rejected": -2.75260591506958, "logps/chosen": -410.026611328125, "logps/rejected": -268.67828369140625, "loss": 0.4338, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9882251024246216, "rewards/margins": 1.4381396770477295, "rewards/rejected": -2.4263644218444824, "step": 5480 }, { "epoch": 0.8808664259927798, "grad_norm": 8.9375, "learning_rate": 2.1288224517555438e-07, "logits/chosen": -2.8081443309783936, "logits/rejected": -2.7508559226989746, "logps/chosen": -436.7002868652344, "logps/rejected": -288.77227783203125, "loss": 0.4359, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.6698066592216492, "rewards/margins": 1.4876631498336792, "rewards/rejected": -2.1574699878692627, "step": 5490 }, { "epoch": 0.8824709185720016, "grad_norm": 9.5625, "learning_rate": 2.0726295328961865e-07, "logits/chosen": -2.793952226638794, "logits/rejected": -2.743959903717041, "logps/chosen": -441.8058166503906, "logps/rejected": -280.339599609375, "loss": 0.3415, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.5820962190628052, "rewards/margins": 2.014848232269287, "rewards/rejected": -2.5969443321228027, "step": 5500 }, { "epoch": 0.8840754111512235, "grad_norm": 11.625, "learning_rate": 2.0171561240598768e-07, "logits/chosen": -2.813645839691162, "logits/rejected": -2.7599101066589355, "logps/chosen": -358.74700927734375, "logps/rejected": -250.6474609375, "loss": 0.4355, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9821175336837769, "rewards/margins": 1.4962762594223022, "rewards/rejected": -2.478394031524658, "step": 5510 }, { "epoch": 0.8856799037304453, "grad_norm": 7.21875, "learning_rate": 1.9624039661210708e-07, "logits/chosen": -2.8077259063720703, "logits/rejected": -2.74383282661438, "logps/chosen": -458.04278564453125, "logps/rejected": -260.79620361328125, "loss": 0.3756, "rewards/accuracies": 0.84375, "rewards/chosen": -0.652388870716095, "rewards/margins": 1.5884355306625366, "rewards/rejected": -2.2408246994018555, "step": 5520 }, { "epoch": 0.887284396309667, "grad_norm": 15.0, "learning_rate": 1.908374777319824e-07, "logits/chosen": -2.8265156745910645, "logits/rejected": -2.769597291946411, "logps/chosen": -403.39898681640625, "logps/rejected": -275.55413818359375, "loss": 0.4323, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7832412719726562, "rewards/margins": 1.4808690547943115, "rewards/rejected": -2.2641100883483887, "step": 5530 }, { "epoch": 0.8888888888888888, "grad_norm": 12.4375, "learning_rate": 1.85507025320788e-07, "logits/chosen": -2.820561647415161, "logits/rejected": -2.780902624130249, "logps/chosen": -408.546142578125, "logps/rejected": -283.93609619140625, "loss": 0.4672, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.8986028432846069, "rewards/margins": 1.3237154483795166, "rewards/rejected": -2.222318172454834, "step": 5540 }, { "epoch": 0.8904933814681107, "grad_norm": 13.9375, "learning_rate": 1.8024920665954516e-07, "logits/chosen": -2.817812442779541, "logits/rejected": -2.753812789916992, "logps/chosen": -433.86260986328125, "logps/rejected": -270.40118408203125, "loss": 0.463, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9561187028884888, "rewards/margins": 1.394054651260376, "rewards/rejected": -2.350173234939575, "step": 5550 }, { "epoch": 0.8920978740473325, "grad_norm": 8.0625, "learning_rate": 1.750641867498709e-07, "logits/chosen": -2.8174307346343994, "logits/rejected": -2.763807773590088, "logps/chosen": -443.3783264160156, "logps/rejected": -312.02813720703125, "loss": 0.385, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.688770592212677, "rewards/margins": 1.6890805959701538, "rewards/rejected": -2.3778512477874756, "step": 5560 }, { "epoch": 0.8937023666265543, "grad_norm": 9.6875, "learning_rate": 1.699521283088035e-07, "logits/chosen": -2.7925477027893066, "logits/rejected": -2.7349798679351807, "logps/chosen": -435.9283142089844, "logps/rejected": -257.9076843261719, "loss": 0.4142, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.7677469253540039, "rewards/margins": 1.7001559734344482, "rewards/rejected": -2.467902660369873, "step": 5570 }, { "epoch": 0.8953068592057761, "grad_norm": 10.8125, "learning_rate": 1.6491319176369287e-07, "logits/chosen": -2.8021292686462402, "logits/rejected": -2.760989189147949, "logps/chosen": -380.2042541503906, "logps/rejected": -285.10430908203125, "loss": 0.432, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9147303700447083, "rewards/margins": 1.4714778661727905, "rewards/rejected": -2.3862082958221436, "step": 5580 }, { "epoch": 0.896911351784998, "grad_norm": 6.875, "learning_rate": 1.599475352471669e-07, "logits/chosen": -2.819826126098633, "logits/rejected": -2.770707845687866, "logps/chosen": -422.594482421875, "logps/rejected": -274.9994812011719, "loss": 0.3975, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7316409349441528, "rewards/margins": 1.6339470148086548, "rewards/rejected": -2.3655877113342285, "step": 5590 }, { "epoch": 0.8985158443642198, "grad_norm": 13.5625, "learning_rate": 1.5505531459216905e-07, "logits/chosen": -2.808365821838379, "logits/rejected": -2.7469406127929688, "logps/chosen": -422.4600524902344, "logps/rejected": -268.0157775878906, "loss": 0.3626, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.793115496635437, "rewards/margins": 1.5755220651626587, "rewards/rejected": -2.3686375617980957, "step": 5600 }, { "epoch": 0.9001203369434416, "grad_norm": 10.375, "learning_rate": 1.5023668332706937e-07, "logits/chosen": -2.788177967071533, "logits/rejected": -2.7526066303253174, "logps/chosen": -409.0157775878906, "logps/rejected": -300.1987609863281, "loss": 0.3873, "rewards/accuracies": 0.875, "rewards/chosen": -0.7893904447555542, "rewards/margins": 1.5636425018310547, "rewards/rejected": -2.3530328273773193, "step": 5610 }, { "epoch": 0.9017248295226634, "grad_norm": 10.8125, "learning_rate": 1.4549179267084324e-07, "logits/chosen": -2.8116002082824707, "logits/rejected": -2.7637436389923096, "logps/chosen": -424.88189697265625, "logps/rejected": -297.0899353027344, "loss": 0.4799, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0401932001113892, "rewards/margins": 1.2693895101547241, "rewards/rejected": -2.3095829486846924, "step": 5620 }, { "epoch": 0.9033293221018853, "grad_norm": 11.25, "learning_rate": 1.4082079152832912e-07, "logits/chosen": -2.810987710952759, "logits/rejected": -2.7448747158050537, "logps/chosen": -437.52313232421875, "logps/rejected": -260.565673828125, "loss": 0.4184, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.76436847448349, "rewards/margins": 1.5387895107269287, "rewards/rejected": -2.3031582832336426, "step": 5630 }, { "epoch": 0.9049338146811071, "grad_norm": 7.875, "learning_rate": 1.3622382648555422e-07, "logits/chosen": -2.808615207672119, "logits/rejected": -2.745837926864624, "logps/chosen": -431.75518798828125, "logps/rejected": -270.6479187011719, "loss": 0.3756, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.7371909022331238, "rewards/margins": 1.6705642938613892, "rewards/rejected": -2.4077553749084473, "step": 5640 }, { "epoch": 0.9065383072603289, "grad_norm": 8.5625, "learning_rate": 1.3170104180513315e-07, "logits/chosen": -2.812695026397705, "logits/rejected": -2.756497859954834, "logps/chosen": -408.6356506347656, "logps/rejected": -262.015625, "loss": 0.3827, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.8543269038200378, "rewards/margins": 1.5779764652252197, "rewards/rejected": -2.4323031902313232, "step": 5650 }, { "epoch": 0.9081427998395507, "grad_norm": 12.6875, "learning_rate": 1.272525794217422e-07, "logits/chosen": -2.839069366455078, "logits/rejected": -2.769153118133545, "logps/chosen": -455.59326171875, "logps/rejected": -255.47805786132812, "loss": 0.3384, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.5893365740776062, "rewards/margins": 1.8813693523406982, "rewards/rejected": -2.470705986022949, "step": 5660 }, { "epoch": 0.9097472924187726, "grad_norm": 9.75, "learning_rate": 1.2287857893766574e-07, "logits/chosen": -2.822566509246826, "logits/rejected": -2.7721171379089355, "logps/chosen": -401.98919677734375, "logps/rejected": -256.7367858886719, "loss": 0.427, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.6906684041023254, "rewards/margins": 1.4893977642059326, "rewards/rejected": -2.1800663471221924, "step": 5670 }, { "epoch": 0.9113517849979944, "grad_norm": 11.5, "learning_rate": 1.1857917761841226e-07, "logits/chosen": -2.825672149658203, "logits/rejected": -2.7675087451934814, "logps/chosen": -414.2330627441406, "logps/rejected": -265.63519287109375, "loss": 0.3934, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.7130271792411804, "rewards/margins": 1.6484267711639404, "rewards/rejected": -2.3614537715911865, "step": 5680 }, { "epoch": 0.9129562775772162, "grad_norm": 7.0, "learning_rate": 1.1435451038841028e-07, "logits/chosen": -2.7964138984680176, "logits/rejected": -2.743408679962158, "logps/chosen": -424.18505859375, "logps/rejected": -264.3424987792969, "loss": 0.3557, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8085341453552246, "rewards/margins": 1.5622847080230713, "rewards/rejected": -2.370818614959717, "step": 5690 }, { "epoch": 0.914560770156438, "grad_norm": 9.8125, "learning_rate": 1.1020470982677167e-07, "logits/chosen": -2.8282032012939453, "logits/rejected": -2.7792210578918457, "logps/chosen": -395.1925354003906, "logps/rejected": -295.00018310546875, "loss": 0.4199, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.7470839619636536, "rewards/margins": 1.542876958847046, "rewards/rejected": -2.289961099624634, "step": 5700 }, { "epoch": 0.9161652627356599, "grad_norm": 9.25, "learning_rate": 1.0612990616313185e-07, "logits/chosen": -2.792717456817627, "logits/rejected": -2.749756336212158, "logps/chosen": -399.0223388671875, "logps/rejected": -280.2065124511719, "loss": 0.3644, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.7691539525985718, "rewards/margins": 1.715105652809143, "rewards/rejected": -2.4842591285705566, "step": 5710 }, { "epoch": 0.9177697553148817, "grad_norm": 10.75, "learning_rate": 1.0213022727356248e-07, "logits/chosen": -2.818091869354248, "logits/rejected": -2.7686927318573, "logps/chosen": -409.58660888671875, "logps/rejected": -274.80340576171875, "loss": 0.388, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.757960855960846, "rewards/margins": 1.5082905292510986, "rewards/rejected": -2.2662513256073, "step": 5720 }, { "epoch": 0.9193742478941035, "grad_norm": 11.125, "learning_rate": 9.820579867656027e-08, "logits/chosen": -2.8041443824768066, "logits/rejected": -2.7569096088409424, "logps/chosen": -400.86871337890625, "logps/rejected": -272.6924743652344, "loss": 0.4255, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0120155811309814, "rewards/margins": 1.3953025341033936, "rewards/rejected": -2.407318115234375, "step": 5730 }, { "epoch": 0.9209787404733253, "grad_norm": 7.75, "learning_rate": 9.435674352910401e-08, "logits/chosen": -2.8009579181671143, "logits/rejected": -2.747126579284668, "logps/chosen": -413.3050842285156, "logps/rejected": -274.79278564453125, "loss": 0.3724, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.7111916542053223, "rewards/margins": 1.8339523077011108, "rewards/rejected": -2.5451438426971436, "step": 5740 }, { "epoch": 0.9225832330525471, "grad_norm": 8.9375, "learning_rate": 9.058318262279431e-08, "logits/chosen": -2.8065695762634277, "logits/rejected": -2.759350538253784, "logps/chosen": -391.44708251953125, "logps/rejected": -273.9414978027344, "loss": 0.3883, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.825060248374939, "rewards/margins": 1.6950734853744507, "rewards/rejected": -2.5201337337493896, "step": 5750 }, { "epoch": 0.924187725631769, "grad_norm": 14.8125, "learning_rate": 8.688523438005997e-08, "logits/chosen": -2.7982685565948486, "logits/rejected": -2.7458367347717285, "logps/chosen": -431.30181884765625, "logps/rejected": -285.3001403808594, "loss": 0.4145, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.5927776098251343, "rewards/margins": 1.6476659774780273, "rewards/rejected": -2.240443706512451, "step": 5760 }, { "epoch": 0.9257922182109908, "grad_norm": 11.5625, "learning_rate": 8.326301485044152e-08, "logits/chosen": -2.8269810676574707, "logits/rejected": -2.7691025733947754, "logps/chosen": -408.160888671875, "logps/rejected": -258.2516784667969, "loss": 0.3867, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.8919556736946106, "rewards/margins": 1.5934278964996338, "rewards/rejected": -2.4853835105895996, "step": 5770 }, { "epoch": 0.9273967107902126, "grad_norm": 8.625, "learning_rate": 7.971663770695165e-08, "logits/chosen": -2.801849365234375, "logits/rejected": -2.749263286590576, "logps/chosen": -396.55792236328125, "logps/rejected": -249.450927734375, "loss": 0.4244, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.846328616142273, "rewards/margins": 1.5574018955230713, "rewards/rejected": -2.4037301540374756, "step": 5780 }, { "epoch": 0.9290012033694344, "grad_norm": 15.8125, "learning_rate": 7.624621424250577e-08, "logits/chosen": -2.789332389831543, "logits/rejected": -2.7454895973205566, "logps/chosen": -389.910888671875, "logps/rejected": -260.88946533203125, "loss": 0.4842, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0670676231384277, "rewards/margins": 1.3610090017318726, "rewards/rejected": -2.4280765056610107, "step": 5790 }, { "epoch": 0.9306056959486563, "grad_norm": 13.9375, "learning_rate": 7.285185336642908e-08, "logits/chosen": -2.802990674972534, "logits/rejected": -2.749004364013672, "logps/chosen": -412.0018005371094, "logps/rejected": -261.63848876953125, "loss": 0.441, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8234884142875671, "rewards/margins": 1.4407368898391724, "rewards/rejected": -2.264225482940674, "step": 5800 }, { "epoch": 0.9322101885278781, "grad_norm": 5.84375, "learning_rate": 6.95336616010417e-08, "logits/chosen": -2.81256103515625, "logits/rejected": -2.743215799331665, "logps/chosen": -449.7594299316406, "logps/rejected": -255.1123046875, "loss": 0.4272, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7551842927932739, "rewards/margins": 1.5449879169464111, "rewards/rejected": -2.3001723289489746, "step": 5810 }, { "epoch": 0.9338146811070999, "grad_norm": 14.5625, "learning_rate": 6.629174307831221e-08, "logits/chosen": -2.792149782180786, "logits/rejected": -2.74296498298645, "logps/chosen": -424.04449462890625, "logps/rejected": -295.71917724609375, "loss": 0.4897, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8915591239929199, "rewards/margins": 1.3822824954986572, "rewards/rejected": -2.273841619491577, "step": 5820 }, { "epoch": 0.9354191736863217, "grad_norm": 9.9375, "learning_rate": 6.312619953659172e-08, "logits/chosen": -2.80828857421875, "logits/rejected": -2.749077320098877, "logps/chosen": -427.0267028808594, "logps/rejected": -277.28143310546875, "loss": 0.4594, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8568135499954224, "rewards/margins": 1.484554409980774, "rewards/rejected": -2.3413679599761963, "step": 5830 }, { "epoch": 0.9370236662655436, "grad_norm": 10.0, "learning_rate": 6.003713031742131e-08, "logits/chosen": -2.8071742057800293, "logits/rejected": -2.7544360160827637, "logps/chosen": -411.4745178222656, "logps/rejected": -297.9090576171875, "loss": 0.3942, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7641955614089966, "rewards/margins": 1.5794193744659424, "rewards/rejected": -2.3436148166656494, "step": 5840 }, { "epoch": 0.9386281588447654, "grad_norm": 14.3125, "learning_rate": 5.702463236241379e-08, "logits/chosen": -2.8057026863098145, "logits/rejected": -2.7581253051757812, "logps/chosen": -418.44903564453125, "logps/rejected": -270.86932373046875, "loss": 0.4896, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8385164141654968, "rewards/margins": 1.352230429649353, "rewards/rejected": -2.1907472610473633, "step": 5850 }, { "epoch": 0.9402326514239872, "grad_norm": 9.75, "learning_rate": 5.4088800210210514e-08, "logits/chosen": -2.7984061241149902, "logits/rejected": -2.7541892528533936, "logps/chosen": -383.7662048339844, "logps/rejected": -275.86553955078125, "loss": 0.434, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8415667414665222, "rewards/margins": 1.3542895317077637, "rewards/rejected": -2.1958565711975098, "step": 5860 }, { "epoch": 0.941837144003209, "grad_norm": 10.75, "learning_rate": 5.122972599351739e-08, "logits/chosen": -2.79818058013916, "logits/rejected": -2.7508370876312256, "logps/chosen": -432.44964599609375, "logps/rejected": -306.43780517578125, "loss": 0.3596, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.6782715320587158, "rewards/margins": 1.6471202373504639, "rewards/rejected": -2.3253917694091797, "step": 5870 }, { "epoch": 0.9434416365824309, "grad_norm": 10.75, "learning_rate": 4.844749943621052e-08, "logits/chosen": -2.816148281097412, "logits/rejected": -2.750479221343994, "logps/chosen": -436.82281494140625, "logps/rejected": -287.916259765625, "loss": 0.3284, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.6569616198539734, "rewards/margins": 1.8633569478988647, "rewards/rejected": -2.5203185081481934, "step": 5880 }, { "epoch": 0.9450461291616526, "grad_norm": 13.375, "learning_rate": 4.57422078505218e-08, "logits/chosen": -2.797820568084717, "logits/rejected": -2.7604968547821045, "logps/chosen": -344.79931640625, "logps/rejected": -274.88446044921875, "loss": 0.4026, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9201502799987793, "rewards/margins": 1.4386075735092163, "rewards/rejected": -2.358757972717285, "step": 5890 }, { "epoch": 0.9466506217408744, "grad_norm": 8.8125, "learning_rate": 4.311393613429943e-08, "logits/chosen": -2.7904746532440186, "logits/rejected": -2.732771396636963, "logps/chosen": -450.02520751953125, "logps/rejected": -296.76300048828125, "loss": 0.4189, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.6878458857536316, "rewards/margins": 1.5955255031585693, "rewards/rejected": -2.283371686935425, "step": 5900 }, { "epoch": 0.9482551143200962, "grad_norm": 8.8125, "learning_rate": 4.056276676834281e-08, "logits/chosen": -2.8199102878570557, "logits/rejected": -2.769636631011963, "logps/chosen": -397.46044921875, "logps/rejected": -276.52374267578125, "loss": 0.4355, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9463607668876648, "rewards/margins": 1.3626400232315063, "rewards/rejected": -2.3090007305145264, "step": 5910 }, { "epoch": 0.9498596068993181, "grad_norm": 11.3125, "learning_rate": 3.808877981381437e-08, "logits/chosen": -2.8034706115722656, "logits/rejected": -2.7549986839294434, "logps/chosen": -383.4767150878906, "logps/rejected": -255.1459197998047, "loss": 0.3141, "rewards/accuracies": 0.875, "rewards/chosen": -0.7080812454223633, "rewards/margins": 1.757089614868164, "rewards/rejected": -2.4651708602905273, "step": 5920 }, { "epoch": 0.9514640994785399, "grad_norm": 11.9375, "learning_rate": 3.569205290972655e-08, "logits/chosen": -2.8086719512939453, "logits/rejected": -2.763061046600342, "logps/chosen": -351.31781005859375, "logps/rejected": -253.1845245361328, "loss": 0.4173, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8971024751663208, "rewards/margins": 1.6073181629180908, "rewards/rejected": -2.504420757293701, "step": 5930 }, { "epoch": 0.9530685920577617, "grad_norm": 8.5, "learning_rate": 3.337266127050681e-08, "logits/chosen": -2.8063273429870605, "logits/rejected": -2.7670204639434814, "logps/chosen": -433.70770263671875, "logps/rejected": -321.5706481933594, "loss": 0.4269, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8975056409835815, "rewards/margins": 1.4744794368743896, "rewards/rejected": -2.3719849586486816, "step": 5940 }, { "epoch": 0.9546730846369835, "grad_norm": 6.40625, "learning_rate": 3.113067768363509e-08, "logits/chosen": -2.805718421936035, "logits/rejected": -2.7629942893981934, "logps/chosen": -393.8545837402344, "logps/rejected": -277.4226989746094, "loss": 0.4174, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8760774731636047, "rewards/margins": 1.410573959350586, "rewards/rejected": -2.286651134490967, "step": 5950 }, { "epoch": 0.9562775772162053, "grad_norm": 10.25, "learning_rate": 2.8966172507362e-08, "logits/chosen": -2.8265609741210938, "logits/rejected": -2.7805733680725098, "logps/chosen": -437.93377685546875, "logps/rejected": -330.654296875, "loss": 0.4138, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.8245475888252258, "rewards/margins": 1.5254485607147217, "rewards/rejected": -2.3499960899353027, "step": 5960 }, { "epoch": 0.9578820697954272, "grad_norm": 12.25, "learning_rate": 2.6879213668498662e-08, "logits/chosen": -2.808868408203125, "logits/rejected": -2.774117946624756, "logps/chosen": -362.60833740234375, "logps/rejected": -288.03076171875, "loss": 0.4555, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9702554941177368, "rewards/margins": 1.4713263511657715, "rewards/rejected": -2.4415817260742188, "step": 5970 }, { "epoch": 0.959486562374649, "grad_norm": 9.875, "learning_rate": 2.4869866660285903e-08, "logits/chosen": -2.8051815032958984, "logits/rejected": -2.740966320037842, "logps/chosen": -458.8876953125, "logps/rejected": -263.69610595703125, "loss": 0.3231, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.6877936124801636, "rewards/margins": 1.8482147455215454, "rewards/rejected": -2.536008596420288, "step": 5980 }, { "epoch": 0.9610910549538708, "grad_norm": 12.0, "learning_rate": 2.293819454033952e-08, "logits/chosen": -2.7994143962860107, "logits/rejected": -2.7571542263031006, "logps/chosen": -398.72113037109375, "logps/rejected": -273.25250244140625, "loss": 0.4602, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0152826309204102, "rewards/margins": 1.3785016536712646, "rewards/rejected": -2.393784284591675, "step": 5990 }, { "epoch": 0.9626955475330926, "grad_norm": 6.4375, "learning_rate": 2.1084257928670748e-08, "logits/chosen": -2.812206745147705, "logits/rejected": -2.7399680614471436, "logps/chosen": -443.63812255859375, "logps/rejected": -267.39141845703125, "loss": 0.3006, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.6824522018432617, "rewards/margins": 1.7949758768081665, "rewards/rejected": -2.4774279594421387, "step": 6000 }, { "epoch": 0.9643000401123145, "grad_norm": 8.9375, "learning_rate": 1.9308115005783336e-08, "logits/chosen": -2.8083033561706543, "logits/rejected": -2.769881248474121, "logps/chosen": -387.3377990722656, "logps/rejected": -303.13006591796875, "loss": 0.4144, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8154579401016235, "rewards/margins": 1.4604164361953735, "rewards/rejected": -2.275873899459839, "step": 6010 }, { "epoch": 0.9659045326915363, "grad_norm": 9.4375, "learning_rate": 1.7609821510849444e-08, "logits/chosen": -2.8372035026550293, "logits/rejected": -2.767444610595703, "logps/chosen": -440.1866149902344, "logps/rejected": -289.16876220703125, "loss": 0.3592, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.7681699991226196, "rewards/margins": 1.7754240036010742, "rewards/rejected": -2.5435941219329834, "step": 6020 }, { "epoch": 0.9675090252707581, "grad_norm": 13.5, "learning_rate": 1.5989430739958013e-08, "logits/chosen": -2.8201520442962646, "logits/rejected": -2.765552043914795, "logps/chosen": -392.9667663574219, "logps/rejected": -248.07363891601562, "loss": 0.4384, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9499319195747375, "rewards/margins": 1.3894308805465698, "rewards/rejected": -2.339362859725952, "step": 6030 }, { "epoch": 0.9691135178499799, "grad_norm": 6.78125, "learning_rate": 1.4446993544444953e-08, "logits/chosen": -2.8138198852539062, "logits/rejected": -2.7597362995147705, "logps/chosen": -401.30291748046875, "logps/rejected": -269.365966796875, "loss": 0.4031, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.738829493522644, "rewards/margins": 1.7171655893325806, "rewards/rejected": -2.4559950828552246, "step": 6040 }, { "epoch": 0.9707180104292018, "grad_norm": 7.28125, "learning_rate": 1.2982558329294458e-08, "logits/chosen": -2.8117334842681885, "logits/rejected": -2.7551369667053223, "logps/chosen": -439.1939392089844, "logps/rejected": -285.7598876953125, "loss": 0.3827, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.5878422260284424, "rewards/margins": 1.657517671585083, "rewards/rejected": -2.2453598976135254, "step": 6050 }, { "epoch": 0.9723225030084236, "grad_norm": 2.421875, "learning_rate": 1.1596171051622974e-08, "logits/chosen": -2.789170026779175, "logits/rejected": -2.7435243129730225, "logps/chosen": -385.2083435058594, "logps/rejected": -255.07101440429688, "loss": 0.3677, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.7299814224243164, "rewards/margins": 1.7800886631011963, "rewards/rejected": -2.5100700855255127, "step": 6060 }, { "epoch": 0.9739269955876454, "grad_norm": 14.625, "learning_rate": 1.0287875219233689e-08, "logits/chosen": -2.787628412246704, "logits/rejected": -2.7384421825408936, "logps/chosen": -435.2119140625, "logps/rejected": -301.015869140625, "loss": 0.4615, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9104372262954712, "rewards/margins": 1.3860965967178345, "rewards/rejected": -2.2965338230133057, "step": 6070 }, { "epoch": 0.9755314881668672, "grad_norm": 11.6875, "learning_rate": 9.057711889254583e-09, "logits/chosen": -2.808032512664795, "logits/rejected": -2.746980667114258, "logps/chosen": -485.32958984375, "logps/rejected": -285.4283142089844, "loss": 0.4324, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7121922373771667, "rewards/margins": 1.5647855997085571, "rewards/rejected": -2.276977777481079, "step": 6080 }, { "epoch": 0.9771359807460891, "grad_norm": 8.0, "learning_rate": 7.905719666846945e-09, "logits/chosen": -2.8120369911193848, "logits/rejected": -2.7556660175323486, "logps/chosen": -416.9154357910156, "logps/rejected": -256.4656677246094, "loss": 0.4232, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.8832438588142395, "rewards/margins": 1.509702444076538, "rewards/rejected": -2.392946481704712, "step": 6090 }, { "epoch": 0.9787404733253109, "grad_norm": 9.1875, "learning_rate": 6.831934703995513e-09, "logits/chosen": -2.8122801780700684, "logits/rejected": -2.7562625408172607, "logps/chosen": -436.6111755371094, "logps/rejected": -286.6775207519531, "loss": 0.4061, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9249618649482727, "rewards/margins": 1.5305758714675903, "rewards/rejected": -2.4555375576019287, "step": 6100 }, { "epoch": 0.9803449659045327, "grad_norm": 10.4375, "learning_rate": 5.836390698374383e-09, "logits/chosen": -2.806790828704834, "logits/rejected": -2.7585809230804443, "logps/chosen": -404.19268798828125, "logps/rejected": -263.6002502441406, "loss": 0.353, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7716562151908875, "rewards/margins": 1.6588990688323975, "rewards/rejected": -2.430555582046509, "step": 6110 }, { "epoch": 0.9819494584837545, "grad_norm": 7.40625, "learning_rate": 4.919118892287578e-09, "logits/chosen": -2.801802635192871, "logits/rejected": -2.7584471702575684, "logps/chosen": -388.66253662109375, "logps/rejected": -264.1017761230469, "loss": 0.5191, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9955611228942871, "rewards/margins": 1.2606489658355713, "rewards/rejected": -2.2562098503112793, "step": 6120 }, { "epoch": 0.9835539510629764, "grad_norm": 9.6875, "learning_rate": 4.080148071690104e-09, "logits/chosen": -2.8261146545410156, "logits/rejected": -2.7765231132507324, "logps/chosen": -371.9804992675781, "logps/rejected": -265.73162841796875, "loss": 0.3837, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8311249613761902, "rewards/margins": 1.5407880544662476, "rewards/rejected": -2.371912717819214, "step": 6130 }, { "epoch": 0.9851584436421982, "grad_norm": 11.0625, "learning_rate": 3.31950456528507e-09, "logits/chosen": -2.830606698989868, "logits/rejected": -2.767864942550659, "logps/chosen": -417.5298767089844, "logps/rejected": -265.28857421875, "loss": 0.478, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9212249517440796, "rewards/margins": 1.4193239212036133, "rewards/rejected": -2.3405489921569824, "step": 6140 }, { "epoch": 0.98676293622142, "grad_norm": 8.4375, "learning_rate": 2.6372122436951753e-09, "logits/chosen": -2.795029640197754, "logits/rejected": -2.7419021129608154, "logps/chosen": -399.4764709472656, "logps/rejected": -262.98614501953125, "loss": 0.4648, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9087712168693542, "rewards/margins": 1.5010160207748413, "rewards/rejected": -2.40978741645813, "step": 6150 }, { "epoch": 0.9883674288006418, "grad_norm": 7.125, "learning_rate": 2.0332925187163677e-09, "logits/chosen": -2.8202505111694336, "logits/rejected": -2.7406833171844482, "logps/chosen": -463.13214111328125, "logps/rejected": -244.956787109375, "loss": 0.3303, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.6529651284217834, "rewards/margins": 1.8866245746612549, "rewards/rejected": -2.5395898818969727, "step": 6160 }, { "epoch": 0.9899719213798637, "grad_norm": 7.71875, "learning_rate": 1.5077643426436584e-09, "logits/chosen": -2.8256125450134277, "logits/rejected": -2.764892816543579, "logps/chosen": -448.1964416503906, "logps/rejected": -308.63873291015625, "loss": 0.3858, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.797903835773468, "rewards/margins": 1.5279308557510376, "rewards/rejected": -2.3258347511291504, "step": 6170 }, { "epoch": 0.9915764139590855, "grad_norm": 11.4375, "learning_rate": 1.0606442076774304e-09, "logits/chosen": -2.8150360584259033, "logits/rejected": -2.753592014312744, "logps/chosen": -444.58697509765625, "logps/rejected": -270.0126647949219, "loss": 0.3719, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.8206769824028015, "rewards/margins": 1.509793758392334, "rewards/rejected": -2.330470561981201, "step": 6180 }, { "epoch": 0.9931809065383073, "grad_norm": 11.0, "learning_rate": 6.919461454057974e-10, "logits/chosen": -2.8238348960876465, "logits/rejected": -2.746014356613159, "logps/chosen": -466.01873779296875, "logps/rejected": -267.83349609375, "loss": 0.4061, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6602118015289307, "rewards/margins": 1.7929067611694336, "rewards/rejected": -2.4531185626983643, "step": 6190 }, { "epoch": 0.9947853991175291, "grad_norm": 10.625, "learning_rate": 4.016817263644002e-10, "logits/chosen": -2.818225860595703, "logits/rejected": -2.759671688079834, "logps/chosen": -435.1194763183594, "logps/rejected": -275.5901794433594, "loss": 0.4083, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.7615712285041809, "rewards/margins": 1.610439658164978, "rewards/rejected": -2.3720109462738037, "step": 6200 }, { "epoch": 0.9963898916967509, "grad_norm": 5.0625, "learning_rate": 1.8986005967253084e-10, "logits/chosen": -2.8127522468566895, "logits/rejected": -2.760911464691162, "logps/chosen": -436.362060546875, "logps/rejected": -280.7018737792969, "loss": 0.3806, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.8359395861625671, "rewards/margins": 1.6096508502960205, "rewards/rejected": -2.4455902576446533, "step": 6210 }, { "epoch": 0.9979943842759728, "grad_norm": 10.5, "learning_rate": 5.6487792748360913e-11, "logits/chosen": -2.824465274810791, "logits/rejected": -2.773252487182617, "logps/chosen": -404.9994201660156, "logps/rejected": -267.37274169921875, "loss": 0.4399, "rewards/accuracies": 0.75, "rewards/chosen": -0.9417276382446289, "rewards/margins": 1.36453378200531, "rewards/rejected": -2.3062610626220703, "step": 6220 }, { "epoch": 0.9995988768551946, "grad_norm": 17.125, "learning_rate": 1.5691110991089552e-12, "logits/chosen": -2.827319860458374, "logits/rejected": -2.7658631801605225, "logps/chosen": -432.77203369140625, "logps/rejected": -280.7579345703125, "loss": 0.4597, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9428902864456177, "rewards/margins": 1.4623734951019287, "rewards/rejected": -2.4052634239196777, "step": 6230 }, { "epoch": 0.9999197753710389, "step": 6232, "total_flos": 0.0, "train_loss": 0.44136011642577255, "train_runtime": 57646.808, "train_samples_per_second": 1.73, "train_steps_per_second": 0.108 } ], "logging_steps": 10, "max_steps": 6232, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }