{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.025465732353444, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006836438215689625, "grad_norm": 0.6573465466499329, "learning_rate": 0.0, "logits/chosen": -7.0057244300842285, "logits/rejected": -7.004946231842041, "logps/chosen": -32.191802978515625, "logps/rejected": -32.495689392089844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.001367287643137925, "grad_norm": 0.7324891686439514, "learning_rate": 5.017166594399687e-06, "logits/chosen": -6.715972900390625, "logits/rejected": -6.715063571929932, "logps/chosen": -31.3376522064209, "logps/rejected": -32.303123474121094, "loss": 0.681, "rewards/accuracies": 0.8125, "rewards/chosen": 0.04785561561584473, "rewards/margins": 0.027377856895327568, "rewards/rejected": 0.020477760583162308, "step": 2 }, { "epoch": 0.0020509314647068877, "grad_norm": 0.71903395652771, "learning_rate": 7.952020911994375e-06, "logits/chosen": -6.396335124969482, "logits/rejected": -6.396225929260254, "logps/chosen": -30.442445755004883, "logps/rejected": -31.42215347290039, "loss": 0.6796, "rewards/accuracies": 0.875, "rewards/chosen": 0.05951811373233795, "rewards/margins": 0.026405882090330124, "rewards/rejected": 0.03311222791671753, "step": 3 }, { "epoch": 0.00273457528627585, "grad_norm": 0.7144582867622375, "learning_rate": 1.0034333188799373e-05, "logits/chosen": -6.924302101135254, "logits/rejected": -6.924713134765625, "logps/chosen": -32.21989059448242, "logps/rejected": -33.102630615234375, "loss": 0.68, "rewards/accuracies": 0.5625, "rewards/chosen": 0.053176477551460266, "rewards/margins": 0.019765580072999, "rewards/rejected": 0.03341089189052582, "step": 4 }, { "epoch": 0.003418219107844813, "grad_norm": 0.8064276576042175, "learning_rate": 1.164950007226698e-05, "logits/chosen": -6.502096176147461, "logits/rejected": -6.503506660461426, "logps/chosen": -31.625972747802734, "logps/rejected": -33.20111846923828, "loss": 0.6726, "rewards/accuracies": 0.625, "rewards/chosen": 0.06931095570325851, "rewards/margins": 0.02332491986453533, "rewards/rejected": 0.04598603397607803, "step": 5 }, { "epoch": 0.0041018629294137755, "grad_norm": 0.8517821431159973, "learning_rate": 1.2969187506394062e-05, "logits/chosen": -6.211585521697998, "logits/rejected": -6.211213111877441, "logps/chosen": -32.45854187011719, "logps/rejected": -32.847900390625, "loss": 0.6704, "rewards/accuracies": 0.75, "rewards/chosen": 0.08532875776290894, "rewards/margins": 0.04293126240372658, "rewards/rejected": 0.042397499084472656, "step": 6 }, { "epoch": 0.004785506750982738, "grad_norm": 0.9325186610221863, "learning_rate": 1.4084967333570947e-05, "logits/chosen": -6.404029846191406, "logits/rejected": -6.403735160827637, "logps/chosen": -29.578746795654297, "logps/rejected": -30.90483283996582, "loss": 0.6575, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1576327532529831, "rewards/margins": 0.08066244423389435, "rewards/rejected": 0.07697030901908875, "step": 7 }, { "epoch": 0.0054691505725517, "grad_norm": 0.9428483247756958, "learning_rate": 1.505149978319906e-05, "logits/chosen": -7.224880695343018, "logits/rejected": -7.223403453826904, "logps/chosen": -30.641345977783203, "logps/rejected": -31.672088623046875, "loss": 0.646, "rewards/accuracies": 0.8125, "rewards/chosen": 0.21333198249340057, "rewards/margins": 0.10112511366605759, "rewards/rejected": 0.11220687627792358, "step": 8 }, { "epoch": 0.006152794394120663, "grad_norm": 0.9667527675628662, "learning_rate": 1.590404182398875e-05, "logits/chosen": -6.2109599113464355, "logits/rejected": -6.2096028327941895, "logps/chosen": -30.011640548706055, "logps/rejected": -31.606502532958984, "loss": 0.6365, "rewards/accuracies": 0.9375, "rewards/chosen": 0.22182659804821014, "rewards/margins": 0.12651820480823517, "rewards/rejected": 0.09530839323997498, "step": 9 }, { "epoch": 0.006836438215689626, "grad_norm": 1.056377649307251, "learning_rate": 1.666666666666667e-05, "logits/chosen": -6.6696319580078125, "logits/rejected": -6.669058799743652, "logps/chosen": -28.96389389038086, "logps/rejected": -30.700702667236328, "loss": 0.619, "rewards/accuracies": 0.875, "rewards/chosen": 0.2628224492073059, "rewards/margins": 0.14518976211547852, "rewards/rejected": 0.1176326796412468, "step": 10 }, { "epoch": 0.0075200820372585886, "grad_norm": 1.2812103033065796, "learning_rate": 1.7356544752637084e-05, "logits/chosen": -6.967124938964844, "logits/rejected": -6.966259956359863, "logps/chosen": -27.347450256347656, "logps/rejected": -30.543434143066406, "loss": 0.5894, "rewards/accuracies": 0.875, "rewards/chosen": 0.3597058057785034, "rewards/margins": 0.24416151642799377, "rewards/rejected": 0.11554430425167084, "step": 11 }, { "epoch": 0.008203725858827551, "grad_norm": 1.4421536922454834, "learning_rate": 1.7986354100793748e-05, "logits/chosen": -6.84221887588501, "logits/rejected": -6.841176986694336, "logps/chosen": -27.595746994018555, "logps/rejected": -32.4852409362793, "loss": 0.5645, "rewards/accuracies": 0.9375, "rewards/chosen": 0.41505667567253113, "rewards/margins": 0.3527407646179199, "rewards/rejected": 0.062315892428159714, "step": 12 }, { "epoch": 0.008887369680396513, "grad_norm": 1.8045566082000732, "learning_rate": 1.8565722538447282e-05, "logits/chosen": -6.956200122833252, "logits/rejected": -6.9544758796691895, "logps/chosen": -27.44561767578125, "logps/rejected": -31.269689559936523, "loss": 0.5454, "rewards/accuracies": 0.875, "rewards/chosen": 0.4621274173259735, "rewards/margins": 0.3798587918281555, "rewards/rejected": 0.0822686180472374, "step": 13 }, { "epoch": 0.009571013501965476, "grad_norm": 1.886886715888977, "learning_rate": 1.9102133927970633e-05, "logits/chosen": -6.8786821365356445, "logits/rejected": -6.875954627990723, "logps/chosen": -26.161935806274414, "logps/rejected": -32.06349182128906, "loss": 0.4859, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5326406359672546, "rewards/margins": 0.5547691583633423, "rewards/rejected": -0.022128496319055557, "step": 14 }, { "epoch": 0.01025465732353444, "grad_norm": 2.2003629207611084, "learning_rate": 1.9601520984261358e-05, "logits/chosen": -6.757758140563965, "logits/rejected": -6.75602388381958, "logps/chosen": -27.4231014251709, "logps/rejected": -31.931123733520508, "loss": 0.4571, "rewards/accuracies": 0.75, "rewards/chosen": 0.4910184144973755, "rewards/margins": 0.4200911521911621, "rewards/rejected": 0.07092726230621338, "step": 15 }, { "epoch": 0.0109383011451034, "grad_norm": 2.1196155548095703, "learning_rate": 2.0068666377598747e-05, "logits/chosen": -6.866423606872559, "logits/rejected": -6.864083290100098, "logps/chosen": -27.84196662902832, "logps/rejected": -34.77912521362305, "loss": 0.402, "rewards/accuracies": 0.875, "rewards/chosen": 0.4826186001300812, "rewards/margins": 0.6415273547172546, "rewards/rejected": -0.15890881419181824, "step": 16 }, { "epoch": 0.011621944966672364, "grad_norm": 1.8175042867660522, "learning_rate": 2.0507482022971233e-05, "logits/chosen": -7.498910903930664, "logits/rejected": -7.495181083679199, "logps/chosen": -24.013561248779297, "logps/rejected": -34.62643814086914, "loss": 0.3825, "rewards/accuracies": 1.0, "rewards/chosen": 0.7071986198425293, "rewards/margins": 1.0680738687515259, "rewards/rejected": -0.3608752489089966, "step": 17 }, { "epoch": 0.012305588788241326, "grad_norm": 1.748481035232544, "learning_rate": 2.0921208418388435e-05, "logits/chosen": -7.238208770751953, "logits/rejected": -7.235814094543457, "logps/chosen": -25.61489486694336, "logps/rejected": -35.53810119628906, "loss": 0.3223, "rewards/accuracies": 1.0, "rewards/chosen": 0.5300499200820923, "rewards/margins": 0.9734385013580322, "rewards/rejected": -0.44338855147361755, "step": 18 }, { "epoch": 0.012989232609810289, "grad_norm": 1.8208378553390503, "learning_rate": 2.1312560015880482e-05, "logits/chosen": -6.60545015335083, "logits/rejected": -6.601428031921387, "logps/chosen": -25.52379608154297, "logps/rejected": -38.78215789794922, "loss": 0.28, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6990535259246826, "rewards/margins": 1.3325963020324707, "rewards/rejected": -0.6335428357124329, "step": 19 }, { "epoch": 0.013672876431379252, "grad_norm": 1.521155834197998, "learning_rate": 2.1683833261066357e-05, "logits/chosen": -6.858750343322754, "logits/rejected": -6.854181289672852, "logps/chosen": -24.544208526611328, "logps/rejected": -39.63270568847656, "loss": 0.2513, "rewards/accuracies": 1.0, "rewards/chosen": 0.6987884044647217, "rewards/margins": 1.5141072273254395, "rewards/rejected": -0.8153188228607178, "step": 20 }, { "epoch": 0.014356520252948214, "grad_norm": 1.443800449371338, "learning_rate": 2.2036988245565324e-05, "logits/chosen": -7.31809663772583, "logits/rejected": -7.313895225524902, "logps/chosen": -26.21209716796875, "logps/rejected": -43.09148406982422, "loss": 0.1991, "rewards/accuracies": 0.875, "rewards/chosen": 0.586815357208252, "rewards/margins": 1.6051698923110962, "rewards/rejected": -1.0183546543121338, "step": 21 }, { "epoch": 0.015040164074517177, "grad_norm": 1.1066259145736694, "learning_rate": 2.2373711347036773e-05, "logits/chosen": -7.023229122161865, "logits/rejected": -7.016810417175293, "logps/chosen": -25.221637725830078, "logps/rejected": -46.84342956542969, "loss": 0.1786, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7423024773597717, "rewards/margins": 2.2052299976348877, "rewards/rejected": -1.4629275798797607, "step": 22 }, { "epoch": 0.01572380789608614, "grad_norm": 0.9575864672660828, "learning_rate": 2.269546393362655e-05, "logits/chosen": -7.135544300079346, "logits/rejected": -7.132981300354004, "logps/chosen": -25.017555236816406, "logps/rejected": -46.14503479003906, "loss": 0.1561, "rewards/accuracies": 0.875, "rewards/chosen": 0.5971914529800415, "rewards/margins": 2.0711851119995117, "rewards/rejected": -1.4739937782287598, "step": 23 }, { "epoch": 0.016407451717655102, "grad_norm": 1.2985295057296753, "learning_rate": 2.3003520695193437e-05, "logits/chosen": -7.233003616333008, "logits/rejected": -7.229007720947266, "logps/chosen": -25.31537628173828, "logps/rejected": -50.57933044433594, "loss": 0.1568, "rewards/accuracies": 0.875, "rewards/chosen": 0.5849456191062927, "rewards/margins": 2.4819841384887695, "rewards/rejected": -1.8970385789871216, "step": 24 }, { "epoch": 0.017091095539224065, "grad_norm": 1.0652590990066528, "learning_rate": 2.329900014453396e-05, "logits/chosen": -6.885555267333984, "logits/rejected": -6.880931854248047, "logps/chosen": -25.63227653503418, "logps/rejected": -56.085269927978516, "loss": 0.1399, "rewards/accuracies": 1.0, "rewards/chosen": 0.7208855152130127, "rewards/margins": 3.016411066055298, "rewards/rejected": -2.295525550842285, "step": 25 }, { "epoch": 0.017774739360793025, "grad_norm": 1.5297735929489136, "learning_rate": 2.3582889132846968e-05, "logits/chosen": -6.612254619598389, "logits/rejected": -6.608108997344971, "logps/chosen": -26.413724899291992, "logps/rejected": -55.120609283447266, "loss": 0.0955, "rewards/accuracies": 0.9375, "rewards/chosen": 0.637628436088562, "rewards/margins": 2.903761863708496, "rewards/rejected": -2.2661335468292236, "step": 26 }, { "epoch": 0.01845838318236199, "grad_norm": 1.1944119930267334, "learning_rate": 2.3856062735983123e-05, "logits/chosen": -7.416288375854492, "logits/rejected": -7.413111686706543, "logps/chosen": -25.707918167114258, "logps/rejected": -61.42747497558594, "loss": 0.0889, "rewards/accuracies": 1.0, "rewards/chosen": 0.6266460418701172, "rewards/margins": 3.5334248542785645, "rewards/rejected": -2.9067788124084473, "step": 27 }, { "epoch": 0.019142027003930952, "grad_norm": 1.1104555130004883, "learning_rate": 2.4119300522370322e-05, "logits/chosen": -7.38694429397583, "logits/rejected": -7.384222984313965, "logps/chosen": -24.46674156188965, "logps/rejected": -67.15142059326172, "loss": 0.1309, "rewards/accuracies": 1.0, "rewards/chosen": 0.6793996095657349, "rewards/margins": 4.214372158050537, "rewards/rejected": -3.534972667694092, "step": 28 }, { "epoch": 0.019825670825499915, "grad_norm": 0.5963255763053894, "learning_rate": 2.4373299964982603e-05, "logits/chosen": -7.037827491760254, "logits/rejected": -7.0339555740356445, "logps/chosen": -24.492443084716797, "logps/rejected": -69.95735168457031, "loss": 0.1195, "rewards/accuracies": 1.0, "rewards/chosen": 0.7446132898330688, "rewards/margins": 4.586461067199707, "rewards/rejected": -3.8418478965759277, "step": 29 }, { "epoch": 0.02050931464706888, "grad_norm": 0.4133020043373108, "learning_rate": 2.4618687578661044e-05, "logits/chosen": -7.322544574737549, "logits/rejected": -7.32080602645874, "logps/chosen": -26.856666564941406, "logps/rejected": -69.12559509277344, "loss": 0.093, "rewards/accuracies": 0.875, "rewards/chosen": 0.6092227697372437, "rewards/margins": 4.225639820098877, "rewards/rejected": -3.6164169311523438, "step": 30 }, { "epoch": 0.021192958468637838, "grad_norm": 0.32627344131469727, "learning_rate": 2.4856028230571212e-05, "logits/chosen": -7.773868560791016, "logits/rejected": -7.773958206176758, "logps/chosen": -24.731990814208984, "logps/rejected": -77.35662078857422, "loss": 0.0523, "rewards/accuracies": 1.0, "rewards/chosen": 0.5320194363594055, "rewards/margins": 5.137128829956055, "rewards/rejected": -4.605109214782715, "step": 31 }, { "epoch": 0.0218766022902068, "grad_norm": 1.160730242729187, "learning_rate": 2.5085832971998436e-05, "logits/chosen": -7.192861080169678, "logits/rejected": -7.194058418273926, "logps/chosen": -26.026947021484375, "logps/rejected": -79.62892150878906, "loss": 0.1232, "rewards/accuracies": 1.0, "rewards/chosen": 0.5468711256980896, "rewards/margins": 5.227826118469238, "rewards/rejected": -4.680954933166504, "step": 32 }, { "epoch": 0.022560246111775765, "grad_norm": 0.45620834827423096, "learning_rate": 2.530856566463146e-05, "logits/chosen": -7.287026882171631, "logits/rejected": -7.283764839172363, "logps/chosen": -26.603515625, "logps/rejected": -76.04161071777344, "loss": 0.0937, "rewards/accuracies": 1.0, "rewards/chosen": 0.5531632900238037, "rewards/margins": 4.8926849365234375, "rewards/rejected": -4.339521408081055, "step": 33 }, { "epoch": 0.023243889933344728, "grad_norm": 0.1387517899274826, "learning_rate": 2.552464861737092e-05, "logits/chosen": -7.320808410644531, "logits/rejected": -7.31854248046875, "logps/chosen": -27.55302619934082, "logps/rejected": -72.02265930175781, "loss": 0.078, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5267232656478882, "rewards/margins": 4.444221496582031, "rewards/rejected": -3.9174981117248535, "step": 34 }, { "epoch": 0.02392753375491369, "grad_norm": 0.1193920224905014, "learning_rate": 2.5734467405837933e-05, "logits/chosen": -7.437685012817383, "logits/rejected": -7.438587188720703, "logps/chosen": -26.66800308227539, "logps/rejected": -80.46302032470703, "loss": 0.0904, "rewards/accuracies": 0.9375, "rewards/chosen": 0.46477290987968445, "rewards/margins": 5.274158954620361, "rewards/rejected": -4.809386253356934, "step": 35 }, { "epoch": 0.02461117757648265, "grad_norm": 0.4990135133266449, "learning_rate": 2.5938375012788124e-05, "logits/chosen": -7.329445838928223, "logits/rejected": -7.328673362731934, "logps/chosen": -26.750404357910156, "logps/rejected": -82.33059692382812, "loss": 0.0914, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5079987645149231, "rewards/margins": 5.443422317504883, "rewards/rejected": -4.935422897338867, "step": 36 }, { "epoch": 0.025294821398051615, "grad_norm": 0.4337184727191925, "learning_rate": 2.6136695401116585e-05, "logits/chosen": -7.337824821472168, "logits/rejected": -7.334741115570068, "logps/chosen": -28.558757781982422, "logps/rejected": -74.80722045898438, "loss": 0.073, "rewards/accuracies": 1.0, "rewards/chosen": 0.46932685375213623, "rewards/margins": 4.615245819091797, "rewards/rejected": -4.145918846130371, "step": 37 }, { "epoch": 0.025978465219620578, "grad_norm": 2.4724130630493164, "learning_rate": 2.6329726610280168e-05, "logits/chosen": -7.328121185302734, "logits/rejected": -7.327683448791504, "logps/chosen": -27.459056854248047, "logps/rejected": -81.92739868164062, "loss": 0.0619, "rewards/accuracies": 1.0, "rewards/chosen": 0.4122246503829956, "rewards/margins": 5.367509841918945, "rewards/rejected": -4.955284595489502, "step": 38 }, { "epoch": 0.02666210904118954, "grad_norm": 0.8911828994750977, "learning_rate": 2.651774345044166e-05, "logits/chosen": -7.515947341918945, "logits/rejected": -7.516031265258789, "logps/chosen": -26.169008255004883, "logps/rejected": -82.07453155517578, "loss": 0.0555, "rewards/accuracies": 0.9375, "rewards/chosen": 0.40035659074783325, "rewards/margins": 5.504101276397705, "rewards/rejected": -5.103744983673096, "step": 39 }, { "epoch": 0.027345752862758504, "grad_norm": 0.13207489252090454, "learning_rate": 2.6700999855466042e-05, "logits/chosen": -7.187801361083984, "logits/rejected": -7.188168048858643, "logps/chosen": -26.29627227783203, "logps/rejected": -86.15091705322266, "loss": 0.0645, "rewards/accuracies": 1.0, "rewards/chosen": 0.5883175134658813, "rewards/margins": 5.863428115844727, "rewards/rejected": -5.275110244750977, "step": 40 }, { "epoch": 0.028029396684327464, "grad_norm": 0.13310998678207397, "learning_rate": 2.687973094532893e-05, "logits/chosen": -7.412800312042236, "logits/rejected": -7.413110733032227, "logps/chosen": -25.983245849609375, "logps/rejected": -84.8594970703125, "loss": 0.0603, "rewards/accuracies": 1.0, "rewards/chosen": 0.4887577295303345, "rewards/margins": 5.763038158416748, "rewards/rejected": -5.274280548095703, "step": 41 }, { "epoch": 0.028713040505896428, "grad_norm": 0.29880672693252563, "learning_rate": 2.7054154839965013e-05, "logits/chosen": -7.244831085205078, "logits/rejected": -7.2452192306518555, "logps/chosen": -25.364463806152344, "logps/rejected": -90.02700805664062, "loss": 0.0543, "rewards/accuracies": 1.0, "rewards/chosen": 0.6598736047744751, "rewards/margins": 6.375776767730713, "rewards/rejected": -5.715902805328369, "step": 42 }, { "epoch": 0.02939668432746539, "grad_norm": 0.10732796043157578, "learning_rate": 2.722447425965978e-05, "logits/chosen": -7.2692766189575195, "logits/rejected": -7.267281532287598, "logps/chosen": -27.82178497314453, "logps/rejected": -80.44223022460938, "loss": 0.0521, "rewards/accuracies": 0.9375, "rewards/chosen": 0.44476398825645447, "rewards/margins": 5.239520072937012, "rewards/rejected": -4.794755935668945, "step": 43 }, { "epoch": 0.030080328149034354, "grad_norm": 0.1331896185874939, "learning_rate": 2.739087794143646e-05, "logits/chosen": -7.0041937828063965, "logits/rejected": -7.004157066345215, "logps/chosen": -29.622665405273438, "logps/rejected": -72.31005859375, "loss": 0.0672, "rewards/accuracies": 1.0, "rewards/chosen": 0.32144713401794434, "rewards/margins": 4.167523384094238, "rewards/rejected": -3.8460757732391357, "step": 44 }, { "epoch": 0.030763971970603314, "grad_norm": 0.12770439684391022, "learning_rate": 2.755354189625573e-05, "logits/chosen": -7.697323799133301, "logits/rejected": -7.6963396072387695, "logps/chosen": -26.937419891357422, "logps/rejected": -81.93016052246094, "loss": 0.0682, "rewards/accuracies": 1.0, "rewards/chosen": 0.539887011051178, "rewards/margins": 5.47153377532959, "rewards/rejected": -4.931647300720215, "step": 45 }, { "epoch": 0.03144761579217228, "grad_norm": 0.14341123402118683, "learning_rate": 2.771263052802624e-05, "logits/chosen": -7.66945743560791, "logits/rejected": -7.669432163238525, "logps/chosen": -26.267492294311523, "logps/rejected": -82.2294921875, "loss": 0.0842, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5777949094772339, "rewards/margins": 5.547584056854248, "rewards/rejected": -4.969788551330566, "step": 46 }, { "epoch": 0.03213125961374124, "grad_norm": 0.10382751375436783, "learning_rate": 2.7868297632261957e-05, "logits/chosen": -7.790553092956543, "logits/rejected": -7.791638374328613, "logps/chosen": -23.3753662109375, "logps/rejected": -91.43357849121094, "loss": 0.0662, "rewards/accuracies": 1.0, "rewards/chosen": 0.7374681234359741, "rewards/margins": 6.686313152313232, "rewards/rejected": -5.948844909667969, "step": 47 }, { "epoch": 0.032814903435310204, "grad_norm": 0.1290905922651291, "learning_rate": 2.8020687289593123e-05, "logits/chosen": -7.983250617980957, "logits/rejected": -7.981710433959961, "logps/chosen": -27.796297073364258, "logps/rejected": -76.02406311035156, "loss": 0.0701, "rewards/accuracies": 0.875, "rewards/chosen": 0.45478904247283936, "rewards/margins": 4.8200883865356445, "rewards/rejected": -4.365299224853516, "step": 48 }, { "epoch": 0.03349854725687917, "grad_norm": 0.08999435603618622, "learning_rate": 2.8169934667141895e-05, "logits/chosen": -7.552944183349609, "logits/rejected": -7.55290412902832, "logps/chosen": -25.62673568725586, "logps/rejected": -80.48733520507812, "loss": 0.0636, "rewards/accuracies": 0.875, "rewards/chosen": 0.6193435192108154, "rewards/margins": 5.451657295227051, "rewards/rejected": -4.832313537597656, "step": 49 }, { "epoch": 0.03418219107844813, "grad_norm": 0.09506697207689285, "learning_rate": 2.8316166738933646e-05, "logits/chosen": -7.756778717041016, "logits/rejected": -7.755283832550049, "logps/chosen": -25.00948143005371, "logps/rejected": -86.72069549560547, "loss": 0.0825, "rewards/accuracies": 1.0, "rewards/chosen": 0.7520055174827576, "rewards/margins": 6.214483261108398, "rewards/rejected": -5.462477684020996, "step": 50 }, { "epoch": 0.034865834900017094, "grad_norm": 0.09687194228172302, "learning_rate": 2.845950293496561e-05, "logits/chosen": -7.514195919036865, "logits/rejected": -7.515366554260254, "logps/chosen": -25.607524871826172, "logps/rejected": -82.53189086914062, "loss": 0.0635, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5975551605224609, "rewards/margins": 5.5928215980529785, "rewards/rejected": -4.995266914367676, "step": 51 }, { "epoch": 0.03554947872158605, "grad_norm": 0.08882694691419601, "learning_rate": 2.8600055727246657e-05, "logits/chosen": -7.754547595977783, "logits/rejected": -7.754302978515625, "logps/chosen": -25.166854858398438, "logps/rejected": -82.42953491210938, "loss": 0.0751, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6452956199645996, "rewards/margins": 5.674887657165527, "rewards/rejected": -5.029592037200928, "step": 52 }, { "epoch": 0.036233122543155014, "grad_norm": 0.08006372302770615, "learning_rate": 2.8737931160013153e-05, "logits/chosen": -7.052004814147949, "logits/rejected": -7.0505876541137695, "logps/chosen": -25.255786895751953, "logps/rejected": -80.49116516113281, "loss": 0.0717, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7127678394317627, "rewards/margins": 5.561014175415039, "rewards/rejected": -4.8482465744018555, "step": 53 }, { "epoch": 0.03691676636472398, "grad_norm": 0.09761884808540344, "learning_rate": 2.8873229330382812e-05, "logits/chosen": -7.3734822273254395, "logits/rejected": -7.373893737792969, "logps/chosen": -24.932849884033203, "logps/rejected": -84.38233947753906, "loss": 0.066, "rewards/accuracies": 1.0, "rewards/chosen": 0.7546302080154419, "rewards/margins": 5.860410690307617, "rewards/rejected": -5.105780601501465, "step": 54 }, { "epoch": 0.03760041018629294, "grad_norm": 0.08598165214061737, "learning_rate": 2.9006044824904066e-05, "logits/chosen": -7.473315238952637, "logits/rejected": -7.4718708992004395, "logps/chosen": -22.600522994995117, "logps/rejected": -89.85809326171875, "loss": 0.0684, "rewards/accuracies": 1.0, "rewards/chosen": 0.9162561893463135, "rewards/margins": 6.733027935028076, "rewards/rejected": -5.816771984100342, "step": 55 }, { "epoch": 0.038284054007861903, "grad_norm": 0.10252334922552109, "learning_rate": 2.913646711677001e-05, "logits/chosen": -7.921880722045898, "logits/rejected": -7.9224114418029785, "logps/chosen": -22.801376342773438, "logps/rejected": -91.75643920898438, "loss": 0.0707, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8891095519065857, "rewards/margins": 6.828681945800781, "rewards/rejected": -5.939572334289551, "step": 56 }, { "epoch": 0.03896769782943087, "grad_norm": 0.09201974421739578, "learning_rate": 2.926458092787486e-05, "logits/chosen": -7.74515438079834, "logits/rejected": -7.745112419128418, "logps/chosen": -24.572185516357422, "logps/rejected": -85.16152954101562, "loss": 0.0737, "rewards/accuracies": 0.875, "rewards/chosen": 0.6914398670196533, "rewards/margins": 5.965743541717529, "rewards/rejected": -5.274303436279297, "step": 57 }, { "epoch": 0.03965134165099983, "grad_norm": 0.11117773503065109, "learning_rate": 2.939046655938229e-05, "logits/chosen": -7.8971848487854, "logits/rejected": -7.898407936096191, "logps/chosen": -25.01128387451172, "logps/rejected": -81.3326416015625, "loss": 0.0988, "rewards/accuracies": 0.875, "rewards/chosen": 0.713032603263855, "rewards/margins": 5.538288116455078, "rewards/rejected": -4.825255870819092, "step": 58 }, { "epoch": 0.04033498547256879, "grad_norm": 0.09400319308042526, "learning_rate": 2.951420019403574e-05, "logits/chosen": -7.640406131744385, "logits/rejected": -7.641009330749512, "logps/chosen": -21.990022659301758, "logps/rejected": -92.40379333496094, "loss": 0.0663, "rewards/accuracies": 1.0, "rewards/chosen": 0.956412672996521, "rewards/margins": 6.9916486740112305, "rewards/rejected": -6.03523588180542, "step": 59 }, { "epoch": 0.04101862929413776, "grad_norm": 0.09507060796022415, "learning_rate": 2.963585417306073e-05, "logits/chosen": -7.427394390106201, "logits/rejected": -7.4276299476623535, "logps/chosen": -23.418113708496094, "logps/rejected": -89.26277160644531, "loss": 0.0839, "rewards/accuracies": 1.0, "rewards/chosen": 0.9489388465881348, "rewards/margins": 6.542214393615723, "rewards/rejected": -5.593276023864746, "step": 60 }, { "epoch": 0.04170227311570672, "grad_norm": 0.11039169132709503, "learning_rate": 2.9755497250179453e-05, "logits/chosen": -7.191524505615234, "logits/rejected": -7.192315101623535, "logps/chosen": -23.427003860473633, "logps/rejected": -90.32389831542969, "loss": 0.0694, "rewards/accuracies": 0.875, "rewards/chosen": 0.9116308093070984, "rewards/margins": 6.657346725463867, "rewards/rejected": -5.745715141296387, "step": 61 }, { "epoch": 0.042385916937275676, "grad_norm": 0.09080065041780472, "learning_rate": 2.98731948249709e-05, "logits/chosen": -7.313616752624512, "logits/rejected": -7.31312370300293, "logps/chosen": -23.24715805053711, "logps/rejected": -90.35850524902344, "loss": 0.0588, "rewards/accuracies": 1.0, "rewards/chosen": 0.9455455541610718, "rewards/margins": 6.656339168548584, "rewards/rejected": -5.710793972015381, "step": 62 }, { "epoch": 0.04306956075884464, "grad_norm": 2.4599640369415283, "learning_rate": 2.9989009157559694e-05, "logits/chosen": -7.396790981292725, "logits/rejected": -7.394364356994629, "logps/chosen": -21.666486740112305, "logps/rejected": -92.30471801757812, "loss": 0.0591, "rewards/accuracies": 1.0, "rewards/chosen": 1.0540430545806885, "rewards/margins": 7.03542947769165, "rewards/rejected": -5.981385707855225, "step": 63 }, { "epoch": 0.0437532045804136, "grad_norm": 0.09549608081579208, "learning_rate": 3.010299956639812e-05, "logits/chosen": -7.422524452209473, "logits/rejected": -7.421130657196045, "logps/chosen": -21.776254653930664, "logps/rejected": -92.33218383789062, "loss": 0.0821, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9801396131515503, "rewards/margins": 6.984930038452148, "rewards/rejected": -6.004790306091309, "step": 64 }, { "epoch": 0.044436848401982566, "grad_norm": 0.1009737178683281, "learning_rate": 3.021522261071426e-05, "logits/chosen": -7.526154518127441, "logits/rejected": -7.525481700897217, "logps/chosen": -21.210092544555664, "logps/rejected": -89.33047485351562, "loss": 0.0632, "rewards/accuracies": 1.0, "rewards/chosen": 1.0028282403945923, "rewards/margins": 6.795907020568848, "rewards/rejected": -5.793078899383545, "step": 65 }, { "epoch": 0.04512049222355153, "grad_norm": 0.06720085442066193, "learning_rate": 3.0325732259031143e-05, "logits/chosen": -7.360430717468262, "logits/rejected": -7.362415790557861, "logps/chosen": -21.520671844482422, "logps/rejected": -94.03379821777344, "loss": 0.0586, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0493947267532349, "rewards/margins": 7.161921501159668, "rewards/rejected": -6.112525939941406, "step": 66 }, { "epoch": 0.04580413604512049, "grad_norm": 0.0918724313378334, "learning_rate": 3.043458004501377e-05, "logits/chosen": -7.500884532928467, "logits/rejected": -7.5007548332214355, "logps/chosen": -21.487873077392578, "logps/rejected": -90.15814208984375, "loss": 0.078, "rewards/accuracies": 1.0, "rewards/chosen": 1.050102710723877, "rewards/margins": 6.845575332641602, "rewards/rejected": -5.795472145080566, "step": 67 }, { "epoch": 0.046487779866689456, "grad_norm": 4.115209579467773, "learning_rate": 3.054181521177061e-05, "logits/chosen": -7.35923957824707, "logits/rejected": -7.359853267669678, "logps/chosen": -21.237117767333984, "logps/rejected": -94.27864837646484, "loss": 0.1065, "rewards/accuracies": 1.0, "rewards/chosen": 1.0161880254745483, "rewards/margins": 7.2166361808776855, "rewards/rejected": -6.200448989868164, "step": 68 }, { "epoch": 0.04717142368825842, "grad_norm": 0.09905055165290833, "learning_rate": 3.064748484562093e-05, "logits/chosen": -7.53664493560791, "logits/rejected": -7.536221981048584, "logps/chosen": -22.354310989379883, "logps/rejected": -83.48863983154297, "loss": 0.0583, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8953481912612915, "rewards/margins": 6.09709358215332, "rewards/rejected": -5.20174503326416, "step": 69 }, { "epoch": 0.04785506750982738, "grad_norm": 0.12648166716098785, "learning_rate": 3.0751634000237615e-05, "logits/chosen": -7.192230701446533, "logits/rejected": -7.1933112144470215, "logps/chosen": -20.42192268371582, "logps/rejected": -94.28740692138672, "loss": 0.0604, "rewards/accuracies": 1.0, "rewards/chosen": 1.1365737915039062, "rewards/margins": 7.305866241455078, "rewards/rejected": -6.169292449951172, "step": 70 }, { "epoch": 0.04853871133139634, "grad_norm": 0.06842316687107086, "learning_rate": 3.085430581198459e-05, "logits/chosen": -7.584309101104736, "logits/rejected": -7.583706378936768, "logps/chosen": -21.28680419921875, "logps/rejected": -90.64261627197266, "loss": 0.0448, "rewards/accuracies": 0.875, "rewards/chosen": 1.0221006870269775, "rewards/margins": 6.891177177429199, "rewards/rejected": -5.869077205657959, "step": 71 }, { "epoch": 0.0492223551529653, "grad_norm": 0.08683859556913376, "learning_rate": 3.095554160718781e-05, "logits/chosen": -7.437884330749512, "logits/rejected": -7.439953804016113, "logps/chosen": -22.36798095703125, "logps/rejected": -91.54447174072266, "loss": 0.0616, "rewards/accuracies": 0.875, "rewards/chosen": 0.8819618821144104, "rewards/margins": 6.812735557556152, "rewards/rejected": -5.930773735046387, "step": 72 }, { "epoch": 0.049905998974534266, "grad_norm": 0.06327193975448608, "learning_rate": 3.10553810020076e-05, "logits/chosen": -7.545299053192139, "logits/rejected": -7.544283866882324, "logps/chosen": -20.753253936767578, "logps/rejected": -90.43821716308594, "loss": 0.0411, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0926127433776855, "rewards/margins": 7.023766040802002, "rewards/rejected": -5.931153297424316, "step": 73 }, { "epoch": 0.05058964279610323, "grad_norm": 0.08810155093669891, "learning_rate": 3.115386199551628e-05, "logits/chosen": -7.737239837646484, "logits/rejected": -7.7376017570495605, "logps/chosen": -18.4809513092041, "logps/rejected": -98.85256958007812, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": 1.3373682498931885, "rewards/margins": 7.9697113037109375, "rewards/rejected": -6.63234281539917, "step": 74 }, { "epoch": 0.05127328661767219, "grad_norm": 0.08683203905820847, "learning_rate": 3.1251021056528336e-05, "logits/chosen": -7.516335487365723, "logits/rejected": -7.517772197723389, "logps/chosen": -22.348384857177734, "logps/rejected": -87.68864440917969, "loss": 0.0684, "rewards/accuracies": 1.0, "rewards/chosen": 0.943874716758728, "rewards/margins": 6.453696250915527, "rewards/rejected": -5.50982141494751, "step": 75 }, { "epoch": 0.051956930439241156, "grad_norm": 0.09710966050624847, "learning_rate": 3.134689320467986e-05, "logits/chosen": -7.58569860458374, "logits/rejected": -7.585268974304199, "logps/chosen": -20.910079956054688, "logps/rejected": -92.8994369506836, "loss": 0.0682, "rewards/accuracies": 1.0, "rewards/chosen": 1.169753074645996, "rewards/margins": 7.1611552238464355, "rewards/rejected": -5.9914021492004395, "step": 76 }, { "epoch": 0.05264057426081012, "grad_norm": 0.0988495871424675, "learning_rate": 3.144151208620804e-05, "logits/chosen": -7.528024196624756, "logits/rejected": -7.528131484985352, "logps/chosen": -19.5706729888916, "logps/rejected": -96.01200866699219, "loss": 0.0572, "rewards/accuracies": 1.0, "rewards/chosen": 1.1593225002288818, "rewards/margins": 7.587392807006836, "rewards/rejected": -6.428070068359375, "step": 77 }, { "epoch": 0.05332421808237908, "grad_norm": 0.08449404686689377, "learning_rate": 3.1534910044841344e-05, "logits/chosen": -7.387535095214844, "logits/rejected": -7.388700485229492, "logps/chosen": -18.77089500427246, "logps/rejected": -99.61615753173828, "loss": 0.048, "rewards/accuracies": 1.0, "rewards/chosen": 1.2509894371032715, "rewards/margins": 7.981078147888184, "rewards/rejected": -6.73008918762207, "step": 78 }, { "epoch": 0.054007861903948046, "grad_norm": 0.0721382200717926, "learning_rate": 3.1627118188174024e-05, "logits/chosen": -7.321064472198486, "logits/rejected": -7.321576118469238, "logps/chosen": -19.325206756591797, "logps/rejected": -96.33390045166016, "loss": 0.0408, "rewards/accuracies": 1.0, "rewards/chosen": 1.2141629457473755, "rewards/margins": 7.639944076538086, "rewards/rejected": -6.42578125, "step": 79 }, { "epoch": 0.05469150572551701, "grad_norm": 0.1042429730296135, "learning_rate": 3.171816644986573e-05, "logits/chosen": -7.758232593536377, "logits/rejected": -7.757414817810059, "logps/chosen": -18.23053550720215, "logps/rejected": -99.88102722167969, "loss": 0.0808, "rewards/accuracies": 1.0, "rewards/chosen": 1.3277846574783325, "rewards/margins": 8.16386604309082, "rewards/rejected": -6.836081504821777, "step": 80 }, { "epoch": 0.055375149547085965, "grad_norm": 0.11650659143924713, "learning_rate": 3.18080836479775e-05, "logits/chosen": -7.587674140930176, "logits/rejected": -7.5873637199401855, "logps/chosen": -18.872041702270508, "logps/rejected": -95.72862243652344, "loss": 0.066, "rewards/accuracies": 1.0, "rewards/chosen": 1.2188632488250732, "rewards/margins": 7.694483757019043, "rewards/rejected": -6.475620269775391, "step": 81 }, { "epoch": 0.05605879336865493, "grad_norm": 0.11029443144798279, "learning_rate": 3.1896897539728616e-05, "logits/chosen": -7.491011619567871, "logits/rejected": -7.490875244140625, "logps/chosen": -19.344589233398438, "logps/rejected": -96.44403076171875, "loss": 0.0733, "rewards/accuracies": 1.0, "rewards/chosen": 1.2758269309997559, "rewards/margins": 7.66909646987915, "rewards/rejected": -6.3932695388793945, "step": 82 }, { "epoch": 0.05674243719022389, "grad_norm": 0.13270649313926697, "learning_rate": 3.198463487293457e-05, "logits/chosen": -7.277115821838379, "logits/rejected": -7.275373458862305, "logps/chosen": -19.66107940673828, "logps/rejected": -91.6268081665039, "loss": 0.0715, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2624590396881104, "rewards/margins": 7.217123985290527, "rewards/rejected": -5.954665184020996, "step": 83 }, { "epoch": 0.057426081011792855, "grad_norm": 0.06585494428873062, "learning_rate": 3.207132143436469e-05, "logits/chosen": -7.557405471801758, "logits/rejected": -7.557989597320557, "logps/chosen": -20.918153762817383, "logps/rejected": -93.34274291992188, "loss": 0.0376, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1160836219787598, "rewards/margins": 7.160100936889648, "rewards/rejected": -6.0440168380737305, "step": 84 }, { "epoch": 0.05810972483336182, "grad_norm": 2.3244948387145996, "learning_rate": 3.215698209523821e-05, "logits/chosen": -7.626119136810303, "logits/rejected": -7.624427795410156, "logps/chosen": -19.504154205322266, "logps/rejected": -92.35759735107422, "loss": 0.094, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3124630451202393, "rewards/margins": 7.299938678741455, "rewards/rejected": -5.987476348876953, "step": 85 }, { "epoch": 0.05879336865493078, "grad_norm": 0.11498862504959106, "learning_rate": 3.224164085405946e-05, "logits/chosen": -7.351897239685059, "logits/rejected": -7.350225448608398, "logps/chosen": -19.02114486694336, "logps/rejected": -95.23216247558594, "loss": 0.0658, "rewards/accuracies": 0.9375, "rewards/chosen": 1.406022071838379, "rewards/margins": 7.666587829589844, "rewards/rejected": -6.260565757751465, "step": 86 }, { "epoch": 0.059477012476499745, "grad_norm": 0.07824338227510452, "learning_rate": 3.232532087697698e-05, "logits/chosen": -7.115013599395752, "logits/rejected": -7.115718364715576, "logps/chosen": -19.142730712890625, "logps/rejected": -96.41107177734375, "loss": 0.0473, "rewards/accuracies": 1.0, "rewards/chosen": 1.2631222009658813, "rewards/margins": 7.63470458984375, "rewards/rejected": -6.371582984924316, "step": 87 }, { "epoch": 0.06016065629806871, "grad_norm": 0.09130487591028214, "learning_rate": 3.240804453583615e-05, "logits/chosen": -7.13962984085083, "logits/rejected": -7.139452934265137, "logps/chosen": -17.780141830444336, "logps/rejected": -100.65348052978516, "loss": 0.0581, "rewards/accuracies": 1.0, "rewards/chosen": 1.3889027833938599, "rewards/margins": 8.222355842590332, "rewards/rejected": -6.833453178405762, "step": 88 }, { "epoch": 0.06084430011963767, "grad_norm": 0.12131084501743317, "learning_rate": 3.248983344408188e-05, "logits/chosen": -7.446634292602539, "logits/rejected": -7.446708679199219, "logps/chosen": -20.685550689697266, "logps/rejected": -92.48131561279297, "loss": 0.0783, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1794323921203613, "rewards/margins": 7.15550422668457, "rewards/rejected": -5.976071357727051, "step": 89 }, { "epoch": 0.06152794394120663, "grad_norm": 0.08219660818576813, "learning_rate": 3.2570708490655414e-05, "logits/chosen": -7.088200569152832, "logits/rejected": -7.087198734283447, "logps/chosen": -17.598615646362305, "logps/rejected": -99.73519134521484, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": 1.4213733673095703, "rewards/margins": 8.207895278930664, "rewards/rejected": -6.786520957946777, "step": 90 }, { "epoch": 0.06221158776277559, "grad_norm": 0.09153630584478378, "learning_rate": 3.265068987201822e-05, "logits/chosen": -7.715055465698242, "logits/rejected": -7.715088367462158, "logps/chosen": -19.998382568359375, "logps/rejected": -92.73628234863281, "loss": 0.0524, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1777880191802979, "rewards/margins": 7.247180938720703, "rewards/rejected": -6.069393157958984, "step": 91 }, { "epoch": 0.06289523158434455, "grad_norm": 0.12608392536640167, "learning_rate": 3.2729797122425925e-05, "logits/chosen": -7.4480977058410645, "logits/rejected": -7.4508056640625, "logps/chosen": -20.724735260009766, "logps/rejected": -93.53631591796875, "loss": 0.0601, "rewards/accuracies": 1.0, "rewards/chosen": 1.142298936843872, "rewards/margins": 7.21593713760376, "rewards/rejected": -6.073637962341309, "step": 92 }, { "epoch": 0.06357887540591352, "grad_norm": 3.3807501792907715, "learning_rate": 3.280804914256559e-05, "logits/chosen": -7.5917768478393555, "logits/rejected": -7.591421604156494, "logps/chosen": -19.73674774169922, "logps/rejected": -92.22866821289062, "loss": 0.0614, "rewards/accuracies": 1.0, "rewards/chosen": 1.2420586347579956, "rewards/margins": 7.210859298706055, "rewards/rejected": -5.968801021575928, "step": 93 }, { "epoch": 0.06426251922748248, "grad_norm": 0.11912574619054794, "learning_rate": 3.288546422666164e-05, "logits/chosen": -7.586514472961426, "logits/rejected": -7.586696147918701, "logps/chosen": -17.544946670532227, "logps/rejected": -100.37984466552734, "loss": 0.0592, "rewards/accuracies": 1.0, "rewards/chosen": 1.3552052974700928, "rewards/margins": 8.262289047241211, "rewards/rejected": -6.907083988189697, "step": 94 }, { "epoch": 0.06494616304905144, "grad_norm": 1.1544617414474487, "learning_rate": 3.2962060088147464e-05, "logits/chosen": -7.45679235458374, "logits/rejected": -7.4574127197265625, "logps/chosen": -19.37907600402832, "logps/rejected": -94.9927978515625, "loss": 0.068, "rewards/accuracies": 1.0, "rewards/chosen": 1.1419525146484375, "rewards/margins": 7.467925071716309, "rewards/rejected": -6.325973033905029, "step": 95 }, { "epoch": 0.06562980687062041, "grad_norm": 0.09871792793273926, "learning_rate": 3.3037853883992805e-05, "logits/chosen": -7.384148597717285, "logits/rejected": -7.382119178771973, "logps/chosen": -18.0347843170166, "logps/rejected": -96.03829956054688, "loss": 0.0635, "rewards/accuracies": 1.0, "rewards/chosen": 1.3461589813232422, "rewards/margins": 7.826671600341797, "rewards/rejected": -6.480513095855713, "step": 96 }, { "epoch": 0.06631345069218937, "grad_norm": 0.12841422855854034, "learning_rate": 3.3112862237770756e-05, "logits/chosen": -7.802553176879883, "logits/rejected": -7.804030418395996, "logps/chosen": -22.69752311706543, "logps/rejected": -85.21568298339844, "loss": 0.0721, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9288662672042847, "rewards/margins": 6.20576810836792, "rewards/rejected": -5.276902675628662, "step": 97 }, { "epoch": 0.06699709451375833, "grad_norm": 0.13374429941177368, "learning_rate": 3.3187101261541584e-05, "logits/chosen": -7.646255970001221, "logits/rejected": -7.647965908050537, "logps/chosen": -18.896333694458008, "logps/rejected": -97.10340881347656, "loss": 0.0751, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1753861904144287, "rewards/margins": 7.7367353439331055, "rewards/rejected": -6.561348915100098, "step": 98 }, { "epoch": 0.0676807383353273, "grad_norm": 0.12442179769277573, "learning_rate": 3.326058657662584e-05, "logits/chosen": -7.475363254547119, "logits/rejected": -7.474472999572754, "logps/chosen": -16.79752540588379, "logps/rejected": -100.84737396240234, "loss": 0.0562, "rewards/accuracies": 1.0, "rewards/chosen": 1.5430591106414795, "rewards/margins": 8.412934303283691, "rewards/rejected": -6.869875431060791, "step": 99 }, { "epoch": 0.06836438215689626, "grad_norm": 0.09981726109981537, "learning_rate": 3.333333333333334e-05, "logits/chosen": -7.96346378326416, "logits/rejected": -7.96453857421875, "logps/chosen": -17.097410202026367, "logps/rejected": -101.76985168457031, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": 1.3636903762817383, "rewards/margins": 8.410021781921387, "rewards/rejected": -7.046331405639648, "step": 100 }, { "epoch": 0.06904802597846522, "grad_norm": 0.14249370992183685, "learning_rate": 3.340535622971072e-05, "logits/chosen": -7.692659378051758, "logits/rejected": -7.692529678344727, "logps/chosen": -19.71304702758789, "logps/rejected": -93.3577651977539, "loss": 0.0768, "rewards/accuracies": 0.9375, "rewards/chosen": 1.21677565574646, "rewards/margins": 7.322096347808838, "rewards/rejected": -6.105320453643799, "step": 101 }, { "epoch": 0.06973166980003419, "grad_norm": 0.08949767798185349, "learning_rate": 3.3476669529365295e-05, "logits/chosen": -7.8225202560424805, "logits/rejected": -7.823861122131348, "logps/chosen": -19.30390167236328, "logps/rejected": -93.31916046142578, "loss": 0.0506, "rewards/accuracies": 0.875, "rewards/chosen": 1.2236952781677246, "rewards/margins": 7.2992682456970215, "rewards/rejected": -6.075573921203613, "step": 102 }, { "epoch": 0.07041531362160315, "grad_norm": 0.10781916230916977, "learning_rate": 3.3547287078419544e-05, "logits/chosen": -7.317572593688965, "logits/rejected": -7.317484378814697, "logps/chosen": -18.614221572875977, "logps/rejected": -95.10206604003906, "loss": 0.0452, "rewards/accuracies": 1.0, "rewards/chosen": 1.3085881471633911, "rewards/margins": 7.580963134765625, "rewards/rejected": -6.272375106811523, "step": 103 }, { "epoch": 0.0710989574431721, "grad_norm": 0.11865044385194778, "learning_rate": 3.361722232164634e-05, "logits/chosen": -7.513178825378418, "logits/rejected": -7.512757301330566, "logps/chosen": -18.320579528808594, "logps/rejected": -97.46134185791016, "loss": 0.0559, "rewards/accuracies": 1.0, "rewards/chosen": 1.386275053024292, "rewards/margins": 7.891357421875, "rewards/rejected": -6.505082130432129, "step": 104 }, { "epoch": 0.07178260126474106, "grad_norm": 0.07373511791229248, "learning_rate": 3.3686488317832306e-05, "logits/chosen": -7.951755523681641, "logits/rejected": -7.953590393066406, "logps/chosen": -18.2440128326416, "logps/rejected": -98.76408386230469, "loss": 0.0375, "rewards/accuracies": 1.0, "rewards/chosen": 1.302478551864624, "rewards/margins": 7.94611120223999, "rewards/rejected": -6.6436333656311035, "step": 105 }, { "epoch": 0.07246624508631003, "grad_norm": 1.5532358884811401, "learning_rate": 3.375509775441284e-05, "logits/chosen": -7.568193435668945, "logits/rejected": -7.567607402801514, "logps/chosen": -16.235000610351562, "logps/rejected": -101.59893798828125, "loss": 0.0667, "rewards/accuracies": 1.0, "rewards/chosen": 1.5100927352905273, "rewards/margins": 8.552789688110352, "rewards/rejected": -7.042696952819824, "step": 106 }, { "epoch": 0.07314988890787899, "grad_norm": 0.13293661177158356, "learning_rate": 3.382306296142016e-05, "logits/chosen": -7.723272800445557, "logits/rejected": -7.723997116088867, "logps/chosen": -20.71710205078125, "logps/rejected": -89.44551086425781, "loss": 0.0708, "rewards/accuracies": 0.875, "rewards/chosen": 1.110649585723877, "rewards/margins": 6.830158233642578, "rewards/rejected": -5.719508647918701, "step": 107 }, { "epoch": 0.07383353272944795, "grad_norm": 0.15443122386932373, "learning_rate": 3.38903959247825e-05, "logits/chosen": -8.132146835327148, "logits/rejected": -8.131288528442383, "logps/chosen": -22.574689865112305, "logps/rejected": -85.0447998046875, "loss": 0.064, "rewards/accuracies": 0.75, "rewards/chosen": 1.059913992881775, "rewards/margins": 6.3161468505859375, "rewards/rejected": -5.256232738494873, "step": 108 }, { "epoch": 0.07451717655101692, "grad_norm": 0.1491561233997345, "learning_rate": 3.395710829901039e-05, "logits/chosen": -7.564889907836914, "logits/rejected": -7.566476345062256, "logps/chosen": -20.87273597717285, "logps/rejected": -90.21101379394531, "loss": 0.0782, "rewards/accuracies": 0.875, "rewards/chosen": 1.0596002340316772, "rewards/margins": 6.8261919021606445, "rewards/rejected": -5.766592025756836, "step": 109 }, { "epoch": 0.07520082037258588, "grad_norm": 0.11730918288230896, "learning_rate": 3.402321141930376e-05, "logits/chosen": -7.329423427581787, "logits/rejected": -7.3300862312316895, "logps/chosen": -16.67871856689453, "logps/rejected": -101.65621948242188, "loss": 0.0747, "rewards/accuracies": 1.0, "rewards/chosen": 1.4635248184204102, "rewards/margins": 8.43840217590332, "rewards/rejected": -6.974877834320068, "step": 110 }, { "epoch": 0.07588446419415484, "grad_norm": 0.11601746827363968, "learning_rate": 3.4088716313110955e-05, "logits/chosen": -7.879857540130615, "logits/rejected": -7.880573272705078, "logps/chosen": -16.62565803527832, "logps/rejected": -101.74634552001953, "loss": 0.0878, "rewards/accuracies": 1.0, "rewards/chosen": 1.464207649230957, "rewards/margins": 8.480425834655762, "rewards/rejected": -7.016218185424805, "step": 111 }, { "epoch": 0.07656810801572381, "grad_norm": 0.14769485592842102, "learning_rate": 3.415363371116969e-05, "logits/chosen": -7.270676612854004, "logits/rejected": -7.270108222961426, "logps/chosen": -20.194072723388672, "logps/rejected": -86.42461395263672, "loss": 0.0858, "rewards/accuracies": 1.0, "rewards/chosen": 1.1820605993270874, "rewards/margins": 6.60344123840332, "rewards/rejected": -5.421380996704102, "step": 112 }, { "epoch": 0.07725175183729277, "grad_norm": 0.0928109809756279, "learning_rate": 3.4217974058057e-05, "logits/chosen": -7.692039966583252, "logits/rejected": -7.693043231964111, "logps/chosen": -17.157081604003906, "logps/rejected": -102.34419250488281, "loss": 0.0433, "rewards/accuracies": 1.0, "rewards/chosen": 1.441861629486084, "rewards/margins": 8.44139289855957, "rewards/rejected": -6.999531269073486, "step": 113 }, { "epoch": 0.07793539565886173, "grad_norm": 0.11043312400579453, "learning_rate": 3.428174752227455e-05, "logits/chosen": -7.269835948944092, "logits/rejected": -7.271697998046875, "logps/chosen": -17.108781814575195, "logps/rejected": -102.29440307617188, "loss": 0.07, "rewards/accuracies": 1.0, "rewards/chosen": 1.4802643060684204, "rewards/margins": 8.454120635986328, "rewards/rejected": -6.9738569259643555, "step": 114 }, { "epoch": 0.0786190394804307, "grad_norm": 0.13979148864746094, "learning_rate": 3.434496400589353e-05, "logits/chosen": -7.706134796142578, "logits/rejected": -7.706099033355713, "logps/chosen": -17.465288162231445, "logps/rejected": -97.67521667480469, "loss": 0.0576, "rewards/accuracies": 1.0, "rewards/chosen": 1.333103060722351, "rewards/margins": 7.9706315994262695, "rewards/rejected": -6.637528419494629, "step": 115 }, { "epoch": 0.07930268330199966, "grad_norm": 0.11743710935115814, "learning_rate": 3.440763315378198e-05, "logits/chosen": -7.552059650421143, "logits/rejected": -7.5531086921691895, "logps/chosen": -20.67180824279785, "logps/rejected": -94.96894836425781, "loss": 0.0753, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2231521606445312, "rewards/margins": 7.382992744445801, "rewards/rejected": -6.1598405838012695, "step": 116 }, { "epoch": 0.07998632712356862, "grad_norm": 0.11024728417396545, "learning_rate": 3.446976436243603e-05, "logits/chosen": -7.415781497955322, "logits/rejected": -7.41645622253418, "logps/chosen": -18.19115447998047, "logps/rejected": -97.71493530273438, "loss": 0.0508, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2914677858352661, "rewards/margins": 7.898283958435059, "rewards/rejected": -6.606816291809082, "step": 117 }, { "epoch": 0.08066997094513759, "grad_norm": 0.10720206797122955, "learning_rate": 3.4531366788435425e-05, "logits/chosen": -7.632894515991211, "logits/rejected": -7.6337432861328125, "logps/chosen": -18.204254150390625, "logps/rejected": -98.41789245605469, "loss": 0.0486, "rewards/accuracies": 1.0, "rewards/chosen": 1.465652585029602, "rewards/margins": 8.01313591003418, "rewards/rejected": -6.547483921051025, "step": 118 }, { "epoch": 0.08135361476670655, "grad_norm": 0.11392402648925781, "learning_rate": 3.459244935654219e-05, "logits/chosen": -7.693991661071777, "logits/rejected": -7.695905685424805, "logps/chosen": -18.35617446899414, "logps/rejected": -99.2543716430664, "loss": 0.0524, "rewards/accuracies": 1.0, "rewards/chosen": 1.343968391418457, "rewards/margins": 8.003668785095215, "rewards/rejected": -6.659700393676758, "step": 119 }, { "epoch": 0.08203725858827551, "grad_norm": 0.13252444565296173, "learning_rate": 3.465302076746041e-05, "logits/chosen": -7.717644691467285, "logits/rejected": -7.717978477478027, "logps/chosen": -22.85373306274414, "logps/rejected": -82.64521026611328, "loss": 0.0767, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9066770672798157, "rewards/margins": 5.908365249633789, "rewards/rejected": -5.001687526702881, "step": 120 }, { "epoch": 0.08272090240984448, "grad_norm": 0.20404450595378876, "learning_rate": 3.471308950527417e-05, "logits/chosen": -7.401839733123779, "logits/rejected": -7.403050899505615, "logps/chosen": -19.973419189453125, "logps/rejected": -94.93260955810547, "loss": 0.0867, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2695695161819458, "rewards/margins": 7.4327192306518555, "rewards/rejected": -6.163149833679199, "step": 121 }, { "epoch": 0.08340454623141344, "grad_norm": 0.11667439341545105, "learning_rate": 3.477266384457914e-05, "logits/chosen": -7.804750442504883, "logits/rejected": -7.805251121520996, "logps/chosen": -19.139328002929688, "logps/rejected": -97.03215026855469, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": 1.2094199657440186, "rewards/margins": 7.789541244506836, "rewards/rejected": -6.580121994018555, "step": 122 }, { "epoch": 0.08408819005298239, "grad_norm": 0.10520108044147491, "learning_rate": 3.48317518573233e-05, "logits/chosen": -7.593397617340088, "logits/rejected": -7.593630790710449, "logps/chosen": -16.382091522216797, "logps/rejected": -102.4710693359375, "loss": 0.0505, "rewards/accuracies": 1.0, "rewards/chosen": 1.5828684568405151, "rewards/margins": 8.575075149536133, "rewards/rejected": -6.992206573486328, "step": 123 }, { "epoch": 0.08477183387455135, "grad_norm": 0.15834353864192963, "learning_rate": 3.489036141937059e-05, "logits/chosen": -7.617155075073242, "logits/rejected": -7.618190765380859, "logps/chosen": -16.784595489501953, "logps/rejected": -102.46844482421875, "loss": 0.0739, "rewards/accuracies": 1.0, "rewards/chosen": 1.4801852703094482, "rewards/margins": 8.504718780517578, "rewards/rejected": -7.024533271789551, "step": 124 }, { "epoch": 0.08545547769612032, "grad_norm": 0.12366563826799393, "learning_rate": 3.494850021680094e-05, "logits/chosen": -7.635287761688232, "logits/rejected": -7.637993335723877, "logps/chosen": -21.137802124023438, "logps/rejected": -92.3879165649414, "loss": 0.0523, "rewards/accuracies": 1.0, "rewards/chosen": 0.9592160582542419, "rewards/margins": 7.035329818725586, "rewards/rejected": -6.076113700866699, "step": 125 }, { "epoch": 0.08613912151768928, "grad_norm": 0.12332063913345337, "learning_rate": 3.500617575195938e-05, "logits/chosen": -7.566810607910156, "logits/rejected": -7.564739227294922, "logps/chosen": -16.802335739135742, "logps/rejected": -97.70240020751953, "loss": 0.0409, "rewards/accuracies": 1.0, "rewards/chosen": 1.569395899772644, "rewards/margins": 8.159868240356445, "rewards/rejected": -6.590473651885986, "step": 126 }, { "epoch": 0.08682276533925824, "grad_norm": 0.12621140480041504, "learning_rate": 3.5063395349265945e-05, "logits/chosen": -7.188416957855225, "logits/rejected": -7.188292503356934, "logps/chosen": -17.01995277404785, "logps/rejected": -98.91438293457031, "loss": 0.059, "rewards/accuracies": 1.0, "rewards/chosen": 1.437026023864746, "rewards/margins": 8.172849655151367, "rewards/rejected": -6.735823631286621, "step": 127 }, { "epoch": 0.0875064091608272, "grad_norm": 0.41836610436439514, "learning_rate": 3.5120166160797804e-05, "logits/chosen": -7.61025333404541, "logits/rejected": -7.610452651977539, "logps/chosen": -19.344051361083984, "logps/rejected": -96.10441589355469, "loss": 0.097, "rewards/accuracies": 1.0, "rewards/chosen": 1.2827006578445435, "rewards/margins": 7.643736839294434, "rewards/rejected": -6.36103630065918, "step": 128 }, { "epoch": 0.08819005298239617, "grad_norm": 0.11097653210163116, "learning_rate": 3.517649517165415e-05, "logits/chosen": -7.840906143188477, "logits/rejected": -7.842407703399658, "logps/chosen": -16.759475708007812, "logps/rejected": -103.30362701416016, "loss": 0.0524, "rewards/accuracies": 1.0, "rewards/chosen": 1.4418132305145264, "rewards/margins": 8.587120056152344, "rewards/rejected": -7.145305633544922, "step": 129 }, { "epoch": 0.08887369680396513, "grad_norm": 0.22647659480571747, "learning_rate": 3.523238920511395e-05, "logits/chosen": -7.327139854431152, "logits/rejected": -7.327320098876953, "logps/chosen": -17.898441314697266, "logps/rejected": -98.65927124023438, "loss": 0.0395, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4518043994903564, "rewards/margins": 8.047662734985352, "rewards/rejected": -6.595858097076416, "step": 130 }, { "epoch": 0.0895573406255341, "grad_norm": 0.191778302192688, "learning_rate": 3.528785492759607e-05, "logits/chosen": -7.822339057922363, "logits/rejected": -7.822516441345215, "logps/chosen": -18.092702865600586, "logps/rejected": -98.55856323242188, "loss": 0.0713, "rewards/accuracies": 0.9375, "rewards/chosen": 1.305895447731018, "rewards/margins": 7.983418941497803, "rewards/rejected": -6.677522659301758, "step": 131 }, { "epoch": 0.09024098444710306, "grad_norm": 0.13501940667629242, "learning_rate": 3.5342898853430836e-05, "logits/chosen": -7.540192127227783, "logits/rejected": -7.541743755340576, "logps/chosen": -17.532081604003906, "logps/rejected": -98.4784927368164, "loss": 0.0541, "rewards/accuracies": 1.0, "rewards/chosen": 1.390265703201294, "rewards/margins": 8.037988662719727, "rewards/rejected": -6.6477227210998535, "step": 132 }, { "epoch": 0.09092462826867202, "grad_norm": 0.11638744920492172, "learning_rate": 3.539752734945143e-05, "logits/chosen": -8.162822723388672, "logits/rejected": -8.164789199829102, "logps/chosen": -19.07799530029297, "logps/rejected": -96.11477661132812, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": 1.2257963418960571, "rewards/margins": 7.651865482330322, "rewards/rejected": -6.4260687828063965, "step": 133 }, { "epoch": 0.09160827209024099, "grad_norm": 0.1322089433670044, "learning_rate": 3.5451746639413466e-05, "logits/chosen": -7.506465435028076, "logits/rejected": -7.507235527038574, "logps/chosen": -17.30217933654785, "logps/rejected": -102.72630310058594, "loss": 0.0504, "rewards/accuracies": 1.0, "rewards/chosen": 1.4213387966156006, "rewards/margins": 8.464402198791504, "rewards/rejected": -7.043063640594482, "step": 134 }, { "epoch": 0.09229191591180995, "grad_norm": 0.250150591135025, "learning_rate": 3.550556280825011e-05, "logits/chosen": -7.750781536102295, "logits/rejected": -7.75086784362793, "logps/chosen": -18.838207244873047, "logps/rejected": -97.46882629394531, "loss": 0.0604, "rewards/accuracies": 1.0, "rewards/chosen": 1.3429428339004517, "rewards/margins": 7.8138427734375, "rewards/rejected": -6.470900535583496, "step": 135 }, { "epoch": 0.09297555973337891, "grad_norm": 0.1744592934846878, "learning_rate": 3.55589818061703e-05, "logits/chosen": -7.778878211975098, "logits/rejected": -7.779552459716797, "logps/chosen": -17.146638870239258, "logps/rejected": -102.00773620605469, "loss": 0.0413, "rewards/accuracies": 1.0, "rewards/chosen": 1.5183310508728027, "rewards/margins": 8.463294982910156, "rewards/rejected": -6.9449639320373535, "step": 136 }, { "epoch": 0.09365920355494788, "grad_norm": 0.11025436222553253, "learning_rate": 3.561200945260678e-05, "logits/chosen": -7.66995906829834, "logits/rejected": -7.670340538024902, "logps/chosen": -18.37664031982422, "logps/rejected": -98.5259780883789, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": 1.312633991241455, "rewards/margins": 7.959234714508057, "rewards/rejected": -6.646600723266602, "step": 137 }, { "epoch": 0.09434284737651684, "grad_norm": 0.19019198417663574, "learning_rate": 3.5664651440020616e-05, "logits/chosen": -7.378687858581543, "logits/rejected": -7.378683090209961, "logps/chosen": -20.617521286010742, "logps/rejected": -93.98561096191406, "loss": 0.0574, "rewards/accuracies": 0.875, "rewards/chosen": 1.2167807817459106, "rewards/margins": 7.349053859710693, "rewards/rejected": -6.1322736740112305, "step": 138 }, { "epoch": 0.0950264911980858, "grad_norm": 0.17032332718372345, "learning_rate": 3.571691333756825e-05, "logits/chosen": -7.3158345222473145, "logits/rejected": -7.315281867980957, "logps/chosen": -20.878231048583984, "logps/rejected": -93.40975952148438, "loss": 0.0824, "rewards/accuracies": 0.875, "rewards/chosen": 1.1321200132369995, "rewards/margins": 7.288637161254883, "rewards/rejected": -6.156517028808594, "step": 139 }, { "epoch": 0.09571013501965477, "grad_norm": 0.14482393860816956, "learning_rate": 3.5768800594637304e-05, "logits/chosen": -7.678028106689453, "logits/rejected": -7.67942476272583, "logps/chosen": -18.689659118652344, "logps/rejected": -99.18344116210938, "loss": 0.0544, "rewards/accuracies": 1.0, "rewards/chosen": 1.227282166481018, "rewards/margins": 7.923852920532227, "rewards/rejected": -6.696571350097656, "step": 140 }, { "epoch": 0.09639377884122373, "grad_norm": 0.2931736409664154, "learning_rate": 3.582031854425634e-05, "logits/chosen": -7.7913055419921875, "logits/rejected": -7.791632652282715, "logps/chosen": -20.5013427734375, "logps/rejected": -94.43547821044922, "loss": 0.071, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1698055267333984, "rewards/margins": 7.352179527282715, "rewards/rejected": -6.182374000549316, "step": 141 }, { "epoch": 0.09707742266279268, "grad_norm": 0.20394925773143768, "learning_rate": 3.587147240638428e-05, "logits/chosen": -7.594935894012451, "logits/rejected": -7.596588134765625, "logps/chosen": -17.43857192993164, "logps/rejected": -103.00689697265625, "loss": 0.0689, "rewards/accuracies": 1.0, "rewards/chosen": 1.4111446142196655, "rewards/margins": 8.468502044677734, "rewards/rejected": -7.057357311248779, "step": 142 }, { "epoch": 0.09776106648436164, "grad_norm": 0.30563512444496155, "learning_rate": 3.5922267291084366e-05, "logits/chosen": -8.063751220703125, "logits/rejected": -8.064957618713379, "logps/chosen": -23.479555130004883, "logps/rejected": -84.12142944335938, "loss": 0.055, "rewards/accuracies": 0.9375, "rewards/chosen": 0.785645604133606, "rewards/margins": 6.064492702484131, "rewards/rejected": -5.278846740722656, "step": 143 }, { "epoch": 0.0984447103059306, "grad_norm": 0.16285376250743866, "learning_rate": 3.5972708201587496e-05, "logits/chosen": -7.488504886627197, "logits/rejected": -7.488586902618408, "logps/chosen": -18.626934051513672, "logps/rejected": -97.39193725585938, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": 1.3367723226547241, "rewards/margins": 7.839698314666748, "rewards/rejected": -6.502925872802734, "step": 144 }, { "epoch": 0.09912835412749957, "grad_norm": 0.14638574421405792, "learning_rate": 3.6022800037249585e-05, "logits/chosen": -7.697809219360352, "logits/rejected": -7.697718620300293, "logps/chosen": -21.11046600341797, "logps/rejected": -94.22920227050781, "loss": 0.0413, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1546366214752197, "rewards/margins": 7.325389385223389, "rewards/rejected": -6.17075252532959, "step": 145 }, { "epoch": 0.09981199794906853, "grad_norm": 0.24265240132808685, "learning_rate": 3.607254759640729e-05, "logits/chosen": -7.784355163574219, "logits/rejected": -7.7849321365356445, "logps/chosen": -21.9410343170166, "logps/rejected": -88.00791931152344, "loss": 0.0475, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0140339136123657, "rewards/margins": 6.551701545715332, "rewards/rejected": -5.537667274475098, "step": 146 }, { "epoch": 0.1004956417706375, "grad_norm": 0.2780419588088989, "learning_rate": 3.612195557913627e-05, "logits/chosen": -7.697455406188965, "logits/rejected": -7.697352409362793, "logps/chosen": -19.691911697387695, "logps/rejected": -93.80331420898438, "loss": 0.0535, "rewards/accuracies": 1.0, "rewards/chosen": 1.2452080249786377, "rewards/margins": 7.372466564178467, "rewards/rejected": -6.12725830078125, "step": 147 }, { "epoch": 0.10117928559220646, "grad_norm": 0.27359965443611145, "learning_rate": 3.6171028589915954e-05, "logits/chosen": -8.16602897644043, "logits/rejected": -8.165872573852539, "logps/chosen": -22.096904754638672, "logps/rejected": -87.45498657226562, "loss": 0.046, "rewards/accuracies": 0.9375, "rewards/chosen": 1.028223991394043, "rewards/margins": 6.476921081542969, "rewards/rejected": -5.448697090148926, "step": 148 }, { "epoch": 0.10186292941377542, "grad_norm": 0.30895331501960754, "learning_rate": 3.6219771140204575e-05, "logits/chosen": -7.5725626945495605, "logits/rejected": -7.573117256164551, "logps/chosen": -20.80474853515625, "logps/rejected": -96.0263671875, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": 1.141845941543579, "rewards/margins": 7.436041355133057, "rewards/rejected": -6.294195175170898, "step": 149 }, { "epoch": 0.10254657323534438, "grad_norm": 0.3591630458831787, "learning_rate": 3.626818765092802e-05, "logits/chosen": -7.885051250457764, "logits/rejected": -7.884799003601074, "logps/chosen": -20.16381072998047, "logps/rejected": -94.33290100097656, "loss": 0.0342, "rewards/accuracies": 0.9375, "rewards/chosen": 1.206000804901123, "rewards/margins": 7.393273830413818, "rewards/rejected": -6.187272548675537, "step": 150 }, { "epoch": 0.10323021705691335, "grad_norm": 0.27674874663352966, "learning_rate": 3.6316282454886157e-05, "logits/chosen": -7.0784010887146, "logits/rejected": -7.078832626342773, "logps/chosen": -20.406280517578125, "logps/rejected": -95.46565246582031, "loss": 0.051, "rewards/accuracies": 0.9375, "rewards/chosen": 1.187001347541809, "rewards/margins": 7.440572738647461, "rewards/rejected": -6.2535719871521, "step": 151 }, { "epoch": 0.10391386087848231, "grad_norm": 0.26770979166030884, "learning_rate": 3.636405979907955e-05, "logits/chosen": -7.879206657409668, "logits/rejected": -7.87844705581665, "logps/chosen": -15.92536735534668, "logps/rejected": -100.56085968017578, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": 1.5342926979064941, "rewards/margins": 8.494837760925293, "rewards/rejected": -6.960544586181641, "step": 152 }, { "epoch": 0.10459750470005127, "grad_norm": 0.44545653462409973, "learning_rate": 3.6411523846959985e-05, "logits/chosen": -7.965452671051025, "logits/rejected": -7.96440315246582, "logps/chosen": -18.854663848876953, "logps/rejected": -93.66927337646484, "loss": 0.0638, "rewards/accuracies": 0.9375, "rewards/chosen": 1.336904525756836, "rewards/margins": 7.525118827819824, "rewards/rejected": -6.188214302062988, "step": 153 }, { "epoch": 0.10528114852162024, "grad_norm": 0.33168119192123413, "learning_rate": 3.645867868060772e-05, "logits/chosen": -7.651296615600586, "logits/rejected": -7.651303291320801, "logps/chosen": -19.033597946166992, "logps/rejected": -99.38988494873047, "loss": 0.0319, "rewards/accuracies": 1.0, "rewards/chosen": 1.2813955545425415, "rewards/margins": 7.988890647888184, "rewards/rejected": -6.707495212554932, "step": 154 }, { "epoch": 0.1059647923431892, "grad_norm": 0.4232638478279114, "learning_rate": 3.6505528302838193e-05, "logits/chosen": -7.240283966064453, "logits/rejected": -7.239809513092041, "logps/chosen": -21.992456436157227, "logps/rejected": -91.47148132324219, "loss": 0.0586, "rewards/accuracies": 0.875, "rewards/chosen": 0.9570145606994629, "rewards/margins": 6.889449119567871, "rewards/rejected": -5.932435035705566, "step": 155 }, { "epoch": 0.10664843616475816, "grad_norm": 0.5165214538574219, "learning_rate": 3.6552076639241027e-05, "logits/chosen": -7.81994104385376, "logits/rejected": -7.82083797454834, "logps/chosen": -24.452449798583984, "logps/rejected": -90.20696258544922, "loss": 0.0533, "rewards/accuracies": 0.875, "rewards/chosen": 0.8499817252159119, "rewards/margins": 6.5041985511779785, "rewards/rejected": -5.654216289520264, "step": 156 }, { "epoch": 0.10733207998632713, "grad_norm": 0.7076581716537476, "learning_rate": 3.65983275401539e-05, "logits/chosen": -8.065098762512207, "logits/rejected": -8.066797256469727, "logps/chosen": -18.442283630371094, "logps/rejected": -101.3883056640625, "loss": 0.049, "rewards/accuracies": 1.0, "rewards/chosen": 1.2687057256698608, "rewards/margins": 8.170927047729492, "rewards/rejected": -6.9022216796875, "step": 157 }, { "epoch": 0.10801572380789609, "grad_norm": 0.29401659965515137, "learning_rate": 3.664428478257371e-05, "logits/chosen": -8.066473960876465, "logits/rejected": -8.065850257873535, "logps/chosen": -16.36079216003418, "logps/rejected": -102.85159301757812, "loss": 0.0168, "rewards/accuracies": 1.0, "rewards/chosen": 1.5252478122711182, "rewards/margins": 8.640026092529297, "rewards/rejected": -7.114778518676758, "step": 158 }, { "epoch": 0.10869936762946505, "grad_norm": 0.5832663774490356, "learning_rate": 3.668995207200753e-05, "logits/chosen": -7.812388896942139, "logits/rejected": -7.812767505645752, "logps/chosen": -16.74033546447754, "logps/rejected": -102.89926147460938, "loss": 0.0418, "rewards/accuracies": 1.0, "rewards/chosen": 1.5028982162475586, "rewards/margins": 8.569622039794922, "rewards/rejected": -7.0667243003845215, "step": 159 }, { "epoch": 0.10938301145103402, "grad_norm": 0.5297597646713257, "learning_rate": 3.673533304426541e-05, "logits/chosen": -8.45863151550293, "logits/rejected": -8.460529327392578, "logps/chosen": -17.308664321899414, "logps/rejected": -105.03002166748047, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": 1.450836181640625, "rewards/margins": 8.663894653320312, "rewards/rejected": -7.2130584716796875, "step": 160 }, { "epoch": 0.11006665527260297, "grad_norm": 0.3816565275192261, "learning_rate": 3.67804312671975e-05, "logits/chosen": -7.8850579261779785, "logits/rejected": -7.884626865386963, "logps/chosen": -20.961528778076172, "logps/rejected": -95.75830841064453, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": 1.1899170875549316, "rewards/margins": 7.40888786315918, "rewards/rejected": -6.218971252441406, "step": 161 }, { "epoch": 0.11075029909417193, "grad_norm": 0.5430158972740173, "learning_rate": 3.682525024237719e-05, "logits/chosen": -7.64088249206543, "logits/rejected": -7.641401290893555, "logps/chosen": -20.025287628173828, "logps/rejected": -98.82681274414062, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": 1.1818149089813232, "rewards/margins": 7.814040184020996, "rewards/rejected": -6.632225513458252, "step": 162 }, { "epoch": 0.1114339429157409, "grad_norm": 0.48071861267089844, "learning_rate": 3.6869793406732636e-05, "logits/chosen": -7.4610819816589355, "logits/rejected": -7.458802223205566, "logps/chosen": -17.84487533569336, "logps/rejected": -98.92069244384766, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": 1.4160375595092773, "rewards/margins": 8.154064178466797, "rewards/rejected": -6.738027572631836, "step": 163 }, { "epoch": 0.11211758673730986, "grad_norm": 0.8646880984306335, "learning_rate": 3.69140641341283e-05, "logits/chosen": -8.2379150390625, "logits/rejected": -8.237663269042969, "logps/chosen": -16.636028289794922, "logps/rejected": -103.89271545410156, "loss": 0.0263, "rewards/accuracies": 1.0, "rewards/chosen": 1.5125583410263062, "rewards/margins": 8.63797378540039, "rewards/rejected": -7.125414848327637, "step": 164 }, { "epoch": 0.11280123055887882, "grad_norm": 0.20733320713043213, "learning_rate": 3.695806573689844e-05, "logits/chosen": -8.326507568359375, "logits/rejected": -8.327356338500977, "logps/chosen": -20.02138900756836, "logps/rejected": -97.39637756347656, "loss": 0.017, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1690669059753418, "rewards/margins": 7.647305011749268, "rewards/rejected": -6.478238105773926, "step": 165 }, { "epoch": 0.11348487438044778, "grad_norm": 0.41143208742141724, "learning_rate": 3.700180146733426e-05, "logits/chosen": -7.953182220458984, "logits/rejected": -7.949914932250977, "logps/chosen": -17.868085861206055, "logps/rejected": -98.61404418945312, "loss": 0.0192, "rewards/accuracies": 1.0, "rewards/chosen": 1.4629055261611938, "rewards/margins": 8.070277214050293, "rewards/rejected": -6.6073713302612305, "step": 166 }, { "epoch": 0.11416851820201675, "grad_norm": 0.5451831221580505, "learning_rate": 3.704527451912639e-05, "logits/chosen": -7.821101188659668, "logits/rejected": -7.81982421875, "logps/chosen": -17.389408111572266, "logps/rejected": -104.10852813720703, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": 1.4619227647781372, "rewards/margins": 8.58312702178955, "rewards/rejected": -7.121204376220703, "step": 167 }, { "epoch": 0.11485216202358571, "grad_norm": 0.12763865292072296, "learning_rate": 3.708848802876438e-05, "logits/chosen": -8.490657806396484, "logits/rejected": -8.486953735351562, "logps/chosen": -17.16522789001465, "logps/rejected": -102.01580047607422, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 1.421800971031189, "rewards/margins": 8.451091766357422, "rewards/rejected": -7.029290199279785, "step": 168 }, { "epoch": 0.11553580584515467, "grad_norm": 0.5917484164237976, "learning_rate": 3.7131445076894564e-05, "logits/chosen": -8.748453140258789, "logits/rejected": -8.747041702270508, "logps/chosen": -20.921695709228516, "logps/rejected": -99.76713562011719, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": 1.0824133157730103, "rewards/margins": 7.870259761810303, "rewards/rejected": -6.787846565246582, "step": 169 }, { "epoch": 0.11621944966672364, "grad_norm": 0.5661998987197876, "learning_rate": 3.717414868963791e-05, "logits/chosen": -7.718993186950684, "logits/rejected": -7.716735363006592, "logps/chosen": -19.703975677490234, "logps/rejected": -103.44842529296875, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": 1.2001471519470215, "rewards/margins": 8.323488235473633, "rewards/rejected": -7.123341083526611, "step": 170 }, { "epoch": 0.1169030934882926, "grad_norm": 0.9373929500579834, "learning_rate": 3.721660183986924e-05, "logits/chosen": -8.67609977722168, "logits/rejected": -8.672961235046387, "logps/chosen": -19.657581329345703, "logps/rejected": -100.23347473144531, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": 1.2649264335632324, "rewards/margins": 8.053326606750488, "rewards/rejected": -6.788400650024414, "step": 171 }, { "epoch": 0.11758673730986156, "grad_norm": 0.5518049597740173, "learning_rate": 3.725880744845915e-05, "logits/chosen": -8.385232925415039, "logits/rejected": -8.381747245788574, "logps/chosen": -19.375560760498047, "logps/rejected": -98.32029724121094, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 1.3120272159576416, "rewards/margins": 7.921546936035156, "rewards/rejected": -6.609519958496094, "step": 172 }, { "epoch": 0.11827038113143053, "grad_norm": 0.22000877559185028, "learning_rate": 3.730076838547993e-05, "logits/chosen": -7.889025688171387, "logits/rejected": -7.888116836547852, "logps/chosen": -19.39897346496582, "logps/rejected": -101.45340728759766, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": 1.2079771757125854, "rewards/margins": 8.12338638305664, "rewards/rejected": -6.915409088134766, "step": 173 }, { "epoch": 0.11895402495299949, "grad_norm": 0.3859174847602844, "learning_rate": 3.734248747137666e-05, "logits/chosen": -8.359643936157227, "logits/rejected": -8.357237815856934, "logps/chosen": -17.061134338378906, "logps/rejected": -99.92786407470703, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": 1.5039126873016357, "rewards/margins": 8.271928787231445, "rewards/rejected": -6.768016338348389, "step": 174 }, { "epoch": 0.11963766877456845, "grad_norm": 0.5424525737762451, "learning_rate": 3.738396747810492e-05, "logits/chosen": -8.363338470458984, "logits/rejected": -8.361252784729004, "logps/chosen": -17.28545379638672, "logps/rejected": -97.44685363769531, "loss": 0.0154, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4006482362747192, "rewards/margins": 8.041190147399902, "rewards/rejected": -6.640542030334473, "step": 175 }, { "epoch": 0.12032131259613742, "grad_norm": 0.03304045647382736, "learning_rate": 3.7425211130235834e-05, "logits/chosen": -8.225688934326172, "logits/rejected": -8.223316192626953, "logps/chosen": -16.089670181274414, "logps/rejected": -101.75882720947266, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 1.5851690769195557, "rewards/margins": 8.51107120513916, "rewards/rejected": -6.925901412963867, "step": 176 }, { "epoch": 0.12100495641770638, "grad_norm": 1.7313191890716553, "learning_rate": 3.7466221106030115e-05, "logits/chosen": -8.463229179382324, "logits/rejected": -8.460272789001465, "logps/chosen": -19.636816024780273, "logps/rejected": -93.89808654785156, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": 1.3034287691116333, "rewards/margins": 7.466072082519531, "rewards/rejected": -6.1626434326171875, "step": 177 }, { "epoch": 0.12168860023927534, "grad_norm": 0.8916001319885254, "learning_rate": 3.750700003848157e-05, "logits/chosen": -7.753788471221924, "logits/rejected": -7.750584125518799, "logps/chosen": -13.993510246276855, "logps/rejected": -101.43766784667969, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 1.7478210926055908, "rewards/margins": 8.73172378540039, "rewards/rejected": -6.983902454376221, "step": 178 }, { "epoch": 0.1223722440608443, "grad_norm": 0.04780713468790054, "learning_rate": 3.7547550516331555e-05, "logits/chosen": -8.450684547424316, "logits/rejected": -8.448432922363281, "logps/chosen": -16.98444366455078, "logps/rejected": -98.80672454833984, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 1.4703868627548218, "rewards/margins": 8.165711402893066, "rewards/rejected": -6.695323944091797, "step": 179 }, { "epoch": 0.12305588788241326, "grad_norm": 0.14325806498527527, "learning_rate": 3.75878750850551e-05, "logits/chosen": -8.262131690979004, "logits/rejected": -8.257766723632812, "logps/chosen": -18.086978912353516, "logps/rejected": -99.60597229003906, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 1.482719898223877, "rewards/margins": 8.118690490722656, "rewards/rejected": -6.635970115661621, "step": 180 }, { "epoch": 0.12373953170398222, "grad_norm": 0.07480663061141968, "learning_rate": 3.7627976247819744e-05, "logits/chosen": -7.811366081237793, "logits/rejected": -7.808693885803223, "logps/chosen": -17.63628387451172, "logps/rejected": -96.19336700439453, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 1.5149433612823486, "rewards/margins": 7.8282012939453125, "rewards/rejected": -6.313257694244385, "step": 181 }, { "epoch": 0.12442317552555118, "grad_norm": 0.03415043279528618, "learning_rate": 3.766785646641792e-05, "logits/chosen": -7.995752334594727, "logits/rejected": -7.993330478668213, "logps/chosen": -17.392295837402344, "logps/rejected": -102.66300964355469, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 1.499037742614746, "rewards/margins": 8.492423057556152, "rewards/rejected": -6.993385314941406, "step": 182 }, { "epoch": 0.12510681934712015, "grad_norm": 0.5668272376060486, "learning_rate": 3.770751816217383e-05, "logits/chosen": -7.9024505615234375, "logits/rejected": -7.899229049682617, "logps/chosen": -20.076175689697266, "logps/rejected": -94.67852020263672, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": 1.2374000549316406, "rewards/margins": 7.441464424133301, "rewards/rejected": -6.204064846038818, "step": 183 }, { "epoch": 0.1257904631686891, "grad_norm": 0.13100677728652954, "learning_rate": 3.7746963716825615e-05, "logits/chosen": -8.027771949768066, "logits/rejected": -8.024057388305664, "logps/chosen": -17.10294532775879, "logps/rejected": -98.41409301757812, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 1.473846435546875, "rewards/margins": 8.118437767028809, "rewards/rejected": -6.644590377807617, "step": 184 }, { "epoch": 0.12647410699025807, "grad_norm": 0.04159475490450859, "learning_rate": 3.778619547338356e-05, "logits/chosen": -8.272957801818848, "logits/rejected": -8.26996898651123, "logps/chosen": -16.866832733154297, "logps/rejected": -101.72177124023438, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 1.5579899549484253, "rewards/margins": 8.453716278076172, "rewards/rejected": -6.895726203918457, "step": 185 }, { "epoch": 0.12715775081182704, "grad_norm": 0.06655889004468918, "learning_rate": 3.782521573696528e-05, "logits/chosen": -8.077256202697754, "logits/rejected": -8.072854995727539, "logps/chosen": -16.95366668701172, "logps/rejected": -101.21292114257812, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 1.523773193359375, "rewards/margins": 8.40956974029541, "rewards/rejected": -6.885796546936035, "step": 186 }, { "epoch": 0.127841394633396, "grad_norm": 0.13407842814922333, "learning_rate": 3.786402677560832e-05, "logits/chosen": -8.47122859954834, "logits/rejected": -8.467863082885742, "logps/chosen": -14.448579788208008, "logps/rejected": -103.32573699951172, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": 1.7809392213821411, "rewards/margins": 8.873130798339844, "rewards/rejected": -7.092191696166992, "step": 187 }, { "epoch": 0.12852503845496496, "grad_norm": 0.05499279871582985, "learning_rate": 3.790263082106134e-05, "logits/chosen": -8.469802856445312, "logits/rejected": -8.467157363891602, "logps/chosen": -14.456302642822266, "logps/rejected": -106.60621643066406, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 1.670943021774292, "rewards/margins": 9.149016380310059, "rewards/rejected": -7.478074073791504, "step": 188 }, { "epoch": 0.12920868227653393, "grad_norm": 0.04532068222761154, "learning_rate": 3.794103006955407e-05, "logits/chosen": -8.465027809143066, "logits/rejected": -8.460269927978516, "logps/chosen": -11.203868865966797, "logps/rejected": -105.0894775390625, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 2.079151153564453, "rewards/margins": 9.403820991516113, "rewards/rejected": -7.324670314788818, "step": 189 }, { "epoch": 0.1298923260981029, "grad_norm": 0.31707510352134705, "learning_rate": 3.797922668254715e-05, "logits/chosen": -8.34368896484375, "logits/rejected": -8.340235710144043, "logps/chosen": -20.915908813476562, "logps/rejected": -92.619873046875, "loss": 0.0201, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1484891176223755, "rewards/margins": 7.130973815917969, "rewards/rejected": -5.982484817504883, "step": 190 }, { "epoch": 0.13057596991967185, "grad_norm": 0.03287724032998085, "learning_rate": 3.801722278746213e-05, "logits/chosen": -8.373847007751465, "logits/rejected": -8.369441986083984, "logps/chosen": -14.578975677490234, "logps/rejected": -101.68533325195312, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 1.7981818914413452, "rewards/margins": 8.716939926147461, "rewards/rejected": -6.918757915496826, "step": 191 }, { "epoch": 0.13125961374124082, "grad_norm": 0.05886462330818176, "learning_rate": 3.8055020478392495e-05, "logits/chosen": -8.64394760131836, "logits/rejected": -8.641085624694824, "logps/chosen": -12.684776306152344, "logps/rejected": -106.15833282470703, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 1.8510715961456299, "rewards/margins": 9.277555465698242, "rewards/rejected": -7.426484107971191, "step": 192 }, { "epoch": 0.13194325756280978, "grad_norm": 0.1320420801639557, "learning_rate": 3.809262181679623e-05, "logits/chosen": -8.223373413085938, "logits/rejected": -8.2202730178833, "logps/chosen": -11.263542175292969, "logps/rejected": -103.8349609375, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": 1.9400476217269897, "rewards/margins": 9.166744232177734, "rewards/rejected": -7.226696968078613, "step": 193 }, { "epoch": 0.13262690138437874, "grad_norm": 0.04411447048187256, "learning_rate": 3.813002883217044e-05, "logits/chosen": -8.446036338806152, "logits/rejected": -8.442571640014648, "logps/chosen": -15.496932983398438, "logps/rejected": -100.54895782470703, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 1.715937614440918, "rewards/margins": 8.45447063446045, "rewards/rejected": -6.738533020019531, "step": 194 }, { "epoch": 0.1333105452059477, "grad_norm": 0.1057153195142746, "learning_rate": 3.816724352270863e-05, "logits/chosen": -8.709799766540527, "logits/rejected": -8.704505920410156, "logps/chosen": -10.11043643951416, "logps/rejected": -102.29596710205078, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 2.22012996673584, "rewards/margins": 9.25549030303955, "rewards/rejected": -7.035360336303711, "step": 195 }, { "epoch": 0.13399418902751667, "grad_norm": 0.03638846427202225, "learning_rate": 3.8204267855941266e-05, "logits/chosen": -8.291994094848633, "logits/rejected": -8.289484024047852, "logps/chosen": -11.70936107635498, "logps/rejected": -104.7098388671875, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 1.9255836009979248, "rewards/margins": 9.197661399841309, "rewards/rejected": -7.272077560424805, "step": 196 }, { "epoch": 0.13467783284908563, "grad_norm": 0.713524580001831, "learning_rate": 3.824110376935989e-05, "logits/chosen": -8.156685829162598, "logits/rejected": -8.154230117797852, "logps/chosen": -12.385478973388672, "logps/rejected": -103.24797058105469, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 2.0237860679626465, "rewards/margins": 9.006911277770996, "rewards/rejected": -6.983124732971191, "step": 197 }, { "epoch": 0.1353614766706546, "grad_norm": 0.08417079597711563, "learning_rate": 3.827775317102552e-05, "logits/chosen": -8.275992393493652, "logits/rejected": -8.273635864257812, "logps/chosen": -11.70821762084961, "logps/rejected": -103.21771240234375, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": 1.8978358507156372, "rewards/margins": 8.999814987182617, "rewards/rejected": -7.1019792556762695, "step": 198 }, { "epoch": 0.13604512049222356, "grad_norm": 0.40099260210990906, "learning_rate": 3.831421794016178e-05, "logits/chosen": -8.486023902893066, "logits/rejected": -8.482569694519043, "logps/chosen": -9.836915016174316, "logps/rejected": -101.75154876708984, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": 2.1302390098571777, "rewards/margins": 9.154228210449219, "rewards/rejected": -7.023989200592041, "step": 199 }, { "epoch": 0.13672876431379252, "grad_norm": 0.05916576087474823, "learning_rate": 3.835049992773302e-05, "logits/chosen": -8.584416389465332, "logits/rejected": -8.580307006835938, "logps/chosen": -8.2374906539917, "logps/rejected": -101.10768127441406, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 2.4480528831481934, "rewards/margins": 9.305997848510742, "rewards/rejected": -6.857944965362549, "step": 200 }, { "epoch": 0.13741240813536149, "grad_norm": 0.04555201157927513, "learning_rate": 3.838660095700815e-05, "logits/chosen": -8.217769622802734, "logits/rejected": -8.213760375976562, "logps/chosen": -7.83418607711792, "logps/rejected": -103.20330810546875, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 2.4305121898651123, "rewards/margins": 9.490331649780273, "rewards/rejected": -7.05981969833374, "step": 201 }, { "epoch": 0.13809605195693045, "grad_norm": 0.13070212304592133, "learning_rate": 3.84225228241104e-05, "logits/chosen": -8.474465370178223, "logits/rejected": -8.47018814086914, "logps/chosen": -8.596515655517578, "logps/rejected": -100.32512664794922, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": 2.2865867614746094, "rewards/margins": 9.18923282623291, "rewards/rejected": -6.902646064758301, "step": 202 }, { "epoch": 0.1387796957784994, "grad_norm": 0.020547082647681236, "learning_rate": 3.8458267298553554e-05, "logits/chosen": -7.878048896789551, "logits/rejected": -7.874508857727051, "logps/chosen": -6.0634589195251465, "logps/rejected": -105.30085754394531, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 2.5648725032806396, "rewards/margins": 9.846035957336426, "rewards/rejected": -7.281163215637207, "step": 203 }, { "epoch": 0.13946333960006838, "grad_norm": 0.17609404027462006, "learning_rate": 3.8493836123764984e-05, "logits/chosen": -8.118544578552246, "logits/rejected": -8.114500999450684, "logps/chosen": -8.899144172668457, "logps/rejected": -99.8304443359375, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": 2.472893238067627, "rewards/margins": 9.110271453857422, "rewards/rejected": -6.637377738952637, "step": 204 }, { "epoch": 0.14014698342163734, "grad_norm": 0.07016999274492264, "learning_rate": 3.852923101759591e-05, "logits/chosen": -8.526727676391602, "logits/rejected": -8.524152755737305, "logps/chosen": -7.122186660766602, "logps/rejected": -106.50407409667969, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 2.472133159637451, "rewards/margins": 9.874606132507324, "rewards/rejected": -7.402473449707031, "step": 205 }, { "epoch": 0.1408306272432063, "grad_norm": 0.034353841096162796, "learning_rate": 3.856445367281923e-05, "logits/chosen": -8.38800048828125, "logits/rejected": -8.386088371276855, "logps/chosen": -12.259008407592773, "logps/rejected": -100.57447814941406, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 1.9847642183303833, "rewards/margins": 8.768875122070312, "rewards/rejected": -6.784111022949219, "step": 206 }, { "epoch": 0.14151427106477527, "grad_norm": 0.055477894842624664, "learning_rate": 3.859950575761529e-05, "logits/chosen": -7.9228434562683105, "logits/rejected": -7.919261932373047, "logps/chosen": -11.90867805480957, "logps/rejected": -101.13788604736328, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 1.946126937866211, "rewards/margins": 8.841601371765137, "rewards/rejected": -6.895474433898926, "step": 207 }, { "epoch": 0.1421979148863442, "grad_norm": 0.06514305621385574, "learning_rate": 3.8634388916046025e-05, "logits/chosen": -8.198057174682617, "logits/rejected": -8.19321060180664, "logps/chosen": -4.697771072387695, "logps/rejected": -104.84072875976562, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 2.768718719482422, "rewards/margins": 9.994787216186523, "rewards/rejected": -7.226068496704102, "step": 208 }, { "epoch": 0.14288155870791316, "grad_norm": 0.0668146163225174, "learning_rate": 3.866910476851757e-05, "logits/chosen": -8.353008270263672, "logits/rejected": -8.350777626037598, "logps/chosen": -7.890315055847168, "logps/rejected": -104.07974243164062, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 2.41140079498291, "rewards/margins": 9.587574005126953, "rewards/rejected": -7.176174163818359, "step": 209 }, { "epoch": 0.14356520252948213, "grad_norm": 0.1168762817978859, "learning_rate": 3.870365491223199e-05, "logits/chosen": -8.43730640411377, "logits/rejected": -8.434577941894531, "logps/chosen": -12.974372863769531, "logps/rejected": -97.06291198730469, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": 1.9716050624847412, "rewards/margins": 8.403085708618164, "rewards/rejected": -6.431480407714844, "step": 210 }, { "epoch": 0.1442488463510511, "grad_norm": 0.8783552050590515, "learning_rate": 3.8738040921628215e-05, "logits/chosen": -8.58105754852295, "logits/rejected": -8.576727867126465, "logps/chosen": -7.948564529418945, "logps/rejected": -99.81754302978516, "loss": 0.0228, "rewards/accuracies": 1.0, "rewards/chosen": 2.4459357261657715, "rewards/margins": 9.19393539428711, "rewards/rejected": -6.748000144958496, "step": 211 }, { "epoch": 0.14493249017262005, "grad_norm": 0.029262151569128036, "learning_rate": 3.877226434881253e-05, "logits/chosen": -8.532116889953613, "logits/rejected": -8.530303955078125, "logps/chosen": -15.963510513305664, "logps/rejected": -97.82894897460938, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 1.5905934572219849, "rewards/margins": 8.12971019744873, "rewards/rejected": -6.539116859436035, "step": 212 }, { "epoch": 0.14561613399418902, "grad_norm": 0.014391157776117325, "learning_rate": 3.880632672397897e-05, "logits/chosen": -8.495813369750977, "logits/rejected": -8.492183685302734, "logps/chosen": -6.07093620300293, "logps/rejected": -105.68872833251953, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 2.6261110305786133, "rewards/margins": 9.885531425476074, "rewards/rejected": -7.259420871734619, "step": 213 }, { "epoch": 0.14629977781575798, "grad_norm": 0.04713955521583557, "learning_rate": 3.884022955581985e-05, "logits/chosen": -8.221561431884766, "logits/rejected": -8.21699047088623, "logps/chosen": -5.479066848754883, "logps/rejected": -102.21693420410156, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 2.675259590148926, "rewards/margins": 9.69130802154541, "rewards/rejected": -7.016048431396484, "step": 214 }, { "epoch": 0.14698342163732694, "grad_norm": 0.024686770513653755, "learning_rate": 3.887397433192676e-05, "logits/chosen": -8.252704620361328, "logits/rejected": -8.24954605102539, "logps/chosen": -10.352910995483398, "logps/rejected": -102.15940856933594, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.9884012937545776, "rewards/margins": 9.128252029418945, "rewards/rejected": -7.139850616455078, "step": 215 }, { "epoch": 0.1476670654588959, "grad_norm": 0.050120823085308075, "learning_rate": 3.890756251918219e-05, "logits/chosen": -7.982952117919922, "logits/rejected": -7.978579521179199, "logps/chosen": -8.58204460144043, "logps/rejected": -99.87626647949219, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 2.453202247619629, "rewards/margins": 9.15585994720459, "rewards/rejected": -6.702658176422119, "step": 216 }, { "epoch": 0.14835070928046487, "grad_norm": 0.06030161306262016, "learning_rate": 3.894099556414216e-05, "logits/chosen": -8.383129119873047, "logits/rejected": -8.37984848022461, "logps/chosen": -9.718710899353027, "logps/rejected": -97.4630355834961, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 2.236280679702759, "rewards/margins": 8.830961227416992, "rewards/rejected": -6.5946807861328125, "step": 217 }, { "epoch": 0.14903435310203383, "grad_norm": 0.01801532879471779, "learning_rate": 3.897427489341009e-05, "logits/chosen": -8.785791397094727, "logits/rejected": -8.781442642211914, "logps/chosen": -11.8729887008667, "logps/rejected": -97.82402038574219, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.104036331176758, "rewards/margins": 8.606888771057129, "rewards/rejected": -6.502852439880371, "step": 218 }, { "epoch": 0.1497179969236028, "grad_norm": 0.10364816337823868, "learning_rate": 3.900740191400198e-05, "logits/chosen": -7.541139602661133, "logits/rejected": -7.538855075836182, "logps/chosen": -8.860525131225586, "logps/rejected": -106.07164001464844, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 2.2772674560546875, "rewards/margins": 9.635967254638672, "rewards/rejected": -7.358699798583984, "step": 219 }, { "epoch": 0.15040164074517176, "grad_norm": 0.4188813865184784, "learning_rate": 3.904037801370344e-05, "logits/chosen": -8.379356384277344, "logits/rejected": -8.375568389892578, "logps/chosen": -8.459717750549316, "logps/rejected": -100.66346740722656, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 2.2179229259490967, "rewards/margins": 9.176994323730469, "rewards/rejected": -6.959071159362793, "step": 220 }, { "epoch": 0.15108528456674072, "grad_norm": 0.032806459814310074, "learning_rate": 3.9073204561418514e-05, "logits/chosen": -7.9838056564331055, "logits/rejected": -7.981409072875977, "logps/chosen": -7.871054172515869, "logps/rejected": -104.08758544921875, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 2.396609306335449, "rewards/margins": 9.59764575958252, "rewards/rejected": -7.201035976409912, "step": 221 }, { "epoch": 0.1517689283883097, "grad_norm": 0.19950072467327118, "learning_rate": 3.9105882907510644e-05, "logits/chosen": -8.120723724365234, "logits/rejected": -8.117892265319824, "logps/chosen": -7.074044227600098, "logps/rejected": -103.61315155029297, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 2.479006767272949, "rewards/margins": 9.645233154296875, "rewards/rejected": -7.166225910186768, "step": 222 }, { "epoch": 0.15245257220987865, "grad_norm": 0.03674686327576637, "learning_rate": 3.913841438413601e-05, "logits/chosen": -8.537965774536133, "logits/rejected": -8.536073684692383, "logps/chosen": -9.582359313964844, "logps/rejected": -106.09123229980469, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 2.127206802368164, "rewards/margins": 9.563614845275879, "rewards/rejected": -7.436408042907715, "step": 223 }, { "epoch": 0.15313621603144761, "grad_norm": 0.07351710647344589, "learning_rate": 3.917080030556938e-05, "logits/chosen": -8.171476364135742, "logits/rejected": -8.165907859802246, "logps/chosen": -3.515740394592285, "logps/rejected": -104.58570861816406, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 2.9159340858459473, "rewards/margins": 10.13833999633789, "rewards/rejected": -7.222405433654785, "step": 224 }, { "epoch": 0.15381985985301658, "grad_norm": 0.022211385890841484, "learning_rate": 3.9203041968522716e-05, "logits/chosen": -8.543351173400879, "logits/rejected": -8.540630340576172, "logps/chosen": -16.233308792114258, "logps/rejected": -100.25698852539062, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 1.6105947494506836, "rewards/margins": 8.302591323852539, "rewards/rejected": -6.6919965744018555, "step": 225 }, { "epoch": 0.15450350367458554, "grad_norm": 0.01738407276570797, "learning_rate": 3.923514065245669e-05, "logits/chosen": -8.173864364624023, "logits/rejected": -8.17070198059082, "logps/chosen": -5.184375286102295, "logps/rejected": -106.73165130615234, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.601736307144165, "rewards/margins": 10.104515075683594, "rewards/rejected": -7.502779006958008, "step": 226 }, { "epoch": 0.1551871474961545, "grad_norm": 0.02448827400803566, "learning_rate": 3.926709761988538e-05, "logits/chosen": -8.471630096435547, "logits/rejected": -8.469100952148438, "logps/chosen": -12.979081153869629, "logps/rejected": -101.88265991210938, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 1.9619406461715698, "rewards/margins": 8.83372974395752, "rewards/rejected": -6.871788501739502, "step": 227 }, { "epoch": 0.15587079131772347, "grad_norm": 0.025002509355545044, "learning_rate": 3.929891411667424e-05, "logits/chosen": -8.178688049316406, "logits/rejected": -8.175941467285156, "logps/chosen": -5.3819990158081055, "logps/rejected": -105.35797119140625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 2.6262099742889404, "rewards/margins": 9.97558307647705, "rewards/rejected": -7.3493733406066895, "step": 228 }, { "epoch": 0.15655443513929243, "grad_norm": 0.024281619116663933, "learning_rate": 3.933059137233147e-05, "logits/chosen": -8.357263565063477, "logits/rejected": -8.352420806884766, "logps/chosen": -7.502842903137207, "logps/rejected": -100.9969482421875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 2.493363857269287, "rewards/margins": 9.29828929901123, "rewards/rejected": -6.804925441741943, "step": 229 }, { "epoch": 0.1572380789608614, "grad_norm": 0.09216087311506271, "learning_rate": 3.9362130600293214e-05, "logits/chosen": -8.181447982788086, "logits/rejected": -8.178281784057617, "logps/chosen": -6.930170059204102, "logps/rejected": -104.61915588378906, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4413774013519287, "rewards/margins": 9.665424346923828, "rewards/rejected": -7.2240471839904785, "step": 230 }, { "epoch": 0.15792172278243036, "grad_norm": 0.02818569727241993, "learning_rate": 3.9393532998202405e-05, "logits/chosen": -9.020523071289062, "logits/rejected": -9.016397476196289, "logps/chosen": -4.084857940673828, "logps/rejected": -104.82252502441406, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 2.7633750438690186, "rewards/margins": 10.059284210205078, "rewards/rejected": -7.2959089279174805, "step": 231 }, { "epoch": 0.15860536660399932, "grad_norm": 0.4159286916255951, "learning_rate": 3.942479974818166e-05, "logits/chosen": -7.816359043121338, "logits/rejected": -7.812115669250488, "logps/chosen": -7.282954692840576, "logps/rejected": -101.0228042602539, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": 2.6354498863220215, "rewards/margins": 9.440237045288086, "rewards/rejected": -6.804786682128906, "step": 232 }, { "epoch": 0.15928901042556828, "grad_norm": 0.15103107690811157, "learning_rate": 3.945593201710032e-05, "logits/chosen": -8.360222816467285, "logits/rejected": -8.356467247009277, "logps/chosen": -5.6452250480651855, "logps/rejected": -104.01358032226562, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": 2.5380759239196777, "rewards/margins": 9.765132904052734, "rewards/rejected": -7.227055549621582, "step": 233 }, { "epoch": 0.15997265424713725, "grad_norm": 0.047917772084474564, "learning_rate": 3.9486930956835724e-05, "logits/chosen": -8.375377655029297, "logits/rejected": -8.371950149536133, "logps/chosen": -10.015028953552246, "logps/rejected": -102.67449951171875, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 2.119246006011963, "rewards/margins": 9.232309341430664, "rewards/rejected": -7.113063812255859, "step": 234 }, { "epoch": 0.1606562980687062, "grad_norm": 0.0444156676530838, "learning_rate": 3.951779770452894e-05, "logits/chosen": -8.369431495666504, "logits/rejected": -8.364815711975098, "logps/chosen": -7.758355140686035, "logps/rejected": -98.44317626953125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 2.534336566925049, "rewards/margins": 9.105522155761719, "rewards/rejected": -6.57118558883667, "step": 235 }, { "epoch": 0.16133994189027517, "grad_norm": 0.20916448533535004, "learning_rate": 3.954853338283512e-05, "logits/chosen": -8.344764709472656, "logits/rejected": -8.343189239501953, "logps/chosen": -13.698495864868164, "logps/rejected": -99.83573913574219, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 1.9027576446533203, "rewards/margins": 8.478544235229492, "rewards/rejected": -6.575786590576172, "step": 236 }, { "epoch": 0.16202358571184414, "grad_norm": 0.13743674755096436, "learning_rate": 3.9579139100168404e-05, "logits/chosen": -8.981305122375488, "logits/rejected": -8.976263046264648, "logps/chosen": -6.324738025665283, "logps/rejected": -103.09501647949219, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": 2.599616289138794, "rewards/margins": 9.680988311767578, "rewards/rejected": -7.081371307373047, "step": 237 }, { "epoch": 0.1627072295334131, "grad_norm": 0.08360245823860168, "learning_rate": 3.960961595094187e-05, "logits/chosen": -8.525315284729004, "logits/rejected": -8.520843505859375, "logps/chosen": -8.224271774291992, "logps/rejected": -102.49725341796875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 2.3865067958831787, "rewards/margins": 9.36349868774414, "rewards/rejected": -6.976992130279541, "step": 238 }, { "epoch": 0.16339087335498206, "grad_norm": 0.04367003217339516, "learning_rate": 3.96399650158023e-05, "logits/chosen": -8.643949508666992, "logits/rejected": -8.638500213623047, "logps/chosen": -6.464688301086426, "logps/rejected": -102.88957214355469, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 2.439134120941162, "rewards/margins": 9.574888229370117, "rewards/rejected": -7.135754585266113, "step": 239 }, { "epoch": 0.16407451717655103, "grad_norm": 1.2583684921264648, "learning_rate": 3.96701873618601e-05, "logits/chosen": -8.107290267944336, "logits/rejected": -8.103104591369629, "logps/chosen": -6.6438446044921875, "logps/rejected": -104.96497344970703, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": 2.563300132751465, "rewards/margins": 9.769927024841309, "rewards/rejected": -7.2066264152526855, "step": 240 }, { "epoch": 0.16475816099812, "grad_norm": 0.04479165002703667, "learning_rate": 3.970028404291448e-05, "logits/chosen": -8.257513046264648, "logits/rejected": -8.2544584274292, "logps/chosen": -5.661798477172852, "logps/rejected": -105.31851196289062, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 2.5816543102264404, "rewards/margins": 9.88718032836914, "rewards/rejected": -7.305525302886963, "step": 241 }, { "epoch": 0.16544180481968895, "grad_norm": 0.03931509330868721, "learning_rate": 3.9730256099673865e-05, "logits/chosen": -8.302347183227539, "logits/rejected": -8.297323226928711, "logps/chosen": -8.33007526397705, "logps/rejected": -96.14693450927734, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 2.4316368103027344, "rewards/margins": 8.783146858215332, "rewards/rejected": -6.351510047912598, "step": 242 }, { "epoch": 0.16612544864125792, "grad_norm": 0.08228328824043274, "learning_rate": 3.976010455997187e-05, "logits/chosen": -9.019020080566406, "logits/rejected": -9.014392852783203, "logps/chosen": -7.488541603088379, "logps/rejected": -99.01887512207031, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 2.5606272220611572, "rewards/margins": 9.262109756469727, "rewards/rejected": -6.701482772827148, "step": 243 }, { "epoch": 0.16680909246282688, "grad_norm": 0.05169620364904404, "learning_rate": 3.978983043897883e-05, "logits/chosen": -8.282466888427734, "logits/rejected": -8.278918266296387, "logps/chosen": -8.882709503173828, "logps/rejected": -102.46783447265625, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 2.278691530227661, "rewards/margins": 9.359025955200195, "rewards/rejected": -7.080333709716797, "step": 244 }, { "epoch": 0.16749273628439584, "grad_norm": 0.12298784404993057, "learning_rate": 3.981943473940888e-05, "logits/chosen": -8.599406242370605, "logits/rejected": -8.596295356750488, "logps/chosen": -4.549351692199707, "logps/rejected": -101.99437713623047, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 2.777820587158203, "rewards/margins": 9.71375846862793, "rewards/rejected": -6.935937881469727, "step": 245 }, { "epoch": 0.16817638010596478, "grad_norm": 0.17390774190425873, "learning_rate": 3.984891845172299e-05, "logits/chosen": -7.868966579437256, "logits/rejected": -7.866180419921875, "logps/chosen": -5.417235374450684, "logps/rejected": -103.10110473632812, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 2.65805721282959, "rewards/margins": 9.724164009094238, "rewards/rejected": -7.066106796264648, "step": 246 }, { "epoch": 0.16886002392753374, "grad_norm": 0.0699414387345314, "learning_rate": 3.987828255432777e-05, "logits/chosen": -8.337913513183594, "logits/rejected": -8.334329605102539, "logps/chosen": -7.363188743591309, "logps/rejected": -102.97398376464844, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 2.4324111938476562, "rewards/margins": 9.505845069885254, "rewards/rejected": -7.073434829711914, "step": 247 }, { "epoch": 0.1695436677491027, "grad_norm": 0.06334469467401505, "learning_rate": 3.9907528013770276e-05, "logits/chosen": -8.148207664489746, "logits/rejected": -8.14399528503418, "logps/chosen": -9.502059936523438, "logps/rejected": -101.30354309082031, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 2.3183770179748535, "rewards/margins": 9.172561645507812, "rewards/rejected": -6.854184150695801, "step": 248 }, { "epoch": 0.17022731157067167, "grad_norm": 0.03188035264611244, "learning_rate": 3.993665578492894e-05, "logits/chosen": -8.251775741577148, "logits/rejected": -8.24781608581543, "logps/chosen": -5.739389896392822, "logps/rejected": -104.40091705322266, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 2.699907064437866, "rewards/margins": 9.870400428771973, "rewards/rejected": -7.170493125915527, "step": 249 }, { "epoch": 0.17091095539224063, "grad_norm": 0.037794724106788635, "learning_rate": 3.9965666811200624e-05, "logits/chosen": -9.004220962524414, "logits/rejected": -9.001818656921387, "logps/chosen": -11.435211181640625, "logps/rejected": -101.93569946289062, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 2.0413670539855957, "rewards/margins": 8.956275939941406, "rewards/rejected": -6.914908409118652, "step": 250 }, { "epoch": 0.1715945992138096, "grad_norm": 0.07556581497192383, "learning_rate": 3.999456202468397e-05, "logits/chosen": -8.686074256896973, "logits/rejected": -8.67982006072998, "logps/chosen": -2.212963104248047, "logps/rejected": -104.65968322753906, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 2.9556403160095215, "rewards/margins": 10.300156593322754, "rewards/rejected": -7.344515323638916, "step": 251 }, { "epoch": 0.17227824303537856, "grad_norm": 0.06053835526108742, "learning_rate": 4.002334234635907e-05, "logits/chosen": -7.849226474761963, "logits/rejected": -7.846704483032227, "logps/chosen": -5.7618184089660645, "logps/rejected": -105.13851928710938, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 2.57078218460083, "rewards/margins": 9.82876205444336, "rewards/rejected": -7.257979869842529, "step": 252 }, { "epoch": 0.17296188685694752, "grad_norm": 0.03628980368375778, "learning_rate": 4.005200868626364e-05, "logits/chosen": -9.012542724609375, "logits/rejected": -9.006736755371094, "logps/chosen": -2.851478099822998, "logps/rejected": -105.56766510009766, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 2.9292049407958984, "rewards/margins": 10.281583786010742, "rewards/rejected": -7.352378845214844, "step": 253 }, { "epoch": 0.17364553067851649, "grad_norm": 0.019014902412891388, "learning_rate": 4.008056194366564e-05, "logits/chosen": -8.828091621398926, "logits/rejected": -8.824596405029297, "logps/chosen": -5.623663425445557, "logps/rejected": -104.88127136230469, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 2.6681244373321533, "rewards/margins": 9.911198616027832, "rewards/rejected": -7.243073463439941, "step": 254 }, { "epoch": 0.17432917450008545, "grad_norm": 0.033682871609926224, "learning_rate": 4.010900300723259e-05, "logits/chosen": -8.874814987182617, "logits/rejected": -8.871262550354004, "logps/chosen": -16.0875301361084, "logps/rejected": -100.15528106689453, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 1.7377955913543701, "rewards/margins": 8.339900970458984, "rewards/rejected": -6.602106094360352, "step": 255 }, { "epoch": 0.1750128183216544, "grad_norm": 0.027486352249979973, "learning_rate": 4.013733275519749e-05, "logits/chosen": -8.249382019042969, "logits/rejected": -8.243680000305176, "logps/chosen": -4.707180023193359, "logps/rejected": -104.88072967529297, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 2.7324421405792236, "rewards/margins": 9.98560905456543, "rewards/rejected": -7.253166675567627, "step": 256 }, { "epoch": 0.17569646214322338, "grad_norm": 0.04209812730550766, "learning_rate": 4.016555205552158e-05, "logits/chosen": -8.72976303100586, "logits/rejected": -8.725276947021484, "logps/chosen": -3.990940570831299, "logps/rejected": -107.79875946044922, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 2.7377421855926514, "rewards/margins": 10.281922340393066, "rewards/rejected": -7.544179916381836, "step": 257 }, { "epoch": 0.17638010596479234, "grad_norm": 0.024933626875281334, "learning_rate": 4.0193661766053834e-05, "logits/chosen": -8.472516059875488, "logits/rejected": -8.467544555664062, "logps/chosen": -2.7546589374542236, "logps/rejected": -105.87759399414062, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.953883647918701, "rewards/margins": 10.339229583740234, "rewards/rejected": -7.385346412658691, "step": 258 }, { "epoch": 0.1770637497863613, "grad_norm": 0.017040524631738663, "learning_rate": 4.022166273468753e-05, "logits/chosen": -8.161469459533691, "logits/rejected": -8.157236099243164, "logps/chosen": -6.1945481300354, "logps/rejected": -102.48176574707031, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 2.597806930541992, "rewards/margins": 9.62875747680664, "rewards/rejected": -7.030951499938965, "step": 259 }, { "epoch": 0.17774739360793027, "grad_norm": 0.027180947363376617, "learning_rate": 4.024955579951363e-05, "logits/chosen": -8.605232238769531, "logits/rejected": -8.60123062133789, "logps/chosen": -9.235639572143555, "logps/rejected": -101.55787658691406, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 2.2358222007751465, "rewards/margins": 9.21670150756836, "rewards/rejected": -6.980879783630371, "step": 260 }, { "epoch": 0.17843103742949923, "grad_norm": 0.022306393831968307, "learning_rate": 4.027734178897136e-05, "logits/chosen": -8.147948265075684, "logits/rejected": -8.142163276672363, "logps/chosen": -6.806973457336426, "logps/rejected": -101.57087707519531, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 2.5903916358947754, "rewards/margins": 9.481873512268066, "rewards/rejected": -6.891481399536133, "step": 261 }, { "epoch": 0.1791146812510682, "grad_norm": 0.029604194685816765, "learning_rate": 4.030502152199576e-05, "logits/chosen": -8.81578540802002, "logits/rejected": -8.810745239257812, "logps/chosen": -4.072345733642578, "logps/rejected": -105.47872924804688, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 2.870894432067871, "rewards/margins": 10.130743980407715, "rewards/rejected": -7.259849548339844, "step": 262 }, { "epoch": 0.17979832507263716, "grad_norm": 0.06167689338326454, "learning_rate": 4.033259580816264e-05, "logits/chosen": -8.9613037109375, "logits/rejected": -8.957178115844727, "logps/chosen": -6.773739814758301, "logps/rejected": -104.23841857910156, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 2.421252965927124, "rewards/margins": 9.695924758911133, "rewards/rejected": -7.27467155456543, "step": 263 }, { "epoch": 0.18048196889420612, "grad_norm": 0.03306131064891815, "learning_rate": 4.036006544783052e-05, "logits/chosen": -8.078629493713379, "logits/rejected": -8.075881958007812, "logps/chosen": -7.308015823364258, "logps/rejected": -107.13836669921875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 2.383667230606079, "rewards/margins": 9.907800674438477, "rewards/rejected": -7.52413272857666, "step": 264 }, { "epoch": 0.18116561271577508, "grad_norm": 0.027957206591963768, "learning_rate": 4.0387431232280135e-05, "logits/chosen": -8.47479248046875, "logits/rejected": -8.470832824707031, "logps/chosen": -7.415696144104004, "logps/rejected": -101.78993225097656, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 2.4211812019348145, "rewards/margins": 9.355667114257812, "rewards/rejected": -6.934486389160156, "step": 265 }, { "epoch": 0.18184925653734405, "grad_norm": 0.04745970293879509, "learning_rate": 4.041469394385112e-05, "logits/chosen": -8.774014472961426, "logits/rejected": -8.769125938415527, "logps/chosen": -2.181617259979248, "logps/rejected": -105.043212890625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 2.936923027038574, "rewards/margins": 10.271211624145508, "rewards/rejected": -7.334289073944092, "step": 266 }, { "epoch": 0.182532900358913, "grad_norm": 0.041147515177726746, "learning_rate": 4.0441854356076257e-05, "logits/chosen": -8.335017204284668, "logits/rejected": -8.330684661865234, "logps/chosen": -4.583469390869141, "logps/rejected": -103.73300170898438, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 2.721442222595215, "rewards/margins": 9.83963394165039, "rewards/rejected": -7.118191719055176, "step": 267 }, { "epoch": 0.18321654418048197, "grad_norm": 0.05953797325491905, "learning_rate": 4.046891323381315e-05, "logits/chosen": -8.943140029907227, "logits/rejected": -8.93960189819336, "logps/chosen": -6.565011024475098, "logps/rejected": -105.8424072265625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4593706130981445, "rewards/margins": 9.836982727050781, "rewards/rejected": -7.377612590789795, "step": 268 }, { "epoch": 0.18390018800205093, "grad_norm": 0.020360393449664116, "learning_rate": 4.049587133337347e-05, "logits/chosen": -8.598892211914062, "logits/rejected": -8.595266342163086, "logps/chosen": -6.403928756713867, "logps/rejected": -104.73531341552734, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 2.465615749359131, "rewards/margins": 9.72228717803955, "rewards/rejected": -7.256670951843262, "step": 269 }, { "epoch": 0.1845838318236199, "grad_norm": 0.028418293222784996, "learning_rate": 4.0522729402649793e-05, "logits/chosen": -9.044174194335938, "logits/rejected": -9.038290977478027, "logps/chosen": -2.2672080993652344, "logps/rejected": -105.06085205078125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 2.981029510498047, "rewards/margins": 10.260021209716797, "rewards/rejected": -7.27899169921875, "step": 270 }, { "epoch": 0.18526747564518886, "grad_norm": 0.05478236451745033, "learning_rate": 4.0549488181240096e-05, "logits/chosen": -8.490236282348633, "logits/rejected": -8.487151145935059, "logps/chosen": -6.394977569580078, "logps/rejected": -105.73877716064453, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 2.5518229007720947, "rewards/margins": 9.850275039672852, "rewards/rejected": -7.298452854156494, "step": 271 }, { "epoch": 0.18595111946675782, "grad_norm": 0.030646532773971558, "learning_rate": 4.057614840056998e-05, "logits/chosen": -8.826255798339844, "logits/rejected": -8.82142162322998, "logps/chosen": -4.965938568115234, "logps/rejected": -100.94479370117188, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 2.6246557235717773, "rewards/margins": 9.525517463684082, "rewards/rejected": -6.900861740112305, "step": 272 }, { "epoch": 0.1866347632883268, "grad_norm": 0.08209877461194992, "learning_rate": 4.06027107840126e-05, "logits/chosen": -8.773487091064453, "logits/rejected": -8.770679473876953, "logps/chosen": -4.593759536743164, "logps/rejected": -106.18574523925781, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.6502466201782227, "rewards/margins": 10.043479919433594, "rewards/rejected": -7.3932342529296875, "step": 273 }, { "epoch": 0.18731840710989575, "grad_norm": 10.889665603637695, "learning_rate": 4.0629176047006474e-05, "logits/chosen": -8.254095077514648, "logits/rejected": -8.247110366821289, "logps/chosen": -6.797707557678223, "logps/rejected": -100.97221374511719, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": 2.6295084953308105, "rewards/margins": 9.377124786376953, "rewards/rejected": -6.747617721557617, "step": 274 }, { "epoch": 0.18800205093146471, "grad_norm": 0.030123209580779076, "learning_rate": 4.065554489717105e-05, "logits/chosen": -8.908941268920898, "logits/rejected": -8.901674270629883, "logps/chosen": -3.4439258575439453, "logps/rejected": -103.22508239746094, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 2.9526548385620117, "rewards/margins": 9.99195671081543, "rewards/rejected": -7.03930139541626, "step": 275 }, { "epoch": 0.18868569475303368, "grad_norm": 0.01289369072765112, "learning_rate": 4.068181803442029e-05, "logits/chosen": -8.13956069946289, "logits/rejected": -8.133371353149414, "logps/chosen": -1.1480457782745361, "logps/rejected": -104.66770935058594, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 3.141758918762207, "rewards/margins": 10.380437850952148, "rewards/rejected": -7.238677978515625, "step": 276 }, { "epoch": 0.18936933857460264, "grad_norm": 0.014317058958113194, "learning_rate": 4.0707996151074147e-05, "logits/chosen": -8.829742431640625, "logits/rejected": -8.822722434997559, "logps/chosen": -3.7229087352752686, "logps/rejected": -103.54573059082031, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 2.8499279022216797, "rewards/margins": 9.988204956054688, "rewards/rejected": -7.138277530670166, "step": 277 }, { "epoch": 0.1900529823961716, "grad_norm": 0.010053861886262894, "learning_rate": 4.073407993196794e-05, "logits/chosen": -9.307626724243164, "logits/rejected": -9.303621292114258, "logps/chosen": -2.278756618499756, "logps/rejected": -107.1941909790039, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.8583335876464844, "rewards/margins": 10.4075345993042, "rewards/rejected": -7.549201011657715, "step": 278 }, { "epoch": 0.19073662621774057, "grad_norm": 0.014993913471698761, "learning_rate": 4.076007005455996e-05, "logits/chosen": -8.420897483825684, "logits/rejected": -8.417263984680176, "logps/chosen": -3.8983936309814453, "logps/rejected": -105.41680908203125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 2.799379348754883, "rewards/margins": 10.032327651977539, "rewards/rejected": -7.232948303222656, "step": 279 }, { "epoch": 0.19142027003930953, "grad_norm": 0.07939931750297546, "learning_rate": 4.0785967189036986e-05, "logits/chosen": -8.433990478515625, "logits/rejected": -8.428629875183105, "logps/chosen": -6.111316680908203, "logps/rejected": -100.03262329101562, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 2.608341693878174, "rewards/margins": 9.380504608154297, "rewards/rejected": -6.7721638679504395, "step": 280 }, { "epoch": 0.1921039138608785, "grad_norm": 0.019475743174552917, "learning_rate": 4.0811771998418e-05, "logits/chosen": -8.59628677368164, "logits/rejected": -8.588144302368164, "logps/chosen": -5.926364421844482, "logps/rejected": -98.92656707763672, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.5851821899414062, "rewards/margins": 9.325597763061523, "rewards/rejected": -6.740416526794434, "step": 281 }, { "epoch": 0.19278755768244746, "grad_norm": 0.015625622123479843, "learning_rate": 4.083748513865602e-05, "logits/chosen": -8.696459770202637, "logits/rejected": -8.690286636352539, "logps/chosen": -6.886237144470215, "logps/rejected": -101.00117492675781, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 2.467906951904297, "rewards/margins": 9.403078079223633, "rewards/rejected": -6.935171604156494, "step": 282 }, { "epoch": 0.19347120150401642, "grad_norm": 0.012177601456642151, "learning_rate": 4.086310725873818e-05, "logits/chosen": -8.644604682922363, "logits/rejected": -8.639453887939453, "logps/chosen": -5.047406196594238, "logps/rejected": -102.75769805908203, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 2.7579779624938965, "rewards/margins": 9.762666702270508, "rewards/rejected": -7.004688262939453, "step": 283 }, { "epoch": 0.19415484532558536, "grad_norm": 0.010010492987930775, "learning_rate": 4.0888639000783966e-05, "logits/chosen": -8.579455375671387, "logits/rejected": -8.575119018554688, "logps/chosen": -4.244923114776611, "logps/rejected": -105.35026550292969, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.8279690742492676, "rewards/margins": 10.066014289855957, "rewards/rejected": -7.238044738769531, "step": 284 }, { "epoch": 0.19483848914715432, "grad_norm": 0.4569879472255707, "learning_rate": 4.0914081000141844e-05, "logits/chosen": -8.379860877990723, "logits/rejected": -8.374613761901855, "logps/chosen": -2.8748466968536377, "logps/rejected": -104.66427612304688, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": 2.851515293121338, "rewards/margins": 10.15652847290039, "rewards/rejected": -7.305013179779053, "step": 285 }, { "epoch": 0.19552213296872328, "grad_norm": 1.4287073612213135, "learning_rate": 4.0939433885484055e-05, "logits/chosen": -8.359570503234863, "logits/rejected": -8.355611801147461, "logps/chosen": -3.670346260070801, "logps/rejected": -105.14372253417969, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 2.7654409408569336, "rewards/margins": 10.029167175292969, "rewards/rejected": -7.263726711273193, "step": 286 }, { "epoch": 0.19620577679029225, "grad_norm": 6.490729808807373, "learning_rate": 4.0964698278899874e-05, "logits/chosen": -8.376953125, "logits/rejected": -8.372331619262695, "logps/chosen": -8.67423152923584, "logps/rejected": -97.91778564453125, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": 2.4146525859832764, "rewards/margins": 8.929595947265625, "rewards/rejected": -6.514942646026611, "step": 287 }, { "epoch": 0.1968894206118612, "grad_norm": 0.02953021042048931, "learning_rate": 4.0989874795987185e-05, "logits/chosen": -8.4850492477417, "logits/rejected": -8.480751991271973, "logps/chosen": -3.733102321624756, "logps/rejected": -103.34707641601562, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 2.7891886234283447, "rewards/margins": 9.851264953613281, "rewards/rejected": -7.062076568603516, "step": 288 }, { "epoch": 0.19757306443343017, "grad_norm": 0.053641676902770996, "learning_rate": 4.1014964045942465e-05, "logits/chosen": -9.235803604125977, "logits/rejected": -9.231438636779785, "logps/chosen": -1.8991869688034058, "logps/rejected": -103.08662414550781, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 2.861818552017212, "rewards/margins": 10.05466079711914, "rewards/rejected": -7.192841529846191, "step": 289 }, { "epoch": 0.19825670825499914, "grad_norm": 0.036926183849573135, "learning_rate": 4.103996663164927e-05, "logits/chosen": -8.293672561645508, "logits/rejected": -8.289783477783203, "logps/chosen": -6.575809478759766, "logps/rejected": -97.37762451171875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.607670307159424, "rewards/margins": 9.101040840148926, "rewards/rejected": -6.493370056152344, "step": 290 }, { "epoch": 0.1989403520765681, "grad_norm": 0.03192468360066414, "learning_rate": 4.106488314976513e-05, "logits/chosen": -8.284808158874512, "logits/rejected": -8.280096054077148, "logps/chosen": -4.614950180053711, "logps/rejected": -99.5863037109375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 2.622579336166382, "rewards/margins": 9.482839584350586, "rewards/rejected": -6.860260963439941, "step": 291 }, { "epoch": 0.19962399589813706, "grad_norm": 1.26917564868927, "learning_rate": 4.108971419080698e-05, "logits/chosen": -8.343595504760742, "logits/rejected": -8.340750694274902, "logps/chosen": -7.399903774261475, "logps/rejected": -94.69021606445312, "loss": 0.023, "rewards/accuracies": 1.0, "rewards/chosen": 2.558232307434082, "rewards/margins": 8.743986129760742, "rewards/rejected": -6.18575382232666, "step": 292 }, { "epoch": 0.20030763971970603, "grad_norm": 0.7614583373069763, "learning_rate": 4.111446033923516e-05, "logits/chosen": -9.294757843017578, "logits/rejected": -9.291974067687988, "logps/chosen": -9.216413497924805, "logps/rejected": -95.37604522705078, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 2.354346990585327, "rewards/margins": 8.619769096374512, "rewards/rejected": -6.2654218673706055, "step": 293 }, { "epoch": 0.200991283541275, "grad_norm": 0.24066519737243652, "learning_rate": 4.113912217353596e-05, "logits/chosen": -8.610675811767578, "logits/rejected": -8.607412338256836, "logps/chosen": -8.673480987548828, "logps/rejected": -98.28018188476562, "loss": 0.0228, "rewards/accuracies": 1.0, "rewards/chosen": 2.4006662368774414, "rewards/margins": 8.955381393432617, "rewards/rejected": -6.554714679718018, "step": 294 }, { "epoch": 0.20167492736284395, "grad_norm": 0.03011869452893734, "learning_rate": 4.116370026630272e-05, "logits/chosen": -8.345272064208984, "logits/rejected": -8.343594551086426, "logps/chosen": -8.59842586517334, "logps/rejected": -101.7082748413086, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 2.2400622367858887, "rewards/margins": 9.24290657043457, "rewards/rejected": -7.002844333648682, "step": 295 }, { "epoch": 0.20235857118441292, "grad_norm": 0.163138285279274, "learning_rate": 4.118819518431564e-05, "logits/chosen": -7.962790489196777, "logits/rejected": -7.959268569946289, "logps/chosen": -14.458796501159668, "logps/rejected": -97.48594665527344, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.7801904678344727, "rewards/margins": 8.262779235839844, "rewards/rejected": -6.482588768005371, "step": 296 }, { "epoch": 0.20304221500598188, "grad_norm": 0.05990254878997803, "learning_rate": 4.121260748862021e-05, "logits/chosen": -8.65965747833252, "logits/rejected": -8.655905723571777, "logps/chosen": -17.323686599731445, "logps/rejected": -96.90819549560547, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 1.5773522853851318, "rewards/margins": 7.906074047088623, "rewards/rejected": -6.32872200012207, "step": 297 }, { "epoch": 0.20372585882755084, "grad_norm": 1.2142362594604492, "learning_rate": 4.123693773460426e-05, "logits/chosen": -8.697903633117676, "logits/rejected": -8.694701194763184, "logps/chosen": -13.892496109008789, "logps/rejected": -93.68260955810547, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 1.8767781257629395, "rewards/margins": 7.901645183563232, "rewards/rejected": -6.024867057800293, "step": 298 }, { "epoch": 0.2044095026491198, "grad_norm": 0.3420475721359253, "learning_rate": 4.126118647207383e-05, "logits/chosen": -8.982627868652344, "logits/rejected": -8.97832202911377, "logps/chosen": -10.814003944396973, "logps/rejected": -94.80226135253906, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 2.1781091690063477, "rewards/margins": 8.412004470825195, "rewards/rejected": -6.233895301818848, "step": 299 }, { "epoch": 0.20509314647068877, "grad_norm": 0.09126918017864227, "learning_rate": 4.1285354245327715e-05, "logits/chosen": -8.705171585083008, "logits/rejected": -8.701096534729004, "logps/chosen": -11.411449432373047, "logps/rejected": -96.93439483642578, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.029691696166992, "rewards/margins": 8.520964622497559, "rewards/rejected": -6.491273403167725, "step": 300 }, { "epoch": 0.20577679029225773, "grad_norm": 0.13730818033218384, "learning_rate": 4.1309441593230726e-05, "logits/chosen": -8.327005386352539, "logits/rejected": -8.323317527770996, "logps/chosen": -11.307563781738281, "logps/rejected": -99.98284912109375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 1.9995715618133545, "rewards/margins": 8.841209411621094, "rewards/rejected": -6.841638088226318, "step": 301 }, { "epoch": 0.2064604341138267, "grad_norm": 0.09373831003904343, "learning_rate": 4.133344904928585e-05, "logits/chosen": -8.190007209777832, "logits/rejected": -8.18551254272461, "logps/chosen": -14.083927154541016, "logps/rejected": -100.15040588378906, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.743296504020691, "rewards/margins": 8.539588928222656, "rewards/rejected": -6.796292304992676, "step": 302 }, { "epoch": 0.20714407793539566, "grad_norm": 0.1988580971956253, "learning_rate": 4.1357377141705084e-05, "logits/chosen": -8.769279479980469, "logits/rejected": -8.765240669250488, "logps/chosen": -9.256983757019043, "logps/rejected": -102.64897918701172, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 2.199514389038086, "rewards/margins": 9.256864547729492, "rewards/rejected": -7.057350158691406, "step": 303 }, { "epoch": 0.20782772175696462, "grad_norm": 0.11368526518344879, "learning_rate": 4.1381226393479236e-05, "logits/chosen": -8.23236083984375, "logits/rejected": -8.227498054504395, "logps/chosen": -8.846726417541504, "logps/rejected": -101.45098876953125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 2.451237201690674, "rewards/margins": 9.227636337280273, "rewards/rejected": -6.776399612426758, "step": 304 }, { "epoch": 0.2085113655785336, "grad_norm": 0.0871858149766922, "learning_rate": 4.1404997322446435e-05, "logits/chosen": -8.883060455322266, "logits/rejected": -8.878778457641602, "logps/chosen": -16.9862003326416, "logps/rejected": -98.52128601074219, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.6407771110534668, "rewards/margins": 8.099433898925781, "rewards/rejected": -6.4586567878723145, "step": 305 }, { "epoch": 0.20919500940010255, "grad_norm": 1.2580318450927734, "learning_rate": 4.142869044135967e-05, "logits/chosen": -8.519518852233887, "logits/rejected": -8.514738082885742, "logps/chosen": -8.119943618774414, "logps/rejected": -96.44764709472656, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": 2.523684024810791, "rewards/margins": 8.90868854522705, "rewards/rejected": -6.385005474090576, "step": 306 }, { "epoch": 0.2098786532216715, "grad_norm": 0.11371597647666931, "learning_rate": 4.145230625795311e-05, "logits/chosen": -8.672273635864258, "logits/rejected": -8.66752815246582, "logps/chosen": -5.848682403564453, "logps/rejected": -100.53289794921875, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 2.5373082160949707, "rewards/margins": 9.406913757324219, "rewards/rejected": -6.869605541229248, "step": 307 }, { "epoch": 0.21056229704324048, "grad_norm": 0.01239006407558918, "learning_rate": 4.14758452750074e-05, "logits/chosen": -8.480913162231445, "logits/rejected": -8.477119445800781, "logps/chosen": -4.9329833984375, "logps/rejected": -99.86244201660156, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 2.7430009841918945, "rewards/margins": 9.478775024414062, "rewards/rejected": -6.735773086547852, "step": 308 }, { "epoch": 0.21124594086480944, "grad_norm": 0.019843559712171555, "learning_rate": 4.149930799041392e-05, "logits/chosen": -8.187057495117188, "logits/rejected": -8.18299674987793, "logps/chosen": -8.721134185791016, "logps/rejected": -98.82756042480469, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 2.429429054260254, "rewards/margins": 8.99527359008789, "rewards/rejected": -6.565844535827637, "step": 309 }, { "epoch": 0.2119295846863784, "grad_norm": 1.3388104438781738, "learning_rate": 4.152269489723788e-05, "logits/chosen": -8.213985443115234, "logits/rejected": -8.209246635437012, "logps/chosen": -5.780764102935791, "logps/rejected": -101.44791412353516, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 2.687356948852539, "rewards/margins": 9.528800964355469, "rewards/rejected": -6.84144401550293, "step": 310 }, { "epoch": 0.21261322850794737, "grad_norm": 0.12263625860214233, "learning_rate": 4.1546006483780626e-05, "logits/chosen": -8.425986289978027, "logits/rejected": -8.421585083007812, "logps/chosen": -3.8563942909240723, "logps/rejected": -100.71022033691406, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.82028865814209, "rewards/margins": 9.665689468383789, "rewards/rejected": -6.845401763916016, "step": 311 }, { "epoch": 0.21329687232951633, "grad_norm": 0.07973676919937134, "learning_rate": 4.156924323364072e-05, "logits/chosen": -8.260736465454102, "logits/rejected": -8.256143569946289, "logps/chosen": -5.931116104125977, "logps/rejected": -93.93976593017578, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.662308692932129, "rewards/margins": 8.795787811279297, "rewards/rejected": -6.133479118347168, "step": 312 }, { "epoch": 0.2139805161510853, "grad_norm": 0.07003989070653915, "learning_rate": 4.1592405625774144e-05, "logits/chosen": -8.499881744384766, "logits/rejected": -8.49638557434082, "logps/chosen": -8.712963104248047, "logps/rejected": -96.14463806152344, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 2.426731586456299, "rewards/margins": 8.754321098327637, "rewards/rejected": -6.327589988708496, "step": 313 }, { "epoch": 0.21466415997265426, "grad_norm": 0.03364497795701027, "learning_rate": 4.161549413455358e-05, "logits/chosen": -8.793960571289062, "logits/rejected": -8.789958953857422, "logps/chosen": -7.100704193115234, "logps/rejected": -97.70849609375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 2.4999358654022217, "rewards/margins": 9.01588249206543, "rewards/rejected": -6.515946865081787, "step": 314 }, { "epoch": 0.21534780379422322, "grad_norm": 0.11617843806743622, "learning_rate": 4.163850922982668e-05, "logits/chosen": -8.551559448242188, "logits/rejected": -8.547842025756836, "logps/chosen": -5.287615776062012, "logps/rejected": -96.11817932128906, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 2.7725396156311035, "rewards/margins": 9.12806510925293, "rewards/rejected": -6.355525970458984, "step": 315 }, { "epoch": 0.21603144761579218, "grad_norm": 0.07314585149288177, "learning_rate": 4.16614513769734e-05, "logits/chosen": -8.732671737670898, "logits/rejected": -8.728841781616211, "logps/chosen": -1.4478777647018433, "logps/rejected": -101.91832733154297, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 3.0449962615966797, "rewards/margins": 10.019905090332031, "rewards/rejected": -6.97490930557251, "step": 316 }, { "epoch": 0.21671509143736115, "grad_norm": 0.18940572440624237, "learning_rate": 4.1684321036962526e-05, "logits/chosen": -8.822675704956055, "logits/rejected": -8.820021629333496, "logps/chosen": -2.2057621479034424, "logps/rejected": -105.27336883544922, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 2.838040351867676, "rewards/margins": 10.169377326965332, "rewards/rejected": -7.3313374519348145, "step": 317 }, { "epoch": 0.2173987352589301, "grad_norm": 0.13475686311721802, "learning_rate": 4.170711866640721e-05, "logits/chosen": -9.191324234008789, "logits/rejected": -9.187318801879883, "logps/chosen": -4.336188793182373, "logps/rejected": -98.95465087890625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 2.6916451454162598, "rewards/margins": 9.47972297668457, "rewards/rejected": -6.7880778312683105, "step": 318 }, { "epoch": 0.21808237908049907, "grad_norm": 0.03434986248612404, "learning_rate": 4.1729844717619684e-05, "logits/chosen": -8.948854446411133, "logits/rejected": -8.944478988647461, "logps/chosen": -6.603644371032715, "logps/rejected": -98.9333724975586, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 2.569068193435669, "rewards/margins": 9.164411544799805, "rewards/rejected": -6.595343589782715, "step": 319 }, { "epoch": 0.21876602290206804, "grad_norm": 1.0198897123336792, "learning_rate": 4.17524996386651e-05, "logits/chosen": -8.363971710205078, "logits/rejected": -8.360854148864746, "logps/chosen": -4.589636325836182, "logps/rejected": -102.8074951171875, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 2.7248377799987793, "rewards/margins": 9.7890043258667, "rewards/rejected": -7.064166069030762, "step": 320 }, { "epoch": 0.21944966672363697, "grad_norm": 0.009596343152225018, "learning_rate": 4.177508387341454e-05, "logits/chosen": -8.922504425048828, "logits/rejected": -8.918219566345215, "logps/chosen": -5.276576519012451, "logps/rejected": -100.50144958496094, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.6093087196350098, "rewards/margins": 9.530210494995117, "rewards/rejected": -6.920901298522949, "step": 321 }, { "epoch": 0.22013331054520593, "grad_norm": 0.01504669152200222, "learning_rate": 4.179759786159719e-05, "logits/chosen": -8.583145141601562, "logits/rejected": -8.578999519348145, "logps/chosen": -4.4279961585998535, "logps/rejected": -101.67122650146484, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.817744493484497, "rewards/margins": 9.725510597229004, "rewards/rejected": -6.907766342163086, "step": 322 }, { "epoch": 0.2208169543667749, "grad_norm": 1.1949272155761719, "learning_rate": 4.182004203885172e-05, "logits/chosen": -8.52472972869873, "logits/rejected": -8.519828796386719, "logps/chosen": -3.7368946075439453, "logps/rejected": -101.96250915527344, "loss": 0.0299, "rewards/accuracies": 1.0, "rewards/chosen": 2.7287728786468506, "rewards/margins": 9.763341903686523, "rewards/rejected": -7.03456974029541, "step": 323 }, { "epoch": 0.22150059818834386, "grad_norm": 0.016684161499142647, "learning_rate": 4.184241683677687e-05, "logits/chosen": -8.843542098999023, "logits/rejected": -8.84028434753418, "logps/chosen": -10.01787281036377, "logps/rejected": -102.78367614746094, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 2.2099363803863525, "rewards/margins": 9.155183792114258, "rewards/rejected": -6.945247650146484, "step": 324 }, { "epoch": 0.22218424200991282, "grad_norm": 0.08492986857891083, "learning_rate": 4.1864722682981245e-05, "logits/chosen": -8.838151931762695, "logits/rejected": -8.834270477294922, "logps/chosen": -13.82537841796875, "logps/rejected": -99.69621276855469, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.9135446548461914, "rewards/margins": 8.576009750366211, "rewards/rejected": -6.662464141845703, "step": 325 }, { "epoch": 0.2228678858314818, "grad_norm": 0.025570135563611984, "learning_rate": 4.188696000113232e-05, "logits/chosen": -8.716621398925781, "logits/rejected": -8.713323593139648, "logps/chosen": -10.355022430419922, "logps/rejected": -102.15589904785156, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 2.089292049407959, "rewards/margins": 9.126069068908691, "rewards/rejected": -7.036777019500732, "step": 326 }, { "epoch": 0.22355152965305075, "grad_norm": 0.013680006377398968, "learning_rate": 4.190912921100477e-05, "logits/chosen": -8.137861251831055, "logits/rejected": -8.134795188903809, "logps/chosen": -10.09477424621582, "logps/rejected": -102.18336486816406, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.2361204624176025, "rewards/margins": 9.223740577697754, "rewards/rejected": -6.987619876861572, "step": 327 }, { "epoch": 0.22423517347461971, "grad_norm": 0.02012096531689167, "learning_rate": 4.1931230728527994e-05, "logits/chosen": -8.611162185668945, "logits/rejected": -8.608440399169922, "logps/chosen": -17.986562728881836, "logps/rejected": -101.1778564453125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.2484841346740723, "rewards/margins": 8.21912956237793, "rewards/rejected": -6.970646381378174, "step": 328 }, { "epoch": 0.22491881729618868, "grad_norm": 0.1113714948296547, "learning_rate": 4.195326496583291e-05, "logits/chosen": -7.975112438201904, "logits/rejected": -7.972050666809082, "logps/chosen": -17.740175247192383, "logps/rejected": -100.5086669921875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.4924429655075073, "rewards/margins": 8.251178741455078, "rewards/rejected": -6.758735179901123, "step": 329 }, { "epoch": 0.22560246111775764, "grad_norm": 0.13353674113750458, "learning_rate": 4.1975232331298125e-05, "logits/chosen": -8.195783615112305, "logits/rejected": -8.191679954528809, "logps/chosen": -18.386669158935547, "logps/rejected": -100.36058044433594, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.3295905590057373, "rewards/margins": 8.125635147094727, "rewards/rejected": -6.79604434967041, "step": 330 }, { "epoch": 0.2262861049393266, "grad_norm": 0.1170080229640007, "learning_rate": 4.1997133229595316e-05, "logits/chosen": -8.810148239135742, "logits/rejected": -8.804279327392578, "logps/chosen": -11.892565727233887, "logps/rejected": -98.70195007324219, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.0924603939056396, "rewards/margins": 8.775979995727539, "rewards/rejected": -6.68351936340332, "step": 331 }, { "epoch": 0.22696974876089557, "grad_norm": 0.0500829853117466, "learning_rate": 4.201896806173394e-05, "logits/chosen": -8.134411811828613, "logits/rejected": -8.131871223449707, "logps/chosen": -17.463855743408203, "logps/rejected": -101.35227966308594, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.4014705419540405, "rewards/margins": 8.30337905883789, "rewards/rejected": -6.9019083976745605, "step": 332 }, { "epoch": 0.22765339258246453, "grad_norm": 0.21678948402404785, "learning_rate": 4.2040737225105335e-05, "logits/chosen": -9.004213333129883, "logits/rejected": -9.001480102539062, "logps/chosen": -21.144948959350586, "logps/rejected": -100.83013916015625, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 1.1842622756958008, "rewards/margins": 7.905330657958984, "rewards/rejected": -6.721068382263184, "step": 333 }, { "epoch": 0.2283370364040335, "grad_norm": 0.03928687795996666, "learning_rate": 4.206244111352608e-05, "logits/chosen": -8.586760520935059, "logits/rejected": -8.584331512451172, "logps/chosen": -15.021004676818848, "logps/rejected": -102.17549896240234, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.6610112190246582, "rewards/margins": 8.713460922241211, "rewards/rejected": -7.0524492263793945, "step": 334 }, { "epoch": 0.22902068022560246, "grad_norm": 0.11412420868873596, "learning_rate": 4.2084080117280756e-05, "logits/chosen": -8.443252563476562, "logits/rejected": -8.440590858459473, "logps/chosen": -13.42440414428711, "logps/rejected": -103.371337890625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.8653513193130493, "rewards/margins": 8.921977996826172, "rewards/rejected": -7.056626319885254, "step": 335 }, { "epoch": 0.22970432404717142, "grad_norm": 0.016360599547624588, "learning_rate": 4.210565462316407e-05, "logits/chosen": -8.501632690429688, "logits/rejected": -8.497930526733398, "logps/chosen": -10.854573249816895, "logps/rejected": -103.44879913330078, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.0531115531921387, "rewards/margins": 9.175830841064453, "rewards/rejected": -7.122718811035156, "step": 336 }, { "epoch": 0.23038796786874038, "grad_norm": 0.035550203174352646, "learning_rate": 4.2127165014522315e-05, "logits/chosen": -8.361442565917969, "logits/rejected": -8.356088638305664, "logps/chosen": -6.107236385345459, "logps/rejected": -103.59524536132812, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 2.5661606788635254, "rewards/margins": 9.75135612487793, "rewards/rejected": -7.185195446014404, "step": 337 }, { "epoch": 0.23107161169030935, "grad_norm": 0.014255646616220474, "learning_rate": 4.214861167129425e-05, "logits/chosen": -8.220901489257812, "logits/rejected": -8.215441703796387, "logps/chosen": -6.71097993850708, "logps/rejected": -100.93061065673828, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.592446804046631, "rewards/margins": 9.438172340393066, "rewards/rejected": -6.845726013183594, "step": 338 }, { "epoch": 0.2317552555118783, "grad_norm": 0.00880985613912344, "learning_rate": 4.2169994970051365e-05, "logits/chosen": -9.30769157409668, "logits/rejected": -9.301285743713379, "logps/chosen": -7.185589790344238, "logps/rejected": -101.14218139648438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.626293659210205, "rewards/margins": 9.399776458740234, "rewards/rejected": -6.7734832763671875, "step": 339 }, { "epoch": 0.23243889933344727, "grad_norm": 0.09166064858436584, "learning_rate": 4.219131528403759e-05, "logits/chosen": -8.124578475952148, "logits/rejected": -8.118850708007812, "logps/chosen": -3.450153112411499, "logps/rejected": -103.247314453125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.933073043823242, "rewards/margins": 9.99491024017334, "rewards/rejected": -7.061838150024414, "step": 340 }, { "epoch": 0.23312254315501624, "grad_norm": 0.014122388325631618, "learning_rate": 4.22125729832083e-05, "logits/chosen": -7.962584972381592, "logits/rejected": -7.956790924072266, "logps/chosen": -5.193010330200195, "logps/rejected": -103.72929382324219, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.713582992553711, "rewards/margins": 9.803380966186523, "rewards/rejected": -7.0897979736328125, "step": 341 }, { "epoch": 0.2338061869765852, "grad_norm": 0.006464731879532337, "learning_rate": 4.2233768434268914e-05, "logits/chosen": -8.946870803833008, "logits/rejected": -8.939178466796875, "logps/chosen": -6.6159586906433105, "logps/rejected": -102.74836730957031, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.60830020904541, "rewards/margins": 9.65874195098877, "rewards/rejected": -7.050442218780518, "step": 342 }, { "epoch": 0.23448983079815416, "grad_norm": 0.0038375267758965492, "learning_rate": 4.225490200071284e-05, "logits/chosen": -8.594236373901367, "logits/rejected": -8.587889671325684, "logps/chosen": -8.829078674316406, "logps/rejected": -101.09379577636719, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.3256402015686035, "rewards/margins": 9.19018840789795, "rewards/rejected": -6.864548683166504, "step": 343 }, { "epoch": 0.23517347461972313, "grad_norm": 0.0016771839000284672, "learning_rate": 4.227597404285883e-05, "logits/chosen": -8.525121688842773, "logits/rejected": -8.519166946411133, "logps/chosen": -2.8276920318603516, "logps/rejected": -104.4154052734375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.849639654159546, "rewards/margins": 10.098280906677246, "rewards/rejected": -7.248641490936279, "step": 344 }, { "epoch": 0.2358571184412921, "grad_norm": 0.00383757334202528, "learning_rate": 4.229698491788791e-05, "logits/chosen": -8.293754577636719, "logits/rejected": -8.28618049621582, "logps/chosen": -2.738898754119873, "logps/rejected": -103.82846069335938, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.93245792388916, "rewards/margins": 10.100936889648438, "rewards/rejected": -7.168478965759277, "step": 345 }, { "epoch": 0.23654076226286105, "grad_norm": 0.052315495908260345, "learning_rate": 4.231793497987961e-05, "logits/chosen": -8.604242324829102, "logits/rejected": -8.597562789916992, "logps/chosen": -1.4767374992370605, "logps/rejected": -105.09549713134766, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 3.0288679599761963, "rewards/margins": 10.37572193145752, "rewards/rejected": -7.346853733062744, "step": 346 }, { "epoch": 0.23722440608443002, "grad_norm": 0.24622400104999542, "learning_rate": 4.2338824579847904e-05, "logits/chosen": -9.0902099609375, "logits/rejected": -9.082121849060059, "logps/chosen": -3.319505453109741, "logps/rejected": -104.91400909423828, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 2.8535261154174805, "rewards/margins": 10.085920333862305, "rewards/rejected": -7.232395172119141, "step": 347 }, { "epoch": 0.23790804990599898, "grad_norm": 0.013927134685218334, "learning_rate": 4.235965406577636e-05, "logits/chosen": -8.298728942871094, "logits/rejected": -8.292978286743164, "logps/chosen": -4.7771172523498535, "logps/rejected": -102.5966796875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.7202770709991455, "rewards/margins": 9.765715599060059, "rewards/rejected": -7.045438289642334, "step": 348 }, { "epoch": 0.23859169372756794, "grad_norm": 0.0025245817378163338, "learning_rate": 4.2380423782653e-05, "logits/chosen": -9.296743392944336, "logits/rejected": -9.290021896362305, "logps/chosen": -2.7512807846069336, "logps/rejected": -106.54434204101562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9062998294830322, "rewards/margins": 10.229927062988281, "rewards/rejected": -7.323627471923828, "step": 349 }, { "epoch": 0.2392753375491369, "grad_norm": 0.011586092412471771, "learning_rate": 4.240113407250459e-05, "logits/chosen": -8.272429466247559, "logits/rejected": -8.266352653503418, "logps/chosen": -7.844491481781006, "logps/rejected": -101.4205093383789, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 2.403277635574341, "rewards/margins": 9.281621932983398, "rewards/rejected": -6.87834358215332, "step": 350 }, { "epoch": 0.23995898137070587, "grad_norm": 0.00912454817444086, "learning_rate": 4.24217852744304e-05, "logits/chosen": -8.6158447265625, "logits/rejected": -8.609745025634766, "logps/chosen": -8.236944198608398, "logps/rejected": -100.05511474609375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.4283463954925537, "rewards/margins": 9.164316177368164, "rewards/rejected": -6.735969543457031, "step": 351 }, { "epoch": 0.24064262519227483, "grad_norm": 0.026029134169220924, "learning_rate": 4.244237772463552e-05, "logits/chosen": -9.03452205657959, "logits/rejected": -9.027360916137695, "logps/chosen": -0.9546108841896057, "logps/rejected": -105.0675048828125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 3.0672996044158936, "rewards/margins": 10.362009048461914, "rewards/rejected": -7.294709205627441, "step": 352 }, { "epoch": 0.2413262690138438, "grad_norm": 0.014573350548744202, "learning_rate": 4.246291175646371e-05, "logits/chosen": -8.923337936401367, "logits/rejected": -8.917699813842773, "logps/chosen": -2.369967460632324, "logps/rejected": -104.56361389160156, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.8961074352264404, "rewards/margins": 10.181365966796875, "rewards/rejected": -7.2852582931518555, "step": 353 }, { "epoch": 0.24200991283541276, "grad_norm": 0.007469063624739647, "learning_rate": 4.24833877004298e-05, "logits/chosen": -8.895869255065918, "logits/rejected": -8.890003204345703, "logps/chosen": -10.532219886779785, "logps/rejected": -101.01882934570312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.2189297676086426, "rewards/margins": 9.02490234375, "rewards/rejected": -6.805972099304199, "step": 354 }, { "epoch": 0.24269355665698172, "grad_norm": 0.009191561490297318, "learning_rate": 4.250380588425157e-05, "logits/chosen": -8.169561386108398, "logits/rejected": -8.164226531982422, "logps/chosen": -2.259254217147827, "logps/rejected": -104.84899139404297, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.97993803024292, "rewards/margins": 10.260005950927734, "rewards/rejected": -7.280068397521973, "step": 355 }, { "epoch": 0.2433772004785507, "grad_norm": 0.0015140968607738614, "learning_rate": 4.2524166632881255e-05, "logits/chosen": -9.046843528747559, "logits/rejected": -9.040699005126953, "logps/chosen": -6.729586124420166, "logps/rejected": -101.90473937988281, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4953386783599854, "rewards/margins": 9.46906852722168, "rewards/rejected": -6.973729133605957, "step": 356 }, { "epoch": 0.24406084430011965, "grad_norm": 0.09261881560087204, "learning_rate": 4.254447026853656e-05, "logits/chosen": -8.426233291625977, "logits/rejected": -8.420336723327637, "logps/chosen": -4.568697929382324, "logps/rejected": -103.64239501953125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 2.7278389930725098, "rewards/margins": 9.849444389343262, "rewards/rejected": -7.121604919433594, "step": 357 }, { "epoch": 0.2447444881216886, "grad_norm": 0.0022088228724896908, "learning_rate": 4.2564717110731244e-05, "logits/chosen": -8.827827453613281, "logits/rejected": -8.82178783416748, "logps/chosen": -5.338776111602783, "logps/rejected": -102.88825988769531, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7577106952667236, "rewards/margins": 9.79307746887207, "rewards/rejected": -7.035366058349609, "step": 358 }, { "epoch": 0.24542813194325755, "grad_norm": 0.0015070593217387795, "learning_rate": 4.258490747630532e-05, "logits/chosen": -8.992010116577148, "logits/rejected": -8.986349105834961, "logps/chosen": -3.1666922569274902, "logps/rejected": -104.47599792480469, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.832685947418213, "rewards/margins": 10.084892272949219, "rewards/rejected": -7.2522053718566895, "step": 359 }, { "epoch": 0.2461117757648265, "grad_norm": 0.009475558996200562, "learning_rate": 4.260504167945479e-05, "logits/chosen": -8.11173152923584, "logits/rejected": -8.103973388671875, "logps/chosen": -2.852238655090332, "logps/rejected": -103.7747573852539, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.9347872734069824, "rewards/margins": 10.06855583190918, "rewards/rejected": -7.1337690353393555, "step": 360 }, { "epoch": 0.24679541958639548, "grad_norm": 0.0074405609630048275, "learning_rate": 4.2625120031760965e-05, "logits/chosen": -8.313879013061523, "logits/rejected": -8.304850578308105, "logps/chosen": -2.4912452697753906, "logps/rejected": -105.95854949951172, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.8547310829162598, "rewards/margins": 10.184301376342773, "rewards/rejected": -7.329569339752197, "step": 361 }, { "epoch": 0.24747906340796444, "grad_norm": 0.010705859400331974, "learning_rate": 4.264514284221944e-05, "logits/chosen": -8.78600025177002, "logits/rejected": -8.779751777648926, "logps/chosen": -4.412062168121338, "logps/rejected": -105.20565032958984, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.681218147277832, "rewards/margins": 9.934383392333984, "rewards/rejected": -7.2531657218933105, "step": 362 }, { "epoch": 0.2481627072295334, "grad_norm": 0.005037416238337755, "learning_rate": 4.266511041726854e-05, "logits/chosen": -8.700928688049316, "logits/rejected": -8.694415092468262, "logps/chosen": -2.101236343383789, "logps/rejected": -105.54911804199219, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.9573659896850586, "rewards/margins": 10.281152725219727, "rewards/rejected": -7.323787689208984, "step": 363 }, { "epoch": 0.24884635105110237, "grad_norm": 0.014074170030653477, "learning_rate": 4.26850230608176e-05, "logits/chosen": -8.300941467285156, "logits/rejected": -8.293091773986816, "logps/chosen": -0.8257774114608765, "logps/rejected": -105.16627502441406, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 3.0987019538879395, "rewards/margins": 10.368069648742676, "rewards/rejected": -7.2693681716918945, "step": 364 }, { "epoch": 0.24952999487267133, "grad_norm": 0.013920050114393234, "learning_rate": 4.2704881074274584e-05, "logits/chosen": -8.609626770019531, "logits/rejected": -8.603206634521484, "logps/chosen": -5.136411666870117, "logps/rejected": -98.89659881591797, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.5628669261932373, "rewards/margins": 9.32121467590332, "rewards/rejected": -6.758347511291504, "step": 365 }, { "epoch": 0.2502136386942403, "grad_norm": 0.003282644785940647, "learning_rate": 4.272468475657351e-05, "logits/chosen": -8.873819351196289, "logits/rejected": -8.86739444732666, "logps/chosen": -3.555722236633301, "logps/rejected": -101.79352569580078, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.761658191680908, "rewards/margins": 9.809206008911133, "rewards/rejected": -7.047548770904541, "step": 366 }, { "epoch": 0.2508972825158093, "grad_norm": 0.004974666517227888, "learning_rate": 4.2744434404201497e-05, "logits/chosen": -8.235498428344727, "logits/rejected": -8.228861808776855, "logps/chosen": -2.0045526027679443, "logps/rejected": -104.42395782470703, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 3.0013670921325684, "rewards/margins": 10.18748950958252, "rewards/rejected": -7.186122417449951, "step": 367 }, { "epoch": 0.2515809263373782, "grad_norm": 0.08620072901248932, "learning_rate": 4.27641303112253e-05, "logits/chosen": -8.469443321228027, "logits/rejected": -8.462420463562012, "logps/chosen": -2.155086040496826, "logps/rejected": -103.15151977539062, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.9877982139587402, "rewards/margins": 10.090206146240234, "rewards/rejected": -7.102407455444336, "step": 368 }, { "epoch": 0.2522645701589472, "grad_norm": 0.011522979475557804, "learning_rate": 4.278377276931767e-05, "logits/chosen": -8.372723579406738, "logits/rejected": -8.366986274719238, "logps/chosen": -3.6897411346435547, "logps/rejected": -102.85868835449219, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.7980620861053467, "rewards/margins": 9.859740257263184, "rewards/rejected": -7.061678886413574, "step": 369 }, { "epoch": 0.25294821398051615, "grad_norm": 0.011201800778508186, "learning_rate": 4.2803362067783256e-05, "logits/chosen": -8.581323623657227, "logits/rejected": -8.575149536132812, "logps/chosen": -4.473511695861816, "logps/rejected": -104.60588073730469, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.7718162536621094, "rewards/margins": 9.941810607910156, "rewards/rejected": -7.169994354248047, "step": 370 }, { "epoch": 0.25363185780208514, "grad_norm": 0.008363377302885056, "learning_rate": 4.2822898493584104e-05, "logits/chosen": -8.934724807739258, "logits/rejected": -8.924216270446777, "logps/chosen": -0.7700201272964478, "logps/rejected": -103.1575927734375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 3.173459768295288, "rewards/margins": 10.321311950683594, "rewards/rejected": -7.147852897644043, "step": 371 }, { "epoch": 0.25431550162365407, "grad_norm": 0.003633183194324374, "learning_rate": 4.284238233136496e-05, "logits/chosen": -8.32390022277832, "logits/rejected": -8.317218780517578, "logps/chosen": -6.153616428375244, "logps/rejected": -103.22695922851562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.642779588699341, "rewards/margins": 9.606148719787598, "rewards/rejected": -6.963369369506836, "step": 372 }, { "epoch": 0.25499914544522306, "grad_norm": 0.01647544652223587, "learning_rate": 4.286181386347813e-05, "logits/chosen": -8.445460319519043, "logits/rejected": -8.439926147460938, "logps/chosen": -4.8091044425964355, "logps/rejected": -103.44429016113281, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.713170051574707, "rewards/margins": 9.844462394714355, "rewards/rejected": -7.131292343139648, "step": 373 }, { "epoch": 0.255682789266792, "grad_norm": 0.005007846746593714, "learning_rate": 4.288119337000801e-05, "logits/chosen": -8.180243492126465, "logits/rejected": -8.17289924621582, "logps/chosen": -2.690537929534912, "logps/rejected": -103.44280242919922, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9424285888671875, "rewards/margins": 10.00843334197998, "rewards/rejected": -7.066004753112793, "step": 374 }, { "epoch": 0.256366433088361, "grad_norm": 0.0044053359888494015, "learning_rate": 4.2900521128795315e-05, "logits/chosen": -8.434185028076172, "logits/rejected": -8.427495002746582, "logps/chosen": -4.177757263183594, "logps/rejected": -104.64283752441406, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.8427345752716064, "rewards/margins": 10.032227516174316, "rewards/rejected": -7.189492702484131, "step": 375 }, { "epoch": 0.2570500769099299, "grad_norm": 0.004466106183826923, "learning_rate": 4.291979741546102e-05, "logits/chosen": -8.43555736541748, "logits/rejected": -8.429043769836426, "logps/chosen": -5.929464340209961, "logps/rejected": -103.10155487060547, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5981554985046387, "rewards/margins": 9.671316146850586, "rewards/rejected": -7.0731611251831055, "step": 376 }, { "epoch": 0.2577337207314989, "grad_norm": 0.004511342383921146, "learning_rate": 4.293902250342989e-05, "logits/chosen": -8.640222549438477, "logits/rejected": -8.633859634399414, "logps/chosen": -6.426881790161133, "logps/rejected": -99.48416900634766, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.551762104034424, "rewards/margins": 9.309947967529297, "rewards/rejected": -6.758186340332031, "step": 377 }, { "epoch": 0.25841736455306785, "grad_norm": 0.0056973714381456375, "learning_rate": 4.295819666395376e-05, "logits/chosen": -9.204008102416992, "logits/rejected": -9.196491241455078, "logps/chosen": -6.414488315582275, "logps/rejected": -103.96736145019531, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.594527244567871, "rewards/margins": 9.66720199584961, "rewards/rejected": -7.072674751281738, "step": 378 }, { "epoch": 0.2591010083746368, "grad_norm": 0.008507749065756798, "learning_rate": 4.297732016613454e-05, "logits/chosen": -8.835928916931152, "logits/rejected": -8.829147338867188, "logps/chosen": -9.92297649383545, "logps/rejected": -97.75117492675781, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.1494839191436768, "rewards/margins": 8.69780445098877, "rewards/rejected": -6.548320293426514, "step": 379 }, { "epoch": 0.2597846521962058, "grad_norm": 0.013283228501677513, "learning_rate": 4.299639327694684e-05, "logits/chosen": -8.062751770019531, "logits/rejected": -8.054314613342285, "logps/chosen": -2.5462586879730225, "logps/rejected": -104.9714584350586, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.986588954925537, "rewards/margins": 10.251060485839844, "rewards/rejected": -7.264471530914307, "step": 380 }, { "epoch": 0.2604682960177747, "grad_norm": 0.004787680692970753, "learning_rate": 4.3015416261260325e-05, "logits/chosen": -9.001537322998047, "logits/rejected": -8.99302864074707, "logps/chosen": -2.091101884841919, "logps/rejected": -105.40074157714844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.0550270080566406, "rewards/margins": 10.327376365661621, "rewards/rejected": -7.2723493576049805, "step": 381 }, { "epoch": 0.2611519398393437, "grad_norm": 0.006214487366378307, "learning_rate": 4.303438938186182e-05, "logits/chosen": -8.611860275268555, "logits/rejected": -8.6060209274292, "logps/chosen": -5.507991790771484, "logps/rejected": -105.48660278320312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.4624009132385254, "rewards/margins": 9.877710342407227, "rewards/rejected": -7.415309906005859, "step": 382 }, { "epoch": 0.26183558366091264, "grad_norm": 0.0017556549282744527, "learning_rate": 4.305331289947705e-05, "logits/chosen": -8.785684585571289, "logits/rejected": -8.778698921203613, "logps/chosen": -3.8884031772613525, "logps/rejected": -104.55636596679688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7400662899017334, "rewards/margins": 9.985477447509766, "rewards/rejected": -7.245410442352295, "step": 383 }, { "epoch": 0.26251922748248163, "grad_norm": 1.5215156078338623, "learning_rate": 4.3072187072792184e-05, "logits/chosen": -8.794901847839355, "logits/rejected": -8.7871732711792, "logps/chosen": -6.543410301208496, "logps/rejected": -103.23709869384766, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": 2.52718186378479, "rewards/margins": 9.563182830810547, "rewards/rejected": -7.036002159118652, "step": 384 }, { "epoch": 0.26320287130405057, "grad_norm": 0.003366302466019988, "learning_rate": 4.309101215847502e-05, "logits/chosen": -8.372238159179688, "logits/rejected": -8.366689682006836, "logps/chosen": -4.55429220199585, "logps/rejected": -103.15303039550781, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7188568115234375, "rewards/margins": 9.817707061767578, "rewards/rejected": -7.098851203918457, "step": 385 }, { "epoch": 0.26388651512561956, "grad_norm": 0.0013468762626871467, "learning_rate": 4.3109788411195924e-05, "logits/chosen": -8.25357723236084, "logits/rejected": -8.245850563049316, "logps/chosen": -2.178378105163574, "logps/rejected": -105.17503356933594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.994547128677368, "rewards/margins": 10.292365074157715, "rewards/rejected": -7.297817230224609, "step": 386 }, { "epoch": 0.2645701589471885, "grad_norm": 1.1559160947799683, "learning_rate": 4.312851608364853e-05, "logits/chosen": -9.062223434448242, "logits/rejected": -9.056803703308105, "logps/chosen": -9.114971160888672, "logps/rejected": -100.82421875, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": 2.3741579055786133, "rewards/margins": 9.214780807495117, "rewards/rejected": -6.840623378753662, "step": 387 }, { "epoch": 0.2652538027687575, "grad_norm": 1.8409600257873535, "learning_rate": 4.314719542657013e-05, "logits/chosen": -8.718587875366211, "logits/rejected": -8.712072372436523, "logps/chosen": -1.1216042041778564, "logps/rejected": -103.86744689941406, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 3.052365779876709, "rewards/margins": 10.235300064086914, "rewards/rejected": -7.182933807373047, "step": 388 }, { "epoch": 0.2659374465903264, "grad_norm": 0.0015149613609537482, "learning_rate": 4.3165826688761796e-05, "logits/chosen": -8.365497589111328, "logits/rejected": -8.35645580291748, "logps/chosen": -1.056126356124878, "logps/rejected": -105.66840362548828, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.0275237560272217, "rewards/margins": 10.435328483581543, "rewards/rejected": -7.407804012298584, "step": 389 }, { "epoch": 0.2666210904118954, "grad_norm": 0.0014328897232189775, "learning_rate": 4.318441011710833e-05, "logits/chosen": -8.651143074035645, "logits/rejected": -8.643198013305664, "logps/chosen": -2.782489776611328, "logps/rejected": -104.9280014038086, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.914445400238037, "rewards/margins": 10.190406799316406, "rewards/rejected": -7.275961399078369, "step": 390 }, { "epoch": 0.26730473423346435, "grad_norm": 0.004804642871022224, "learning_rate": 4.3202945956597786e-05, "logits/chosen": -8.556671142578125, "logits/rejected": -8.549764633178711, "logps/chosen": -5.289373397827148, "logps/rejected": -104.13321685791016, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.69921875, "rewards/margins": 9.870359420776367, "rewards/rejected": -7.171140193939209, "step": 391 }, { "epoch": 0.26798837805503334, "grad_norm": 0.015577034093439579, "learning_rate": 4.3221434450340956e-05, "logits/chosen": -8.594712257385254, "logits/rejected": -8.588298797607422, "logps/chosen": -7.976094722747803, "logps/rejected": -103.6680908203125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.2585253715515137, "rewards/margins": 9.437847137451172, "rewards/rejected": -7.179322719573975, "step": 392 }, { "epoch": 0.2686720218766023, "grad_norm": 1.09415864944458, "learning_rate": 4.323987583959045e-05, "logits/chosen": -8.699881553649902, "logits/rejected": -8.693279266357422, "logps/chosen": -3.243112087249756, "logps/rejected": -103.77877807617188, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 2.8886663913726807, "rewards/margins": 10.02401351928711, "rewards/rejected": -7.135346412658691, "step": 393 }, { "epoch": 0.26935566569817126, "grad_norm": 0.00414937362074852, "learning_rate": 4.325827036375957e-05, "logits/chosen": -8.403953552246094, "logits/rejected": -8.395954132080078, "logps/chosen": -7.14552116394043, "logps/rejected": -101.45437622070312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5869476795196533, "rewards/margins": 9.441308975219727, "rewards/rejected": -6.8543620109558105, "step": 394 }, { "epoch": 0.2700393095197402, "grad_norm": 0.006160000339150429, "learning_rate": 4.327661826044101e-05, "logits/chosen": -8.3131685256958, "logits/rejected": -8.305975914001465, "logps/chosen": -5.041403770446777, "logps/rejected": -102.01606750488281, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.7272772789001465, "rewards/margins": 9.73137092590332, "rewards/rejected": -7.004094123840332, "step": 395 }, { "epoch": 0.2707229533413092, "grad_norm": 0.004751343745738268, "learning_rate": 4.329491976542521e-05, "logits/chosen": -8.609884262084961, "logits/rejected": -8.602437019348145, "logps/chosen": -2.0396876335144043, "logps/rejected": -104.72236633300781, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.040825366973877, "rewards/margins": 10.256433486938477, "rewards/rejected": -7.215608596801758, "step": 396 }, { "epoch": 0.2714065971628781, "grad_norm": 0.046780820935964584, "learning_rate": 4.331317511271859e-05, "logits/chosen": -8.806650161743164, "logits/rejected": -8.800368309020996, "logps/chosen": -9.1046142578125, "logps/rejected": -100.94709777832031, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.34073543548584, "rewards/margins": 9.179146766662598, "rewards/rejected": -6.838411331176758, "step": 397 }, { "epoch": 0.2720902409844471, "grad_norm": 0.07823255658149719, "learning_rate": 4.333138453456147e-05, "logits/chosen": -8.292383193969727, "logits/rejected": -8.28640079498291, "logps/chosen": -6.904983997344971, "logps/rejected": -101.0579833984375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 2.480107545852661, "rewards/margins": 9.334122657775879, "rewards/rejected": -6.854015350341797, "step": 398 }, { "epoch": 0.27277388480601605, "grad_norm": 0.027452733367681503, "learning_rate": 4.334954826144581e-05, "logits/chosen": -8.262672424316406, "logits/rejected": -8.255694389343262, "logps/chosen": -9.623405456542969, "logps/rejected": -99.9922103881836, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.2889957427978516, "rewards/margins": 9.070050239562988, "rewards/rejected": -6.781054496765137, "step": 399 }, { "epoch": 0.27345752862758504, "grad_norm": 0.005676618777215481, "learning_rate": 4.336766652213271e-05, "logits/chosen": -8.361352920532227, "logits/rejected": -8.353755950927734, "logps/chosen": -4.206287384033203, "logps/rejected": -104.26679992675781, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.819005012512207, "rewards/margins": 9.937519073486328, "rewards/rejected": -7.118513584136963, "step": 400 }, { "epoch": 0.274141172449154, "grad_norm": 0.018793685361742973, "learning_rate": 4.338573954366971e-05, "logits/chosen": -8.689802169799805, "logits/rejected": -8.681695938110352, "logps/chosen": -4.9471049308776855, "logps/rejected": -102.94046783447266, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.6427292823791504, "rewards/margins": 9.774005889892578, "rewards/rejected": -7.131276607513428, "step": 401 }, { "epoch": 0.27482481627072297, "grad_norm": 0.3951297104358673, "learning_rate": 4.340376755140784e-05, "logits/chosen": -9.440359115600586, "logits/rejected": -9.433698654174805, "logps/chosen": -9.67077922821045, "logps/rejected": -98.35834503173828, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 2.143448829650879, "rewards/margins": 8.817878723144531, "rewards/rejected": -6.6744303703308105, "step": 402 }, { "epoch": 0.2755084600922919, "grad_norm": 0.007711814250797033, "learning_rate": 4.342175076901849e-05, "logits/chosen": -8.727655410766602, "logits/rejected": -8.722369194030762, "logps/chosen": -9.267398834228516, "logps/rejected": -99.5816421508789, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.22213077545166, "rewards/margins": 8.972776412963867, "rewards/rejected": -6.750646591186523, "step": 403 }, { "epoch": 0.2761921039138609, "grad_norm": 0.0034751149360090494, "learning_rate": 4.343968941851009e-05, "logits/chosen": -9.089895248413086, "logits/rejected": -9.083243370056152, "logps/chosen": -4.531732082366943, "logps/rejected": -103.2743911743164, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6432225704193115, "rewards/margins": 9.831244468688965, "rewards/rejected": -7.188021659851074, "step": 404 }, { "epoch": 0.27687574773542983, "grad_norm": 0.014763365499675274, "learning_rate": 4.345758372024448e-05, "logits/chosen": -8.875761032104492, "logits/rejected": -8.867890357971191, "logps/chosen": -7.588079929351807, "logps/rejected": -100.28427124023438, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.3676087856292725, "rewards/margins": 9.26114273071289, "rewards/rejected": -6.893533706665039, "step": 405 }, { "epoch": 0.2775593915569988, "grad_norm": 0.0037145749665796757, "learning_rate": 4.347543389295324e-05, "logits/chosen": -8.38283634185791, "logits/rejected": -8.376840591430664, "logps/chosen": -7.916836738586426, "logps/rejected": -100.87419128417969, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4288134574890137, "rewards/margins": 9.2294921875, "rewards/rejected": -6.800678253173828, "step": 406 }, { "epoch": 0.27824303537856776, "grad_norm": 0.0731847807765007, "learning_rate": 4.3493240153753666e-05, "logits/chosen": -9.013548851013184, "logits/rejected": -9.005868911743164, "logps/chosen": -2.3298182487487793, "logps/rejected": -104.09090423583984, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": 3.028059482574463, "rewards/margins": 10.189071655273438, "rewards/rejected": -7.161011695861816, "step": 407 }, { "epoch": 0.27892667920013675, "grad_norm": 0.004454337526112795, "learning_rate": 4.3511002718164666e-05, "logits/chosen": -8.610910415649414, "logits/rejected": -8.602648735046387, "logps/chosen": -2.9312126636505127, "logps/rejected": -104.59634399414062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.881288528442383, "rewards/margins": 10.12816047668457, "rewards/rejected": -7.2468719482421875, "step": 408 }, { "epoch": 0.2796103230217057, "grad_norm": 0.02575918287038803, "learning_rate": 4.352872180012237e-05, "logits/chosen": -8.183395385742188, "logits/rejected": -8.176310539245605, "logps/chosen": -2.801544189453125, "logps/rejected": -105.08318328857422, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.9611222743988037, "rewards/margins": 10.211971282958984, "rewards/rejected": -7.250848770141602, "step": 409 }, { "epoch": 0.2802939668432747, "grad_norm": 0.005058853421360254, "learning_rate": 4.35463976119956e-05, "logits/chosen": -8.374162673950195, "logits/rejected": -8.365734100341797, "logps/chosen": -3.4828178882598877, "logps/rejected": -103.94667053222656, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7522459030151367, "rewards/margins": 9.998052597045898, "rewards/rejected": -7.245806694030762, "step": 410 }, { "epoch": 0.2809776106648436, "grad_norm": 0.0021976998541504145, "learning_rate": 4.356403036460115e-05, "logits/chosen": -8.349347114562988, "logits/rejected": -8.342382431030273, "logps/chosen": -2.9652316570281982, "logps/rejected": -104.9738540649414, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8867568969726562, "rewards/margins": 10.177942276000977, "rewards/rejected": -7.29118537902832, "step": 411 }, { "epoch": 0.2816612544864126, "grad_norm": 0.0363687165081501, "learning_rate": 4.3581620267218916e-05, "logits/chosen": -8.396289825439453, "logits/rejected": -8.38779354095459, "logps/chosen": -2.260495662689209, "logps/rejected": -103.69731140136719, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.9925460815429688, "rewards/margins": 10.130373001098633, "rewards/rejected": -7.137826919555664, "step": 412 }, { "epoch": 0.28234489830798154, "grad_norm": 0.02711641602218151, "learning_rate": 4.359916752760669e-05, "logits/chosen": -9.223090171813965, "logits/rejected": -9.215396881103516, "logps/chosen": -0.7421690821647644, "logps/rejected": -104.41729736328125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.099992036819458, "rewards/margins": 10.408863067626953, "rewards/rejected": -7.308871746063232, "step": 413 }, { "epoch": 0.28302854212955053, "grad_norm": 0.017193064093589783, "learning_rate": 4.361667235201499e-05, "logits/chosen": -8.552892684936523, "logits/rejected": -8.545613288879395, "logps/chosen": -0.6103758811950684, "logps/rejected": -106.28846740722656, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 3.0354037284851074, "rewards/margins": 10.471359252929688, "rewards/rejected": -7.435954570770264, "step": 414 }, { "epoch": 0.28371218595111947, "grad_norm": 0.0018228114349767566, "learning_rate": 4.363413494520154e-05, "logits/chosen": -8.620177268981934, "logits/rejected": -8.613823890686035, "logps/chosen": -2.7492642402648926, "logps/rejected": -106.82447814941406, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9633092880249023, "rewards/margins": 10.30150032043457, "rewards/rejected": -7.338191032409668, "step": 415 }, { "epoch": 0.2843958297726884, "grad_norm": 0.0030648300889879465, "learning_rate": 4.365155551044572e-05, "logits/chosen": -8.609830856323242, "logits/rejected": -8.601987838745117, "logps/chosen": -5.235803127288818, "logps/rejected": -104.69035339355469, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.621227264404297, "rewards/margins": 9.800393104553223, "rewards/rejected": -7.179165840148926, "step": 416 }, { "epoch": 0.2850794735942574, "grad_norm": 0.0073739103972911835, "learning_rate": 4.366893424956263e-05, "logits/chosen": -8.6229248046875, "logits/rejected": -8.615910530090332, "logps/chosen": -4.149497985839844, "logps/rejected": -103.97555541992188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.7901530265808105, "rewards/margins": 9.941476821899414, "rewards/rejected": -7.151323318481445, "step": 417 }, { "epoch": 0.28576311741582633, "grad_norm": 0.0051852744072675705, "learning_rate": 4.368627136291726e-05, "logits/chosen": -8.856842041015625, "logits/rejected": -8.849176406860352, "logps/chosen": -4.011946201324463, "logps/rejected": -103.90495300292969, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.823066234588623, "rewards/margins": 9.928596496582031, "rewards/rejected": -7.10552978515625, "step": 418 }, { "epoch": 0.2864467612373953, "grad_norm": 0.001903603202663362, "learning_rate": 4.370356704943825e-05, "logits/chosen": -8.376588821411133, "logits/rejected": -8.370501518249512, "logps/chosen": -4.010472297668457, "logps/rejected": -100.32624816894531, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.760188579559326, "rewards/margins": 9.659926414489746, "rewards/rejected": -6.899738311767578, "step": 419 }, { "epoch": 0.28713040505896426, "grad_norm": 0.003611641237512231, "learning_rate": 4.372082150663168e-05, "logits/chosen": -8.419270515441895, "logits/rejected": -8.411884307861328, "logps/chosen": -3.8073537349700928, "logps/rejected": -104.60127258300781, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7402071952819824, "rewards/margins": 10.009723663330078, "rewards/rejected": -7.269516944885254, "step": 420 }, { "epoch": 0.28781404888053325, "grad_norm": 0.016695190221071243, "learning_rate": 4.3738034930594475e-05, "logits/chosen": -8.755483627319336, "logits/rejected": -8.747870445251465, "logps/chosen": -3.547536611557007, "logps/rejected": -102.96208190917969, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.908595561981201, "rewards/margins": 9.909878730773926, "rewards/rejected": -7.001282691955566, "step": 421 }, { "epoch": 0.2884976927021022, "grad_norm": 0.002444524085149169, "learning_rate": 4.3755207516027904e-05, "logits/chosen": -8.686174392700195, "logits/rejected": -8.678109169006348, "logps/chosen": -2.6382813453674316, "logps/rejected": -105.20079040527344, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.914067029953003, "rewards/margins": 10.218762397766113, "rewards/rejected": -7.304695129394531, "step": 422 }, { "epoch": 0.2891813365236712, "grad_norm": 0.012840681709349155, "learning_rate": 4.377233945625071e-05, "logits/chosen": -8.440995216369629, "logits/rejected": -8.432981491088867, "logps/chosen": -2.0710036754608154, "logps/rejected": -104.67202758789062, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 3.0162057876586914, "rewards/margins": 10.241601943969727, "rewards/rejected": -7.225396156311035, "step": 423 }, { "epoch": 0.2898649803452401, "grad_norm": 0.002462374046444893, "learning_rate": 4.378943094321221e-05, "logits/chosen": -8.920100212097168, "logits/rejected": -8.911172866821289, "logps/chosen": -2.872682571411133, "logps/rejected": -105.52049255371094, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.85443115234375, "rewards/margins": 10.191970825195312, "rewards/rejected": -7.3375396728515625, "step": 424 }, { "epoch": 0.2905486241668091, "grad_norm": 0.0017940844409167767, "learning_rate": 4.3806482167505196e-05, "logits/chosen": -8.431417465209961, "logits/rejected": -8.424173355102539, "logps/chosen": -4.0090012550354, "logps/rejected": -102.52439880371094, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.858539342880249, "rewards/margins": 9.865900993347168, "rewards/rejected": -7.00736141204834, "step": 425 }, { "epoch": 0.29123226798837804, "grad_norm": 0.04515918716788292, "learning_rate": 4.382349331837866e-05, "logits/chosen": -8.639028549194336, "logits/rejected": -8.629929542541504, "logps/chosen": -5.2822771072387695, "logps/rejected": -102.51902770996094, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 2.6343228816986084, "rewards/margins": 9.662965774536133, "rewards/rejected": -7.028642654418945, "step": 426 }, { "epoch": 0.291915911809947, "grad_norm": 0.24295847117900848, "learning_rate": 4.3840464583750404e-05, "logits/chosen": -8.728076934814453, "logits/rejected": -8.72086238861084, "logps/chosen": -8.267617225646973, "logps/rejected": -104.31537628173828, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.4282138347625732, "rewards/margins": 9.449258804321289, "rewards/rejected": -7.021044731140137, "step": 427 }, { "epoch": 0.29259955563151596, "grad_norm": 0.4113301634788513, "learning_rate": 4.385739615021954e-05, "logits/chosen": -9.143360137939453, "logits/rejected": -9.135485649108887, "logps/chosen": -2.1134378910064697, "logps/rejected": -105.4035873413086, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": 2.8666858673095703, "rewards/margins": 10.231256484985352, "rewards/rejected": -7.3645710945129395, "step": 428 }, { "epoch": 0.29328319945308495, "grad_norm": 0.03981515020132065, "learning_rate": 4.387428820307874e-05, "logits/chosen": -8.273298263549805, "logits/rejected": -8.266387939453125, "logps/chosen": -5.626396179199219, "logps/rejected": -101.99627685546875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 2.7467331886291504, "rewards/margins": 9.60647201538086, "rewards/rejected": -6.859739303588867, "step": 429 }, { "epoch": 0.2939668432746539, "grad_norm": 0.0015278402715921402, "learning_rate": 4.3891140926326446e-05, "logits/chosen": -9.16115951538086, "logits/rejected": -9.153755187988281, "logps/chosen": -0.34946173429489136, "logps/rejected": -104.79254150390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1363108158111572, "rewards/margins": 10.46337890625, "rewards/rejected": -7.3270673751831055, "step": 430 }, { "epoch": 0.2946504870962229, "grad_norm": 0.0021791725885123014, "learning_rate": 4.390795450267886e-05, "logits/chosen": -9.256216049194336, "logits/rejected": -9.247282028198242, "logps/chosen": -2.289412021636963, "logps/rejected": -103.77095794677734, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.887864112854004, "rewards/margins": 10.134910583496094, "rewards/rejected": -7.247046947479248, "step": 431 }, { "epoch": 0.2953341309177918, "grad_norm": 0.009418662637472153, "learning_rate": 4.3924729113581876e-05, "logits/chosen": -7.909134864807129, "logits/rejected": -7.901614189147949, "logps/chosen": -4.175797939300537, "logps/rejected": -103.27705383300781, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.861025810241699, "rewards/margins": 9.861093521118164, "rewards/rejected": -7.000067710876465, "step": 432 }, { "epoch": 0.2960177747393608, "grad_norm": 0.006180360447615385, "learning_rate": 4.394146493922276e-05, "logits/chosen": -8.985454559326172, "logits/rejected": -8.97504711151123, "logps/chosen": -0.538193941116333, "logps/rejected": -106.45734405517578, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.0092451572418213, "rewards/margins": 10.500663757324219, "rewards/rejected": -7.491418838500977, "step": 433 }, { "epoch": 0.29670141856092974, "grad_norm": 0.0036949731875211, "learning_rate": 4.395816215854185e-05, "logits/chosen": -8.462194442749023, "logits/rejected": -8.45510196685791, "logps/chosen": -3.0134265422821045, "logps/rejected": -105.28936004638672, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.903017520904541, "rewards/margins": 10.154800415039062, "rewards/rejected": -7.2517828941345215, "step": 434 }, { "epoch": 0.29738506238249873, "grad_norm": 0.014123302884399891, "learning_rate": 4.397482094924396e-05, "logits/chosen": -8.624835014343262, "logits/rejected": -8.61728572845459, "logps/chosen": -5.437878608703613, "logps/rejected": -103.14376831054688, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.6321299076080322, "rewards/margins": 9.640804290771484, "rewards/rejected": -7.008674621582031, "step": 435 }, { "epoch": 0.29806870620406767, "grad_norm": 0.02297816053032875, "learning_rate": 4.399144148780977e-05, "logits/chosen": -8.4743013381958, "logits/rejected": -8.467833518981934, "logps/chosen": -5.964591026306152, "logps/rejected": -100.55735778808594, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 2.6668224334716797, "rewards/margins": 9.452790260314941, "rewards/rejected": -6.785967826843262, "step": 436 }, { "epoch": 0.29875235002563666, "grad_norm": 0.018093885853886604, "learning_rate": 4.400802394950703e-05, "logits/chosen": -9.071942329406738, "logits/rejected": -9.063803672790527, "logps/chosen": -1.8545482158660889, "logps/rejected": -104.4587631225586, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.9711055755615234, "rewards/margins": 10.197751998901367, "rewards/rejected": -7.226646900177002, "step": 437 }, { "epoch": 0.2994359938472056, "grad_norm": 0.01483860518783331, "learning_rate": 4.402456850840166e-05, "logits/chosen": -8.231962203979492, "logits/rejected": -8.224666595458984, "logps/chosen": -2.774177074432373, "logps/rejected": -103.09608459472656, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.78359055519104, "rewards/margins": 9.982662200927734, "rewards/rejected": -7.199071884155273, "step": 438 }, { "epoch": 0.3001196376687746, "grad_norm": 0.006202701944857836, "learning_rate": 4.4041075337368695e-05, "logits/chosen": -8.689013481140137, "logits/rejected": -8.681581497192383, "logps/chosen": -0.262413889169693, "logps/rejected": -105.01823425292969, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.202761173248291, "rewards/margins": 10.47207260131836, "rewards/rejected": -7.269311904907227, "step": 439 }, { "epoch": 0.3008032814903435, "grad_norm": 0.04866080358624458, "learning_rate": 4.405754460810312e-05, "logits/chosen": -7.923452377319336, "logits/rejected": -7.915256023406982, "logps/chosen": -4.198235034942627, "logps/rejected": -103.98109436035156, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 2.8263795375823975, "rewards/margins": 9.915066719055176, "rewards/rejected": -7.088686943054199, "step": 440 }, { "epoch": 0.3014869253119125, "grad_norm": 0.020256653428077698, "learning_rate": 4.407397649113065e-05, "logits/chosen": -8.247723579406738, "logits/rejected": -8.240986824035645, "logps/chosen": -5.8653950691223145, "logps/rejected": -105.0443115234375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.6038293838500977, "rewards/margins": 9.837307929992676, "rewards/rejected": -7.233478546142578, "step": 441 }, { "epoch": 0.30217056913348145, "grad_norm": 0.09919893741607666, "learning_rate": 4.40903711558182e-05, "logits/chosen": -8.514164924621582, "logits/rejected": -8.505867958068848, "logps/chosen": -2.2341458797454834, "logps/rejected": -102.83536529541016, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 2.8598525524139404, "rewards/margins": 9.995038032531738, "rewards/rejected": -7.135185718536377, "step": 442 }, { "epoch": 0.30285421295505044, "grad_norm": 0.008221643976867199, "learning_rate": 4.41067287703845e-05, "logits/chosen": -8.201640129089355, "logits/rejected": -8.194480895996094, "logps/chosen": -5.872988224029541, "logps/rejected": -101.70741271972656, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.651071071624756, "rewards/margins": 9.546727180480957, "rewards/rejected": -6.895656585693359, "step": 443 }, { "epoch": 0.3035378567766194, "grad_norm": 0.0041684964671730995, "learning_rate": 4.412304950191033e-05, "logits/chosen": -8.810373306274414, "logits/rejected": -8.802671432495117, "logps/chosen": -2.1036629676818848, "logps/rejected": -105.12593078613281, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.091256856918335, "rewards/margins": 10.266637802124023, "rewards/rejected": -7.175381660461426, "step": 444 }, { "epoch": 0.30422150059818837, "grad_norm": 0.008970328606665134, "learning_rate": 4.413933351634886e-05, "logits/chosen": -8.730385780334473, "logits/rejected": -8.723445892333984, "logps/chosen": -0.6290568113327026, "logps/rejected": -106.90171813964844, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 3.0456457138061523, "rewards/margins": 10.537764549255371, "rewards/rejected": -7.492118835449219, "step": 445 }, { "epoch": 0.3049051444197573, "grad_norm": 0.025316383689641953, "learning_rate": 4.4155580978535707e-05, "logits/chosen": -8.563749313354492, "logits/rejected": -8.556403160095215, "logps/chosen": -3.2873964309692383, "logps/rejected": -102.51420593261719, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 2.8757858276367188, "rewards/margins": 9.908005714416504, "rewards/rejected": -7.032219886779785, "step": 446 }, { "epoch": 0.3055887882413263, "grad_norm": 0.0021601759362965822, "learning_rate": 4.417179205219895e-05, "logits/chosen": -8.384568214416504, "logits/rejected": -8.377700805664062, "logps/chosen": -5.649431228637695, "logps/rejected": -103.21467590332031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.719381809234619, "rewards/margins": 9.723543167114258, "rewards/rejected": -7.0041608810424805, "step": 447 }, { "epoch": 0.30627243206289523, "grad_norm": 0.008732028305530548, "learning_rate": 4.418796689996907e-05, "logits/chosen": -8.423553466796875, "logits/rejected": -8.415849685668945, "logps/chosen": -4.7347283363342285, "logps/rejected": -102.67100524902344, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.7291178703308105, "rewards/margins": 9.767168045043945, "rewards/rejected": -7.038050174713135, "step": 448 }, { "epoch": 0.3069560758844642, "grad_norm": 0.00761454226449132, "learning_rate": 4.420410568338872e-05, "logits/chosen": -8.205127716064453, "logits/rejected": -8.19650936126709, "logps/chosen": -3.6675803661346436, "logps/rejected": -101.73629760742188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.9035677909851074, "rewards/margins": 9.837669372558594, "rewards/rejected": -6.934101104736328, "step": 449 }, { "epoch": 0.30763971970603315, "grad_norm": 0.020028289407491684, "learning_rate": 4.42202085629224e-05, "logits/chosen": -8.467580795288086, "logits/rejected": -8.45896053314209, "logps/chosen": -0.29333826899528503, "logps/rejected": -105.71141815185547, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.2800166606903076, "rewards/margins": 10.527706146240234, "rewards/rejected": -7.247690200805664, "step": 450 }, { "epoch": 0.30832336352760215, "grad_norm": 0.0019741621799767017, "learning_rate": 4.423627569796601e-05, "logits/chosen": -8.294241905212402, "logits/rejected": -8.287430763244629, "logps/chosen": -2.616419792175293, "logps/rejected": -103.9027328491211, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.961172580718994, "rewards/margins": 10.089062690734863, "rewards/rejected": -7.127889633178711, "step": 451 }, { "epoch": 0.3090070073491711, "grad_norm": 0.00491661112755537, "learning_rate": 4.425230724685638e-05, "logits/chosen": -8.5132417678833, "logits/rejected": -8.50566577911377, "logps/chosen": -5.443904399871826, "logps/rejected": -102.46366882324219, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.708801031112671, "rewards/margins": 9.696992874145508, "rewards/rejected": -6.988192081451416, "step": 452 }, { "epoch": 0.30969065117074, "grad_norm": 0.0063985800370574, "learning_rate": 4.4268303366880536e-05, "logits/chosen": -8.73978042602539, "logits/rejected": -8.733410835266113, "logps/chosen": -6.017031669616699, "logps/rejected": -103.2100601196289, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.65048885345459, "rewards/margins": 9.663922309875488, "rewards/rejected": -7.013433456420898, "step": 453 }, { "epoch": 0.310374294992309, "grad_norm": 0.002273548161610961, "learning_rate": 4.428426421428507e-05, "logits/chosen": -9.019403457641602, "logits/rejected": -9.012088775634766, "logps/chosen": -5.619483947753906, "logps/rejected": -103.96014404296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.62160062789917, "rewards/margins": 9.735528945922852, "rewards/rejected": -7.113928318023682, "step": 454 }, { "epoch": 0.31105793881387794, "grad_norm": 1.0235414505004883, "learning_rate": 4.430018994428521e-05, "logits/chosen": -8.100796699523926, "logits/rejected": -8.093568801879883, "logps/chosen": -3.2148375511169434, "logps/rejected": -104.29463195800781, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 2.8730216026306152, "rewards/margins": 10.079814910888672, "rewards/rejected": -7.206792831420898, "step": 455 }, { "epoch": 0.31174158263544693, "grad_norm": 0.004153348505496979, "learning_rate": 4.431608071107392e-05, "logits/chosen": -8.597708702087402, "logits/rejected": -8.587472915649414, "logps/chosen": -2.2851154804229736, "logps/rejected": -105.79315948486328, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.982032299041748, "rewards/margins": 10.334674835205078, "rewards/rejected": -7.35264253616333, "step": 456 }, { "epoch": 0.31242522645701587, "grad_norm": 0.007395236752927303, "learning_rate": 4.433193666783084e-05, "logits/chosen": -8.310531616210938, "logits/rejected": -8.303681373596191, "logps/chosen": -5.149104118347168, "logps/rejected": -103.15540313720703, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.7130279541015625, "rewards/margins": 9.751060485839844, "rewards/rejected": -7.038032531738281, "step": 457 }, { "epoch": 0.31310887027858486, "grad_norm": 0.006010998971760273, "learning_rate": 4.4347757966731156e-05, "logits/chosen": -8.540982246398926, "logits/rejected": -8.533578872680664, "logps/chosen": -0.3366570770740509, "logps/rejected": -107.26311492919922, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.0217599868774414, "rewards/margins": 10.627141952514648, "rewards/rejected": -7.605382442474365, "step": 458 }, { "epoch": 0.3137925141001538, "grad_norm": 0.033957481384277344, "learning_rate": 4.436354475895436e-05, "logits/chosen": -7.949348449707031, "logits/rejected": -7.942412376403809, "logps/chosen": -2.9271676540374756, "logps/rejected": -106.64265441894531, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.869744300842285, "rewards/margins": 10.296711921691895, "rewards/rejected": -7.426967144012451, "step": 459 }, { "epoch": 0.3144761579217228, "grad_norm": 0.009128553792834282, "learning_rate": 4.437929719469291e-05, "logits/chosen": -9.132242202758789, "logits/rejected": -9.124337196350098, "logps/chosen": -5.388453006744385, "logps/rejected": -101.93492126464844, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.6185786724090576, "rewards/margins": 9.543310165405273, "rewards/rejected": -6.924731254577637, "step": 460 }, { "epoch": 0.3151598017432917, "grad_norm": 0.003514249576255679, "learning_rate": 4.4395015423160807e-05, "logits/chosen": -9.358850479125977, "logits/rejected": -9.350238800048828, "logps/chosen": -6.114445686340332, "logps/rejected": -102.04658508300781, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.602713108062744, "rewards/margins": 9.563714981079102, "rewards/rejected": -6.961000442504883, "step": 461 }, { "epoch": 0.3158434455648607, "grad_norm": 0.016789816319942474, "learning_rate": 4.4410699592602094e-05, "logits/chosen": -9.559629440307617, "logits/rejected": -9.552355766296387, "logps/chosen": -2.6146042346954346, "logps/rejected": -104.5404052734375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.9078707695007324, "rewards/margins": 10.248326301574707, "rewards/rejected": -7.340455055236816, "step": 462 }, { "epoch": 0.31652708938642965, "grad_norm": 0.0037055097054690123, "learning_rate": 4.442634985029922e-05, "logits/chosen": -8.29887866973877, "logits/rejected": -8.29250717163086, "logps/chosen": -5.643272399902344, "logps/rejected": -102.46156311035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.74391508102417, "rewards/margins": 9.725446701049805, "rewards/rejected": -6.981531143188477, "step": 463 }, { "epoch": 0.31721073320799864, "grad_norm": 0.15369649231433868, "learning_rate": 4.444196634258136e-05, "logits/chosen": -8.70723819732666, "logits/rejected": -8.699256896972656, "logps/chosen": -2.009288787841797, "logps/rejected": -106.18553161621094, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.798741579055786, "rewards/margins": 10.345163345336914, "rewards/rejected": -7.546422004699707, "step": 464 }, { "epoch": 0.3178943770295676, "grad_norm": 0.010893861763179302, "learning_rate": 4.4457549214832566e-05, "logits/chosen": -8.263656616210938, "logits/rejected": -8.255218505859375, "logps/chosen": -5.337066650390625, "logps/rejected": -103.0721664428711, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.682435989379883, "rewards/margins": 9.700052261352539, "rewards/rejected": -7.017616271972656, "step": 465 }, { "epoch": 0.31857802085113657, "grad_norm": 1.7722548246383667, "learning_rate": 4.44730986115e-05, "logits/chosen": -7.794482231140137, "logits/rejected": -7.7873311042785645, "logps/chosen": -2.336664915084839, "logps/rejected": -106.20601654052734, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 2.9559683799743652, "rewards/margins": 10.354446411132812, "rewards/rejected": -7.398478031158447, "step": 466 }, { "epoch": 0.3192616646727055, "grad_norm": 0.005831338930875063, "learning_rate": 4.448861467610187e-05, "logits/chosen": -8.930891036987305, "logits/rejected": -8.92378044128418, "logps/chosen": -0.2828085124492645, "logps/rejected": -107.51061248779297, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.116548538208008, "rewards/margins": 10.642037391662598, "rewards/rejected": -7.525489330291748, "step": 467 }, { "epoch": 0.3199453084942745, "grad_norm": 8.820595741271973, "learning_rate": 4.4504097551235406e-05, "logits/chosen": -8.49001693725586, "logits/rejected": -8.482492446899414, "logps/chosen": -0.30729401111602783, "logps/rejected": -106.62168884277344, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": 3.0916454792022705, "rewards/margins": 10.608511924743652, "rewards/rejected": -7.516866683959961, "step": 468 }, { "epoch": 0.32062895231584343, "grad_norm": 0.0018817499512806535, "learning_rate": 4.4519547378584725e-05, "logits/chosen": -8.87216567993164, "logits/rejected": -8.863203048706055, "logps/chosen": -0.739941418170929, "logps/rejected": -107.391845703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.037013530731201, "rewards/margins": 10.587583541870117, "rewards/rejected": -7.550570487976074, "step": 469 }, { "epoch": 0.3213125961374124, "grad_norm": 0.00554692605510354, "learning_rate": 4.453496429892863e-05, "logits/chosen": -8.761054992675781, "logits/rejected": -8.753409385681152, "logps/chosen": -2.0775136947631836, "logps/rejected": -105.81187438964844, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.9159111976623535, "rewards/margins": 10.329763412475586, "rewards/rejected": -7.413851737976074, "step": 470 }, { "epoch": 0.32199623995898136, "grad_norm": 0.002091691130772233, "learning_rate": 4.455034845214827e-05, "logits/chosen": -8.762371063232422, "logits/rejected": -8.754973411560059, "logps/chosen": -3.61396861076355, "logps/rejected": -104.44828033447266, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7650954723358154, "rewards/margins": 9.999674797058105, "rewards/rejected": -7.234579086303711, "step": 471 }, { "epoch": 0.32267988378055035, "grad_norm": 0.005783146247267723, "learning_rate": 4.4565699977234796e-05, "logits/chosen": -7.742964744567871, "logits/rejected": -7.7345685958862305, "logps/chosen": -3.629291534423828, "logps/rejected": -104.32504272460938, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.904996871948242, "rewards/margins": 10.066434860229492, "rewards/rejected": -7.161437034606934, "step": 472 }, { "epoch": 0.3233635276021193, "grad_norm": 0.14060890674591064, "learning_rate": 4.458101901229686e-05, "logits/chosen": -9.0277738571167, "logits/rejected": -9.01858901977539, "logps/chosen": -2.585986614227295, "logps/rejected": -106.68061828613281, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 2.912820339202881, "rewards/margins": 10.336631774902344, "rewards/rejected": -7.423811912536621, "step": 473 }, { "epoch": 0.3240471714236883, "grad_norm": 0.0013258624821901321, "learning_rate": 4.459630569456809e-05, "logits/chosen": -8.564998626708984, "logits/rejected": -8.555011749267578, "logps/chosen": -1.68840754032135, "logps/rejected": -104.91932678222656, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9576756954193115, "rewards/margins": 10.303991317749023, "rewards/rejected": -7.346314907073975, "step": 474 }, { "epoch": 0.3247308152452572, "grad_norm": 0.0015278668142855167, "learning_rate": 4.461156016041444e-05, "logits/chosen": -8.558924674987793, "logits/rejected": -8.55068588256836, "logps/chosen": -5.515765190124512, "logps/rejected": -103.09732055664062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6014914512634277, "rewards/margins": 9.724061012268066, "rewards/rejected": -7.122570037841797, "step": 475 }, { "epoch": 0.3254144590668262, "grad_norm": 0.0014825885882601142, "learning_rate": 4.462678254534156e-05, "logits/chosen": -8.697341918945312, "logits/rejected": -8.686885833740234, "logps/chosen": -4.869255542755127, "logps/rejected": -103.71391296386719, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6683764457702637, "rewards/margins": 9.838724136352539, "rewards/rejected": -7.170347690582275, "step": 476 }, { "epoch": 0.32609810288839514, "grad_norm": 0.002014046534895897, "learning_rate": 4.464197298400191e-05, "logits/chosen": -8.38284683227539, "logits/rejected": -8.373446464538574, "logps/chosen": -3.422443389892578, "logps/rejected": -101.76657104492188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7969813346862793, "rewards/margins": 9.865921974182129, "rewards/rejected": -7.06894063949585, "step": 477 }, { "epoch": 0.3267817467099641, "grad_norm": 0.0014426767593249679, "learning_rate": 4.4657131610201994e-05, "logits/chosen": -8.738237380981445, "logits/rejected": -8.72602653503418, "logps/chosen": -2.416172981262207, "logps/rejected": -104.72158813476562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9565024375915527, "rewards/margins": 10.216925621032715, "rewards/rejected": -7.260423183441162, "step": 478 }, { "epoch": 0.32746539053153306, "grad_norm": 0.004093154799193144, "learning_rate": 4.467225855690939e-05, "logits/chosen": -8.258980751037598, "logits/rejected": -8.24978256225586, "logps/chosen": -2.350992441177368, "logps/rejected": -104.8758316040039, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9377365112304688, "rewards/margins": 10.133366584777832, "rewards/rejected": -7.195630073547363, "step": 479 }, { "epoch": 0.32814903435310205, "grad_norm": 0.0013650651089847088, "learning_rate": 4.468735395625979e-05, "logits/chosen": -8.250252723693848, "logits/rejected": -8.242077827453613, "logps/chosen": -0.49852651357650757, "logps/rejected": -106.38642883300781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.121192455291748, "rewards/margins": 10.513965606689453, "rewards/rejected": -7.392772674560547, "step": 480 }, { "epoch": 0.328832678174671, "grad_norm": 0.0061868708580732346, "learning_rate": 4.470241793956387e-05, "logits/chosen": -8.537254333496094, "logits/rejected": -8.528968811035156, "logps/chosen": -5.901983737945557, "logps/rejected": -103.16006469726562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5430474281311035, "rewards/margins": 9.64773941040039, "rewards/rejected": -7.104691505432129, "step": 481 }, { "epoch": 0.32951632199624, "grad_norm": 0.003750113770365715, "learning_rate": 4.471745063731416e-05, "logits/chosen": -8.029670715332031, "logits/rejected": -8.022723197937012, "logps/chosen": -7.158431053161621, "logps/rejected": -101.67037200927734, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.538825511932373, "rewards/margins": 9.417570114135742, "rewards/rejected": -6.8787455558776855, "step": 482 }, { "epoch": 0.3301999658178089, "grad_norm": 0.01933920755982399, "learning_rate": 4.473245217919187e-05, "logits/chosen": -8.87919807434082, "logits/rejected": -8.871953964233398, "logps/chosen": -3.2212586402893066, "logps/rejected": -103.41313171386719, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.771517753601074, "rewards/margins": 9.895627975463867, "rewards/rejected": -7.124109745025635, "step": 483 }, { "epoch": 0.3308836096393779, "grad_norm": 0.0021278061904013157, "learning_rate": 4.474742269407355e-05, "logits/chosen": -8.930269241333008, "logits/rejected": -8.922283172607422, "logps/chosen": -0.9535489678382874, "logps/rejected": -106.35649108886719, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0349509716033936, "rewards/margins": 10.501429557800293, "rewards/rejected": -7.46647834777832, "step": 484 }, { "epoch": 0.33156725346094684, "grad_norm": 0.00489907618612051, "learning_rate": 4.476236231003773e-05, "logits/chosen": -8.423600196838379, "logits/rejected": -8.414560317993164, "logps/chosen": -4.081560134887695, "logps/rejected": -102.44493103027344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.866814613342285, "rewards/margins": 9.844215393066406, "rewards/rejected": -6.977401256561279, "step": 485 }, { "epoch": 0.33225089728251583, "grad_norm": 0.014903204515576363, "learning_rate": 4.477727115437156e-05, "logits/chosen": -8.745259284973145, "logits/rejected": -8.732925415039062, "logps/chosen": -2.7057366371154785, "logps/rejected": -105.11150360107422, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 2.9714460372924805, "rewards/margins": 10.227808952331543, "rewards/rejected": -7.2563629150390625, "step": 486 }, { "epoch": 0.33293454110408477, "grad_norm": 0.002235093154013157, "learning_rate": 4.479214935357724e-05, "logits/chosen": -8.540664672851562, "logits/rejected": -8.531951904296875, "logps/chosen": -6.398951053619385, "logps/rejected": -103.16231536865234, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6921651363372803, "rewards/margins": 9.742839813232422, "rewards/rejected": -7.050674915313721, "step": 487 }, { "epoch": 0.33361818492565376, "grad_norm": 0.0009667477570474148, "learning_rate": 4.480699703337852e-05, "logits/chosen": -8.236481666564941, "logits/rejected": -8.228665351867676, "logps/chosen": -2.105247974395752, "logps/rejected": -105.45651245117188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9786887168884277, "rewards/margins": 10.250114440917969, "rewards/rejected": -7.271424293518066, "step": 488 }, { "epoch": 0.3343018287472227, "grad_norm": 0.0032008292619138956, "learning_rate": 4.4821814318727016e-05, "logits/chosen": -8.048897743225098, "logits/rejected": -8.039741516113281, "logps/chosen": -3.01436710357666, "logps/rejected": -105.15585327148438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9299824237823486, "rewards/margins": 10.18440055847168, "rewards/rejected": -7.25441837310791, "step": 489 }, { "epoch": 0.3349854725687917, "grad_norm": 0.002589550567790866, "learning_rate": 4.483660133380856e-05, "logits/chosen": -8.643340110778809, "logits/rejected": -8.633667945861816, "logps/chosen": -2.0513951778411865, "logps/rejected": -104.131103515625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9950079917907715, "rewards/margins": 10.220212936401367, "rewards/rejected": -7.2252044677734375, "step": 490 }, { "epoch": 0.3356691163903606, "grad_norm": 0.0014033180195838213, "learning_rate": 4.485135820204948e-05, "logits/chosen": -8.453453063964844, "logits/rejected": -8.447285652160645, "logps/chosen": -3.4667301177978516, "logps/rejected": -103.73129272460938, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8766846656799316, "rewards/margins": 10.030512809753418, "rewards/rejected": -7.1538286209106445, "step": 491 }, { "epoch": 0.33635276021192956, "grad_norm": 0.007230238523334265, "learning_rate": 4.486608504612267e-05, "logits/chosen": -8.043546676635742, "logits/rejected": -8.035528182983398, "logps/chosen": -3.675070285797119, "logps/rejected": -104.3658676147461, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.818315029144287, "rewards/margins": 10.111940383911133, "rewards/rejected": -7.293625831604004, "step": 492 }, { "epoch": 0.33703640403349855, "grad_norm": 0.0011577564291656017, "learning_rate": 4.488078198795383e-05, "logits/chosen": -7.988910675048828, "logits/rejected": -7.980679035186768, "logps/chosen": -2.145134210586548, "logps/rejected": -104.7978515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9695188999176025, "rewards/margins": 10.253246307373047, "rewards/rejected": -7.283728122711182, "step": 493 }, { "epoch": 0.3377200478550675, "grad_norm": 0.001039127353578806, "learning_rate": 4.489544914872745e-05, "logits/chosen": -8.660932540893555, "logits/rejected": -8.650224685668945, "logps/chosen": -5.825237274169922, "logps/rejected": -103.39875030517578, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.621295690536499, "rewards/margins": 9.725727081298828, "rewards/rejected": -7.104431629180908, "step": 494 }, { "epoch": 0.3384036916766365, "grad_norm": 0.01348673366010189, "learning_rate": 4.4910086648892815e-05, "logits/chosen": -8.039382934570312, "logits/rejected": -8.032678604125977, "logps/chosen": -6.533967971801758, "logps/rejected": -103.76960754394531, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.534627676010132, "rewards/margins": 9.5947265625, "rewards/rejected": -7.060098648071289, "step": 495 }, { "epoch": 0.3390873354982054, "grad_norm": 0.04675036296248436, "learning_rate": 4.4924694608169965e-05, "logits/chosen": -7.563284873962402, "logits/rejected": -7.5536885261535645, "logps/chosen": -2.3267688751220703, "logps/rejected": -104.27783203125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 3.0490903854370117, "rewards/margins": 10.177108764648438, "rewards/rejected": -7.128018379211426, "step": 496 }, { "epoch": 0.3397709793197744, "grad_norm": 0.8958771824836731, "learning_rate": 4.4939273145555536e-05, "logits/chosen": -8.5501127243042, "logits/rejected": -8.540250778198242, "logps/chosen": -2.9380013942718506, "logps/rejected": -103.09528350830078, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 2.8277928829193115, "rewards/margins": 9.982556343078613, "rewards/rejected": -7.154764175415039, "step": 497 }, { "epoch": 0.34045462314134334, "grad_norm": 0.0012584005016833544, "learning_rate": 4.495382237932863e-05, "logits/chosen": -8.299640655517578, "logits/rejected": -8.292192459106445, "logps/chosen": -3.6002590656280518, "logps/rejected": -104.08901977539062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8089542388916016, "rewards/margins": 10.087903022766113, "rewards/rejected": -7.2789483070373535, "step": 498 }, { "epoch": 0.34113826696291233, "grad_norm": 0.001834243768826127, "learning_rate": 4.4968342427056505e-05, "logits/chosen": -8.431312561035156, "logits/rejected": -8.423611640930176, "logps/chosen": -9.939682960510254, "logps/rejected": -100.28660583496094, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.352928876876831, "rewards/margins": 9.104085922241211, "rewards/rejected": -6.751155853271484, "step": 499 }, { "epoch": 0.34182191078448126, "grad_norm": 0.037215836346149445, "learning_rate": 4.498283340560031e-05, "logits/chosen": -8.711947441101074, "logits/rejected": -8.69975757598877, "logps/chosen": -2.4903714656829834, "logps/rejected": -106.04417419433594, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9623427391052246, "rewards/margins": 10.290369987487793, "rewards/rejected": -7.328027725219727, "step": 500 }, { "epoch": 0.34250555460605026, "grad_norm": 0.5547650456428528, "learning_rate": 4.499729543112076e-05, "logits/chosen": -8.24644947052002, "logits/rejected": -8.23978042602539, "logps/chosen": -2.3066964149475098, "logps/rejected": -106.0586166381836, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 2.9819116592407227, "rewards/margins": 10.313545227050781, "rewards/rejected": -7.331634044647217, "step": 501 }, { "epoch": 0.3431891984276192, "grad_norm": 0.007571065332740545, "learning_rate": 4.501172861908366e-05, "logits/chosen": -8.468841552734375, "logits/rejected": -8.46197509765625, "logps/chosen": -2.615532398223877, "logps/rejected": -106.12149047851562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.849536895751953, "rewards/margins": 10.251347541809082, "rewards/rejected": -7.401810646057129, "step": 502 }, { "epoch": 0.3438728422491882, "grad_norm": 0.015846312046051025, "learning_rate": 4.502613308426546e-05, "logits/chosen": -8.91657829284668, "logits/rejected": -8.90576171875, "logps/chosen": -6.97099494934082, "logps/rejected": -101.73868560791016, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 2.5278360843658447, "rewards/margins": 9.477986335754395, "rewards/rejected": -6.950150489807129, "step": 503 }, { "epoch": 0.3445564860707571, "grad_norm": 0.001375861931592226, "learning_rate": 4.504050894075876e-05, "logits/chosen": -8.676509857177734, "logits/rejected": -8.668481826782227, "logps/chosen": -0.327106773853302, "logps/rejected": -106.3055419921875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.170165777206421, "rewards/margins": 10.620999336242676, "rewards/rejected": -7.450833797454834, "step": 504 }, { "epoch": 0.3452401298923261, "grad_norm": 0.0020000154618173838, "learning_rate": 4.5054856301977696e-05, "logits/chosen": -9.122840881347656, "logits/rejected": -9.114381790161133, "logps/chosen": -1.9594602584838867, "logps/rejected": -104.32960510253906, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.0264248847961426, "rewards/margins": 10.27796745300293, "rewards/rejected": -7.251543045043945, "step": 505 }, { "epoch": 0.34592377371389504, "grad_norm": 0.003062107600271702, "learning_rate": 4.506917528066332e-05, "logits/chosen": -8.640966415405273, "logits/rejected": -8.634054183959961, "logps/chosen": -2.033053159713745, "logps/rejected": -106.69282531738281, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.982987642288208, "rewards/margins": 10.416561126708984, "rewards/rejected": -7.4335737228393555, "step": 506 }, { "epoch": 0.34660741753546404, "grad_norm": 0.00285928207449615, "learning_rate": 4.508346598888894e-05, "logits/chosen": -8.909124374389648, "logits/rejected": -8.900798797607422, "logps/chosen": -0.3890218138694763, "logps/rejected": -106.06564331054688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.085998058319092, "rewards/margins": 10.539165496826172, "rewards/rejected": -7.453167915344238, "step": 507 }, { "epoch": 0.34729106135703297, "grad_norm": 0.0009102821350097656, "learning_rate": 4.509772853806532e-05, "logits/chosen": -8.774336814880371, "logits/rejected": -8.768088340759277, "logps/chosen": -2.6880226135253906, "logps/rejected": -104.37442016601562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.908202886581421, "rewards/margins": 10.152679443359375, "rewards/rejected": -7.244477272033691, "step": 508 }, { "epoch": 0.34797470517860196, "grad_norm": 0.0036198990419507027, "learning_rate": 4.511196303894598e-05, "logits/chosen": -8.268594741821289, "logits/rejected": -8.260845184326172, "logps/chosen": -4.112466812133789, "logps/rejected": -105.69254302978516, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.825169801712036, "rewards/margins": 10.11925220489502, "rewards/rejected": -7.294083118438721, "step": 509 }, { "epoch": 0.3486583490001709, "grad_norm": 0.0013479518238455057, "learning_rate": 4.512616960163227e-05, "logits/chosen": -8.522378921508789, "logits/rejected": -8.514694213867188, "logps/chosen": -0.2214025855064392, "logps/rejected": -107.65446472167969, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0971834659576416, "rewards/margins": 10.682053565979004, "rewards/rejected": -7.584869384765625, "step": 510 }, { "epoch": 0.3493419928217399, "grad_norm": 0.001095878193154931, "learning_rate": 4.5140348335578547e-05, "logits/chosen": -8.628227233886719, "logits/rejected": -8.622039794921875, "logps/chosen": -0.36658817529678345, "logps/rejected": -105.71177673339844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.145615816116333, "rewards/margins": 10.53858757019043, "rewards/rejected": -7.392972946166992, "step": 511 }, { "epoch": 0.3500256366433088, "grad_norm": 0.001534723211079836, "learning_rate": 4.515449934959718e-05, "logits/chosen": -8.196242332458496, "logits/rejected": -8.189290046691895, "logps/chosen": -2.016763210296631, "logps/rejected": -103.82477569580078, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0176162719726562, "rewards/margins": 10.158855438232422, "rewards/rejected": -7.141238689422607, "step": 512 }, { "epoch": 0.3507092804648778, "grad_norm": 0.026840349659323692, "learning_rate": 4.516862275186361e-05, "logits/chosen": -8.746345520019531, "logits/rejected": -8.740405082702637, "logps/chosen": -2.0899293422698975, "logps/rejected": -106.89317321777344, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.926164388656616, "rewards/margins": 10.363093376159668, "rewards/rejected": -7.436928749084473, "step": 513 }, { "epoch": 0.35139292428644675, "grad_norm": 0.0024204105138778687, "learning_rate": 4.518271864992127e-05, "logits/chosen": -8.779780387878418, "logits/rejected": -8.774059295654297, "logps/chosen": -2.343595027923584, "logps/rejected": -105.1497802734375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9416275024414062, "rewards/margins": 10.277658462524414, "rewards/rejected": -7.336030960083008, "step": 514 }, { "epoch": 0.35207656810801574, "grad_norm": 0.0018259048229083419, "learning_rate": 4.519678715068652e-05, "logits/chosen": -9.061904907226562, "logits/rejected": -9.053589820861816, "logps/chosen": -0.5819038152694702, "logps/rejected": -106.97761535644531, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.112555503845215, "rewards/margins": 10.553955078125, "rewards/rejected": -7.441399097442627, "step": 515 }, { "epoch": 0.3527602119295847, "grad_norm": 0.009637483395636082, "learning_rate": 4.521082836045353e-05, "logits/chosen": -9.004085540771484, "logits/rejected": -8.996991157531738, "logps/chosen": -2.1073989868164062, "logps/rejected": -105.78080749511719, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9963436126708984, "rewards/margins": 10.33294677734375, "rewards/rejected": -7.336603164672852, "step": 516 }, { "epoch": 0.35344385575115367, "grad_norm": 0.005179519299417734, "learning_rate": 4.5224842384899045e-05, "logits/chosen": -8.54949951171875, "logits/rejected": -8.542850494384766, "logps/chosen": -0.2195740044116974, "logps/rejected": -107.4166030883789, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.223278045654297, "rewards/margins": 10.665200233459473, "rewards/rejected": -7.441922187805176, "step": 517 }, { "epoch": 0.3541274995727226, "grad_norm": 0.0030595571734011173, "learning_rate": 4.523882932908722e-05, "logits/chosen": -9.063653945922852, "logits/rejected": -9.055839538574219, "logps/chosen": -2.1390576362609863, "logps/rejected": -104.51939392089844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.037348985671997, "rewards/margins": 10.306814193725586, "rewards/rejected": -7.269465446472168, "step": 518 }, { "epoch": 0.3548111433942916, "grad_norm": 0.0030745009426027536, "learning_rate": 4.52527892974743e-05, "logits/chosen": -8.814664840698242, "logits/rejected": -8.807222366333008, "logps/chosen": -2.2190515995025635, "logps/rejected": -103.7955551147461, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.050616979598999, "rewards/margins": 10.222452163696289, "rewards/rejected": -7.171834945678711, "step": 519 }, { "epoch": 0.35549478721586053, "grad_norm": 0.002484948141500354, "learning_rate": 4.526672239391333e-05, "logits/chosen": -8.779367446899414, "logits/rejected": -8.773014068603516, "logps/chosen": -4.210451602935791, "logps/rejected": -104.68485260009766, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8560290336608887, "rewards/margins": 10.089742660522461, "rewards/rejected": -7.233713150024414, "step": 520 }, { "epoch": 0.3561784310374295, "grad_norm": 0.0030445915181189775, "learning_rate": 4.528062872165875e-05, "logits/chosen": -9.03580093383789, "logits/rejected": -9.028921127319336, "logps/chosen": -3.6832785606384277, "logps/rejected": -105.81210327148438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7440707683563232, "rewards/margins": 10.099363327026367, "rewards/rejected": -7.355292320251465, "step": 521 }, { "epoch": 0.35686207485899846, "grad_norm": 0.01798449642956257, "learning_rate": 4.529450838337104e-05, "logits/chosen": -8.569169998168945, "logits/rejected": -8.562253952026367, "logps/chosen": -2.2889931201934814, "logps/rejected": -105.79234313964844, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.8429834842681885, "rewards/margins": 10.250594139099121, "rewards/rejected": -7.407610893249512, "step": 522 }, { "epoch": 0.35754571868056745, "grad_norm": 0.017598913982510567, "learning_rate": 4.530836148112124e-05, "logits/chosen": -8.44102668762207, "logits/rejected": -8.43248176574707, "logps/chosen": -2.363553285598755, "logps/rejected": -107.67010498046875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 3.026621103286743, "rewards/margins": 10.457855224609375, "rewards/rejected": -7.431234359741211, "step": 523 }, { "epoch": 0.3582293625021364, "grad_norm": 0.007089692167937756, "learning_rate": 4.532218811639545e-05, "logits/chosen": -8.386029243469238, "logits/rejected": -8.380964279174805, "logps/chosen": -5.226341247558594, "logps/rejected": -105.19155883789062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6615748405456543, "rewards/margins": 9.892965316772461, "rewards/rejected": -7.231390953063965, "step": 524 }, { "epoch": 0.3589130063237054, "grad_norm": 0.003910040948539972, "learning_rate": 4.5335988390099284e-05, "logits/chosen": -8.716702461242676, "logits/rejected": -8.709980964660645, "logps/chosen": -2.842343330383301, "logps/rejected": -103.74392700195312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9060425758361816, "rewards/margins": 10.125288963317871, "rewards/rejected": -7.219246864318848, "step": 525 }, { "epoch": 0.3595966501452743, "grad_norm": 0.12534207105636597, "learning_rate": 4.534976240256232e-05, "logits/chosen": -8.458158493041992, "logits/rejected": -8.451277732849121, "logps/chosen": -0.2034679651260376, "logps/rejected": -107.31441497802734, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 3.194084644317627, "rewards/margins": 10.661066055297852, "rewards/rejected": -7.466981887817383, "step": 526 }, { "epoch": 0.3602802939668433, "grad_norm": 0.002347145928069949, "learning_rate": 4.536351025354245e-05, "logits/chosen": -8.932251930236816, "logits/rejected": -8.926216125488281, "logps/chosen": -6.3815460205078125, "logps/rejected": -104.78129577636719, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5657718181610107, "rewards/margins": 9.837814331054688, "rewards/rejected": -7.272042274475098, "step": 527 }, { "epoch": 0.36096393778841224, "grad_norm": 0.004644155967980623, "learning_rate": 4.537723204223021e-05, "logits/chosen": -8.162765502929688, "logits/rejected": -8.156951904296875, "logps/chosen": -2.3011324405670166, "logps/rejected": -108.49044799804688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.866445541381836, "rewards/margins": 10.424814224243164, "rewards/rejected": -7.55836820602417, "step": 528 }, { "epoch": 0.3616475816099812, "grad_norm": 0.10573361814022064, "learning_rate": 4.53909278672531e-05, "logits/chosen": -8.624829292297363, "logits/rejected": -8.618780136108398, "logps/chosen": -6.4251909255981445, "logps/rejected": -104.890625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 2.6115946769714355, "rewards/margins": 9.819175720214844, "rewards/rejected": -7.207581520080566, "step": 529 }, { "epoch": 0.36233122543155016, "grad_norm": 0.014966655522584915, "learning_rate": 4.5404597826679824e-05, "logits/chosen": -8.710149765014648, "logits/rejected": -8.702912330627441, "logps/chosen": -4.090182781219482, "logps/rejected": -104.3074951171875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.822023391723633, "rewards/margins": 9.991340637207031, "rewards/rejected": -7.169317245483398, "step": 530 }, { "epoch": 0.3630148692531191, "grad_norm": 0.0013991504674777389, "learning_rate": 4.541824201802449e-05, "logits/chosen": -8.357665061950684, "logits/rejected": -8.348608016967773, "logps/chosen": -0.455402135848999, "logps/rejected": -107.76449584960938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1156625747680664, "rewards/margins": 10.683145523071289, "rewards/rejected": -7.5674824714660645, "step": 531 }, { "epoch": 0.3636985130746881, "grad_norm": 0.1360078901052475, "learning_rate": 4.543186053825081e-05, "logits/chosen": -9.073970794677734, "logits/rejected": -9.067255020141602, "logps/chosen": -6.493519306182861, "logps/rejected": -101.43251037597656, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 2.4746761322021484, "rewards/margins": 9.446413040161133, "rewards/rejected": -6.971736431121826, "step": 532 }, { "epoch": 0.364382156896257, "grad_norm": 0.0012016879627481103, "learning_rate": 4.544545348377621e-05, "logits/chosen": -8.85019588470459, "logits/rejected": -8.844279289245605, "logps/chosen": -2.8336009979248047, "logps/rejected": -106.91740417480469, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.893784523010254, "rewards/margins": 10.381479263305664, "rewards/rejected": -7.487694263458252, "step": 533 }, { "epoch": 0.365065800717826, "grad_norm": 0.0013644045684486628, "learning_rate": 4.5459020950475946e-05, "logits/chosen": -8.693999290466309, "logits/rejected": -8.686568260192871, "logps/chosen": -5.384868621826172, "logps/rejected": -105.42316436767578, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5882716178894043, "rewards/margins": 9.919539451599121, "rewards/rejected": -7.331268787384033, "step": 534 }, { "epoch": 0.36574944453939495, "grad_norm": 0.19662976264953613, "learning_rate": 4.5472563033687145e-05, "logits/chosen": -7.862525939941406, "logits/rejected": -7.85363245010376, "logps/chosen": -0.24509553611278534, "logps/rejected": -107.10560607910156, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 3.2135324478149414, "rewards/margins": 10.639017105102539, "rewards/rejected": -7.425485134124756, "step": 535 }, { "epoch": 0.36643308836096394, "grad_norm": 0.005831377115100622, "learning_rate": 4.548607982821284e-05, "logits/chosen": -8.940006256103516, "logits/rejected": -8.931025505065918, "logps/chosen": -5.2350873947143555, "logps/rejected": -104.00788116455078, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.711857318878174, "rewards/margins": 9.861034393310547, "rewards/rejected": -7.149176597595215, "step": 536 }, { "epoch": 0.3671167321825329, "grad_norm": 0.0020434055477380753, "learning_rate": 4.5499571428325935e-05, "logits/chosen": -7.958423614501953, "logits/rejected": -7.9507737159729, "logps/chosen": -5.039041042327881, "logps/rejected": -103.1528091430664, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.721285104751587, "rewards/margins": 9.856602668762207, "rewards/rejected": -7.135318279266357, "step": 537 }, { "epoch": 0.36780037600410187, "grad_norm": 0.0027917856350541115, "learning_rate": 4.5513037927773155e-05, "logits/chosen": -8.918725967407227, "logits/rejected": -8.912227630615234, "logps/chosen": -4.297214031219482, "logps/rejected": -105.8448486328125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6667566299438477, "rewards/margins": 10.095367431640625, "rewards/rejected": -7.428611755371094, "step": 538 }, { "epoch": 0.3684840198256708, "grad_norm": 0.0015323086408898234, "learning_rate": 4.5526479419778986e-05, "logits/chosen": -9.015233993530273, "logits/rejected": -9.006407737731934, "logps/chosen": -5.570869445800781, "logps/rejected": -103.89583587646484, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5463967323303223, "rewards/margins": 9.823688507080078, "rewards/rejected": -7.277291774749756, "step": 539 }, { "epoch": 0.3691676636472398, "grad_norm": 0.0026783794164657593, "learning_rate": 4.553989599704948e-05, "logits/chosen": -9.076767921447754, "logits/rejected": -9.066667556762695, "logps/chosen": -1.7292388677597046, "logps/rejected": -106.40653991699219, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9628357887268066, "rewards/margins": 10.44752311706543, "rewards/rejected": -7.484687328338623, "step": 540 }, { "epoch": 0.36985130746880873, "grad_norm": 0.001596634741872549, "learning_rate": 4.555328775177616e-05, "logits/chosen": -8.075600624084473, "logits/rejected": -8.06791877746582, "logps/chosen": -7.225759983062744, "logps/rejected": -102.36463165283203, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4857876300811768, "rewards/margins": 9.507375717163086, "rewards/rejected": -7.021587371826172, "step": 541 }, { "epoch": 0.3705349512903777, "grad_norm": 0.0013423098716884851, "learning_rate": 4.5566654775639785e-05, "logits/chosen": -8.757363319396973, "logits/rejected": -8.749910354614258, "logps/chosen": -4.235930442810059, "logps/rejected": -104.26500701904297, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.819624662399292, "rewards/margins": 10.016908645629883, "rewards/rejected": -7.197283744812012, "step": 542 }, { "epoch": 0.37121859511194666, "grad_norm": 0.19696085155010223, "learning_rate": 4.5579997159814117e-05, "logits/chosen": -8.57602310180664, "logits/rejected": -8.569262504577637, "logps/chosen": -6.078829288482666, "logps/rejected": -105.01226806640625, "loss": 0.0414, "rewards/accuracies": 1.0, "rewards/chosen": 2.6462416648864746, "rewards/margins": 9.826871871948242, "rewards/rejected": -7.180630207061768, "step": 543 }, { "epoch": 0.37190223893351565, "grad_norm": 0.0009205901296809316, "learning_rate": 4.5593314994969665e-05, "logits/chosen": -8.260759353637695, "logits/rejected": -8.253074645996094, "logps/chosen": -2.8376593589782715, "logps/rejected": -107.60362243652344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.94307279586792, "rewards/margins": 10.438429832458496, "rewards/rejected": -7.495357513427734, "step": 544 }, { "epoch": 0.3725858827550846, "grad_norm": 0.0022142576053738594, "learning_rate": 4.560660837127738e-05, "logits/chosen": -8.802131652832031, "logits/rejected": -8.795625686645508, "logps/chosen": -0.4525069296360016, "logps/rejected": -108.47467041015625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.050459384918213, "rewards/margins": 10.716374397277832, "rewards/rejected": -7.665914535522461, "step": 545 }, { "epoch": 0.3732695265766536, "grad_norm": 0.0013103248784318566, "learning_rate": 4.561987737841229e-05, "logits/chosen": -8.30688762664795, "logits/rejected": -8.299917221069336, "logps/chosen": -3.467480182647705, "logps/rejected": -103.85676574707031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.736145496368408, "rewards/margins": 9.978313446044922, "rewards/rejected": -7.242167949676514, "step": 546 }, { "epoch": 0.3739531703982225, "grad_norm": 0.0021970246452838182, "learning_rate": 4.563312210555719e-05, "logits/chosen": -8.67757511138916, "logits/rejected": -8.666135787963867, "logps/chosen": -2.8361434936523438, "logps/rejected": -105.41587829589844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.0015244483947754, "rewards/margins": 10.247262954711914, "rewards/rejected": -7.2457380294799805, "step": 547 }, { "epoch": 0.3746368142197915, "grad_norm": 0.001669823075644672, "learning_rate": 4.564634264140616e-05, "logits/chosen": -8.999870300292969, "logits/rejected": -8.989389419555664, "logps/chosen": -0.698437511920929, "logps/rejected": -106.5791015625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.053706169128418, "rewards/margins": 10.576841354370117, "rewards/rejected": -7.523136138916016, "step": 548 }, { "epoch": 0.37532045804136044, "grad_norm": 0.0013778910506516695, "learning_rate": 4.56595390741682e-05, "logits/chosen": -8.743714332580566, "logits/rejected": -8.735590934753418, "logps/chosen": -3.1071789264678955, "logps/rejected": -106.09569549560547, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8997299671173096, "rewards/margins": 10.253467559814453, "rewards/rejected": -7.353737831115723, "step": 549 }, { "epoch": 0.37600410186292943, "grad_norm": 0.9665376543998718, "learning_rate": 4.567271149157073e-05, "logits/chosen": -8.333063125610352, "logits/rejected": -8.32824420928955, "logps/chosen": -8.114344596862793, "logps/rejected": -104.68082427978516, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": 2.3675384521484375, "rewards/margins": 9.609511375427246, "rewards/rejected": -7.241972923278809, "step": 550 }, { "epoch": 0.37668774568449837, "grad_norm": 0.007263954728841782, "learning_rate": 4.5685859980863086e-05, "logits/chosen": -8.855039596557617, "logits/rejected": -8.845767974853516, "logps/chosen": -1.2399609088897705, "logps/rejected": -106.70069885253906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.075157642364502, "rewards/margins": 10.521895408630371, "rewards/rejected": -7.446737766265869, "step": 551 }, { "epoch": 0.37737138950606736, "grad_norm": 0.004008229356259108, "learning_rate": 4.569898462881999e-05, "logits/chosen": -8.860137939453125, "logits/rejected": -8.851652145385742, "logps/chosen": -3.5987207889556885, "logps/rejected": -105.0776596069336, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7817349433898926, "rewards/margins": 10.117851257324219, "rewards/rejected": -7.336116790771484, "step": 552 }, { "epoch": 0.3780550333276363, "grad_norm": 0.005858825985342264, "learning_rate": 4.571208552174497e-05, "logits/chosen": -8.812792778015137, "logits/rejected": -8.806192398071289, "logps/chosen": -9.174994468688965, "logps/rejected": -101.59281158447266, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.283297538757324, "rewards/margins": 9.177817344665527, "rewards/rejected": -6.8945207595825195, "step": 553 }, { "epoch": 0.3787386771492053, "grad_norm": 0.1876385509967804, "learning_rate": 4.572516274547383e-05, "logits/chosen": -8.669267654418945, "logits/rejected": -8.661458015441895, "logps/chosen": -3.6457464694976807, "logps/rejected": -105.7276611328125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 2.851086139678955, "rewards/margins": 10.192622184753418, "rewards/rejected": -7.341536521911621, "step": 554 }, { "epoch": 0.3794223209707742, "grad_norm": 0.43734100461006165, "learning_rate": 4.573821638537794e-05, "logits/chosen": -8.221056938171387, "logits/rejected": -8.21543025970459, "logps/chosen": -7.783557415008545, "logps/rejected": -103.34688568115234, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 2.520139694213867, "rewards/margins": 9.543603897094727, "rewards/rejected": -7.023464679718018, "step": 555 }, { "epoch": 0.3801059647923432, "grad_norm": 0.07461749762296677, "learning_rate": 4.575124652636763e-05, "logits/chosen": -8.62582015991211, "logits/rejected": -8.61740493774414, "logps/chosen": -7.623571395874023, "logps/rejected": -101.0118179321289, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 2.567636489868164, "rewards/margins": 9.42699909210205, "rewards/rejected": -6.85936164855957, "step": 556 }, { "epoch": 0.38078960861391214, "grad_norm": 0.017381882295012474, "learning_rate": 4.5764253252895486e-05, "logits/chosen": -8.886857032775879, "logits/rejected": -8.878725051879883, "logps/chosen": -8.122917175292969, "logps/rejected": -103.88660430908203, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.403275728225708, "rewards/margins": 9.486910820007324, "rewards/rejected": -7.083635330200195, "step": 557 }, { "epoch": 0.38147325243548114, "grad_norm": 0.009558760561048985, "learning_rate": 4.577723664895965e-05, "logits/chosen": -8.360452651977539, "logits/rejected": -8.353442192077637, "logps/chosen": -9.06432056427002, "logps/rejected": -103.78009033203125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.281027317047119, "rewards/margins": 9.338869094848633, "rewards/rejected": -7.0578413009643555, "step": 558 }, { "epoch": 0.38215689625705007, "grad_norm": 0.014928596094250679, "learning_rate": 4.579019679810706e-05, "logits/chosen": -8.317472457885742, "logits/rejected": -8.31100082397461, "logps/chosen": -7.856249809265137, "logps/rejected": -103.80247497558594, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.4019904136657715, "rewards/margins": 9.572214126586914, "rewards/rejected": -7.170223236083984, "step": 559 }, { "epoch": 0.38284054007861906, "grad_norm": 0.04886111244559288, "learning_rate": 4.5803133783436676e-05, "logits/chosen": -8.262016296386719, "logits/rejected": -8.254585266113281, "logps/chosen": -7.028374195098877, "logps/rejected": -104.12940979003906, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.505631923675537, "rewards/margins": 9.65554428100586, "rewards/rejected": -7.1499128341674805, "step": 560 }, { "epoch": 0.383524183900188, "grad_norm": 0.012986937537789345, "learning_rate": 4.581604768760269e-05, "logits/chosen": -8.254681587219238, "logits/rejected": -8.24558162689209, "logps/chosen": -4.371557235717773, "logps/rejected": -104.9013671875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.8006577491760254, "rewards/margins": 10.062606811523438, "rewards/rejected": -7.2619500160217285, "step": 561 }, { "epoch": 0.384207827721757, "grad_norm": 0.008311575278639793, "learning_rate": 4.582893859281769e-05, "logits/chosen": -7.894365310668945, "logits/rejected": -7.889013290405273, "logps/chosen": -8.379609107971191, "logps/rejected": -104.8883285522461, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.3424105644226074, "rewards/margins": 9.538131713867188, "rewards/rejected": -7.19572114944458, "step": 562 }, { "epoch": 0.3848914715433259, "grad_norm": 0.006131183821707964, "learning_rate": 4.584180658085578e-05, "logits/chosen": -8.107142448425293, "logits/rejected": -8.096502304077148, "logps/chosen": -3.2123143672943115, "logps/rejected": -106.4113540649414, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.8114352226257324, "rewards/margins": 10.295516014099121, "rewards/rejected": -7.484080791473389, "step": 563 }, { "epoch": 0.3855751153648949, "grad_norm": 0.004662070423364639, "learning_rate": 4.585465173305571e-05, "logits/chosen": -9.062649726867676, "logits/rejected": -9.053556442260742, "logps/chosen": -6.10276985168457, "logps/rejected": -103.59041595458984, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.560807704925537, "rewards/margins": 9.770605087280273, "rewards/rejected": -7.209796905517578, "step": 564 }, { "epoch": 0.38625875918646385, "grad_norm": 0.0024933405220508575, "learning_rate": 4.5867474130323984e-05, "logits/chosen": -8.054591178894043, "logits/rejected": -8.047430038452148, "logps/chosen": -3.8450570106506348, "logps/rejected": -106.15379333496094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.78342342376709, "rewards/margins": 10.17796516418457, "rewards/rejected": -7.394542217254639, "step": 565 }, { "epoch": 0.38694240300803284, "grad_norm": 0.006151636131107807, "learning_rate": 4.588027385313786e-05, "logits/chosen": -8.169829368591309, "logits/rejected": -8.163496971130371, "logps/chosen": -5.6698479652404785, "logps/rejected": -105.5827407836914, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5689215660095215, "rewards/margins": 9.920917510986328, "rewards/rejected": -7.351995944976807, "step": 566 }, { "epoch": 0.3876260468296018, "grad_norm": 5.168415069580078, "learning_rate": 4.5893050981548446e-05, "logits/chosen": -9.0006103515625, "logits/rejected": -8.992657661437988, "logps/chosen": -8.390092849731445, "logps/rejected": -104.20333862304688, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": 2.2624621391296387, "rewards/margins": 9.526378631591797, "rewards/rejected": -7.263917446136475, "step": 567 }, { "epoch": 0.3883096906511707, "grad_norm": 0.40208378434181213, "learning_rate": 4.5905805595183656e-05, "logits/chosen": -9.33488941192627, "logits/rejected": -9.323884963989258, "logps/chosen": -3.062016487121582, "logps/rejected": -106.27412414550781, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 2.8704464435577393, "rewards/margins": 10.292535781860352, "rewards/rejected": -7.422089099884033, "step": 568 }, { "epoch": 0.3889933344727397, "grad_norm": 0.004512095358222723, "learning_rate": 4.591853777325119e-05, "logits/chosen": -8.960912704467773, "logits/rejected": -8.952515602111816, "logps/chosen": -10.488362312316895, "logps/rejected": -102.77323913574219, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.2130544185638428, "rewards/margins": 9.120040893554688, "rewards/rejected": -6.906986713409424, "step": 569 }, { "epoch": 0.38967697829430864, "grad_norm": 0.0059547447599470615, "learning_rate": 4.593124759454153e-05, "logits/chosen": -8.343918800354004, "logits/rejected": -8.337284088134766, "logps/chosen": -13.361282348632812, "logps/rejected": -101.08283996582031, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.1572937965393066, "rewards/margins": 8.759539604187012, "rewards/rejected": -6.602246284484863, "step": 570 }, { "epoch": 0.39036062211587763, "grad_norm": 0.003797155572101474, "learning_rate": 4.5943935137430806e-05, "logits/chosen": -8.468061447143555, "logits/rejected": -8.457059860229492, "logps/chosen": -6.84923791885376, "logps/rejected": -103.3392333984375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.508284568786621, "rewards/margins": 9.58808422088623, "rewards/rejected": -7.079799652099609, "step": 571 }, { "epoch": 0.39104426593744657, "grad_norm": 0.0024280259385704994, "learning_rate": 4.595660047988374e-05, "logits/chosen": -8.285135269165039, "logits/rejected": -8.276823997497559, "logps/chosen": -3.869678497314453, "logps/rejected": -105.55516052246094, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.714939594268799, "rewards/margins": 10.14161205291748, "rewards/rejected": -7.426671981811523, "step": 572 }, { "epoch": 0.39172790975901556, "grad_norm": 0.002518144901841879, "learning_rate": 4.59692436994565e-05, "logits/chosen": -8.42386245727539, "logits/rejected": -8.415853500366211, "logps/chosen": -4.778499603271484, "logps/rejected": -105.35393524169922, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7317118644714355, "rewards/margins": 10.040302276611328, "rewards/rejected": -7.308589935302734, "step": 573 }, { "epoch": 0.3924115535805845, "grad_norm": 0.006311685312539339, "learning_rate": 4.5981864873299563e-05, "logits/chosen": -8.874204635620117, "logits/rejected": -8.863118171691895, "logps/chosen": -2.0156564712524414, "logps/rejected": -105.76942443847656, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.945424795150757, "rewards/margins": 10.388139724731445, "rewards/rejected": -7.442714214324951, "step": 574 }, { "epoch": 0.3930951974021535, "grad_norm": 0.014740025624632835, "learning_rate": 4.599446407816052e-05, "logits/chosen": -8.024564743041992, "logits/rejected": -8.011886596679688, "logps/chosen": -4.820056438446045, "logps/rejected": -103.82994079589844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6856627464294434, "rewards/margins": 9.785124778747559, "rewards/rejected": -7.099461555480957, "step": 575 }, { "epoch": 0.3937788412237224, "grad_norm": 0.0038888133130967617, "learning_rate": 4.6007041390386874e-05, "logits/chosen": -8.4778470993042, "logits/rejected": -8.469890594482422, "logps/chosen": -9.351346969604492, "logps/rejected": -103.89117431640625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.316455364227295, "rewards/margins": 9.395227432250977, "rewards/rejected": -7.07877254486084, "step": 576 }, { "epoch": 0.3944624850452914, "grad_norm": 0.03431423753499985, "learning_rate": 4.601959688592886e-05, "logits/chosen": -8.282942771911621, "logits/rejected": -8.273542404174805, "logps/chosen": -11.958642959594727, "logps/rejected": -102.07725524902344, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.0872135162353516, "rewards/margins": 8.997941017150879, "rewards/rejected": -6.910727500915527, "step": 577 }, { "epoch": 0.39514612886686035, "grad_norm": 0.003184944624081254, "learning_rate": 4.603213064034216e-05, "logits/chosen": -8.198872566223145, "logits/rejected": -8.189592361450195, "logps/chosen": -5.108485698699951, "logps/rejected": -104.42049407958984, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.720396041870117, "rewards/margins": 9.966577529907227, "rewards/rejected": -7.246180534362793, "step": 578 }, { "epoch": 0.39582977268842934, "grad_norm": 0.0026459884829819202, "learning_rate": 4.604464272879061e-05, "logits/chosen": -8.726604461669922, "logits/rejected": -8.717840194702148, "logps/chosen": -5.164083957672119, "logps/rejected": -105.4052734375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5914320945739746, "rewards/margins": 9.939166069030762, "rewards/rejected": -7.347734451293945, "step": 579 }, { "epoch": 0.3965134165099983, "grad_norm": 0.0029460766818374395, "learning_rate": 4.605713322604896e-05, "logits/chosen": -8.560563087463379, "logits/rejected": -8.552568435668945, "logps/chosen": -6.519986152648926, "logps/rejected": -103.46198272705078, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.548583745956421, "rewards/margins": 9.622475624084473, "rewards/rejected": -7.073892116546631, "step": 580 }, { "epoch": 0.39719706033156726, "grad_norm": 0.0013321733567863703, "learning_rate": 4.606960220650551e-05, "logits/chosen": -8.359171867370605, "logits/rejected": -8.34779167175293, "logps/chosen": -2.507463216781616, "logps/rejected": -105.7034912109375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0159690380096436, "rewards/margins": 10.3662691116333, "rewards/rejected": -7.350300312042236, "step": 581 }, { "epoch": 0.3978807041531362, "grad_norm": 0.003979991655796766, "learning_rate": 4.608204974416481e-05, "logits/chosen": -7.301906585693359, "logits/rejected": -7.294061183929443, "logps/chosen": -7.414824485778809, "logps/rejected": -102.34539794921875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5049726963043213, "rewards/margins": 9.477119445800781, "rewards/rejected": -6.972146987915039, "step": 582 }, { "epoch": 0.3985643479747052, "grad_norm": 0.0016385484486818314, "learning_rate": 4.6094475912650234e-05, "logits/chosen": -8.292871475219727, "logits/rejected": -8.285087585449219, "logps/chosen": -4.511925220489502, "logps/rejected": -104.4433822631836, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.716689109802246, "rewards/margins": 10.001952171325684, "rewards/rejected": -7.285263538360596, "step": 583 }, { "epoch": 0.3992479917962741, "grad_norm": 0.005751208867877722, "learning_rate": 4.610688078520666e-05, "logits/chosen": -8.044489860534668, "logits/rejected": -8.037750244140625, "logps/chosen": -6.3470611572265625, "logps/rejected": -104.4365234375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.588156223297119, "rewards/margins": 9.749639511108398, "rewards/rejected": -7.161483287811279, "step": 584 }, { "epoch": 0.3999316356178431, "grad_norm": 0.0031950161792337894, "learning_rate": 4.611926443470301e-05, "logits/chosen": -7.472258567810059, "logits/rejected": -7.465157508850098, "logps/chosen": -6.837857723236084, "logps/rejected": -103.01193237304688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5669338703155518, "rewards/margins": 9.610733985900879, "rewards/rejected": -7.043799877166748, "step": 585 }, { "epoch": 0.40061527943941205, "grad_norm": 0.0074254730716347694, "learning_rate": 4.6131626933634844e-05, "logits/chosen": -8.576961517333984, "logits/rejected": -8.569276809692383, "logps/chosen": -6.358741760253906, "logps/rejected": -104.70413970947266, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.559098243713379, "rewards/margins": 9.737351417541504, "rewards/rejected": -7.178253173828125, "step": 586 }, { "epoch": 0.40129892326098104, "grad_norm": 0.031281232833862305, "learning_rate": 4.6143968354126914e-05, "logits/chosen": -8.032002449035645, "logits/rejected": -8.024921417236328, "logps/chosen": -5.586743354797363, "logps/rejected": -103.89324951171875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 2.6771154403686523, "rewards/margins": 9.839521408081055, "rewards/rejected": -7.162405967712402, "step": 587 }, { "epoch": 0.40198256708255, "grad_norm": 0.0029978954698890448, "learning_rate": 4.6156288767935646e-05, "logits/chosen": -8.560604095458984, "logits/rejected": -8.55235767364502, "logps/chosen": -3.347517728805542, "logps/rejected": -104.95110321044922, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.919206142425537, "rewards/margins": 10.151535034179688, "rewards/rejected": -7.232329368591309, "step": 588 }, { "epoch": 0.40266621090411897, "grad_norm": 0.4118789732456207, "learning_rate": 4.61685882464517e-05, "logits/chosen": -8.162120819091797, "logits/rejected": -8.154633522033691, "logps/chosen": -2.639763832092285, "logps/rejected": -106.35176849365234, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 2.7827415466308594, "rewards/margins": 10.368711471557617, "rewards/rejected": -7.585968971252441, "step": 589 }, { "epoch": 0.4033498547256879, "grad_norm": 4.377540111541748, "learning_rate": 4.61808668607024e-05, "logits/chosen": -8.099590301513672, "logits/rejected": -8.091910362243652, "logps/chosen": -4.08021879196167, "logps/rejected": -104.2010498046875, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": 2.755794048309326, "rewards/margins": 9.95073127746582, "rewards/rejected": -7.194937705993652, "step": 590 }, { "epoch": 0.4040334985472569, "grad_norm": 0.002147893188521266, "learning_rate": 4.619312468135426e-05, "logits/chosen": -8.152549743652344, "logits/rejected": -8.14564037322998, "logps/chosen": -5.391055107116699, "logps/rejected": -104.48399353027344, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.652204751968384, "rewards/margins": 9.875570297241211, "rewards/rejected": -7.223365306854248, "step": 591 }, { "epoch": 0.40471714236882583, "grad_norm": 0.0020603423472493887, "learning_rate": 4.620536177871533e-05, "logits/chosen": -8.392439842224121, "logits/rejected": -8.384027481079102, "logps/chosen": -2.7295682430267334, "logps/rejected": -105.89027404785156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9637300968170166, "rewards/margins": 10.334489822387695, "rewards/rejected": -7.370759010314941, "step": 592 }, { "epoch": 0.4054007861903948, "grad_norm": 0.00187546422239393, "learning_rate": 4.621757822273772e-05, "logits/chosen": -8.736177444458008, "logits/rejected": -8.727310180664062, "logps/chosen": -1.461037039756775, "logps/rejected": -107.03590393066406, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.950974464416504, "rewards/margins": 10.479513168334961, "rewards/rejected": -7.528538703918457, "step": 593 }, { "epoch": 0.40608443001196376, "grad_norm": 0.0015048424247652292, "learning_rate": 4.62297740830199e-05, "logits/chosen": -8.82970142364502, "logits/rejected": -8.821441650390625, "logps/chosen": -6.536367416381836, "logps/rejected": -103.48062133789062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.618603229522705, "rewards/margins": 9.72909164428711, "rewards/rejected": -7.1104888916015625, "step": 594 }, { "epoch": 0.40676807383353275, "grad_norm": 0.008835145272314548, "learning_rate": 4.6241949428809165e-05, "logits/chosen": -8.069320678710938, "logits/rejected": -8.061511993408203, "logps/chosen": -4.8421244621276855, "logps/rejected": -105.01075744628906, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7052366733551025, "rewards/margins": 9.957635879516602, "rewards/rejected": -7.2524003982543945, "step": 595 }, { "epoch": 0.4074517176551017, "grad_norm": 0.001872926251962781, "learning_rate": 4.625410432900395e-05, "logits/chosen": -7.592142581939697, "logits/rejected": -7.584121227264404, "logps/chosen": -0.9792262315750122, "logps/rejected": -106.9080581665039, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1351542472839355, "rewards/margins": 10.59819221496582, "rewards/rejected": -7.463038444519043, "step": 596 }, { "epoch": 0.4081353614766707, "grad_norm": 0.0017921682447195053, "learning_rate": 4.626623885215616e-05, "logits/chosen": -8.930887222290039, "logits/rejected": -8.921416282653809, "logps/chosen": -4.312730312347412, "logps/rejected": -103.09434509277344, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5800282955169678, "rewards/margins": 9.805492401123047, "rewards/rejected": -7.225464344024658, "step": 597 }, { "epoch": 0.4088190052982396, "grad_norm": 2.488905668258667, "learning_rate": 4.627835306647352e-05, "logits/chosen": -8.696121215820312, "logits/rejected": -8.687654495239258, "logps/chosen": -5.071111679077148, "logps/rejected": -104.13282775878906, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 2.671191692352295, "rewards/margins": 9.88160514831543, "rewards/rejected": -7.210412979125977, "step": 598 }, { "epoch": 0.4095026491198086, "grad_norm": 0.0012533715926110744, "learning_rate": 4.629044703982186e-05, "logits/chosen": -8.272012710571289, "logits/rejected": -8.262380599975586, "logps/chosen": -0.7332795858383179, "logps/rejected": -107.1965103149414, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1669435501098633, "rewards/margins": 10.609651565551758, "rewards/rejected": -7.4427080154418945, "step": 599 }, { "epoch": 0.41018629294137754, "grad_norm": 0.0014637598069384694, "learning_rate": 4.63025208397274e-05, "logits/chosen": -8.637378692626953, "logits/rejected": -8.628328323364258, "logps/chosen": -5.213522434234619, "logps/rejected": -103.84986114501953, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6739892959594727, "rewards/margins": 9.806371688842773, "rewards/rejected": -7.132383346557617, "step": 600 }, { "epoch": 0.41086993676294653, "grad_norm": 0.0016425189096480608, "learning_rate": 4.6314574533379e-05, "logits/chosen": -8.58425521850586, "logits/rejected": -8.576434135437012, "logps/chosen": -1.6509530544281006, "logps/rejected": -107.93331909179688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.974886894226074, "rewards/margins": 10.48930835723877, "rewards/rejected": -7.5144219398498535, "step": 601 }, { "epoch": 0.41155358058451547, "grad_norm": 0.0007946584955789149, "learning_rate": 4.632660818763041e-05, "logits/chosen": -8.129392623901367, "logits/rejected": -8.11850357055664, "logps/chosen": -2.6048786640167236, "logps/rejected": -106.1905288696289, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1686065196990967, "rewards/margins": 10.457315444946289, "rewards/rejected": -7.288708686828613, "step": 602 }, { "epoch": 0.41223722440608446, "grad_norm": 0.0015301024541258812, "learning_rate": 4.633862186900253e-05, "logits/chosen": -8.566434860229492, "logits/rejected": -8.557917594909668, "logps/chosen": -4.047150611877441, "logps/rejected": -106.55728149414062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7293829917907715, "rewards/margins": 10.222867965698242, "rewards/rejected": -7.4934844970703125, "step": 603 }, { "epoch": 0.4129208682276534, "grad_norm": 0.003750710980966687, "learning_rate": 4.6350615643685535e-05, "logits/chosen": -8.298046112060547, "logits/rejected": -8.287761688232422, "logps/chosen": -3.159313917160034, "logps/rejected": -107.782470703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.912578582763672, "rewards/margins": 10.411989212036133, "rewards/rejected": -7.4994096755981445, "step": 604 }, { "epoch": 0.41360451204922233, "grad_norm": 0.0015571832191199064, "learning_rate": 4.6362589577541154e-05, "logits/chosen": -7.986322402954102, "logits/rejected": -7.9749932289123535, "logps/chosen": -2.90818452835083, "logps/rejected": -106.37124633789062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8461575508117676, "rewards/margins": 10.281166076660156, "rewards/rejected": -7.4350080490112305, "step": 605 }, { "epoch": 0.4142881558707913, "grad_norm": 0.00179735803976655, "learning_rate": 4.637454373610477e-05, "logits/chosen": -7.918172836303711, "logits/rejected": -7.9080328941345215, "logps/chosen": -3.4594502449035645, "logps/rejected": -107.17668151855469, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8640050888061523, "rewards/margins": 10.300156593322754, "rewards/rejected": -7.436151504516602, "step": 606 }, { "epoch": 0.41497179969236025, "grad_norm": 0.0015229909913614392, "learning_rate": 4.638647818458763e-05, "logits/chosen": -7.945550441741943, "logits/rejected": -7.936903953552246, "logps/chosen": -2.6364641189575195, "logps/rejected": -106.5941162109375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9193601608276367, "rewards/margins": 10.37005615234375, "rewards/rejected": -7.450695991516113, "step": 607 }, { "epoch": 0.41565544351392925, "grad_norm": 0.007012188900262117, "learning_rate": 4.639839298787892e-05, "logits/chosen": -8.245428085327148, "logits/rejected": -8.235103607177734, "logps/chosen": -3.6372783184051514, "logps/rejected": -106.87487030029297, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.897979736328125, "rewards/margins": 10.351425170898438, "rewards/rejected": -7.4534454345703125, "step": 608 }, { "epoch": 0.4163390873354982, "grad_norm": 0.007217418402433395, "learning_rate": 4.641028821054793e-05, "logits/chosen": -7.720359802246094, "logits/rejected": -7.712087154388428, "logps/chosen": -7.713535785675049, "logps/rejected": -102.59449005126953, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.535747766494751, "rewards/margins": 9.491642951965332, "rewards/rejected": -6.955894947052002, "step": 609 }, { "epoch": 0.4170227311570672, "grad_norm": 0.00192848383449018, "learning_rate": 4.6422163916846124e-05, "logits/chosen": -7.897371292114258, "logits/rejected": -7.887155532836914, "logps/chosen": -3.4518628120422363, "logps/rejected": -107.12396240234375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.948718309402466, "rewards/margins": 10.379631042480469, "rewards/rejected": -7.430912971496582, "step": 610 }, { "epoch": 0.4177063749786361, "grad_norm": 0.0019544237293303013, "learning_rate": 4.643402017070924e-05, "logits/chosen": -7.779003143310547, "logits/rejected": -7.770967483520508, "logps/chosen": -4.065620422363281, "logps/rejected": -105.56593322753906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7040297985076904, "rewards/margins": 10.067255020141602, "rewards/rejected": -7.363224983215332, "step": 611 }, { "epoch": 0.4183900188002051, "grad_norm": 0.0034359307028353214, "learning_rate": 4.644585703575936e-05, "logits/chosen": -8.18880844116211, "logits/rejected": -8.177088737487793, "logps/chosen": -5.203794479370117, "logps/rejected": -105.65762329101562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7141265869140625, "rewards/margins": 9.969755172729492, "rewards/rejected": -7.25562858581543, "step": 612 }, { "epoch": 0.41907366262177403, "grad_norm": 0.0017649437068030238, "learning_rate": 4.645767457530692e-05, "logits/chosen": -9.304546356201172, "logits/rejected": -9.293478012084961, "logps/chosen": -5.148870468139648, "logps/rejected": -106.22356414794922, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6151843070983887, "rewards/margins": 10.066971778869629, "rewards/rejected": -7.45178747177124, "step": 613 }, { "epoch": 0.419757306443343, "grad_norm": 0.05492188781499863, "learning_rate": 4.64694728523528e-05, "logits/chosen": -7.638477802276611, "logits/rejected": -7.6282267570495605, "logps/chosen": -4.8635969161987305, "logps/rejected": -106.23712921142578, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7532784938812256, "rewards/margins": 10.08159065246582, "rewards/rejected": -7.328312397003174, "step": 614 }, { "epoch": 0.42044095026491196, "grad_norm": 0.40518712997436523, "learning_rate": 4.648125192959028e-05, "logits/chosen": -8.816143989562988, "logits/rejected": -8.802261352539062, "logps/chosen": -1.2561192512512207, "logps/rejected": -108.02069091796875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 3.051705837249756, "rewards/margins": 10.659786224365234, "rewards/rejected": -7.608080863952637, "step": 615 }, { "epoch": 0.42112459408648095, "grad_norm": 0.005355834495276213, "learning_rate": 4.649301186940709e-05, "logits/chosen": -8.685707092285156, "logits/rejected": -8.675680160522461, "logps/chosen": -4.369622230529785, "logps/rejected": -106.85824584960938, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.659575939178467, "rewards/margins": 10.163527488708496, "rewards/rejected": -7.5039520263671875, "step": 616 }, { "epoch": 0.4218082379080499, "grad_norm": 0.0014539557741954923, "learning_rate": 4.650475273388737e-05, "logits/chosen": -8.050531387329102, "logits/rejected": -8.039979934692383, "logps/chosen": -2.6578781604766846, "logps/rejected": -107.22073364257812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9132843017578125, "rewards/margins": 10.405319213867188, "rewards/rejected": -7.492033958435059, "step": 617 }, { "epoch": 0.4224918817296189, "grad_norm": 0.0013185343705117702, "learning_rate": 4.651647458481359e-05, "logits/chosen": -8.13132381439209, "logits/rejected": -8.121657371520996, "logps/chosen": -2.707611083984375, "logps/rejected": -106.81291198730469, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9544689655303955, "rewards/margins": 10.374098777770996, "rewards/rejected": -7.41963005065918, "step": 618 }, { "epoch": 0.4231755255511878, "grad_norm": 0.1959184855222702, "learning_rate": 4.652817748366864e-05, "logits/chosen": -8.38038444519043, "logits/rejected": -8.370586395263672, "logps/chosen": -1.4755134582519531, "logps/rejected": -108.21654510498047, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 3.003787040710449, "rewards/margins": 10.637664794921875, "rewards/rejected": -7.633877754211426, "step": 619 }, { "epoch": 0.4238591693727568, "grad_norm": 0.6070986986160278, "learning_rate": 4.653986149163757e-05, "logits/chosen": -7.807539463043213, "logits/rejected": -7.794714450836182, "logps/chosen": -1.1600244045257568, "logps/rejected": -107.02631378173828, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 3.171736240386963, "rewards/margins": 10.63200569152832, "rewards/rejected": -7.460269451141357, "step": 620 }, { "epoch": 0.42454281319432574, "grad_norm": 0.002985526341944933, "learning_rate": 4.655152666960967e-05, "logits/chosen": -7.925712585449219, "logits/rejected": -7.9156999588012695, "logps/chosen": -2.475722074508667, "logps/rejected": -106.34619140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0203189849853516, "rewards/margins": 10.407691955566406, "rewards/rejected": -7.3873724937438965, "step": 621 }, { "epoch": 0.42522645701589473, "grad_norm": 0.001744079519994557, "learning_rate": 4.6563173078180315e-05, "logits/chosen": -8.068453788757324, "logits/rejected": -8.05902099609375, "logps/chosen": -5.6676435470581055, "logps/rejected": -104.12444305419922, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7117671966552734, "rewards/margins": 9.861989974975586, "rewards/rejected": -7.1502227783203125, "step": 622 }, { "epoch": 0.42591010083746367, "grad_norm": 0.002396911848336458, "learning_rate": 4.657480077765283e-05, "logits/chosen": -8.636311531066895, "logits/rejected": -8.627917289733887, "logps/chosen": -3.8943963050842285, "logps/rejected": -105.62800598144531, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7278480529785156, "rewards/margins": 10.187816619873047, "rewards/rejected": -7.459968566894531, "step": 623 }, { "epoch": 0.42659374465903266, "grad_norm": 0.004694013856351376, "learning_rate": 4.6586409828040405e-05, "logits/chosen": -8.189339637756348, "logits/rejected": -8.18036937713623, "logps/chosen": -3.072343111038208, "logps/rejected": -107.00157928466797, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.912203073501587, "rewards/margins": 10.347129821777344, "rewards/rejected": -7.434926986694336, "step": 624 }, { "epoch": 0.4272773884806016, "grad_norm": 0.0011894130147993565, "learning_rate": 4.659800028906792e-05, "logits/chosen": -8.522075653076172, "logits/rejected": -8.510181427001953, "logps/chosen": -4.1352081298828125, "logps/rejected": -105.50617980957031, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.855008602142334, "rewards/margins": 10.126768112182617, "rewards/rejected": -7.271759986877441, "step": 625 }, { "epoch": 0.4279610323021706, "grad_norm": 0.0010934221791103482, "learning_rate": 4.660957222017383e-05, "logits/chosen": -7.838116645812988, "logits/rejected": -7.8301310539245605, "logps/chosen": -3.7500839233398438, "logps/rejected": -105.1845932006836, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.861330509185791, "rewards/margins": 10.129945755004883, "rewards/rejected": -7.268615245819092, "step": 626 }, { "epoch": 0.4286446761237395, "grad_norm": 0.0023983458522707224, "learning_rate": 4.662112568051194e-05, "logits/chosen": -7.897002220153809, "logits/rejected": -7.888054847717285, "logps/chosen": -0.655996561050415, "logps/rejected": -107.52599334716797, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0418930053710938, "rewards/margins": 10.618558883666992, "rewards/rejected": -7.576667308807373, "step": 627 }, { "epoch": 0.4293283199453085, "grad_norm": 0.002339074620977044, "learning_rate": 4.663266072895327e-05, "logits/chosen": -7.563567161560059, "logits/rejected": -7.554030418395996, "logps/chosen": -7.071746349334717, "logps/rejected": -103.56379699707031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.689978837966919, "rewards/margins": 9.632989883422852, "rewards/rejected": -6.943011283874512, "step": 628 }, { "epoch": 0.43001196376687745, "grad_norm": 0.007866540923714638, "learning_rate": 4.664417742408782e-05, "logits/chosen": -8.800668716430664, "logits/rejected": -8.792113304138184, "logps/chosen": -9.348047256469727, "logps/rejected": -100.20880126953125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.330822467803955, "rewards/margins": 9.112377166748047, "rewards/rejected": -6.781554698944092, "step": 629 }, { "epoch": 0.43069560758844644, "grad_norm": 0.02250785380601883, "learning_rate": 4.665567582422637e-05, "logits/chosen": -7.790220260620117, "logits/rejected": -7.781975746154785, "logps/chosen": -3.5560011863708496, "logps/rejected": -105.42109680175781, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.847094774246216, "rewards/margins": 10.113374710083008, "rewards/rejected": -7.266280174255371, "step": 630 }, { "epoch": 0.4313792514100154, "grad_norm": 0.002127178246155381, "learning_rate": 4.666715598740224e-05, "logits/chosen": -8.387374877929688, "logits/rejected": -8.37803840637207, "logps/chosen": -5.929864883422852, "logps/rejected": -104.68219757080078, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6286377906799316, "rewards/margins": 9.799800872802734, "rewards/rejected": -7.171163558959961, "step": 631 }, { "epoch": 0.43206289523158437, "grad_norm": 0.001809819252230227, "learning_rate": 4.667861797137309e-05, "logits/chosen": -8.606306076049805, "logits/rejected": -8.597049713134766, "logps/chosen": -4.044001579284668, "logps/rejected": -103.84457397460938, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8000733852386475, "rewards/margins": 10.017450332641602, "rewards/rejected": -7.217376708984375, "step": 632 }, { "epoch": 0.4327465390531533, "grad_norm": 0.0016966351540759206, "learning_rate": 4.669006183362258e-05, "logits/chosen": -8.318706512451172, "logits/rejected": -8.309415817260742, "logps/chosen": -0.7654773592948914, "logps/rejected": -108.46417236328125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9966259002685547, "rewards/margins": 10.669849395751953, "rewards/rejected": -7.673223972320557, "step": 633 }, { "epoch": 0.4334301828747223, "grad_norm": 0.0018062597373500466, "learning_rate": 4.670148763136221e-05, "logits/chosen": -8.222513198852539, "logits/rejected": -8.213122367858887, "logps/chosen": -1.0121705532073975, "logps/rejected": -107.0772705078125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.046203136444092, "rewards/margins": 10.531135559082031, "rewards/rejected": -7.484931945800781, "step": 634 }, { "epoch": 0.4341138266962912, "grad_norm": 0.006436067633330822, "learning_rate": 4.671289542153293e-05, "logits/chosen": -8.249629974365234, "logits/rejected": -8.239599227905273, "logps/chosen": -4.186458587646484, "logps/rejected": -105.7552719116211, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.744842052459717, "rewards/margins": 10.06043529510498, "rewards/rejected": -7.315593242645264, "step": 635 }, { "epoch": 0.4347974705178602, "grad_norm": 0.0015189540572464466, "learning_rate": 4.672428526080691e-05, "logits/chosen": -8.568038940429688, "logits/rejected": -8.557464599609375, "logps/chosen": -4.934256553649902, "logps/rejected": -105.13912963867188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6899187564849854, "rewards/margins": 9.999547958374023, "rewards/rejected": -7.309628486633301, "step": 636 }, { "epoch": 0.43548111433942915, "grad_norm": 0.059913743287324905, "learning_rate": 4.673565720558918e-05, "logits/chosen": -7.738733768463135, "logits/rejected": -7.728980541229248, "logps/chosen": -2.4801461696624756, "logps/rejected": -106.89192199707031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.0306172370910645, "rewards/margins": 10.40225601196289, "rewards/rejected": -7.371639251708984, "step": 637 }, { "epoch": 0.43616475816099814, "grad_norm": 0.0015033251838758588, "learning_rate": 4.6747011312019374e-05, "logits/chosen": -7.727455139160156, "logits/rejected": -7.718120574951172, "logps/chosen": -3.319288492202759, "logps/rejected": -105.90872192382812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9245715141296387, "rewards/margins": 10.263715744018555, "rewards/rejected": -7.339143753051758, "step": 638 }, { "epoch": 0.4368484019825671, "grad_norm": 0.004491225816309452, "learning_rate": 4.6758347635973334e-05, "logits/chosen": -7.992136001586914, "logits/rejected": -7.981237411499023, "logps/chosen": -0.7147228717803955, "logps/rejected": -107.17578125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.1263427734375, "rewards/margins": 10.66911506652832, "rewards/rejected": -7.5427727699279785, "step": 639 }, { "epoch": 0.43753204580413607, "grad_norm": 0.002228789497166872, "learning_rate": 4.676966623306479e-05, "logits/chosen": -8.021259307861328, "logits/rejected": -8.011384963989258, "logps/chosen": -2.5446887016296387, "logps/rejected": -107.53973388671875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9110043048858643, "rewards/margins": 10.36148452758789, "rewards/rejected": -7.450479507446289, "step": 640 }, { "epoch": 0.438215689625705, "grad_norm": 0.011855927295982838, "learning_rate": 4.678096715864696e-05, "logits/chosen": -8.123444557189941, "logits/rejected": -8.112960815429688, "logps/chosen": -5.733158588409424, "logps/rejected": -104.95639038085938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.719849109649658, "rewards/margins": 9.923224449157715, "rewards/rejected": -7.203374862670898, "step": 641 }, { "epoch": 0.43889933344727394, "grad_norm": 0.0013029164401814342, "learning_rate": 4.679225046781422e-05, "logits/chosen": -7.785754203796387, "logits/rejected": -7.778971195220947, "logps/chosen": -5.556894779205322, "logps/rejected": -103.1944580078125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6762099266052246, "rewards/margins": 9.771913528442383, "rewards/rejected": -7.095703125, "step": 642 }, { "epoch": 0.43958297726884293, "grad_norm": 0.0014392153825610876, "learning_rate": 4.68035162154037e-05, "logits/chosen": -8.362053871154785, "logits/rejected": -8.353281021118164, "logps/chosen": -2.309937000274658, "logps/rejected": -106.65938568115234, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.882521152496338, "rewards/margins": 10.356989860534668, "rewards/rejected": -7.474468231201172, "step": 643 }, { "epoch": 0.44026662109041187, "grad_norm": 0.0014100566040724516, "learning_rate": 4.681476445599687e-05, "logits/chosen": -8.362162590026855, "logits/rejected": -8.354913711547852, "logps/chosen": -5.293179988861084, "logps/rejected": -102.8038101196289, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6426455974578857, "rewards/margins": 9.752397537231445, "rewards/rejected": -7.1097517013549805, "step": 644 }, { "epoch": 0.44095026491198086, "grad_norm": 0.0026908516883850098, "learning_rate": 4.6825995243921137e-05, "logits/chosen": -8.354543685913086, "logits/rejected": -8.34355354309082, "logps/chosen": -1.3949227333068848, "logps/rejected": -106.16683959960938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0317342281341553, "rewards/margins": 10.478947639465332, "rewards/rejected": -7.447213172912598, "step": 645 }, { "epoch": 0.4416339087335498, "grad_norm": 0.001323995995335281, "learning_rate": 4.683720863325141e-05, "logits/chosen": -8.697856903076172, "logits/rejected": -8.688383102416992, "logps/chosen": -4.03721809387207, "logps/rejected": -104.5935287475586, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8393216133117676, "rewards/margins": 10.082682609558105, "rewards/rejected": -7.243361473083496, "step": 646 }, { "epoch": 0.4423175525551188, "grad_norm": 0.0013239918043836951, "learning_rate": 4.684840467781168e-05, "logits/chosen": -7.980260848999023, "logits/rejected": -7.972324371337891, "logps/chosen": -1.7197599411010742, "logps/rejected": -105.26658630371094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.013543128967285, "rewards/margins": 10.332744598388672, "rewards/rejected": -7.319201469421387, "step": 647 }, { "epoch": 0.4430011963766877, "grad_norm": 0.0017890805611386895, "learning_rate": 4.685958343117656e-05, "logits/chosen": -8.50792407989502, "logits/rejected": -8.499551773071289, "logps/chosen": -3.884697914123535, "logps/rejected": -105.65689086914062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8714404106140137, "rewards/margins": 10.234334945678711, "rewards/rejected": -7.362894058227539, "step": 648 }, { "epoch": 0.4436848401982567, "grad_norm": 0.0012149381218478084, "learning_rate": 4.6870744946672826e-05, "logits/chosen": -7.942140102386475, "logits/rejected": -7.933757305145264, "logps/chosen": -2.7354984283447266, "logps/rejected": -107.49845886230469, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9206957817077637, "rewards/margins": 10.390966415405273, "rewards/rejected": -7.470270156860352, "step": 649 }, { "epoch": 0.44436848401982565, "grad_norm": 0.0011531213531270623, "learning_rate": 4.688188927738093e-05, "logits/chosen": -8.231632232666016, "logits/rejected": -8.222159385681152, "logps/chosen": -2.066807985305786, "logps/rejected": -107.73614501953125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9245283603668213, "rewards/margins": 10.451997756958008, "rewards/rejected": -7.527470588684082, "step": 650 }, { "epoch": 0.44505212784139464, "grad_norm": 0.00156882603187114, "learning_rate": 4.689301647613653e-05, "logits/chosen": -8.201997756958008, "logits/rejected": -8.19373893737793, "logps/chosen": -2.9446053504943848, "logps/rejected": -105.99998474121094, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.768482208251953, "rewards/margins": 10.225954055786133, "rewards/rejected": -7.45747184753418, "step": 651 }, { "epoch": 0.4457357716629636, "grad_norm": 0.0032573314383625984, "learning_rate": 4.6904126595532014e-05, "logits/chosen": -8.136311531066895, "logits/rejected": -8.127165794372559, "logps/chosen": -8.296357154846191, "logps/rejected": -102.09619140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.450711727142334, "rewards/margins": 9.320980072021484, "rewards/rejected": -6.870266914367676, "step": 652 }, { "epoch": 0.44641941548453257, "grad_norm": 0.001009328872896731, "learning_rate": 4.69152196879179e-05, "logits/chosen": -8.328792572021484, "logits/rejected": -8.320201873779297, "logps/chosen": -2.5707449913024902, "logps/rejected": -106.32523345947266, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.977811336517334, "rewards/margins": 10.345619201660156, "rewards/rejected": -7.3678083419799805, "step": 653 }, { "epoch": 0.4471030593061015, "grad_norm": 0.008735932409763336, "learning_rate": 4.692629580540446e-05, "logits/chosen": -8.25289535522461, "logits/rejected": -8.242178916931152, "logps/chosen": -5.36268424987793, "logps/rejected": -107.11383819580078, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.759403705596924, "rewards/margins": 10.11590576171875, "rewards/rejected": -7.356501579284668, "step": 654 }, { "epoch": 0.4477867031276705, "grad_norm": 0.001519752317108214, "learning_rate": 4.693735499986305e-05, "logits/chosen": -7.855658531188965, "logits/rejected": -7.848461627960205, "logps/chosen": -4.284889221191406, "logps/rejected": -106.23924255371094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.69523024559021, "rewards/margins": 10.15200138092041, "rewards/rejected": -7.456770420074463, "step": 655 }, { "epoch": 0.44847034694923943, "grad_norm": 0.001563094207085669, "learning_rate": 4.694839732292767e-05, "logits/chosen": -8.48218822479248, "logits/rejected": -8.47243881225586, "logps/chosen": -4.3241376876831055, "logps/rejected": -106.0663070678711, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8265085220336914, "rewards/margins": 10.184122085571289, "rewards/rejected": -7.357614040374756, "step": 656 }, { "epoch": 0.4491539907708084, "grad_norm": 0.0036476803943514824, "learning_rate": 4.6959422825996345e-05, "logits/chosen": -8.16054630279541, "logits/rejected": -8.150506019592285, "logps/chosen": -4.524077415466309, "logps/rejected": -104.2166519165039, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7913248538970947, "rewards/margins": 10.020915985107422, "rewards/rejected": -7.2295918464660645, "step": 657 }, { "epoch": 0.44983763459237736, "grad_norm": 0.0018403275171294808, "learning_rate": 4.69704315602326e-05, "logits/chosen": -7.201643943786621, "logits/rejected": -7.192437648773193, "logps/chosen": -3.0495543479919434, "logps/rejected": -104.0179214477539, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.0237603187561035, "rewards/margins": 10.126012802124023, "rewards/rejected": -7.10225248336792, "step": 658 }, { "epoch": 0.45052127841394635, "grad_norm": 0.0014999682316556573, "learning_rate": 4.698142357656684e-05, "logits/chosen": -8.847084999084473, "logits/rejected": -8.838239669799805, "logps/chosen": -6.70492696762085, "logps/rejected": -103.50083923339844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5569024085998535, "rewards/margins": 9.596895217895508, "rewards/rejected": -7.039992332458496, "step": 659 }, { "epoch": 0.4512049222355153, "grad_norm": 0.0008918237290345132, "learning_rate": 4.6992398925697814e-05, "logits/chosen": -8.125909805297852, "logits/rejected": -8.117348670959473, "logps/chosen": -0.375044047832489, "logps/rejected": -109.12510681152344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1436963081359863, "rewards/margins": 10.816034317016602, "rewards/rejected": -7.672337532043457, "step": 660 }, { "epoch": 0.4518885660570843, "grad_norm": 0.0018144403584301472, "learning_rate": 4.7003357658094e-05, "logits/chosen": -8.724529266357422, "logits/rejected": -8.714594841003418, "logps/chosen": -2.1819212436676025, "logps/rejected": -106.67544555664062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.089984178543091, "rewards/margins": 10.494285583496094, "rewards/rejected": -7.404301166534424, "step": 661 }, { "epoch": 0.4525722098786532, "grad_norm": 0.0017859518993645906, "learning_rate": 4.7014299823995005e-05, "logits/chosen": -8.204028129577637, "logits/rejected": -8.19211196899414, "logps/chosen": -0.7380673885345459, "logps/rejected": -108.33647155761719, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1361398696899414, "rewards/margins": 10.810609817504883, "rewards/rejected": -7.674469947814941, "step": 662 }, { "epoch": 0.4532558537002222, "grad_norm": 0.001090395380742848, "learning_rate": 4.702522547341289e-05, "logits/chosen": -7.687248706817627, "logits/rejected": -7.679068088531494, "logps/chosen": -3.665461540222168, "logps/rejected": -106.3669662475586, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8626575469970703, "rewards/margins": 10.149602890014648, "rewards/rejected": -7.286944389343262, "step": 663 }, { "epoch": 0.45393949752179114, "grad_norm": 0.004840615671128035, "learning_rate": 4.703613465613363e-05, "logits/chosen": -9.237913131713867, "logits/rejected": -9.228812217712402, "logps/chosen": -9.372472763061523, "logps/rejected": -103.56379699707031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.229844093322754, "rewards/margins": 9.272995948791504, "rewards/rejected": -7.04315185546875, "step": 664 }, { "epoch": 0.4546231413433601, "grad_norm": 0.0015646663960069418, "learning_rate": 4.704702742171841e-05, "logits/chosen": -8.202930450439453, "logits/rejected": -8.191713333129883, "logps/chosen": -2.2362523078918457, "logps/rejected": -107.54962158203125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9946208000183105, "rewards/margins": 10.518720626831055, "rewards/rejected": -7.524099349975586, "step": 665 }, { "epoch": 0.45530678516492906, "grad_norm": 0.0010211966000497341, "learning_rate": 4.7057903819505024e-05, "logits/chosen": -7.998904705047607, "logits/rejected": -7.989501953125, "logps/chosen": -4.373772621154785, "logps/rejected": -106.32440185546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.828124523162842, "rewards/margins": 10.203109741210938, "rewards/rejected": -7.374986171722412, "step": 666 }, { "epoch": 0.45599042898649805, "grad_norm": 0.0008046280709095299, "learning_rate": 4.7068763898609154e-05, "logits/chosen": -8.75006103515625, "logits/rejected": -8.740378379821777, "logps/chosen": -1.9228732585906982, "logps/rejected": -107.07870483398438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9750473499298096, "rewards/margins": 10.492757797241211, "rewards/rejected": -7.5177106857299805, "step": 667 }, { "epoch": 0.456674072808067, "grad_norm": 0.001086143427528441, "learning_rate": 4.707960770792576e-05, "logits/chosen": -8.620080947875977, "logits/rejected": -8.60882568359375, "logps/chosen": -0.6180664896965027, "logps/rejected": -108.17129516601562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1563425064086914, "rewards/margins": 10.762445449829102, "rewards/rejected": -7.60610294342041, "step": 668 }, { "epoch": 0.457357716629636, "grad_norm": 0.0007976078777574003, "learning_rate": 4.709043529613039e-05, "logits/chosen": -8.520782470703125, "logits/rejected": -8.513232231140137, "logps/chosen": -2.7031972408294678, "logps/rejected": -107.41541290283203, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.823629379272461, "rewards/margins": 10.440393447875977, "rewards/rejected": -7.616764068603516, "step": 669 }, { "epoch": 0.4580413604512049, "grad_norm": 0.001052489154972136, "learning_rate": 4.710124671168044e-05, "logits/chosen": -8.352490425109863, "logits/rejected": -8.342756271362305, "logps/chosen": -2.021857261657715, "logps/rejected": -106.99288940429688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.979400396347046, "rewards/margins": 10.447965621948242, "rewards/rejected": -7.468564987182617, "step": 670 }, { "epoch": 0.4587250042727739, "grad_norm": 0.0012831392232328653, "learning_rate": 4.711204200281654e-05, "logits/chosen": -8.179638862609863, "logits/rejected": -8.170108795166016, "logps/chosen": -7.866863250732422, "logps/rejected": -102.32186889648438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.492614984512329, "rewards/margins": 9.458054542541504, "rewards/rejected": -6.965439796447754, "step": 671 }, { "epoch": 0.45940864809434284, "grad_norm": 0.0012516663409769535, "learning_rate": 4.712282121756376e-05, "logits/chosen": -7.8158650398254395, "logits/rejected": -7.807015419006348, "logps/chosen": -0.5090246200561523, "logps/rejected": -108.73420715332031, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.085714817047119, "rewards/margins": 10.773140907287598, "rewards/rejected": -7.68742561340332, "step": 672 }, { "epoch": 0.46009229191591183, "grad_norm": 0.0010294998064637184, "learning_rate": 4.713358440373295e-05, "logits/chosen": -8.29403305053711, "logits/rejected": -8.283514022827148, "logps/chosen": -0.2994798421859741, "logps/rejected": -108.70299530029297, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.036842107772827, "rewards/margins": 10.819134712219238, "rewards/rejected": -7.782292366027832, "step": 673 }, { "epoch": 0.46077593573748077, "grad_norm": 0.001025501056574285, "learning_rate": 4.7144331608922e-05, "logits/chosen": -7.605380535125732, "logits/rejected": -7.5972900390625, "logps/chosen": -3.514976739883423, "logps/rejected": -105.88878631591797, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9285876750946045, "rewards/margins": 10.293889999389648, "rewards/rejected": -7.365302085876465, "step": 674 }, { "epoch": 0.46145957955904976, "grad_norm": 0.010140892118215561, "learning_rate": 4.715506288051709e-05, "logits/chosen": -8.431896209716797, "logits/rejected": -8.420933723449707, "logps/chosen": -4.241971015930176, "logps/rejected": -106.79314422607422, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.868494987487793, "rewards/margins": 10.260313034057617, "rewards/rejected": -7.391817569732666, "step": 675 }, { "epoch": 0.4621432233806187, "grad_norm": 0.003816541749984026, "learning_rate": 4.7165778265693935e-05, "logits/chosen": -8.681777954101562, "logits/rejected": -8.671908378601074, "logps/chosen": -0.3580332100391388, "logps/rejected": -108.39139556884766, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1796860694885254, "rewards/margins": 10.83378791809082, "rewards/rejected": -7.654101848602295, "step": 676 }, { "epoch": 0.4628268672021877, "grad_norm": 0.0013220239197835326, "learning_rate": 4.7176477811419076e-05, "logits/chosen": -8.915156364440918, "logits/rejected": -8.90614128112793, "logps/chosen": -4.327639102935791, "logps/rejected": -106.73416137695312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7856297492980957, "rewards/margins": 10.236716270446777, "rewards/rejected": -7.451086044311523, "step": 677 }, { "epoch": 0.4635105110237566, "grad_norm": 0.0012013550149276853, "learning_rate": 4.718716156445106e-05, "logits/chosen": -8.38157844543457, "logits/rejected": -8.372312545776367, "logps/chosen": -4.363598346710205, "logps/rejected": -104.82454681396484, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.829237937927246, "rewards/margins": 10.05633544921875, "rewards/rejected": -7.227097511291504, "step": 678 }, { "epoch": 0.4641941548453256, "grad_norm": 0.0008339948835782707, "learning_rate": 4.7197829571341704e-05, "logits/chosen": -8.507811546325684, "logits/rejected": -8.499732971191406, "logps/chosen": -3.80600643157959, "logps/rejected": -106.40218353271484, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.799650192260742, "rewards/margins": 10.230144500732422, "rewards/rejected": -7.430494785308838, "step": 679 }, { "epoch": 0.46487779866689455, "grad_norm": 0.0019077680772170424, "learning_rate": 4.720848187843727e-05, "logits/chosen": -8.180133819580078, "logits/rejected": -8.171496391296387, "logps/chosen": -0.31853827834129333, "logps/rejected": -109.10690307617188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.1006650924682617, "rewards/margins": 10.829742431640625, "rewards/rejected": -7.729077339172363, "step": 680 }, { "epoch": 0.4655614424884635, "grad_norm": 0.0011713261483237147, "learning_rate": 4.721911853187975e-05, "logits/chosen": -8.901090621948242, "logits/rejected": -8.891423225402832, "logps/chosen": -1.8903906345367432, "logps/rejected": -107.52394104003906, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9330971240997314, "rewards/margins": 10.467355728149414, "rewards/rejected": -7.534257888793945, "step": 681 }, { "epoch": 0.4662450863100325, "grad_norm": 0.0027333437465131283, "learning_rate": 4.722973957760799e-05, "logits/chosen": -8.024504661560059, "logits/rejected": -8.014790534973145, "logps/chosen": -4.092985153198242, "logps/rejected": -105.65291595458984, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7578954696655273, "rewards/margins": 10.091751098632812, "rewards/rejected": -7.333854675292969, "step": 682 }, { "epoch": 0.4669287301316014, "grad_norm": 0.000935609161388129, "learning_rate": 4.724034506135888e-05, "logits/chosen": -8.327143669128418, "logits/rejected": -8.316909790039062, "logps/chosen": -1.8907629251480103, "logps/rejected": -108.05915069580078, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.94240140914917, "rewards/margins": 10.576119422912598, "rewards/rejected": -7.633718013763428, "step": 683 }, { "epoch": 0.4676123739531704, "grad_norm": 0.0008314246078953147, "learning_rate": 4.725093502866861e-05, "logits/chosen": -8.873047828674316, "logits/rejected": -8.865547180175781, "logps/chosen": -4.275880813598633, "logps/rejected": -106.90864562988281, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7168378829956055, "rewards/margins": 10.198728561401367, "rewards/rejected": -7.481889724731445, "step": 684 }, { "epoch": 0.46829601777473934, "grad_norm": 0.001093894476071, "learning_rate": 4.7261509524873764e-05, "logits/chosen": -8.056530952453613, "logits/rejected": -8.04553508758545, "logps/chosen": -7.0981550216674805, "logps/rejected": -103.64517211914062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.52966046333313, "rewards/margins": 9.698058128356934, "rewards/rejected": -7.168396949768066, "step": 685 }, { "epoch": 0.46897966159630833, "grad_norm": 0.0010665907757356763, "learning_rate": 4.727206859511253e-05, "logits/chosen": -9.027381896972656, "logits/rejected": -9.016711235046387, "logps/chosen": -0.2983025312423706, "logps/rejected": -108.90772247314453, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.037731170654297, "rewards/margins": 10.813218116760254, "rewards/rejected": -7.775487899780273, "step": 686 }, { "epoch": 0.46966330541787726, "grad_norm": 0.0015889713540673256, "learning_rate": 4.7282612284325846e-05, "logits/chosen": -8.83301067352295, "logits/rejected": -8.823369026184082, "logps/chosen": -4.534211158752441, "logps/rejected": -106.51017761230469, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8820135593414307, "rewards/margins": 10.214557647705078, "rewards/rejected": -7.33254337310791, "step": 687 }, { "epoch": 0.47034694923944625, "grad_norm": 0.0009390591294504702, "learning_rate": 4.729314063725853e-05, "logits/chosen": -8.12787914276123, "logits/rejected": -8.120015144348145, "logps/chosen": -6.629327774047852, "logps/rejected": -104.00148010253906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5723071098327637, "rewards/margins": 9.660274505615234, "rewards/rejected": -7.087967395782471, "step": 688 }, { "epoch": 0.4710305930610152, "grad_norm": 0.0008340045460499823, "learning_rate": 4.730365369846044e-05, "logits/chosen": -8.499711990356445, "logits/rejected": -8.489737510681152, "logps/chosen": -3.4662086963653564, "logps/rejected": -106.02233123779297, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8142318725585938, "rewards/margins": 10.236635208129883, "rewards/rejected": -7.422403335571289, "step": 689 }, { "epoch": 0.4717142368825842, "grad_norm": 0.001040459843352437, "learning_rate": 4.7314151512287594e-05, "logits/chosen": -8.298806190490723, "logits/rejected": -8.290054321289062, "logps/chosen": -2.7972052097320557, "logps/rejected": -107.24905395507812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9209189414978027, "rewards/margins": 10.381271362304688, "rewards/rejected": -7.460352420806885, "step": 690 }, { "epoch": 0.4723978807041531, "grad_norm": 0.0011902523692697287, "learning_rate": 4.732463412290331e-05, "logits/chosen": -7.7163238525390625, "logits/rejected": -7.706830024719238, "logps/chosen": -1.6869031190872192, "logps/rejected": -108.07369232177734, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0249242782592773, "rewards/margins": 10.634295463562012, "rewards/rejected": -7.609371185302734, "step": 691 }, { "epoch": 0.4730815245257221, "grad_norm": 0.0009579909383319318, "learning_rate": 4.73351015742793e-05, "logits/chosen": -8.779144287109375, "logits/rejected": -8.76916790008545, "logps/chosen": -1.7769347429275513, "logps/rejected": -106.92496490478516, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9442973136901855, "rewards/margins": 10.503393173217773, "rewards/rejected": -7.55909538269043, "step": 692 }, { "epoch": 0.47376516834729104, "grad_norm": 0.001105062197893858, "learning_rate": 4.7345553910196785e-05, "logits/chosen": -8.324189186096191, "logits/rejected": -8.315547943115234, "logps/chosen": -5.414868354797363, "logps/rejected": -104.90518188476562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7042322158813477, "rewards/margins": 9.946975708007812, "rewards/rejected": -7.242743968963623, "step": 693 }, { "epoch": 0.47444881216886003, "grad_norm": 0.0007615566137246788, "learning_rate": 4.735599117424759e-05, "logits/chosen": -8.313612937927246, "logits/rejected": -8.30469799041748, "logps/chosen": -0.3021045923233032, "logps/rejected": -109.444091796875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.122605323791504, "rewards/margins": 10.848892211914062, "rewards/rejected": -7.7262864112854, "step": 694 }, { "epoch": 0.47513245599042897, "grad_norm": 0.0011062478879466653, "learning_rate": 4.7366413409835235e-05, "logits/chosen": -8.313570022583008, "logits/rejected": -8.301855087280273, "logps/chosen": -4.624068737030029, "logps/rejected": -106.61598205566406, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.793625593185425, "rewards/margins": 10.138141632080078, "rewards/rejected": -7.344516277313232, "step": 695 }, { "epoch": 0.47581609981199796, "grad_norm": 0.0008293703431263566, "learning_rate": 4.737682066017604e-05, "logits/chosen": -8.832090377807617, "logits/rejected": -8.823433876037598, "logps/chosen": -5.619585037231445, "logps/rejected": -106.0971450805664, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6038243770599365, "rewards/margins": 9.996563911437988, "rewards/rejected": -7.3927388191223145, "step": 696 }, { "epoch": 0.4764997436335669, "grad_norm": 0.0010156340431421995, "learning_rate": 4.7387212968300166e-05, "logits/chosen": -8.252408027648926, "logits/rejected": -8.243721961975098, "logps/chosen": -3.326138734817505, "logps/rejected": -106.80862426757812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7435688972473145, "rewards/margins": 10.261789321899414, "rewards/rejected": -7.5182204246521, "step": 697 }, { "epoch": 0.4771833874551359, "grad_norm": 0.0007091203588061035, "learning_rate": 4.7397590377052686e-05, "logits/chosen": -8.310778617858887, "logits/rejected": -8.299304962158203, "logps/chosen": -2.0421626567840576, "logps/rejected": -108.77801513671875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0722439289093018, "rewards/margins": 10.696084022521973, "rewards/rejected": -7.62384033203125, "step": 698 }, { "epoch": 0.4778670312767048, "grad_norm": 1.7730236053466797, "learning_rate": 4.74079529290947e-05, "logits/chosen": -8.634693145751953, "logits/rejected": -8.62221622467041, "logps/chosen": -7.521895885467529, "logps/rejected": -105.12432098388672, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": 2.502657413482666, "rewards/margins": 9.772953033447266, "rewards/rejected": -7.270296096801758, "step": 699 }, { "epoch": 0.4785506750982738, "grad_norm": 0.0009403625736013055, "learning_rate": 4.741830066690428e-05, "logits/chosen": -8.855135917663574, "logits/rejected": -8.84527587890625, "logps/chosen": -2.502150058746338, "logps/rejected": -106.1936264038086, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.83278751373291, "rewards/margins": 10.337223052978516, "rewards/rejected": -7.5044355392456055, "step": 700 }, { "epoch": 0.47923431891984275, "grad_norm": 0.0008950048359110951, "learning_rate": 4.742863363277765e-05, "logits/chosen": -8.406011581420898, "logits/rejected": -8.396879196166992, "logps/chosen": -3.627880334854126, "logps/rejected": -105.12092590332031, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8564515113830566, "rewards/margins": 10.074037551879883, "rewards/rejected": -7.217585563659668, "step": 701 }, { "epoch": 0.47991796274141174, "grad_norm": 0.0010409082751721144, "learning_rate": 4.743895186883009e-05, "logits/chosen": -8.751259803771973, "logits/rejected": -8.741830825805664, "logps/chosen": -5.312592506408691, "logps/rejected": -105.65164947509766, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.685451030731201, "rewards/margins": 9.997998237609863, "rewards/rejected": -7.312547206878662, "step": 702 }, { "epoch": 0.4806016065629807, "grad_norm": 0.003689934266731143, "learning_rate": 4.7449255416997075e-05, "logits/chosen": -7.733644485473633, "logits/rejected": -7.72327995300293, "logps/chosen": -5.138180732727051, "logps/rejected": -105.59648132324219, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6786296367645264, "rewards/margins": 10.021940231323242, "rewards/rejected": -7.343311309814453, "step": 703 }, { "epoch": 0.48128525038454967, "grad_norm": 0.0011095311492681503, "learning_rate": 4.7459544319035206e-05, "logits/chosen": -8.826608657836914, "logits/rejected": -8.815845489501953, "logps/chosen": -0.45945560932159424, "logps/rejected": -109.89413452148438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0525074005126953, "rewards/margins": 10.870027542114258, "rewards/rejected": -7.817519187927246, "step": 704 }, { "epoch": 0.4819688942061186, "grad_norm": 0.0009829369373619556, "learning_rate": 4.746981861652332e-05, "logits/chosen": -8.709552764892578, "logits/rejected": -8.698140144348145, "logps/chosen": -0.46685805916786194, "logps/rejected": -110.32579803466797, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9881463050842285, "rewards/margins": 10.84997272491455, "rewards/rejected": -7.861827850341797, "step": 705 }, { "epoch": 0.4826525380276876, "grad_norm": 0.0006494956905953586, "learning_rate": 4.74800783508634e-05, "logits/chosen": -8.550646781921387, "logits/rejected": -8.53934097290039, "logps/chosen": -7.3324809074401855, "logps/rejected": -104.47164916992188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5261754989624023, "rewards/margins": 9.695860862731934, "rewards/rejected": -7.169685363769531, "step": 706 }, { "epoch": 0.48333618184925653, "grad_norm": 0.0016154369805008173, "learning_rate": 4.7490323563281665e-05, "logits/chosen": -8.420034408569336, "logits/rejected": -8.406375885009766, "logps/chosen": -2.5324602127075195, "logps/rejected": -109.60820770263672, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8896429538726807, "rewards/margins": 10.633834838867188, "rewards/rejected": -7.744192600250244, "step": 707 }, { "epoch": 0.4840198256708255, "grad_norm": 0.0012668170966207981, "learning_rate": 4.750055429482949e-05, "logits/chosen": -8.370856285095215, "logits/rejected": -8.36069107055664, "logps/chosen": -2.2860987186431885, "logps/rejected": -108.82623291015625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0011403560638428, "rewards/margins": 10.643234252929688, "rewards/rejected": -7.642093658447266, "step": 708 }, { "epoch": 0.48470346949239446, "grad_norm": 0.00118519167881459, "learning_rate": 4.751077058638445e-05, "logits/chosen": -7.894417762756348, "logits/rejected": -7.881048202514648, "logps/chosen": -1.809781789779663, "logps/rejected": -108.7864990234375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.082995891571045, "rewards/margins": 10.686603546142578, "rewards/rejected": -7.603607654571533, "step": 709 }, { "epoch": 0.48538711331396345, "grad_norm": 0.03049357235431671, "learning_rate": 4.752097247865126e-05, "logits/chosen": -7.853043079376221, "logits/rejected": -7.843297481536865, "logps/chosen": -2.1017544269561768, "logps/rejected": -109.2361831665039, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.9453885555267334, "rewards/margins": 10.631464004516602, "rewards/rejected": -7.686075210571289, "step": 710 }, { "epoch": 0.4860707571355324, "grad_norm": 0.0008613677346147597, "learning_rate": 4.753116001216277e-05, "logits/chosen": -8.265771865844727, "logits/rejected": -8.252348899841309, "logps/chosen": -0.7632164359092712, "logps/rejected": -109.94906616210938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0775856971740723, "rewards/margins": 10.880810737609863, "rewards/rejected": -7.803224563598633, "step": 711 }, { "epoch": 0.4867544009571014, "grad_norm": 0.001586316735483706, "learning_rate": 4.7541333227280944e-05, "logits/chosen": -7.995811462402344, "logits/rejected": -7.984045028686523, "logps/chosen": -6.099093437194824, "logps/rejected": -106.32244110107422, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6736273765563965, "rewards/margins": 9.972606658935547, "rewards/rejected": -7.298980712890625, "step": 712 }, { "epoch": 0.4874380447786703, "grad_norm": 0.0014692615950480103, "learning_rate": 4.755149216419776e-05, "logits/chosen": -8.194463729858398, "logits/rejected": -8.182847023010254, "logps/chosen": -2.3108959197998047, "logps/rejected": -109.51744079589844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.052036762237549, "rewards/margins": 10.711737632751465, "rewards/rejected": -7.659700870513916, "step": 713 }, { "epoch": 0.4881216886002393, "grad_norm": 0.009031210094690323, "learning_rate": 4.756163686293624e-05, "logits/chosen": -8.048599243164062, "logits/rejected": -8.039588928222656, "logps/chosen": -4.24690055847168, "logps/rejected": -108.12060546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7090249061584473, "rewards/margins": 10.322822570800781, "rewards/rejected": -7.613797664642334, "step": 714 }, { "epoch": 0.48880533242180824, "grad_norm": 1.265960931777954, "learning_rate": 4.7571767363351344e-05, "logits/chosen": -7.72518253326416, "logits/rejected": -7.7122602462768555, "logps/chosen": -3.694174289703369, "logps/rejected": -107.8482437133789, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 2.896104335784912, "rewards/margins": 10.40372085571289, "rewards/rejected": -7.507617950439453, "step": 715 }, { "epoch": 0.4894889762433772, "grad_norm": 0.002358867786824703, "learning_rate": 4.758188370513093e-05, "logits/chosen": -8.535787582397461, "logits/rejected": -8.52325439453125, "logps/chosen": -2.5660338401794434, "logps/rejected": -108.94853210449219, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9347989559173584, "rewards/margins": 10.634222030639648, "rewards/rejected": -7.699422359466553, "step": 716 }, { "epoch": 0.49017262006494616, "grad_norm": 0.0019943518564105034, "learning_rate": 4.759198592779667e-05, "logits/chosen": -8.545592308044434, "logits/rejected": -8.534497261047363, "logps/chosen": -5.543930530548096, "logps/rejected": -106.10367584228516, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.649695634841919, "rewards/margins": 10.03976821899414, "rewards/rejected": -7.390072822570801, "step": 717 }, { "epoch": 0.4908562638865151, "grad_norm": 0.07378987967967987, "learning_rate": 4.760207407070501e-05, "logits/chosen": -8.10565185546875, "logits/rejected": -8.092208862304688, "logps/chosen": -1.052488088607788, "logps/rejected": -109.68089294433594, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 3.146778106689453, "rewards/margins": 10.868486404418945, "rewards/rejected": -7.721707820892334, "step": 718 }, { "epoch": 0.4915399077080841, "grad_norm": 0.002344505861401558, "learning_rate": 4.761214817304805e-05, "logits/chosen": -7.966355323791504, "logits/rejected": -7.956185340881348, "logps/chosen": -5.351738929748535, "logps/rejected": -106.30805969238281, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7276601791381836, "rewards/margins": 10.044713973999023, "rewards/rejected": -7.31705379486084, "step": 719 }, { "epoch": 0.492223551529653, "grad_norm": 0.04246656596660614, "learning_rate": 4.762220827385448e-05, "logits/chosen": -8.163488388061523, "logits/rejected": -8.151750564575195, "logps/chosen": -2.31327223777771, "logps/rejected": -109.02241516113281, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.903151273727417, "rewards/margins": 10.611071586608887, "rewards/rejected": -7.707920074462891, "step": 720 }, { "epoch": 0.492907195351222, "grad_norm": 0.002859015017747879, "learning_rate": 4.763225441199049e-05, "logits/chosen": -7.915321350097656, "logits/rejected": -7.901073932647705, "logps/chosen": -4.1378936767578125, "logps/rejected": -108.15422821044922, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.85918927192688, "rewards/margins": 10.387924194335938, "rewards/rejected": -7.528735160827637, "step": 721 }, { "epoch": 0.49359083917279095, "grad_norm": 0.0011839127400889993, "learning_rate": 4.7642286626160654e-05, "logits/chosen": -8.064905166625977, "logits/rejected": -8.051424026489258, "logps/chosen": -2.697786331176758, "logps/rejected": -109.00448608398438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9395151138305664, "rewards/margins": 10.615860939025879, "rewards/rejected": -7.6763458251953125, "step": 722 }, { "epoch": 0.49427448299435994, "grad_norm": 0.0013968818821012974, "learning_rate": 4.765230495490885e-05, "logits/chosen": -8.096624374389648, "logits/rejected": -8.08483600616455, "logps/chosen": -0.47234466671943665, "logps/rejected": -109.77161407470703, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.086317539215088, "rewards/margins": 10.844104766845703, "rewards/rejected": -7.757786750793457, "step": 723 }, { "epoch": 0.4949581268159289, "grad_norm": 0.0027395475190132856, "learning_rate": 4.7662309436619115e-05, "logits/chosen": -8.233879089355469, "logits/rejected": -8.222707748413086, "logps/chosen": -4.501645088195801, "logps/rejected": -108.56465148925781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.862844228744507, "rewards/margins": 10.40389633178711, "rewards/rejected": -7.541051864624023, "step": 724 }, { "epoch": 0.49564177063749787, "grad_norm": 0.0422692596912384, "learning_rate": 4.7672300109516563e-05, "logits/chosen": -8.565290451049805, "logits/rejected": -8.556174278259277, "logps/chosen": -2.4043996334075928, "logps/rejected": -108.47987365722656, "loss": 0.0241, "rewards/accuracies": 1.0, "rewards/chosen": 2.877352714538574, "rewards/margins": 10.535493850708008, "rewards/rejected": -7.658140182495117, "step": 725 }, { "epoch": 0.4963254144590668, "grad_norm": 0.1285325437784195, "learning_rate": 4.768227701166823e-05, "logits/chosen": -7.4915008544921875, "logits/rejected": -7.480079650878906, "logps/chosen": -4.103942394256592, "logps/rejected": -108.16111755371094, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 2.870838165283203, "rewards/margins": 10.36353874206543, "rewards/rejected": -7.492700099945068, "step": 726 }, { "epoch": 0.4970090582806358, "grad_norm": 0.004235296510159969, "learning_rate": 4.7692240180983964e-05, "logits/chosen": -8.553783416748047, "logits/rejected": -8.541882514953613, "logps/chosen": -5.138129234313965, "logps/rejected": -105.82496643066406, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6777119636535645, "rewards/margins": 10.020267486572266, "rewards/rejected": -7.342555522918701, "step": 727 }, { "epoch": 0.49769270210220473, "grad_norm": 0.01710386946797371, "learning_rate": 4.770218965521729e-05, "logits/chosen": -8.636209487915039, "logits/rejected": -8.623175621032715, "logps/chosen": -9.206189155578613, "logps/rejected": -103.78195190429688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4035208225250244, "rewards/margins": 9.393898963928223, "rewards/rejected": -6.990378379821777, "step": 728 }, { "epoch": 0.4983763459237737, "grad_norm": 0.0012214165180921555, "learning_rate": 4.7712125471966245e-05, "logits/chosen": -8.381967544555664, "logits/rejected": -8.368612289428711, "logps/chosen": -3.880763292312622, "logps/rejected": -107.90703582763672, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.774421453475952, "rewards/margins": 10.352872848510742, "rewards/rejected": -7.578451156616211, "step": 729 }, { "epoch": 0.49905998974534266, "grad_norm": 0.0014726887457072735, "learning_rate": 4.7722047668674267e-05, "logits/chosen": -7.968984603881836, "logits/rejected": -7.959848880767822, "logps/chosen": -7.379117965698242, "logps/rejected": -105.40544891357422, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5431289672851562, "rewards/margins": 9.773601531982422, "rewards/rejected": -7.230472087860107, "step": 730 }, { "epoch": 0.49974363356691165, "grad_norm": 0.0009911510860547423, "learning_rate": 4.7731956282631004e-05, "logits/chosen": -8.28152847290039, "logits/rejected": -8.271310806274414, "logps/chosen": -0.4440341889858246, "logps/rejected": -109.8488540649414, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.126007556915283, "rewards/margins": 10.894365310668945, "rewards/rejected": -7.76835823059082, "step": 731 }, { "epoch": 0.5004272773884806, "grad_norm": 0.004491661209613085, "learning_rate": 4.77418513509732e-05, "logits/chosen": -8.325092315673828, "logits/rejected": -8.314556121826172, "logps/chosen": -4.388209819793701, "logps/rejected": -107.82733154296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.838263988494873, "rewards/margins": 10.300527572631836, "rewards/rejected": -7.462264060974121, "step": 732 }, { "epoch": 0.5011109212100495, "grad_norm": 0.0009425997268408537, "learning_rate": 4.775173291068547e-05, "logits/chosen": -7.833487033843994, "logits/rejected": -7.821265697479248, "logps/chosen": -3.7929391860961914, "logps/rejected": -107.7928695678711, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8409178256988525, "rewards/margins": 10.39396858215332, "rewards/rejected": -7.553051471710205, "step": 733 }, { "epoch": 0.5017945650316186, "grad_norm": 0.0007842215127311647, "learning_rate": 4.776160099860117e-05, "logits/chosen": -8.606929779052734, "logits/rejected": -8.594782829284668, "logps/chosen": -2.0238230228424072, "logps/rejected": -108.19305419921875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9672136306762695, "rewards/margins": 10.585858345031738, "rewards/rejected": -7.6186442375183105, "step": 734 }, { "epoch": 0.5024782088531875, "grad_norm": 1.0853101015090942, "learning_rate": 4.777145565140325e-05, "logits/chosen": -8.67665958404541, "logits/rejected": -8.665224075317383, "logps/chosen": -2.0308070182800293, "logps/rejected": -108.9786605834961, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 2.9844799041748047, "rewards/margins": 10.649297714233398, "rewards/rejected": -7.66481876373291, "step": 735 }, { "epoch": 0.5031618526747564, "grad_norm": 0.0010984860127791762, "learning_rate": 4.7781296905624986e-05, "logits/chosen": -8.341728210449219, "logits/rejected": -8.32986068725586, "logps/chosen": -0.5729345083236694, "logps/rejected": -109.74815368652344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0688986778259277, "rewards/margins": 10.852709770202637, "rewards/rejected": -7.783811569213867, "step": 736 }, { "epoch": 0.5038454964963254, "grad_norm": 1.674928069114685, "learning_rate": 4.779112479765086e-05, "logits/chosen": -9.14012622833252, "logits/rejected": -9.127029418945312, "logps/chosen": -2.0240695476531982, "logps/rejected": -108.33042907714844, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": 2.9230663776397705, "rewards/margins": 10.598368644714355, "rewards/rejected": -7.675302028656006, "step": 737 }, { "epoch": 0.5045291403178944, "grad_norm": 0.0011035872157663107, "learning_rate": 4.780093936371736e-05, "logits/chosen": -7.741412162780762, "logits/rejected": -7.732145309448242, "logps/chosen": -4.743345737457275, "logps/rejected": -108.56619262695312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9111382961273193, "rewards/margins": 10.380746841430664, "rewards/rejected": -7.469609260559082, "step": 738 }, { "epoch": 0.5052127841394634, "grad_norm": 0.0006353717180900276, "learning_rate": 4.781074063991376e-05, "logits/chosen": -7.925971984863281, "logits/rejected": -7.915360450744629, "logps/chosen": -6.2536420822143555, "logps/rejected": -106.06416320800781, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.6800456047058105, "rewards/margins": 9.989089965820312, "rewards/rejected": -7.309043884277344, "step": 739 }, { "epoch": 0.5058964279610323, "grad_norm": 0.0628226175904274, "learning_rate": 4.782052866218294e-05, "logits/chosen": -8.065020561218262, "logits/rejected": -8.053184509277344, "logps/chosen": -2.407402753829956, "logps/rejected": -108.23011779785156, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 3.066648483276367, "rewards/margins": 10.57589054107666, "rewards/rejected": -7.509242057800293, "step": 740 }, { "epoch": 0.5065800717826012, "grad_norm": 0.19930994510650635, "learning_rate": 4.783030346632214e-05, "logits/chosen": -8.68425178527832, "logits/rejected": -8.675002098083496, "logps/chosen": -5.053059101104736, "logps/rejected": -107.68724060058594, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 2.672126293182373, "rewards/margins": 10.159862518310547, "rewards/rejected": -7.487735748291016, "step": 741 }, { "epoch": 0.5072637156041703, "grad_norm": 0.2505651116371155, "learning_rate": 4.7840065087983786e-05, "logits/chosen": -8.35506820678711, "logits/rejected": -8.347417831420898, "logps/chosen": -9.759700775146484, "logps/rejected": -105.57514953613281, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 2.2848825454711914, "rewards/margins": 9.558662414550781, "rewards/rejected": -7.27377986907959, "step": 742 }, { "epoch": 0.5079473594257392, "grad_norm": 0.14458736777305603, "learning_rate": 4.784981356267626e-05, "logits/chosen": -8.435648918151855, "logits/rejected": -8.427689552307129, "logps/chosen": -7.0282440185546875, "logps/rejected": -105.1323013305664, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 2.5298664569854736, "rewards/margins": 9.769062042236328, "rewards/rejected": -7.239195346832275, "step": 743 }, { "epoch": 0.5086310032473081, "grad_norm": 1.8410125970840454, "learning_rate": 4.785954892576465e-05, "logits/chosen": -8.717840194702148, "logits/rejected": -8.710444450378418, "logps/chosen": -7.713871955871582, "logps/rejected": -106.39540100097656, "loss": 0.0285, "rewards/accuracies": 0.9375, "rewards/chosen": 2.3150923252105713, "rewards/margins": 9.806320190429688, "rewards/rejected": -7.491226673126221, "step": 744 }, { "epoch": 0.5093146470688771, "grad_norm": 0.21500040590763092, "learning_rate": 4.7869271212471554e-05, "logits/chosen": -8.237555503845215, "logits/rejected": -8.230384826660156, "logps/chosen": -3.668006658554077, "logps/rejected": -107.35643768310547, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 2.7655904293060303, "rewards/margins": 10.259777069091797, "rewards/rejected": -7.494187355041504, "step": 745 }, { "epoch": 0.5099982908904461, "grad_norm": 0.41796404123306274, "learning_rate": 4.7878980457877814e-05, "logits/chosen": -9.022404670715332, "logits/rejected": -9.014780044555664, "logps/chosen": -9.580872535705566, "logps/rejected": -105.68891906738281, "loss": 0.0064, "rewards/accuracies": 0.9375, "rewards/chosen": 2.153825044631958, "rewards/margins": 9.552549362182617, "rewards/rejected": -7.3987250328063965, "step": 746 }, { "epoch": 0.5106819347120151, "grad_norm": 0.11947747319936752, "learning_rate": 4.7888676696923315e-05, "logits/chosen": -8.961423873901367, "logits/rejected": -8.949812889099121, "logps/chosen": -3.630056381225586, "logps/rejected": -106.65235900878906, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7962777614593506, "rewards/margins": 10.198479652404785, "rewards/rejected": -7.402202606201172, "step": 747 }, { "epoch": 0.511365578533584, "grad_norm": 0.01173047162592411, "learning_rate": 4.7898359964407695e-05, "logits/chosen": -9.329082489013672, "logits/rejected": -9.320259094238281, "logps/chosen": -4.618907451629639, "logps/rejected": -108.70803833007812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.658454656600952, "rewards/margins": 10.32633113861084, "rewards/rejected": -7.667876243591309, "step": 748 }, { "epoch": 0.5120492223551529, "grad_norm": 0.002339734695851803, "learning_rate": 4.790803029499111e-05, "logits/chosen": -9.218048095703125, "logits/rejected": -9.208362579345703, "logps/chosen": -2.6083993911743164, "logps/rejected": -108.2761001586914, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8880817890167236, "rewards/margins": 10.505168914794922, "rewards/rejected": -7.617086887359619, "step": 749 }, { "epoch": 0.512732866176722, "grad_norm": 0.17820684611797333, "learning_rate": 4.7917687723195004e-05, "logits/chosen": -8.937501907348633, "logits/rejected": -8.928044319152832, "logps/chosen": -4.450533390045166, "logps/rejected": -107.41029357910156, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.760371685028076, "rewards/margins": 10.288396835327148, "rewards/rejected": -7.528024673461914, "step": 750 }, { "epoch": 0.5134165099982909, "grad_norm": 0.008534167893230915, "learning_rate": 4.792733228340281e-05, "logits/chosen": -9.224613189697266, "logits/rejected": -9.213038444519043, "logps/chosen": -4.339767932891846, "logps/rejected": -106.74658203125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7783398628234863, "rewards/margins": 10.175296783447266, "rewards/rejected": -7.396956443786621, "step": 751 }, { "epoch": 0.5141001538198599, "grad_norm": 0.008391938172280788, "learning_rate": 4.793696400986071e-05, "logits/chosen": -8.985795021057129, "logits/rejected": -8.972804069519043, "logps/chosen": -1.7269386053085327, "logps/rejected": -108.47373962402344, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.075429677963257, "rewards/margins": 10.689558029174805, "rewards/rejected": -7.614128112792969, "step": 752 }, { "epoch": 0.5147837976414288, "grad_norm": 11.414566993713379, "learning_rate": 4.7946582936678344e-05, "logits/chosen": -8.696015357971191, "logits/rejected": -8.686498641967773, "logps/chosen": -3.94171404838562, "logps/rejected": -107.3568344116211, "loss": 0.0233, "rewards/accuracies": 1.0, "rewards/chosen": 2.810776948928833, "rewards/margins": 10.343069076538086, "rewards/rejected": -7.532292366027832, "step": 753 }, { "epoch": 0.5154674414629978, "grad_norm": 0.002493275096639991, "learning_rate": 4.795618909782957e-05, "logits/chosen": -9.526618957519531, "logits/rejected": -9.516611099243164, "logps/chosen": -3.1063969135284424, "logps/rejected": -106.91735076904297, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8443918228149414, "rewards/margins": 10.33669662475586, "rewards/rejected": -7.492304801940918, "step": 754 }, { "epoch": 0.5161510852845668, "grad_norm": 0.013696888461709023, "learning_rate": 4.796578252715314e-05, "logits/chosen": -9.165050506591797, "logits/rejected": -9.154101371765137, "logps/chosen": -4.651892185211182, "logps/rejected": -106.12405395507812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.735438823699951, "rewards/margins": 10.167692184448242, "rewards/rejected": -7.432252883911133, "step": 755 }, { "epoch": 0.5168347291061357, "grad_norm": 3.4256951808929443, "learning_rate": 4.797536325835345e-05, "logits/chosen": -9.251741409301758, "logits/rejected": -9.243463516235352, "logps/chosen": -10.481992721557617, "logps/rejected": -103.33259582519531, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 2.1489758491516113, "rewards/margins": 9.28059196472168, "rewards/rejected": -7.131616592407227, "step": 756 }, { "epoch": 0.5175183729277046, "grad_norm": 0.004824300762265921, "learning_rate": 4.7984931325001216e-05, "logits/chosen": -8.130032539367676, "logits/rejected": -8.121503829956055, "logps/chosen": -5.416064739227295, "logps/rejected": -105.90327453613281, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6500864028930664, "rewards/margins": 9.984929084777832, "rewards/rejected": -7.334843635559082, "step": 757 }, { "epoch": 0.5182020167492736, "grad_norm": 0.010455186478793621, "learning_rate": 4.799448676053423e-05, "logits/chosen": -9.109296798706055, "logits/rejected": -9.100788116455078, "logps/chosen": -10.948762893676758, "logps/rejected": -104.58244323730469, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.201826810836792, "rewards/margins": 9.359445571899414, "rewards/rejected": -7.157617568969727, "step": 758 }, { "epoch": 0.5188856605708426, "grad_norm": 0.029686270281672478, "learning_rate": 4.800402959825802e-05, "logits/chosen": -9.310729026794434, "logits/rejected": -9.299785614013672, "logps/chosen": -8.241718292236328, "logps/rejected": -106.1810302734375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.4585611820220947, "rewards/margins": 9.785444259643555, "rewards/rejected": -7.326882839202881, "step": 759 }, { "epoch": 0.5195693043924116, "grad_norm": 0.01291381474584341, "learning_rate": 4.801355987134653e-05, "logits/chosen": -8.287029266357422, "logits/rejected": -8.276752471923828, "logps/chosen": -2.0654823780059814, "logps/rejected": -107.76654052734375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9831340312957764, "rewards/margins": 10.543913841247559, "rewards/rejected": -7.560779571533203, "step": 760 }, { "epoch": 0.5202529482139805, "grad_norm": 0.0051713986322283745, "learning_rate": 4.802307761284289e-05, "logits/chosen": -9.034029960632324, "logits/rejected": -9.026206016540527, "logps/chosen": -8.122821807861328, "logps/rejected": -103.7130126953125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.2518208026885986, "rewards/margins": 9.432682991027832, "rewards/rejected": -7.180861949920654, "step": 761 }, { "epoch": 0.5209365920355494, "grad_norm": 0.005372358951717615, "learning_rate": 4.8032582855660014e-05, "logits/chosen": -9.290934562683105, "logits/rejected": -9.282051086425781, "logps/chosen": -8.099786758422852, "logps/rejected": -104.21316528320312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.406525135040283, "rewards/margins": 9.553189277648926, "rewards/rejected": -7.146664619445801, "step": 762 }, { "epoch": 0.5216202358571185, "grad_norm": 0.007335455622524023, "learning_rate": 4.8042075632581346e-05, "logits/chosen": -9.11307430267334, "logits/rejected": -9.102954864501953, "logps/chosen": -5.524725914001465, "logps/rejected": -105.82417297363281, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.638881206512451, "rewards/margins": 9.992063522338867, "rewards/rejected": -7.353181838989258, "step": 763 }, { "epoch": 0.5223038796786874, "grad_norm": 0.020392410457134247, "learning_rate": 4.80515559762615e-05, "logits/chosen": -8.874475479125977, "logits/rejected": -8.864171028137207, "logps/chosen": -7.999664306640625, "logps/rejected": -104.95994567871094, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.459719657897949, "rewards/margins": 9.6554594039917, "rewards/rejected": -7.195740222930908, "step": 764 }, { "epoch": 0.5229875235002563, "grad_norm": 0.03940771892666817, "learning_rate": 4.8061023919226964e-05, "logits/chosen": -8.579340934753418, "logits/rejected": -8.571548461914062, "logps/chosen": -2.060110569000244, "logps/rejected": -107.5482406616211, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.971261978149414, "rewards/margins": 10.420124053955078, "rewards/rejected": -7.448861122131348, "step": 765 }, { "epoch": 0.5236711673218253, "grad_norm": 0.004403365775942802, "learning_rate": 4.807047949387674e-05, "logits/chosen": -9.034116744995117, "logits/rejected": -9.025457382202148, "logps/chosen": -3.279025077819824, "logps/rejected": -106.798095703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9414968490600586, "rewards/margins": 10.320786476135254, "rewards/rejected": -7.3792901039123535, "step": 766 }, { "epoch": 0.5243548111433943, "grad_norm": 0.0032752850092947483, "learning_rate": 4.807992273248302e-05, "logits/chosen": -8.806706428527832, "logits/rejected": -8.795724868774414, "logps/chosen": -2.9177744388580322, "logps/rejected": -106.27584075927734, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8453783988952637, "rewards/margins": 10.276800155639648, "rewards/rejected": -7.431421756744385, "step": 767 }, { "epoch": 0.5250384549649633, "grad_norm": 0.017109790816903114, "learning_rate": 4.808935366719187e-05, "logits/chosen": -8.819512367248535, "logits/rejected": -8.808951377868652, "logps/chosen": -2.512585163116455, "logps/rejected": -105.26658630371094, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8421473503112793, "rewards/margins": 10.220498085021973, "rewards/rejected": -7.378351211547852, "step": 768 }, { "epoch": 0.5257220987865322, "grad_norm": 0.0011919845128431916, "learning_rate": 4.8098772330023855e-05, "logits/chosen": -8.30352783203125, "logits/rejected": -8.293909072875977, "logps/chosen": -3.107116937637329, "logps/rejected": -105.83116149902344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9155311584472656, "rewards/margins": 10.266412734985352, "rewards/rejected": -7.350881576538086, "step": 769 }, { "epoch": 0.5264057426081011, "grad_norm": 0.0039408463053405285, "learning_rate": 4.81081787528747e-05, "logits/chosen": -8.67070198059082, "logits/rejected": -8.660886764526367, "logps/chosen": -4.808472156524658, "logps/rejected": -105.3343276977539, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7526564598083496, "rewards/margins": 10.091597557067871, "rewards/rejected": -7.33894157409668, "step": 770 }, { "epoch": 0.5270893864296702, "grad_norm": 0.007131851278245449, "learning_rate": 4.811757296751595e-05, "logits/chosen": -9.37773609161377, "logits/rejected": -9.364879608154297, "logps/chosen": -0.6705060601234436, "logps/rejected": -107.48263549804688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.08815860748291, "rewards/margins": 10.648643493652344, "rewards/rejected": -7.560484886169434, "step": 771 }, { "epoch": 0.5277730302512391, "grad_norm": 0.024614591151475906, "learning_rate": 4.812695500559561e-05, "logits/chosen": -8.407444953918457, "logits/rejected": -8.39866828918457, "logps/chosen": -3.9638724327087402, "logps/rejected": -104.424560546875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.9526963233947754, "rewards/margins": 10.007464408874512, "rewards/rejected": -7.054767608642578, "step": 772 }, { "epoch": 0.528456674072808, "grad_norm": 0.0033601282630115747, "learning_rate": 4.8136324898638756e-05, "logits/chosen": -9.198286056518555, "logits/rejected": -9.185053825378418, "logps/chosen": -2.501364231109619, "logps/rejected": -105.354248046875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.945863723754883, "rewards/margins": 10.354353904724121, "rewards/rejected": -7.40848970413208, "step": 773 }, { "epoch": 0.529140317894377, "grad_norm": 0.0013103863457217813, "learning_rate": 4.8145682678048214e-05, "logits/chosen": -8.97663688659668, "logits/rejected": -8.966251373291016, "logps/chosen": -2.399407148361206, "logps/rejected": -106.20095825195312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.910797595977783, "rewards/margins": 10.264013290405273, "rewards/rejected": -7.353216171264648, "step": 774 }, { "epoch": 0.529823961715946, "grad_norm": 0.000930626003537327, "learning_rate": 4.815502837510518e-05, "logits/chosen": -9.658573150634766, "logits/rejected": -9.64823055267334, "logps/chosen": -3.1582064628601074, "logps/rejected": -106.86614990234375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8451004028320312, "rewards/margins": 10.308100700378418, "rewards/rejected": -7.463000297546387, "step": 775 }, { "epoch": 0.530507605537515, "grad_norm": 0.0024848405737429857, "learning_rate": 4.816436202096981e-05, "logits/chosen": -8.86169147491455, "logits/rejected": -8.850533485412598, "logps/chosen": -0.4775656461715698, "logps/rejected": -106.97842407226562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1738266944885254, "rewards/margins": 10.647747039794922, "rewards/rejected": -7.4739203453063965, "step": 776 }, { "epoch": 0.5311912493590839, "grad_norm": 0.0016189615707844496, "learning_rate": 4.81736836466819e-05, "logits/chosen": -8.660746574401855, "logits/rejected": -8.648465156555176, "logps/chosen": -2.7429654598236084, "logps/rejected": -105.91068267822266, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.910076141357422, "rewards/margins": 10.295504570007324, "rewards/rejected": -7.385427951812744, "step": 777 }, { "epoch": 0.5318748931806528, "grad_norm": 0.0031622315291315317, "learning_rate": 4.8182993283161485e-05, "logits/chosen": -8.152990341186523, "logits/rejected": -8.141247749328613, "logps/chosen": -2.227194309234619, "logps/rejected": -106.02484130859375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.947484016418457, "rewards/margins": 10.31656551361084, "rewards/rejected": -7.369081497192383, "step": 778 }, { "epoch": 0.5325585370022219, "grad_norm": 0.0011640526354312897, "learning_rate": 4.819229096120941e-05, "logits/chosen": -8.55341625213623, "logits/rejected": -8.543582916259766, "logps/chosen": -6.860029220581055, "logps/rejected": -104.99632263183594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5770392417907715, "rewards/margins": 9.820119857788086, "rewards/rejected": -7.243080139160156, "step": 779 }, { "epoch": 0.5332421808237908, "grad_norm": 0.0017718439921736717, "learning_rate": 4.820157671150801e-05, "logits/chosen": -9.03231430053711, "logits/rejected": -9.02137279510498, "logps/chosen": -0.5707932114601135, "logps/rejected": -106.8825912475586, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1421303749084473, "rewards/margins": 10.603202819824219, "rewards/rejected": -7.461071968078613, "step": 780 }, { "epoch": 0.5339258246453598, "grad_norm": 0.0030634189024567604, "learning_rate": 4.821085056462168e-05, "logits/chosen": -8.897279739379883, "logits/rejected": -8.887443542480469, "logps/chosen": -4.452642440795898, "logps/rejected": -105.54930877685547, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7443461418151855, "rewards/margins": 10.027786254882812, "rewards/rejected": -7.283441066741943, "step": 781 }, { "epoch": 0.5346094684669287, "grad_norm": 0.0018481542356312275, "learning_rate": 4.822011255099747e-05, "logits/chosen": -9.7228364944458, "logits/rejected": -9.712964057922363, "logps/chosen": -1.6563634872436523, "logps/rejected": -106.06796264648438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9549953937530518, "rewards/margins": 10.421481132507324, "rewards/rejected": -7.466485977172852, "step": 782 }, { "epoch": 0.5352931122884977, "grad_norm": 0.003446885384619236, "learning_rate": 4.8229362700965726e-05, "logits/chosen": -9.4985990524292, "logits/rejected": -9.490641593933105, "logps/chosen": -3.3762319087982178, "logps/rejected": -105.28634643554688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.640120267868042, "rewards/margins": 10.058067321777344, "rewards/rejected": -7.417947292327881, "step": 783 }, { "epoch": 0.5359767561100667, "grad_norm": 0.0011916556395590305, "learning_rate": 4.8238601044740645e-05, "logits/chosen": -9.054402351379395, "logits/rejected": -9.046116828918457, "logps/chosen": -4.135790824890137, "logps/rejected": -105.05474853515625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.745279550552368, "rewards/margins": 10.013723373413086, "rewards/rejected": -7.268444061279297, "step": 784 }, { "epoch": 0.5366603999316356, "grad_norm": 0.0015317384386435151, "learning_rate": 4.824782761242088e-05, "logits/chosen": -8.997936248779297, "logits/rejected": -8.986723899841309, "logps/chosen": -4.468085289001465, "logps/rejected": -106.02771759033203, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.768218755722046, "rewards/margins": 10.101882934570312, "rewards/rejected": -7.333664894104004, "step": 785 }, { "epoch": 0.5373440437532045, "grad_norm": 0.001560779637657106, "learning_rate": 4.8257042433990135e-05, "logits/chosen": -8.496073722839355, "logits/rejected": -8.48560905456543, "logps/chosen": -1.7716690301895142, "logps/rejected": -106.17747497558594, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.0535593032836914, "rewards/margins": 10.42155647277832, "rewards/rejected": -7.367997169494629, "step": 786 }, { "epoch": 0.5380276875747736, "grad_norm": 0.00246842741034925, "learning_rate": 4.826624553931775e-05, "logits/chosen": -9.546893119812012, "logits/rejected": -9.537055969238281, "logps/chosen": -4.5737433433532715, "logps/rejected": -104.51683807373047, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.64176082611084, "rewards/margins": 9.984384536743164, "rewards/rejected": -7.342623710632324, "step": 787 }, { "epoch": 0.5387113313963425, "grad_norm": 0.0018781605176627636, "learning_rate": 4.827543695815926e-05, "logits/chosen": -9.003469467163086, "logits/rejected": -8.992547988891602, "logps/chosen": -2.6186249256134033, "logps/rejected": -106.69450378417969, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.938373565673828, "rewards/margins": 10.398786544799805, "rewards/rejected": -7.460413932800293, "step": 788 }, { "epoch": 0.5393949752179115, "grad_norm": 0.0020011502783745527, "learning_rate": 4.8284616720157006e-05, "logits/chosen": -9.48122787475586, "logits/rejected": -9.472023010253906, "logps/chosen": -8.326510429382324, "logps/rejected": -106.5345230102539, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.411715269088745, "rewards/margins": 9.776873588562012, "rewards/rejected": -7.3651580810546875, "step": 789 }, { "epoch": 0.5400786190394804, "grad_norm": 0.0016137039056047797, "learning_rate": 4.82937848548407e-05, "logits/chosen": -8.857290267944336, "logits/rejected": -8.8453369140625, "logps/chosen": -3.4849143028259277, "logps/rejected": -105.38690185546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.791872978210449, "rewards/margins": 10.197467803955078, "rewards/rejected": -7.405594825744629, "step": 790 }, { "epoch": 0.5407622628610494, "grad_norm": 0.0018592324340716004, "learning_rate": 4.8302941391627947e-05, "logits/chosen": -8.508340835571289, "logits/rejected": -8.496508598327637, "logps/chosen": -2.8011178970336914, "logps/rejected": -107.08613586425781, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9344592094421387, "rewards/margins": 10.402288436889648, "rewards/rejected": -7.467828750610352, "step": 791 }, { "epoch": 0.5414459066826184, "grad_norm": 0.0015265436377376318, "learning_rate": 4.83120863598249e-05, "logits/chosen": -9.28333854675293, "logits/rejected": -9.272224426269531, "logps/chosen": -2.748241901397705, "logps/rejected": -104.84828186035156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.762310266494751, "rewards/margins": 10.17378044128418, "rewards/rejected": -7.411469459533691, "step": 792 }, { "epoch": 0.5421295505041873, "grad_norm": 0.001331367064267397, "learning_rate": 4.832121978862673e-05, "logits/chosen": -8.787428855895996, "logits/rejected": -8.778520584106445, "logps/chosen": -3.5350098609924316, "logps/rejected": -106.45716857910156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.865309476852417, "rewards/margins": 10.205279350280762, "rewards/rejected": -7.339970111846924, "step": 793 }, { "epoch": 0.5428131943257563, "grad_norm": 0.003385082585737109, "learning_rate": 4.8330341707118276e-05, "logits/chosen": -8.777369499206543, "logits/rejected": -8.766924858093262, "logps/chosen": -1.5776218175888062, "logps/rejected": -106.02197265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0779225826263428, "rewards/margins": 10.444441795349121, "rewards/rejected": -7.366518974304199, "step": 794 }, { "epoch": 0.5434968381473252, "grad_norm": 0.001699388725683093, "learning_rate": 4.833945214427451e-05, "logits/chosen": -8.549946784973145, "logits/rejected": -8.538536071777344, "logps/chosen": -2.650611400604248, "logps/rejected": -107.41838073730469, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.037973403930664, "rewards/margins": 10.458346366882324, "rewards/rejected": -7.420372009277344, "step": 795 }, { "epoch": 0.5441804819688942, "grad_norm": 0.0022188269067555666, "learning_rate": 4.834855112896116e-05, "logits/chosen": -9.629668235778809, "logits/rejected": -9.619180679321289, "logps/chosen": -1.9213364124298096, "logps/rejected": -106.39653778076172, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8981950283050537, "rewards/margins": 10.344661712646484, "rewards/rejected": -7.446466445922852, "step": 796 }, { "epoch": 0.5448641257904632, "grad_norm": 0.001163344830274582, "learning_rate": 4.835763868993521e-05, "logits/chosen": -9.18748664855957, "logits/rejected": -9.178380966186523, "logps/chosen": -9.292892456054688, "logps/rejected": -103.35055541992188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.250108242034912, "rewards/margins": 9.277664184570312, "rewards/rejected": -7.0275559425354, "step": 797 }, { "epoch": 0.5455477696120321, "grad_norm": 0.0010576486820355058, "learning_rate": 4.8366714855845496e-05, "logits/chosen": -8.722304344177246, "logits/rejected": -8.712569236755371, "logps/chosen": -0.5775929689407349, "logps/rejected": -107.63465118408203, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0894103050231934, "rewards/margins": 10.642189025878906, "rewards/rejected": -7.552778244018555, "step": 798 }, { "epoch": 0.546231413433601, "grad_norm": 0.0009824035223573446, "learning_rate": 4.837577965523319e-05, "logits/chosen": -9.314172744750977, "logits/rejected": -9.30272388458252, "logps/chosen": -0.27050837874412537, "logps/rejected": -107.91744995117188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.132232666015625, "rewards/margins": 10.74184799194336, "rewards/rejected": -7.609615325927734, "step": 799 }, { "epoch": 0.5469150572551701, "grad_norm": 0.0009672934538684785, "learning_rate": 4.8384833116532396e-05, "logits/chosen": -9.255950927734375, "logits/rejected": -9.242000579833984, "logps/chosen": -1.064619541168213, "logps/rejected": -107.72248840332031, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1061859130859375, "rewards/margins": 10.659305572509766, "rewards/rejected": -7.553120136260986, "step": 800 }, { "epoch": 0.547598701076739, "grad_norm": 0.0009379958501085639, "learning_rate": 4.8393875268070636e-05, "logits/chosen": -8.961837768554688, "logits/rejected": -8.953292846679688, "logps/chosen": -0.3139047622680664, "logps/rejected": -107.82954406738281, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.067283868789673, "rewards/margins": 10.652536392211914, "rewards/rejected": -7.585251808166504, "step": 801 }, { "epoch": 0.548282344898308, "grad_norm": 0.003889232873916626, "learning_rate": 4.84029061380694e-05, "logits/chosen": -9.25599479675293, "logits/rejected": -9.243898391723633, "logps/chosen": -5.036285400390625, "logps/rejected": -105.86213684082031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.720874309539795, "rewards/margins": 10.08654499053955, "rewards/rejected": -7.365670204162598, "step": 802 }, { "epoch": 0.5489659887198769, "grad_norm": 0.0016888697864487767, "learning_rate": 4.841192575464469e-05, "logits/chosen": -8.582124710083008, "logits/rejected": -8.568196296691895, "logps/chosen": -2.720304250717163, "logps/rejected": -106.48416900634766, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.004563808441162, "rewards/margins": 10.380496978759766, "rewards/rejected": -7.375932693481445, "step": 803 }, { "epoch": 0.5496496325414459, "grad_norm": 0.0025704000145196915, "learning_rate": 4.842093414580753e-05, "logits/chosen": -9.675749778747559, "logits/rejected": -9.662240982055664, "logps/chosen": -4.822721004486084, "logps/rejected": -106.67796325683594, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.737574338912964, "rewards/margins": 10.185797691345215, "rewards/rejected": -7.448222637176514, "step": 804 }, { "epoch": 0.5503332763630149, "grad_norm": 0.0008364578825421631, "learning_rate": 4.842993133946448e-05, "logits/chosen": -8.951029777526855, "logits/rejected": -8.940197944641113, "logps/chosen": -5.358844757080078, "logps/rejected": -104.79487609863281, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.764416217803955, "rewards/margins": 9.950594902038574, "rewards/rejected": -7.186178684234619, "step": 805 }, { "epoch": 0.5510169201845838, "grad_norm": 0.0007913350709713995, "learning_rate": 4.843891736341818e-05, "logits/chosen": -9.378564834594727, "logits/rejected": -9.366779327392578, "logps/chosen": -6.283114433288574, "logps/rejected": -105.24431610107422, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6897571086883545, "rewards/margins": 9.838534355163574, "rewards/rejected": -7.148777961730957, "step": 806 }, { "epoch": 0.5517005640061527, "grad_norm": 0.0014447914436459541, "learning_rate": 4.8447892245367846e-05, "logits/chosen": -8.96488094329834, "logits/rejected": -8.955078125, "logps/chosen": -2.1861910820007324, "logps/rejected": -107.43557739257812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.907226324081421, "rewards/margins": 10.482337951660156, "rewards/rejected": -7.575112342834473, "step": 807 }, { "epoch": 0.5523842078277218, "grad_norm": 0.0011227658251300454, "learning_rate": 4.845685601290977e-05, "logits/chosen": -8.366279602050781, "logits/rejected": -8.356929779052734, "logps/chosen": -1.9131747484207153, "logps/rejected": -106.81945037841797, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.90958833694458, "rewards/margins": 10.432117462158203, "rewards/rejected": -7.522529125213623, "step": 808 }, { "epoch": 0.5530678516492907, "grad_norm": 2.022292137145996, "learning_rate": 4.846580869353787e-05, "logits/chosen": -9.607446670532227, "logits/rejected": -9.597162246704102, "logps/chosen": -6.564858913421631, "logps/rejected": -106.23641967773438, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 2.534895420074463, "rewards/margins": 9.83066463470459, "rewards/rejected": -7.295768737792969, "step": 809 }, { "epoch": 0.5537514954708597, "grad_norm": 0.0040961322374641895, "learning_rate": 4.847475031464416e-05, "logits/chosen": -9.892683029174805, "logits/rejected": -9.87844181060791, "logps/chosen": -0.19322240352630615, "logps/rejected": -107.98269653320312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.135653495788574, "rewards/margins": 10.719234466552734, "rewards/rejected": -7.583580017089844, "step": 810 }, { "epoch": 0.5544351392924286, "grad_norm": 0.0008967083995230496, "learning_rate": 4.8483680903519274e-05, "logits/chosen": -9.170083999633789, "logits/rejected": -9.159222602844238, "logps/chosen": -0.2898944616317749, "logps/rejected": -108.25603485107422, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1141176223754883, "rewards/margins": 10.768651962280273, "rewards/rejected": -7.654534339904785, "step": 811 }, { "epoch": 0.5551187831139976, "grad_norm": 0.0009844796732068062, "learning_rate": 4.8492600487352926e-05, "logits/chosen": -8.713384628295898, "logits/rejected": -8.700687408447266, "logps/chosen": -5.733950138092041, "logps/rejected": -104.93241882324219, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.785637378692627, "rewards/margins": 9.940799713134766, "rewards/rejected": -7.155162811279297, "step": 812 }, { "epoch": 0.5558024269355666, "grad_norm": 0.0010318891145288944, "learning_rate": 4.850150909323447e-05, "logits/chosen": -9.09451675415039, "logits/rejected": -9.081742286682129, "logps/chosen": -1.8747305870056152, "logps/rejected": -106.9559555053711, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.970749616622925, "rewards/margins": 10.482893943786621, "rewards/rejected": -7.512145042419434, "step": 813 }, { "epoch": 0.5564860707571355, "grad_norm": 0.0010485502425581217, "learning_rate": 4.8510406748153355e-05, "logits/chosen": -9.207686424255371, "logits/rejected": -9.196195602416992, "logps/chosen": -1.9893051385879517, "logps/rejected": -106.98690032958984, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.034259796142578, "rewards/margins": 10.427892684936523, "rewards/rejected": -7.3936333656311035, "step": 814 }, { "epoch": 0.5571697145787045, "grad_norm": 0.0012036035768687725, "learning_rate": 4.8519293478999614e-05, "logits/chosen": -9.170076370239258, "logits/rejected": -9.157506942749023, "logps/chosen": -4.056570529937744, "logps/rejected": -106.064697265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.83510160446167, "rewards/margins": 10.161787033081055, "rewards/rejected": -7.326685905456543, "step": 815 }, { "epoch": 0.5578533584002735, "grad_norm": 0.0008995429379865527, "learning_rate": 4.8528169312564355e-05, "logits/chosen": -9.098052024841309, "logits/rejected": -9.085901260375977, "logps/chosen": -2.0052132606506348, "logps/rejected": -107.00572204589844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.967607021331787, "rewards/margins": 10.463644027709961, "rewards/rejected": -7.496037006378174, "step": 816 }, { "epoch": 0.5585370022218424, "grad_norm": 0.0017718507442623377, "learning_rate": 4.8537034275540264e-05, "logits/chosen": -9.347105026245117, "logits/rejected": -9.33591079711914, "logps/chosen": -7.088460922241211, "logps/rejected": -105.02359771728516, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.573558807373047, "rewards/margins": 9.70557689666748, "rewards/rejected": -7.132018089294434, "step": 817 }, { "epoch": 0.5592206460434114, "grad_norm": 0.0007968979771248996, "learning_rate": 4.854588839452205e-05, "logits/chosen": -8.763313293457031, "logits/rejected": -8.750354766845703, "logps/chosen": -2.325188636779785, "logps/rejected": -107.60749816894531, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.980172634124756, "rewards/margins": 10.563252449035645, "rewards/rejected": -7.583079814910889, "step": 818 }, { "epoch": 0.5599042898649803, "grad_norm": 0.001000613789074123, "learning_rate": 4.855473169600698e-05, "logits/chosen": -9.046760559082031, "logits/rejected": -9.033550262451172, "logps/chosen": -2.0523149967193604, "logps/rejected": -107.04121398925781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0647404193878174, "rewards/margins": 10.522361755371094, "rewards/rejected": -7.4576215744018555, "step": 819 }, { "epoch": 0.5605879336865494, "grad_norm": 0.0009813508950173855, "learning_rate": 4.856356420639528e-05, "logits/chosen": -9.120138168334961, "logits/rejected": -9.106664657592773, "logps/chosen": -8.507831573486328, "logps/rejected": -104.4205093383789, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.447463035583496, "rewards/margins": 9.61520767211914, "rewards/rejected": -7.1677446365356445, "step": 820 }, { "epoch": 0.5612715775081183, "grad_norm": 0.0009627667022868991, "learning_rate": 4.857238595199068e-05, "logits/chosen": -8.996660232543945, "logits/rejected": -8.982248306274414, "logps/chosen": -0.16059431433677673, "logps/rejected": -108.1725845336914, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1105165481567383, "rewards/margins": 10.729573249816895, "rewards/rejected": -7.61905574798584, "step": 821 }, { "epoch": 0.5619552213296872, "grad_norm": 0.00123152369633317, "learning_rate": 4.858119695900084e-05, "logits/chosen": -8.943631172180176, "logits/rejected": -8.929767608642578, "logps/chosen": -3.9173130989074707, "logps/rejected": -106.66258239746094, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.781510829925537, "rewards/margins": 10.226255416870117, "rewards/rejected": -7.44474458694458, "step": 822 }, { "epoch": 0.5626388651512562, "grad_norm": 0.001574164954945445, "learning_rate": 4.858999725353783e-05, "logits/chosen": -8.096368789672852, "logits/rejected": -8.087187767028809, "logps/chosen": -5.832483291625977, "logps/rejected": -104.30200958251953, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6617166996002197, "rewards/margins": 9.774314880371094, "rewards/rejected": -7.112598419189453, "step": 823 }, { "epoch": 0.5633225089728252, "grad_norm": 0.0011717285960912704, "learning_rate": 4.8598786861618605e-05, "logits/chosen": -8.995145797729492, "logits/rejected": -8.983595848083496, "logps/chosen": -2.242673635482788, "logps/rejected": -107.18565368652344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8963329792022705, "rewards/margins": 10.469146728515625, "rewards/rejected": -7.572813987731934, "step": 824 }, { "epoch": 0.5640061527943941, "grad_norm": 0.0015995175344869494, "learning_rate": 4.860756580916542e-05, "logits/chosen": -8.38247013092041, "logits/rejected": -8.369697570800781, "logps/chosen": -2.0025246143341064, "logps/rejected": -108.1111831665039, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0516769886016846, "rewards/margins": 10.584585189819336, "rewards/rejected": -7.532907485961914, "step": 825 }, { "epoch": 0.5646897966159631, "grad_norm": 0.002977329771965742, "learning_rate": 4.861633412200637e-05, "logits/chosen": -9.265241622924805, "logits/rejected": -9.253007888793945, "logps/chosen": -2.803997755050659, "logps/rejected": -107.49052429199219, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.867748260498047, "rewards/margins": 10.459047317504883, "rewards/rejected": -7.591299057006836, "step": 826 }, { "epoch": 0.565373440437532, "grad_norm": 0.0014601795701310039, "learning_rate": 4.862509182587578e-05, "logits/chosen": -9.133916854858398, "logits/rejected": -9.120689392089844, "logps/chosen": -2.9206948280334473, "logps/rejected": -105.48797607421875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9147088527679443, "rewards/margins": 10.213600158691406, "rewards/rejected": -7.298890590667725, "step": 827 }, { "epoch": 0.5660570842591011, "grad_norm": 0.0015190003905445337, "learning_rate": 4.863383894641467e-05, "logits/chosen": -8.27790355682373, "logits/rejected": -8.270221710205078, "logps/chosen": -3.5125787258148193, "logps/rejected": -105.9957504272461, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.825726270675659, "rewards/margins": 10.13892936706543, "rewards/rejected": -7.313203811645508, "step": 828 }, { "epoch": 0.56674072808067, "grad_norm": 0.0014953408390283585, "learning_rate": 4.864257550917123e-05, "logits/chosen": -8.840639114379883, "logits/rejected": -8.833141326904297, "logps/chosen": -4.178999900817871, "logps/rejected": -105.8759536743164, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6988601684570312, "rewards/margins": 10.033941268920898, "rewards/rejected": -7.335081100463867, "step": 829 }, { "epoch": 0.5674243719022389, "grad_norm": 0.0008127799374051392, "learning_rate": 4.865130153960124e-05, "logits/chosen": -9.078328132629395, "logits/rejected": -9.06698226928711, "logps/chosen": -2.1028103828430176, "logps/rejected": -107.44087219238281, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9078662395477295, "rewards/margins": 10.454914093017578, "rewards/rejected": -7.547047138214111, "step": 830 }, { "epoch": 0.5681080157238079, "grad_norm": 0.001359988353215158, "learning_rate": 4.8660017063068526e-05, "logits/chosen": -8.586751937866211, "logits/rejected": -8.572832107543945, "logps/chosen": -1.9419028759002686, "logps/rejected": -107.03657531738281, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9806880950927734, "rewards/margins": 10.458718299865723, "rewards/rejected": -7.478029251098633, "step": 831 }, { "epoch": 0.5687916595453768, "grad_norm": 0.0013002261985093355, "learning_rate": 4.8668722104845403e-05, "logits/chosen": -8.766027450561523, "logits/rejected": -8.752252578735352, "logps/chosen": -1.9603271484375, "logps/rejected": -107.32257080078125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.045368194580078, "rewards/margins": 10.569068908691406, "rewards/rejected": -7.523700714111328, "step": 832 }, { "epoch": 0.5694753033669459, "grad_norm": 0.0009204475209116936, "learning_rate": 4.8677416690113134e-05, "logits/chosen": -8.889400482177734, "logits/rejected": -8.878293991088867, "logps/chosen": -7.191109657287598, "logps/rejected": -104.94508361816406, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.468621015548706, "rewards/margins": 9.667960166931152, "rewards/rejected": -7.199339389801025, "step": 833 }, { "epoch": 0.5701589471885148, "grad_norm": 0.001754283206537366, "learning_rate": 4.868610084396232e-05, "logits/chosen": -8.604387283325195, "logits/rejected": -8.594934463500977, "logps/chosen": -6.1422576904296875, "logps/rejected": -106.0280532836914, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5920207500457764, "rewards/margins": 9.872902870178223, "rewards/rejected": -7.280881881713867, "step": 834 }, { "epoch": 0.5708425910100837, "grad_norm": 0.0007637017988599837, "learning_rate": 4.869477459139337e-05, "logits/chosen": -8.446769714355469, "logits/rejected": -8.434442520141602, "logps/chosen": -3.531569242477417, "logps/rejected": -106.1651382446289, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.938547134399414, "rewards/margins": 10.250688552856445, "rewards/rejected": -7.312140941619873, "step": 835 }, { "epoch": 0.5715262348316527, "grad_norm": 0.0011212369427084923, "learning_rate": 4.870343795731694e-05, "logits/chosen": -8.650309562683105, "logits/rejected": -8.635127067565918, "logps/chosen": -0.19749023020267487, "logps/rejected": -108.76956176757812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1673853397369385, "rewards/margins": 10.82502555847168, "rewards/rejected": -7.65764045715332, "step": 836 }, { "epoch": 0.5722098786532217, "grad_norm": 0.0009314444032497704, "learning_rate": 4.8712090966554334e-05, "logits/chosen": -8.65327262878418, "logits/rejected": -8.642313003540039, "logps/chosen": -6.791860103607178, "logps/rejected": -107.11813354492188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4393367767333984, "rewards/margins": 9.988226890563965, "rewards/rejected": -7.548890113830566, "step": 837 }, { "epoch": 0.5728935224747906, "grad_norm": 0.0009790982585400343, "learning_rate": 4.872073364383795e-05, "logits/chosen": -8.872349739074707, "logits/rejected": -8.859280586242676, "logps/chosen": -2.6891181468963623, "logps/rejected": -108.08426666259766, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0118255615234375, "rewards/margins": 10.574373245239258, "rewards/rejected": -7.562548637390137, "step": 838 }, { "epoch": 0.5735771662963596, "grad_norm": 0.0016522224759683013, "learning_rate": 4.8729366013811674e-05, "logits/chosen": -8.529397010803223, "logits/rejected": -8.518332481384277, "logps/chosen": -5.293834209442139, "logps/rejected": -105.35260009765625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.670947551727295, "rewards/margins": 9.899978637695312, "rewards/rejected": -7.229031562805176, "step": 839 }, { "epoch": 0.5742608101179285, "grad_norm": 0.02520020864903927, "learning_rate": 4.8737988101031366e-05, "logits/chosen": -8.532392501831055, "logits/rejected": -8.520427703857422, "logps/chosen": -0.16107018291950226, "logps/rejected": -108.83419036865234, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 3.11252498626709, "rewards/margins": 10.787261962890625, "rewards/rejected": -7.674737453460693, "step": 840 }, { "epoch": 0.5749444539394976, "grad_norm": 0.0014226485509425402, "learning_rate": 4.874659992996521e-05, "logits/chosen": -7.966912269592285, "logits/rejected": -7.954341888427734, "logps/chosen": -1.7800439596176147, "logps/rejected": -107.47384643554688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0897140502929688, "rewards/margins": 10.585848808288574, "rewards/rejected": -7.496133804321289, "step": 841 }, { "epoch": 0.5756280977610665, "grad_norm": 0.001820364035665989, "learning_rate": 4.875520152499416e-05, "logits/chosen": -8.051290512084961, "logits/rejected": -8.039520263671875, "logps/chosen": -4.961377143859863, "logps/rejected": -105.47537231445312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6829142570495605, "rewards/margins": 9.991338729858398, "rewards/rejected": -7.3084259033203125, "step": 842 }, { "epoch": 0.5763117415826354, "grad_norm": 0.0009659876814112067, "learning_rate": 4.876379291041238e-05, "logits/chosen": -9.045863151550293, "logits/rejected": -9.032997131347656, "logps/chosen": -1.6741887331008911, "logps/rejected": -107.53939819335938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9928150177001953, "rewards/margins": 10.520874977111816, "rewards/rejected": -7.528059482574463, "step": 843 }, { "epoch": 0.5769953854042044, "grad_norm": 0.000756710534915328, "learning_rate": 4.8772374110427594e-05, "logits/chosen": -8.779281616210938, "logits/rejected": -8.763463020324707, "logps/chosen": -0.577763557434082, "logps/rejected": -107.52088165283203, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.180652379989624, "rewards/margins": 10.709561347961426, "rewards/rejected": -7.528909683227539, "step": 844 }, { "epoch": 0.5776790292257734, "grad_norm": 0.0020036373753100634, "learning_rate": 4.878094514916154e-05, "logits/chosen": -8.901328086853027, "logits/rejected": -8.889524459838867, "logps/chosen": -2.6727652549743652, "logps/rejected": -107.16747283935547, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8762781620025635, "rewards/margins": 10.334615707397461, "rewards/rejected": -7.458337783813477, "step": 845 }, { "epoch": 0.5783626730473423, "grad_norm": 0.00150903663598001, "learning_rate": 4.8789506050650396e-05, "logits/chosen": -8.488471031188965, "logits/rejected": -8.476822853088379, "logps/chosen": -1.9726049900054932, "logps/rejected": -108.10281372070312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0048654079437256, "rewards/margins": 10.548591613769531, "rewards/rejected": -7.543725967407227, "step": 846 }, { "epoch": 0.5790463168689113, "grad_norm": 0.0008858887595124543, "learning_rate": 4.879805683884512e-05, "logits/chosen": -8.690950393676758, "logits/rejected": -8.67811107635498, "logps/chosen": -4.710376739501953, "logps/rejected": -105.438720703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.690246105194092, "rewards/margins": 10.042668342590332, "rewards/rejected": -7.352421283721924, "step": 847 }, { "epoch": 0.5797299606904802, "grad_norm": 0.0010642919223755598, "learning_rate": 4.8806597537611906e-05, "logits/chosen": -9.079977035522461, "logits/rejected": -9.068734169006348, "logps/chosen": -3.3016464710235596, "logps/rejected": -105.89403533935547, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.832724094390869, "rewards/margins": 10.193729400634766, "rewards/rejected": -7.361005783081055, "step": 848 }, { "epoch": 0.5804136045120493, "grad_norm": 0.0012188830878585577, "learning_rate": 4.881512817073255e-05, "logits/chosen": -8.048458099365234, "logits/rejected": -8.03650951385498, "logps/chosen": -4.437312602996826, "logps/rejected": -105.83868408203125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9250071048736572, "rewards/margins": 10.187154769897461, "rewards/rejected": -7.262147426605225, "step": 849 }, { "epoch": 0.5810972483336182, "grad_norm": 0.0009205377427861094, "learning_rate": 4.882364876190489e-05, "logits/chosen": -8.870341300964355, "logits/rejected": -8.856063842773438, "logps/chosen": -0.1789657175540924, "logps/rejected": -109.24922180175781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1914658546447754, "rewards/margins": 10.82913875579834, "rewards/rejected": -7.6376729011535645, "step": 850 }, { "epoch": 0.5817808921551871, "grad_norm": 0.0012709468137472868, "learning_rate": 4.8832159334743136e-05, "logits/chosen": -8.95004653930664, "logits/rejected": -8.939218521118164, "logps/chosen": -6.327701091766357, "logps/rejected": -106.12860870361328, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5969085693359375, "rewards/margins": 9.963525772094727, "rewards/rejected": -7.366616249084473, "step": 851 }, { "epoch": 0.5824645359767561, "grad_norm": 0.0007027096580713987, "learning_rate": 4.884065991277833e-05, "logits/chosen": -8.552252769470215, "logits/rejected": -8.542703628540039, "logps/chosen": -5.573062896728516, "logps/rejected": -106.2645263671875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.655749797821045, "rewards/margins": 9.974966049194336, "rewards/rejected": -7.319215774536133, "step": 852 }, { "epoch": 0.5831481797983251, "grad_norm": 0.013132927007973194, "learning_rate": 4.8849150519458726e-05, "logits/chosen": -9.422569274902344, "logits/rejected": -9.407750129699707, "logps/chosen": -1.6474640369415283, "logps/rejected": -108.02268981933594, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.928171157836914, "rewards/margins": 10.573081970214844, "rewards/rejected": -7.644908905029297, "step": 853 }, { "epoch": 0.583831823619894, "grad_norm": 0.0009187680552713573, "learning_rate": 4.885763117815009e-05, "logits/chosen": -8.693428039550781, "logits/rejected": -8.679885864257812, "logps/chosen": -5.013706684112549, "logps/rejected": -105.42494201660156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7598907947540283, "rewards/margins": 10.018733978271484, "rewards/rejected": -7.258842945098877, "step": 854 }, { "epoch": 0.584515467441463, "grad_norm": 0.0027828323654830456, "learning_rate": 4.886610191213622e-05, "logits/chosen": -8.400545120239258, "logits/rejected": -8.384855270385742, "logps/chosen": -1.9845664501190186, "logps/rejected": -108.50321960449219, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.993105411529541, "rewards/margins": 10.583975791931152, "rewards/rejected": -7.590869903564453, "step": 855 }, { "epoch": 0.5851991112630319, "grad_norm": 0.0009296465432271361, "learning_rate": 4.887456274461922e-05, "logits/chosen": -8.526435852050781, "logits/rejected": -8.511405944824219, "logps/chosen": -0.14297965168952942, "logps/rejected": -109.43601989746094, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.118213653564453, "rewards/margins": 10.906146049499512, "rewards/rejected": -7.7879319190979, "step": 856 }, { "epoch": 0.585882755084601, "grad_norm": 0.001235453994013369, "learning_rate": 4.8883013698719973e-05, "logits/chosen": -9.227710723876953, "logits/rejected": -9.211897850036621, "logps/chosen": -4.254585266113281, "logps/rejected": -106.89192199707031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8123793601989746, "rewards/margins": 10.221610069274902, "rewards/rejected": -7.409230709075928, "step": 857 }, { "epoch": 0.5865663989061699, "grad_norm": 0.0006839464185759425, "learning_rate": 4.889145479747843e-05, "logits/chosen": -8.334935188293457, "logits/rejected": -8.323939323425293, "logps/chosen": -2.277338743209839, "logps/rejected": -107.76860809326172, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8639392852783203, "rewards/margins": 10.540955543518066, "rewards/rejected": -7.677015781402588, "step": 858 }, { "epoch": 0.5872500427277388, "grad_norm": 0.0008398873033002019, "learning_rate": 4.889988606385404e-05, "logits/chosen": -8.622016906738281, "logits/rejected": -8.608774185180664, "logps/chosen": -4.231171131134033, "logps/rejected": -107.97203063964844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.752927303314209, "rewards/margins": 10.293793678283691, "rewards/rejected": -7.540866374969482, "step": 859 }, { "epoch": 0.5879336865493078, "grad_norm": 0.0010857522720471025, "learning_rate": 4.8908307520726135e-05, "logits/chosen": -8.26314926147461, "logits/rejected": -8.251432418823242, "logps/chosen": -6.495090484619141, "logps/rejected": -105.52587127685547, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6565513610839844, "rewards/margins": 9.945944786071777, "rewards/rejected": -7.289393901824951, "step": 860 }, { "epoch": 0.5886173303708768, "grad_norm": 0.0010462123900651932, "learning_rate": 4.891671919089425e-05, "logits/chosen": -8.87281322479248, "logits/rejected": -8.858566284179688, "logps/chosen": -1.9402683973312378, "logps/rejected": -106.7776870727539, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.932738780975342, "rewards/margins": 10.427818298339844, "rewards/rejected": -7.495079517364502, "step": 861 }, { "epoch": 0.5893009741924458, "grad_norm": 0.0010260014096274972, "learning_rate": 4.892512109707855e-05, "logits/chosen": -8.819645881652832, "logits/rejected": -8.804654121398926, "logps/chosen": -0.1365620493888855, "logps/rejected": -109.17929077148438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.161818265914917, "rewards/margins": 10.891536712646484, "rewards/rejected": -7.72971773147583, "step": 862 }, { "epoch": 0.5899846180140147, "grad_norm": 0.0009000278660096228, "learning_rate": 4.893351326192016e-05, "logits/chosen": -8.624185562133789, "logits/rejected": -8.611123085021973, "logps/chosen": -2.5815725326538086, "logps/rejected": -107.78102111816406, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.927499294281006, "rewards/margins": 10.48701286315918, "rewards/rejected": -7.559513568878174, "step": 863 }, { "epoch": 0.5906682618355836, "grad_norm": 0.0006806828314438462, "learning_rate": 4.894189570798156e-05, "logits/chosen": -8.804571151733398, "logits/rejected": -8.794509887695312, "logps/chosen": -0.15099740028381348, "logps/rejected": -109.7205810546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.10532283782959, "rewards/margins": 10.867586135864258, "rewards/rejected": -7.762263298034668, "step": 864 }, { "epoch": 0.5913519056571527, "grad_norm": 0.0010690035996958613, "learning_rate": 4.895026845774691e-05, "logits/chosen": -9.291401863098145, "logits/rejected": -9.277257919311523, "logps/chosen": -0.21176457405090332, "logps/rejected": -109.53131866455078, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1095502376556396, "rewards/margins": 10.906033515930176, "rewards/rejected": -7.796483039855957, "step": 865 }, { "epoch": 0.5920355494787216, "grad_norm": 0.0009427561890333891, "learning_rate": 4.895863153362244e-05, "logits/chosen": -8.37058162689209, "logits/rejected": -8.36046028137207, "logps/chosen": -1.3786929845809937, "logps/rejected": -107.19212341308594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0502023696899414, "rewards/margins": 10.5453519821167, "rewards/rejected": -7.495149612426758, "step": 866 }, { "epoch": 0.5927191933002905, "grad_norm": 0.0010906008537858725, "learning_rate": 4.896698495793684e-05, "logits/chosen": -9.685840606689453, "logits/rejected": -9.665777206420898, "logps/chosen": -3.7171781063079834, "logps/rejected": -107.23562622070312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.825676202774048, "rewards/margins": 10.358489036560059, "rewards/rejected": -7.532812595367432, "step": 867 }, { "epoch": 0.5934028371218595, "grad_norm": 0.003471185453236103, "learning_rate": 4.897532875294154e-05, "logits/chosen": -9.190020561218262, "logits/rejected": -9.177166938781738, "logps/chosen": -4.305331707000732, "logps/rejected": -107.89705657958984, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7521891593933105, "rewards/margins": 10.274447441101074, "rewards/rejected": -7.522258281707764, "step": 868 }, { "epoch": 0.5940864809434284, "grad_norm": 0.0008185533224605024, "learning_rate": 4.8983662940811115e-05, "logits/chosen": -9.069205284118652, "logits/rejected": -9.057533264160156, "logps/chosen": -1.8760472536087036, "logps/rejected": -107.90416717529297, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9556050300598145, "rewards/margins": 10.514318466186523, "rewards/rejected": -7.558713912963867, "step": 869 }, { "epoch": 0.5947701247649975, "grad_norm": 0.001321829273365438, "learning_rate": 4.899198754364365e-05, "logits/chosen": -9.517535209655762, "logits/rejected": -9.501310348510742, "logps/chosen": -1.920157551765442, "logps/rejected": -108.91116333007812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9509215354919434, "rewards/margins": 10.66656494140625, "rewards/rejected": -7.715642929077148, "step": 870 }, { "epoch": 0.5954537685865664, "grad_norm": 0.0010625177528709173, "learning_rate": 4.900030258346106e-05, "logits/chosen": -8.521032333374023, "logits/rejected": -8.50784683227539, "logps/chosen": -5.377965927124023, "logps/rejected": -106.58120727539062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7212040424346924, "rewards/margins": 10.06932258605957, "rewards/rejected": -7.348118782043457, "step": 871 }, { "epoch": 0.5961374124081353, "grad_norm": 0.0006495018606074154, "learning_rate": 4.900860808220946e-05, "logits/chosen": -8.794100761413574, "logits/rejected": -8.780953407287598, "logps/chosen": -0.1257757544517517, "logps/rejected": -109.71985626220703, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.148578405380249, "rewards/margins": 10.876506805419922, "rewards/rejected": -7.727928161621094, "step": 872 }, { "epoch": 0.5968210562297043, "grad_norm": 0.0007179192616604269, "learning_rate": 4.90169040617595e-05, "logits/chosen": -8.43160343170166, "logits/rejected": -8.41800594329834, "logps/chosen": -5.279450416564941, "logps/rejected": -105.90161895751953, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6812503337860107, "rewards/margins": 9.996578216552734, "rewards/rejected": -7.315328598022461, "step": 873 }, { "epoch": 0.5975047000512733, "grad_norm": 0.010327671654522419, "learning_rate": 4.9025190543906715e-05, "logits/chosen": -8.324527740478516, "logits/rejected": -8.312158584594727, "logps/chosen": -4.052178859710693, "logps/rejected": -107.87213134765625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.7998688220977783, "rewards/margins": 10.300848007202148, "rewards/rejected": -7.500978469848633, "step": 874 }, { "epoch": 0.5981883438728423, "grad_norm": 0.0008702539489604533, "learning_rate": 4.903346755037189e-05, "logits/chosen": -8.536144256591797, "logits/rejected": -8.522905349731445, "logps/chosen": -2.8509674072265625, "logps/rejected": -106.87882995605469, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9018704891204834, "rewards/margins": 10.346318244934082, "rewards/rejected": -7.4444475173950195, "step": 875 }, { "epoch": 0.5988719876944112, "grad_norm": 0.007416768930852413, "learning_rate": 4.904173510280135e-05, "logits/chosen": -9.681302070617676, "logits/rejected": -9.668766975402832, "logps/chosen": -9.944673538208008, "logps/rejected": -104.90299987792969, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.1727545261383057, "rewards/margins": 9.492712020874023, "rewards/rejected": -7.3199567794799805, "step": 876 }, { "epoch": 0.5995556315159801, "grad_norm": 0.0009610601118765771, "learning_rate": 4.904999322276735e-05, "logits/chosen": -9.020308494567871, "logits/rejected": -9.004383087158203, "logps/chosen": -3.234571695327759, "logps/rejected": -107.79324340820312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.966583251953125, "rewards/margins": 10.478170394897461, "rewards/rejected": -7.511587142944336, "step": 877 }, { "epoch": 0.6002392753375492, "grad_norm": 0.0006969086243771017, "learning_rate": 4.9058241931768385e-05, "logits/chosen": -8.835762023925781, "logits/rejected": -8.818755149841309, "logps/chosen": -0.1465277224779129, "logps/rejected": -110.20256042480469, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.2033824920654297, "rewards/margins": 10.971906661987305, "rewards/rejected": -7.768523216247559, "step": 878 }, { "epoch": 0.6009229191591181, "grad_norm": 0.0007225262233987451, "learning_rate": 4.9066481251229535e-05, "logits/chosen": -8.967888832092285, "logits/rejected": -8.956037521362305, "logps/chosen": -0.1042642742395401, "logps/rejected": -109.91057586669922, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0097122192382812, "rewards/margins": 10.912251472473145, "rewards/rejected": -7.90254020690918, "step": 879 }, { "epoch": 0.601606562980687, "grad_norm": 0.0007925578393042088, "learning_rate": 4.907471120250281e-05, "logits/chosen": -8.134000778198242, "logits/rejected": -8.122922897338867, "logps/chosen": -2.7656784057617188, "logps/rejected": -109.11323547363281, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9805517196655273, "rewards/margins": 10.566871643066406, "rewards/rejected": -7.586319446563721, "step": 880 }, { "epoch": 0.602290206802256, "grad_norm": 1.6680328845977783, "learning_rate": 4.9082931806867474e-05, "logits/chosen": -8.877436637878418, "logits/rejected": -8.86228084564209, "logps/chosen": -2.510493755340576, "logps/rejected": -109.25128173828125, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": 3.023285388946533, "rewards/margins": 10.755630493164062, "rewards/rejected": -7.732345104217529, "step": 881 }, { "epoch": 0.602973850623825, "grad_norm": 0.0011265173088759184, "learning_rate": 4.909114308553033e-05, "logits/chosen": -8.989694595336914, "logits/rejected": -8.976420402526855, "logps/chosen": -0.12623770534992218, "logps/rejected": -109.89363861083984, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0879769325256348, "rewards/margins": 10.926535606384277, "rewards/rejected": -7.838558673858643, "step": 882 }, { "epoch": 0.603657494445394, "grad_norm": 0.0014461091486737132, "learning_rate": 4.909934505962615e-05, "logits/chosen": -8.896915435791016, "logits/rejected": -8.884345054626465, "logps/chosen": -0.13654294610023499, "logps/rejected": -109.62754821777344, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.078856945037842, "rewards/margins": 10.871575355529785, "rewards/rejected": -7.792718410491943, "step": 883 }, { "epoch": 0.6043411382669629, "grad_norm": 0.0018850337946787477, "learning_rate": 4.9107537750217886e-05, "logits/chosen": -9.689690589904785, "logits/rejected": -9.67172622680664, "logps/chosen": -4.053910255432129, "logps/rejected": -107.17417907714844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7835135459899902, "rewards/margins": 10.375564575195312, "rewards/rejected": -7.592050552368164, "step": 884 }, { "epoch": 0.6050247820885318, "grad_norm": 0.00109342101495713, "learning_rate": 4.9115721178297093e-05, "logits/chosen": -8.752756118774414, "logits/rejected": -8.74128532409668, "logps/chosen": -4.306169033050537, "logps/rejected": -107.53015899658203, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8125789165496826, "rewards/margins": 10.289190292358398, "rewards/rejected": -7.476612091064453, "step": 885 }, { "epoch": 0.6057084259101009, "grad_norm": 0.0014121506828814745, "learning_rate": 4.9123895364784184e-05, "logits/chosen": -8.43940258026123, "logits/rejected": -8.426204681396484, "logps/chosen": -6.854777812957764, "logps/rejected": -104.19532775878906, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.62418794631958, "rewards/margins": 9.75788688659668, "rewards/rejected": -7.133697986602783, "step": 886 }, { "epoch": 0.6063920697316698, "grad_norm": 0.0009326404542662203, "learning_rate": 4.913206033052877e-05, "logits/chosen": -8.170903205871582, "logits/rejected": -8.154525756835938, "logps/chosen": -1.173406720161438, "logps/rejected": -107.27832794189453, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0896482467651367, "rewards/margins": 10.650222778320312, "rewards/rejected": -7.560573577880859, "step": 887 }, { "epoch": 0.6070757135532387, "grad_norm": 0.0008487348677590489, "learning_rate": 4.914021609631002e-05, "logits/chosen": -8.750838279724121, "logits/rejected": -8.737818717956543, "logps/chosen": -0.12878084182739258, "logps/rejected": -108.7122802734375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1260406970977783, "rewards/margins": 10.7860689163208, "rewards/rejected": -7.660028457641602, "step": 888 }, { "epoch": 0.6077593573748077, "grad_norm": 0.0011693151900544763, "learning_rate": 4.91483626828369e-05, "logits/chosen": -8.609637260437012, "logits/rejected": -8.595433235168457, "logps/chosen": -4.45286750793457, "logps/rejected": -106.41963195800781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7934796810150146, "rewards/margins": 10.160555839538574, "rewards/rejected": -7.367076396942139, "step": 889 }, { "epoch": 0.6084430011963767, "grad_norm": 0.011789080686867237, "learning_rate": 4.915650011074855e-05, "logits/chosen": -9.118420600891113, "logits/rejected": -9.102618217468262, "logps/chosen": -0.153539776802063, "logps/rejected": -108.38118743896484, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.181918144226074, "rewards/margins": 10.840649604797363, "rewards/rejected": -7.658731460571289, "step": 890 }, { "epoch": 0.6091266450179457, "grad_norm": 0.0013138022040948272, "learning_rate": 4.916462840061458e-05, "logits/chosen": -8.450427055358887, "logits/rejected": -8.438304901123047, "logps/chosen": -4.120811462402344, "logps/rejected": -106.91087341308594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7877466678619385, "rewards/margins": 10.22199821472168, "rewards/rejected": -7.434251308441162, "step": 891 }, { "epoch": 0.6098102888395146, "grad_norm": 0.0018978294683620334, "learning_rate": 4.917274757293539e-05, "logits/chosen": -8.775741577148438, "logits/rejected": -8.76285457611084, "logps/chosen": -0.11817965656518936, "logps/rejected": -107.90899658203125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.2120604515075684, "rewards/margins": 10.727790832519531, "rewards/rejected": -7.515730381011963, "step": 892 }, { "epoch": 0.6104939326610835, "grad_norm": 0.004610407631844282, "learning_rate": 4.918085764814244e-05, "logits/chosen": -8.001786231994629, "logits/rejected": -7.9905195236206055, "logps/chosen": -3.319763422012329, "logps/rejected": -104.89214324951172, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.849987268447876, "rewards/margins": 10.162004470825195, "rewards/rejected": -7.312017440795898, "step": 893 }, { "epoch": 0.6111775764826526, "grad_norm": 0.016612622886896133, "learning_rate": 4.9188958646598624e-05, "logits/chosen": -8.088839530944824, "logits/rejected": -8.076178550720215, "logps/chosen": -0.22183872759342194, "logps/rejected": -106.77017974853516, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.1819233894348145, "rewards/margins": 10.619709014892578, "rewards/rejected": -7.4377851486206055, "step": 894 }, { "epoch": 0.6118612203042215, "grad_norm": 0.0014300011098384857, "learning_rate": 4.919705058859854e-05, "logits/chosen": -9.141032218933105, "logits/rejected": -9.128105163574219, "logps/chosen": -3.0245048999786377, "logps/rejected": -103.7455825805664, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8718068599700928, "rewards/margins": 10.077162742614746, "rewards/rejected": -7.205356121063232, "step": 895 }, { "epoch": 0.6125448641257905, "grad_norm": 0.00186909141484648, "learning_rate": 4.920513349436875e-05, "logits/chosen": -8.724922180175781, "logits/rejected": -8.713294982910156, "logps/chosen": -0.18031267821788788, "logps/rejected": -106.73710632324219, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1982765197753906, "rewards/margins": 10.613025665283203, "rewards/rejected": -7.4147491455078125, "step": 896 }, { "epoch": 0.6132285079473594, "grad_norm": 0.004795832559466362, "learning_rate": 4.92132073840682e-05, "logits/chosen": -8.528414726257324, "logits/rejected": -8.51816463470459, "logps/chosen": -3.4486584663391113, "logps/rejected": -104.8431625366211, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.77518892288208, "rewards/margins": 10.009510040283203, "rewards/rejected": -7.234321594238281, "step": 897 }, { "epoch": 0.6139121517689284, "grad_norm": 0.020169898867607117, "learning_rate": 4.922127227778841e-05, "logits/chosen": -9.226594924926758, "logits/rejected": -9.21304702758789, "logps/chosen": -2.6874687671661377, "logps/rejected": -106.70719909667969, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.950869560241699, "rewards/margins": 10.352184295654297, "rewards/rejected": -7.401313781738281, "step": 898 }, { "epoch": 0.6145957955904974, "grad_norm": 0.006603466346859932, "learning_rate": 4.9229328195553815e-05, "logits/chosen": -8.581225395202637, "logits/rejected": -8.567604064941406, "logps/chosen": -0.1399608999490738, "logps/rejected": -107.84843444824219, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.112703800201416, "rewards/margins": 10.66120433807373, "rewards/rejected": -7.5485005378723145, "step": 899 }, { "epoch": 0.6152794394120663, "grad_norm": 0.015969902276992798, "learning_rate": 4.923737515732209e-05, "logits/chosen": -8.92689323425293, "logits/rejected": -8.913134574890137, "logps/chosen": -0.8604783415794373, "logps/rejected": -106.38348388671875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.02053165435791, "rewards/margins": 10.482069969177246, "rewards/rejected": -7.461538314819336, "step": 900 }, { "epoch": 0.6159630832336352, "grad_norm": 0.012108869850635529, "learning_rate": 4.924541318298438e-05, "logits/chosen": -8.875288963317871, "logits/rejected": -8.864500045776367, "logps/chosen": -2.367023468017578, "logps/rejected": -106.61441802978516, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.920581817626953, "rewards/margins": 10.374983787536621, "rewards/rejected": -7.454401969909668, "step": 901 }, { "epoch": 0.6166467270552043, "grad_norm": 0.001833709655329585, "learning_rate": 4.92534422923657e-05, "logits/chosen": -8.86943531036377, "logits/rejected": -8.857858657836914, "logps/chosen": -7.220534324645996, "logps/rejected": -102.99203491210938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.471494197845459, "rewards/margins": 9.515544891357422, "rewards/rejected": -7.044050216674805, "step": 902 }, { "epoch": 0.6173303708767732, "grad_norm": 0.003694317303597927, "learning_rate": 4.9261462505225106e-05, "logits/chosen": -8.967816352844238, "logits/rejected": -8.955134391784668, "logps/chosen": -3.5925252437591553, "logps/rejected": -105.81301879882812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8226430416107178, "rewards/margins": 10.13817024230957, "rewards/rejected": -7.315526962280273, "step": 903 }, { "epoch": 0.6180140146983422, "grad_norm": 0.0045763845555484295, "learning_rate": 4.926947384125606e-05, "logits/chosen": -7.705752372741699, "logits/rejected": -7.69375467300415, "logps/chosen": -6.759002685546875, "logps/rejected": -104.01143646240234, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6494688987731934, "rewards/margins": 9.8026704788208, "rewards/rejected": -7.153202056884766, "step": 904 }, { "epoch": 0.6186976585199111, "grad_norm": 0.06388817727565765, "learning_rate": 4.927747632008672e-05, "logits/chosen": -7.991916656494141, "logits/rejected": -7.9797258377075195, "logps/chosen": -4.740283012390137, "logps/rejected": -105.1083755493164, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 2.8678221702575684, "rewards/margins": 10.057828903198242, "rewards/rejected": -7.190006732940674, "step": 905 }, { "epoch": 0.61938130234148, "grad_norm": 0.012957584112882614, "learning_rate": 4.9285469961280226e-05, "logits/chosen": -8.267226219177246, "logits/rejected": -8.255399703979492, "logps/chosen": -0.22195389866828918, "logps/rejected": -107.81879425048828, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.142163038253784, "rewards/margins": 10.699746131896973, "rewards/rejected": -7.557582855224609, "step": 906 }, { "epoch": 0.6200649461630491, "grad_norm": 0.01766708865761757, "learning_rate": 4.9293454784334924e-05, "logits/chosen": -8.723714828491211, "logits/rejected": -8.708290100097656, "logps/chosen": -4.727567672729492, "logps/rejected": -104.84048461914062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7747931480407715, "rewards/margins": 9.95781421661377, "rewards/rejected": -7.183021068572998, "step": 907 }, { "epoch": 0.620748589984618, "grad_norm": 0.0022933161817491055, "learning_rate": 4.9301430808684754e-05, "logits/chosen": -8.306326866149902, "logits/rejected": -8.292444229125977, "logps/chosen": -0.22621607780456543, "logps/rejected": -106.486083984375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.182260274887085, "rewards/margins": 10.624244689941406, "rewards/rejected": -7.441985130310059, "step": 908 }, { "epoch": 0.621432233806187, "grad_norm": 0.02358756586909294, "learning_rate": 4.930939805369946e-05, "logits/chosen": -8.066385269165039, "logits/rejected": -8.054060935974121, "logps/chosen": -3.4819753170013428, "logps/rejected": -102.78453063964844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.953616142272949, "rewards/margins": 9.885087966918945, "rewards/rejected": -6.931471824645996, "step": 909 }, { "epoch": 0.6221158776277559, "grad_norm": 0.00239131529815495, "learning_rate": 4.93173565386849e-05, "logits/chosen": -9.185497283935547, "logits/rejected": -9.173722267150879, "logps/chosen": -5.107606887817383, "logps/rejected": -105.68162536621094, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6846437454223633, "rewards/margins": 10.018570899963379, "rewards/rejected": -7.333927154541016, "step": 910 }, { "epoch": 0.6227995214493249, "grad_norm": 0.003936356399208307, "learning_rate": 4.932530628288331e-05, "logits/chosen": -8.556028366088867, "logits/rejected": -8.545654296875, "logps/chosen": -3.627518892288208, "logps/rejected": -104.83385467529297, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.7339673042297363, "rewards/margins": 9.961771965026855, "rewards/rejected": -7.227804183959961, "step": 911 }, { "epoch": 0.6234831652708939, "grad_norm": 0.002200191607698798, "learning_rate": 4.933324730547361e-05, "logits/chosen": -8.92053508758545, "logits/rejected": -8.906463623046875, "logps/chosen": -6.131665229797363, "logps/rejected": -105.67060089111328, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6258225440979004, "rewards/margins": 9.88397216796875, "rewards/rejected": -7.258148670196533, "step": 912 }, { "epoch": 0.6241668090924628, "grad_norm": 0.0032321778126060963, "learning_rate": 4.934117962557165e-05, "logits/chosen": -7.802901744842529, "logits/rejected": -7.79019021987915, "logps/chosen": -4.171523094177246, "logps/rejected": -105.81120300292969, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.766150951385498, "rewards/margins": 9.98621654510498, "rewards/rejected": -7.220066070556641, "step": 913 }, { "epoch": 0.6248504529140317, "grad_norm": 0.0011355356546118855, "learning_rate": 4.9349103262230524e-05, "logits/chosen": -8.256885528564453, "logits/rejected": -8.24554443359375, "logps/chosen": -3.9400153160095215, "logps/rejected": -105.52847290039062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7916674613952637, "rewards/margins": 10.108915328979492, "rewards/rejected": -7.317248344421387, "step": 914 }, { "epoch": 0.6255340967356008, "grad_norm": 0.0022821619641035795, "learning_rate": 4.935701823444081e-05, "logits/chosen": -8.1102294921875, "logits/rejected": -8.096424102783203, "logps/chosen": -1.3691091537475586, "logps/rejected": -106.61139678955078, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1241230964660645, "rewards/margins": 10.568265914916992, "rewards/rejected": -7.4441423416137695, "step": 915 }, { "epoch": 0.6262177405571697, "grad_norm": 0.010369568131864071, "learning_rate": 4.9364924561130845e-05, "logits/chosen": -7.912507057189941, "logits/rejected": -7.901450157165527, "logps/chosen": -2.3958773612976074, "logps/rejected": -104.40128326416016, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.964108943939209, "rewards/margins": 10.164047241210938, "rewards/rejected": -7.1999382972717285, "step": 916 }, { "epoch": 0.6269013843787387, "grad_norm": 0.0012599406763911247, "learning_rate": 4.937282226116702e-05, "logits/chosen": -8.84067440032959, "logits/rejected": -8.826576232910156, "logps/chosen": -0.25386208295822144, "logps/rejected": -108.13279724121094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.09940242767334, "rewards/margins": 10.746719360351562, "rewards/rejected": -7.647316932678223, "step": 917 }, { "epoch": 0.6275850282003076, "grad_norm": 0.0014278158778324723, "learning_rate": 4.938071135335405e-05, "logits/chosen": -8.283681869506836, "logits/rejected": -8.267885208129883, "logps/chosen": -0.8542852997779846, "logps/rejected": -107.44723510742188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1080470085144043, "rewards/margins": 10.664164543151855, "rewards/rejected": -7.556118011474609, "step": 918 }, { "epoch": 0.6282686720218766, "grad_norm": 0.0018742283573374152, "learning_rate": 4.938859185643519e-05, "logits/chosen": -8.9636812210083, "logits/rejected": -8.949657440185547, "logps/chosen": -2.385467529296875, "logps/rejected": -107.54209899902344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9204161167144775, "rewards/margins": 10.50749397277832, "rewards/rejected": -7.58707857131958, "step": 919 }, { "epoch": 0.6289523158434456, "grad_norm": 0.0015132639091461897, "learning_rate": 4.939646378909259e-05, "logits/chosen": -8.07682991027832, "logits/rejected": -8.061960220336914, "logps/chosen": -1.4246777296066284, "logps/rejected": -107.24046325683594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0415735244750977, "rewards/margins": 10.563251495361328, "rewards/rejected": -7.521678924560547, "step": 920 }, { "epoch": 0.6296359596650145, "grad_norm": 0.001330008846707642, "learning_rate": 4.940432716994748e-05, "logits/chosen": -8.196417808532715, "logits/rejected": -8.183274269104004, "logps/chosen": -3.9989850521087646, "logps/rejected": -106.65235900878906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8345119953155518, "rewards/margins": 10.247663497924805, "rewards/rejected": -7.413151741027832, "step": 921 }, { "epoch": 0.6303196034865834, "grad_norm": 0.0028585598338395357, "learning_rate": 4.9412182017560496e-05, "logits/chosen": -9.030448913574219, "logits/rejected": -9.018414497375488, "logps/chosen": -5.633444786071777, "logps/rejected": -106.98115539550781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.622413158416748, "rewards/margins": 10.028142929077148, "rewards/rejected": -7.405729293823242, "step": 922 }, { "epoch": 0.6310032473081525, "grad_norm": 0.000953816226683557, "learning_rate": 4.942002835043187e-05, "logits/chosen": -8.003302574157715, "logits/rejected": -7.98963737487793, "logps/chosen": -1.839194893836975, "logps/rejected": -107.10871887207031, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.018199920654297, "rewards/margins": 10.551641464233398, "rewards/rejected": -7.533441543579102, "step": 923 }, { "epoch": 0.6316868911297214, "grad_norm": 0.0015626325039193034, "learning_rate": 4.942786618700178e-05, "logits/chosen": -8.354629516601562, "logits/rejected": -8.342412948608398, "logps/chosen": -5.676076889038086, "logps/rejected": -104.99163818359375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6647391319274902, "rewards/margins": 9.915826797485352, "rewards/rejected": -7.251087188720703, "step": 924 }, { "epoch": 0.6323705349512904, "grad_norm": 0.00163526670075953, "learning_rate": 4.9435695545650545e-05, "logits/chosen": -9.195151329040527, "logits/rejected": -9.184158325195312, "logps/chosen": -0.22017209231853485, "logps/rejected": -108.84369659423828, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.922757625579834, "rewards/margins": 10.72988510131836, "rewards/rejected": -7.807126522064209, "step": 925 }, { "epoch": 0.6330541787728593, "grad_norm": 0.0013512482400983572, "learning_rate": 4.944351644469891e-05, "logits/chosen": -8.46926498413086, "logits/rejected": -8.455952644348145, "logps/chosen": -2.2124078273773193, "logps/rejected": -107.87774658203125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.936232805252075, "rewards/margins": 10.486482620239258, "rewards/rejected": -7.550249099731445, "step": 926 }, { "epoch": 0.6337378225944283, "grad_norm": 0.021314116194844246, "learning_rate": 4.945132890240829e-05, "logits/chosen": -8.542010307312012, "logits/rejected": -8.528703689575195, "logps/chosen": -5.314979553222656, "logps/rejected": -105.78103637695312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.655085325241089, "rewards/margins": 10.05783462524414, "rewards/rejected": -7.402749061584473, "step": 927 }, { "epoch": 0.6344214664159973, "grad_norm": 0.0009217024780809879, "learning_rate": 4.945913293698104e-05, "logits/chosen": -9.376779556274414, "logits/rejected": -9.365079879760742, "logps/chosen": -3.650599956512451, "logps/rejected": -107.47711181640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7559289932250977, "rewards/margins": 10.374422073364258, "rewards/rejected": -7.618493556976318, "step": 928 }, { "epoch": 0.6351051102375662, "grad_norm": 0.0013495092280209064, "learning_rate": 4.9466928566560696e-05, "logits/chosen": -7.941590309143066, "logits/rejected": -7.9306793212890625, "logps/chosen": -4.479122638702393, "logps/rejected": -107.99463653564453, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6841375827789307, "rewards/margins": 10.286932945251465, "rewards/rejected": -7.602794647216797, "step": 929 }, { "epoch": 0.6357887540591352, "grad_norm": 0.002012599026784301, "learning_rate": 4.9474715809232256e-05, "logits/chosen": -8.703540802001953, "logits/rejected": -8.687817573547363, "logps/chosen": -1.7281392812728882, "logps/rejected": -107.6332778930664, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.021488666534424, "rewards/margins": 10.590947151184082, "rewards/rejected": -7.569458961486816, "step": 930 }, { "epoch": 0.6364723978807042, "grad_norm": 0.0013074551243335009, "learning_rate": 4.948249468302239e-05, "logits/chosen": -8.385579109191895, "logits/rejected": -8.369741439819336, "logps/chosen": -4.006075859069824, "logps/rejected": -106.9578857421875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8688080310821533, "rewards/margins": 10.337944984436035, "rewards/rejected": -7.469137668609619, "step": 931 }, { "epoch": 0.6371560417022731, "grad_norm": 0.001222289283759892, "learning_rate": 4.9490265205899697e-05, "logits/chosen": -8.582006454467773, "logits/rejected": -8.570631980895996, "logps/chosen": -4.167882919311523, "logps/rejected": -107.16722869873047, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8021326065063477, "rewards/margins": 10.257941246032715, "rewards/rejected": -7.455808162689209, "step": 932 }, { "epoch": 0.6378396855238421, "grad_norm": 0.0010176306823268533, "learning_rate": 4.9498027395775006e-05, "logits/chosen": -7.9100847244262695, "logits/rejected": -7.894102573394775, "logps/chosen": -6.3980937004089355, "logps/rejected": -106.43769836425781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5909831523895264, "rewards/margins": 9.96802043914795, "rewards/rejected": -7.377037048339844, "step": 933 }, { "epoch": 0.638523329345411, "grad_norm": 0.001023213379085064, "learning_rate": 4.950578127050156e-05, "logits/chosen": -8.576423645019531, "logits/rejected": -8.55893325805664, "logps/chosen": -0.2514270544052124, "logps/rejected": -109.42103576660156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.187575101852417, "rewards/margins": 10.906501770019531, "rewards/rejected": -7.718926906585693, "step": 934 }, { "epoch": 0.63920697316698, "grad_norm": 0.0009306669817306101, "learning_rate": 4.95135268478753e-05, "logits/chosen": -8.480508804321289, "logits/rejected": -8.467915534973145, "logps/chosen": -3.3982224464416504, "logps/rejected": -108.32803344726562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.830706834793091, "rewards/margins": 10.472785949707031, "rewards/rejected": -7.642078399658203, "step": 935 }, { "epoch": 0.639890616988549, "grad_norm": 0.0010816790163516998, "learning_rate": 4.952126414563509e-05, "logits/chosen": -8.07599925994873, "logits/rejected": -8.062082290649414, "logps/chosen": -4.50624418258667, "logps/rejected": -107.23699188232422, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8215885162353516, "rewards/margins": 10.309192657470703, "rewards/rejected": -7.48760461807251, "step": 936 }, { "epoch": 0.6405742608101179, "grad_norm": 0.0010774651309475303, "learning_rate": 4.952899318146297e-05, "logits/chosen": -8.05767822265625, "logits/rejected": -8.043957710266113, "logps/chosen": -0.29280245304107666, "logps/rejected": -109.18038940429688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.114813804626465, "rewards/margins": 10.873587608337402, "rewards/rejected": -7.7587738037109375, "step": 937 }, { "epoch": 0.6412579046316869, "grad_norm": 0.0008345380192622542, "learning_rate": 4.9536713972984414e-05, "logits/chosen": -8.375638008117676, "logits/rejected": -8.362893104553223, "logps/chosen": -2.2183361053466797, "logps/rejected": -108.49813079833984, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8785109519958496, "rewards/margins": 10.641144752502441, "rewards/rejected": -7.762633800506592, "step": 938 }, { "epoch": 0.6419415484532559, "grad_norm": 0.0014753997093066573, "learning_rate": 4.954442653776852e-05, "logits/chosen": -8.47830581665039, "logits/rejected": -8.46566104888916, "logps/chosen": -6.636676788330078, "logps/rejected": -104.47200012207031, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4923181533813477, "rewards/margins": 9.752154350280762, "rewards/rejected": -7.259835720062256, "step": 939 }, { "epoch": 0.6426251922748248, "grad_norm": 0.0012697846395894885, "learning_rate": 4.955213089332832e-05, "logits/chosen": -8.055377006530762, "logits/rejected": -8.03874397277832, "logps/chosen": -3.879446268081665, "logps/rejected": -108.47645568847656, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7816648483276367, "rewards/margins": 10.421710968017578, "rewards/rejected": -7.640045642852783, "step": 940 }, { "epoch": 0.6433088360963938, "grad_norm": 0.0008957599638961256, "learning_rate": 4.955982705712095e-05, "logits/chosen": -8.921520233154297, "logits/rejected": -8.9035005569458, "logps/chosen": -2.0181849002838135, "logps/rejected": -108.96369171142578, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9803202152252197, "rewards/margins": 10.6083345413208, "rewards/rejected": -7.62801456451416, "step": 941 }, { "epoch": 0.6439924799179627, "grad_norm": 0.0011196242412552238, "learning_rate": 4.956751504654796e-05, "logits/chosen": -9.183137893676758, "logits/rejected": -9.167661666870117, "logps/chosen": -1.955863118171692, "logps/rejected": -108.80032348632812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9348888397216797, "rewards/margins": 10.717401504516602, "rewards/rejected": -7.782512664794922, "step": 942 }, { "epoch": 0.6446761237395318, "grad_norm": 0.0009153300779871643, "learning_rate": 4.957519487895548e-05, "logits/chosen": -8.148491859436035, "logits/rejected": -8.13200855255127, "logps/chosen": -1.8137397766113281, "logps/rejected": -108.35507202148438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1065948009490967, "rewards/margins": 10.664231300354004, "rewards/rejected": -7.557636737823486, "step": 943 }, { "epoch": 0.6453597675611007, "grad_norm": 0.0010337589774280787, "learning_rate": 4.9582866571634485e-05, "logits/chosen": -8.330501556396484, "logits/rejected": -8.316678047180176, "logps/chosen": -4.096710681915283, "logps/rejected": -107.25914764404297, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7052690982818604, "rewards/margins": 10.24148941040039, "rewards/rejected": -7.536220073699951, "step": 944 }, { "epoch": 0.6460434113826696, "grad_norm": 0.0015667981933802366, "learning_rate": 4.959053014182106e-05, "logits/chosen": -8.315736770629883, "logits/rejected": -8.30040454864502, "logps/chosen": -4.188967227935791, "logps/rejected": -107.96475219726562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8557958602905273, "rewards/margins": 10.34184455871582, "rewards/rejected": -7.486048221588135, "step": 945 }, { "epoch": 0.6467270552042386, "grad_norm": 0.001049409038387239, "learning_rate": 4.959818560669655e-05, "logits/chosen": -8.415283203125, "logits/rejected": -8.402676582336426, "logps/chosen": -6.242586135864258, "logps/rejected": -106.64836120605469, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6700971126556396, "rewards/margins": 10.0455322265625, "rewards/rejected": -7.375435829162598, "step": 946 }, { "epoch": 0.6474106990258075, "grad_norm": 0.0012634820304811, "learning_rate": 4.96058329833879e-05, "logits/chosen": -8.130233764648438, "logits/rejected": -8.118165969848633, "logps/chosen": -8.492538452148438, "logps/rejected": -106.28717803955078, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.483351707458496, "rewards/margins": 9.753561019897461, "rewards/rejected": -7.270209789276123, "step": 947 }, { "epoch": 0.6480943428473765, "grad_norm": 0.25033554434776306, "learning_rate": 4.961347228896777e-05, "logits/chosen": -8.74403190612793, "logits/rejected": -8.729537963867188, "logps/chosen": -1.480446457862854, "logps/rejected": -108.37271118164062, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 3.050980567932129, "rewards/margins": 10.676826477050781, "rewards/rejected": -7.625845909118652, "step": 948 }, { "epoch": 0.6487779866689455, "grad_norm": 0.0008741291821934283, "learning_rate": 4.962110354045488e-05, "logits/chosen": -8.230834007263184, "logits/rejected": -8.215198516845703, "logps/chosen": -2.001708745956421, "logps/rejected": -108.9013900756836, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.962613821029663, "rewards/margins": 10.624832153320312, "rewards/rejected": -7.662219047546387, "step": 949 }, { "epoch": 0.6494616304905144, "grad_norm": 0.0009115493157878518, "learning_rate": 4.962872675481414e-05, "logits/chosen": -9.131914138793945, "logits/rejected": -9.117222785949707, "logps/chosen": -5.644988059997559, "logps/rejected": -106.07322692871094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.681662082672119, "rewards/margins": 10.040763854980469, "rewards/rejected": -7.359101295471191, "step": 950 }, { "epoch": 0.6501452743120834, "grad_norm": 0.0014414316974580288, "learning_rate": 4.9636341948956906e-05, "logits/chosen": -8.675741195678711, "logits/rejected": -8.659063339233398, "logps/chosen": -0.28834769129753113, "logps/rejected": -110.58307647705078, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1439640522003174, "rewards/margins": 10.972360610961914, "rewards/rejected": -7.828396797180176, "step": 951 }, { "epoch": 0.6508289181336524, "grad_norm": 0.0008260154281742871, "learning_rate": 4.964394913974124e-05, "logits/chosen": -9.471626281738281, "logits/rejected": -9.451094627380371, "logps/chosen": -0.17262595891952515, "logps/rejected": -110.14649200439453, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1769864559173584, "rewards/margins": 10.970149993896484, "rewards/rejected": -7.793163299560547, "step": 952 }, { "epoch": 0.6515125619552213, "grad_norm": 0.0008309210534207523, "learning_rate": 4.965154834397211e-05, "logits/chosen": -7.970911979675293, "logits/rejected": -7.957189559936523, "logps/chosen": -3.4174020290374756, "logps/rejected": -108.49563598632812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9118709564208984, "rewards/margins": 10.437298774719238, "rewards/rejected": -7.525426864624023, "step": 953 }, { "epoch": 0.6521962057767903, "grad_norm": 0.0009565710206516087, "learning_rate": 4.965913957840159e-05, "logits/chosen": -8.638667106628418, "logits/rejected": -8.623881340026855, "logps/chosen": -10.873923301696777, "logps/rejected": -106.78463745117188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.263345956802368, "rewards/margins": 9.524003028869629, "rewards/rejected": -7.26065731048584, "step": 954 }, { "epoch": 0.6528798495983592, "grad_norm": 0.0009923321194946766, "learning_rate": 4.966672285972911e-05, "logits/chosen": -9.189859390258789, "logits/rejected": -9.173938751220703, "logps/chosen": -6.657454967498779, "logps/rejected": -107.55268859863281, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.559422016143799, "rewards/margins": 10.036107063293457, "rewards/rejected": -7.476686477661133, "step": 955 }, { "epoch": 0.6535634934199283, "grad_norm": 0.0017102466663345695, "learning_rate": 4.967429820460167e-05, "logits/chosen": -8.639357566833496, "logits/rejected": -8.623844146728516, "logps/chosen": -2.6973347663879395, "logps/rejected": -107.8591079711914, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.872866630554199, "rewards/margins": 10.515302658081055, "rewards/rejected": -7.642435073852539, "step": 956 }, { "epoch": 0.6542471372414972, "grad_norm": 0.001019915915094316, "learning_rate": 4.9681865629614064e-05, "logits/chosen": -8.64651870727539, "logits/rejected": -8.630879402160645, "logps/chosen": -4.304109573364258, "logps/rejected": -108.16067504882812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7783031463623047, "rewards/margins": 10.357624053955078, "rewards/rejected": -7.579320907592773, "step": 957 }, { "epoch": 0.6549307810630661, "grad_norm": 0.0008854862535372376, "learning_rate": 4.9689425151309074e-05, "logits/chosen": -8.85775375366211, "logits/rejected": -8.84099006652832, "logps/chosen": -4.6245198249816895, "logps/rejected": -107.77269744873047, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7181365489959717, "rewards/margins": 10.245267868041992, "rewards/rejected": -7.527131080627441, "step": 958 }, { "epoch": 0.6556144248846351, "grad_norm": 0.0007929550483822823, "learning_rate": 4.969697678617773e-05, "logits/chosen": -9.12083625793457, "logits/rejected": -9.103191375732422, "logps/chosen": -1.9300074577331543, "logps/rejected": -109.8733139038086, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9476702213287354, "rewards/margins": 10.798425674438477, "rewards/rejected": -7.8507561683654785, "step": 959 }, { "epoch": 0.6562980687062041, "grad_norm": 0.0008256227010861039, "learning_rate": 4.970452055065948e-05, "logits/chosen": -8.716899871826172, "logits/rejected": -8.702997207641602, "logps/chosen": -6.783629417419434, "logps/rejected": -106.4786376953125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4999380111694336, "rewards/margins": 9.819168090820312, "rewards/rejected": -7.319229602813721, "step": 960 }, { "epoch": 0.656981712527773, "grad_norm": 0.0006153188296593726, "learning_rate": 4.9712056461142423e-05, "logits/chosen": -8.078710556030273, "logits/rejected": -8.061729431152344, "logps/chosen": -3.3650903701782227, "logps/rejected": -109.82525634765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9852161407470703, "rewards/margins": 10.672378540039062, "rewards/rejected": -7.68716287612915, "step": 961 }, { "epoch": 0.657665356349342, "grad_norm": 0.0006653005839325488, "learning_rate": 4.971958453396355e-05, "logits/chosen": -8.389832496643066, "logits/rejected": -8.368077278137207, "logps/chosen": -0.6128366589546204, "logps/rejected": -109.76162719726562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.139110565185547, "rewards/margins": 10.889315605163574, "rewards/rejected": -7.750205039978027, "step": 962 }, { "epoch": 0.6583490001709109, "grad_norm": 0.001143513829447329, "learning_rate": 4.972710478540891e-05, "logits/chosen": -8.174921035766602, "logits/rejected": -8.162342071533203, "logps/chosen": -1.7310192584991455, "logps/rejected": -110.2339096069336, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9571242332458496, "rewards/margins": 10.74372386932373, "rewards/rejected": -7.7865986824035645, "step": 963 }, { "epoch": 0.65903264399248, "grad_norm": 0.002111230744048953, "learning_rate": 4.973461723171385e-05, "logits/chosen": -9.080989837646484, "logits/rejected": -9.066656112670898, "logps/chosen": -2.402527332305908, "logps/rejected": -110.01934051513672, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9147729873657227, "rewards/margins": 10.730806350708008, "rewards/rejected": -7.816033840179443, "step": 964 }, { "epoch": 0.6597162878140489, "grad_norm": 0.001141382148489356, "learning_rate": 4.9742121889063213e-05, "logits/chosen": -8.720772743225098, "logits/rejected": -8.704428672790527, "logps/chosen": -2.1124231815338135, "logps/rejected": -109.71905517578125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9541988372802734, "rewards/margins": 10.732933044433594, "rewards/rejected": -7.77873420715332, "step": 965 }, { "epoch": 0.6603999316356178, "grad_norm": 0.0008561389404349029, "learning_rate": 4.974961877359156e-05, "logits/chosen": -8.773319244384766, "logits/rejected": -8.756545066833496, "logps/chosen": -1.8403733968734741, "logps/rejected": -110.00011444091797, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.023174285888672, "rewards/margins": 10.830212593078613, "rewards/rejected": -7.807039260864258, "step": 966 }, { "epoch": 0.6610835754571868, "grad_norm": 0.0006974704447202384, "learning_rate": 4.975710790138336e-05, "logits/chosen": -8.859539031982422, "logits/rejected": -8.840620040893555, "logps/chosen": -0.14933691918849945, "logps/rejected": -110.74867248535156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1937594413757324, "rewards/margins": 11.069833755493164, "rewards/rejected": -7.876073837280273, "step": 967 }, { "epoch": 0.6617672192787558, "grad_norm": 0.0053726970218122005, "learning_rate": 4.976458928847323e-05, "logits/chosen": -8.824128150939941, "logits/rejected": -8.809192657470703, "logps/chosen": -2.0769989490509033, "logps/rejected": -109.62948608398438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.959930419921875, "rewards/margins": 10.699350357055664, "rewards/rejected": -7.739418983459473, "step": 968 }, { "epoch": 0.6624508631003247, "grad_norm": 0.0007771365926600993, "learning_rate": 4.977206295084609e-05, "logits/chosen": -8.481181144714355, "logits/rejected": -8.468304634094238, "logps/chosen": -8.88937759399414, "logps/rejected": -106.18510437011719, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.315793752670288, "rewards/margins": 9.697283744812012, "rewards/rejected": -7.381490707397461, "step": 969 }, { "epoch": 0.6631345069218937, "grad_norm": 0.0012223360827192664, "learning_rate": 4.9779528904437424e-05, "logits/chosen": -9.317586898803711, "logits/rejected": -9.301541328430176, "logps/chosen": -5.295547008514404, "logps/rejected": -108.15823364257812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7898969650268555, "rewards/margins": 10.354264259338379, "rewards/rejected": -7.564367771148682, "step": 970 }, { "epoch": 0.6638181507434626, "grad_norm": 0.0008181874873116612, "learning_rate": 4.978698716513342e-05, "logits/chosen": -9.038347244262695, "logits/rejected": -9.022796630859375, "logps/chosen": -5.655346393585205, "logps/rejected": -107.68607330322266, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.624983549118042, "rewards/margins": 10.20676040649414, "rewards/rejected": -7.581777572631836, "step": 971 }, { "epoch": 0.6645017945650317, "grad_norm": 0.0009536230354569852, "learning_rate": 4.9794437748771244e-05, "logits/chosen": -8.604353904724121, "logits/rejected": -8.586516380310059, "logps/chosen": -2.0805184841156006, "logps/rejected": -110.15762329101562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9082210063934326, "rewards/margins": 10.690495491027832, "rewards/rejected": -7.782273769378662, "step": 972 }, { "epoch": 0.6651854383866006, "grad_norm": 0.0013871793635189533, "learning_rate": 4.9801880671139204e-05, "logits/chosen": -9.200010299682617, "logits/rejected": -9.18382453918457, "logps/chosen": -6.929057598114014, "logps/rejected": -108.36453247070312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4813761711120605, "rewards/margins": 10.158611297607422, "rewards/rejected": -7.677234649658203, "step": 973 }, { "epoch": 0.6658690822081695, "grad_norm": 0.0012019037967547774, "learning_rate": 4.980931594797693e-05, "logits/chosen": -8.351176261901855, "logits/rejected": -8.334583282470703, "logps/chosen": -3.662951946258545, "logps/rejected": -109.6275634765625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.925780773162842, "rewards/margins": 10.596139907836914, "rewards/rejected": -7.670358657836914, "step": 974 }, { "epoch": 0.6665527260297385, "grad_norm": 0.0008109601330943406, "learning_rate": 4.981674359497562e-05, "logits/chosen": -7.894742012023926, "logits/rejected": -7.879582405090332, "logps/chosen": -3.968960762023926, "logps/rejected": -108.78520965576172, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.864449977874756, "rewards/margins": 10.458895683288574, "rewards/rejected": -7.594446182250977, "step": 975 }, { "epoch": 0.6672363698513075, "grad_norm": 0.0009474704856984317, "learning_rate": 4.98241636277782e-05, "logits/chosen": -8.729464530944824, "logits/rejected": -8.71509075164795, "logps/chosen": -2.5160787105560303, "logps/rejected": -110.94682312011719, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9549765586853027, "rewards/margins": 10.78354263305664, "rewards/rejected": -7.82856559753418, "step": 976 }, { "epoch": 0.6679200136728765, "grad_norm": 0.0009225400281138718, "learning_rate": 4.983157606197955e-05, "logits/chosen": -8.64224910736084, "logits/rejected": -8.62706470489502, "logps/chosen": -2.321885108947754, "logps/rejected": -110.11847686767578, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.942166328430176, "rewards/margins": 10.74679183959961, "rewards/rejected": -7.804625511169434, "step": 977 }, { "epoch": 0.6686036574944454, "grad_norm": 0.001010917592793703, "learning_rate": 4.98389809131267e-05, "logits/chosen": -9.04008674621582, "logits/rejected": -9.019453048706055, "logps/chosen": -2.1911582946777344, "logps/rejected": -110.54615783691406, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9652862548828125, "rewards/margins": 10.804264068603516, "rewards/rejected": -7.838977336883545, "step": 978 }, { "epoch": 0.6692873013160143, "grad_norm": 0.0011651602108031511, "learning_rate": 4.984637819671897e-05, "logits/chosen": -8.729026794433594, "logits/rejected": -8.710644721984863, "logps/chosen": -3.2246580123901367, "logps/rejected": -108.17843627929688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.900970458984375, "rewards/margins": 10.40626335144043, "rewards/rejected": -7.505292892456055, "step": 979 }, { "epoch": 0.6699709451375834, "grad_norm": 0.0013146233977749944, "learning_rate": 4.985376792820825e-05, "logits/chosen": -8.57040786743164, "logits/rejected": -8.552022933959961, "logps/chosen": -7.947155475616455, "logps/rejected": -106.88980102539062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4400196075439453, "rewards/margins": 9.87524127960205, "rewards/rejected": -7.4352216720581055, "step": 980 }, { "epoch": 0.6706545889591523, "grad_norm": 0.0008905677241273224, "learning_rate": 4.986115012299915e-05, "logits/chosen": -8.20004940032959, "logits/rejected": -8.187466621398926, "logps/chosen": -3.4500460624694824, "logps/rejected": -108.57444763183594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8522467613220215, "rewards/margins": 10.50871753692627, "rewards/rejected": -7.656471252441406, "step": 981 }, { "epoch": 0.6713382327807212, "grad_norm": 0.000625471817329526, "learning_rate": 4.986852479644916e-05, "logits/chosen": -8.283437728881836, "logits/rejected": -8.26471996307373, "logps/chosen": -2.4846856594085693, "logps/rejected": -110.20442199707031, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.061382293701172, "rewards/margins": 10.826567649841309, "rewards/rejected": -7.7651848793029785, "step": 982 }, { "epoch": 0.6720218766022902, "grad_norm": 0.0007759027066640556, "learning_rate": 4.987589196386893e-05, "logits/chosen": -8.09473991394043, "logits/rejected": -8.078889846801758, "logps/chosen": -0.19879528880119324, "logps/rejected": -111.11090087890625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.172105312347412, "rewards/margins": 11.065040588378906, "rewards/rejected": -7.892935752868652, "step": 983 }, { "epoch": 0.6727055204238591, "grad_norm": 0.0009699153597466648, "learning_rate": 4.988325164052236e-05, "logits/chosen": -9.127694129943848, "logits/rejected": -9.109886169433594, "logps/chosen": -9.538293838500977, "logps/rejected": -105.93087005615234, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.352874279022217, "rewards/margins": 9.655829429626465, "rewards/rejected": -7.30295467376709, "step": 984 }, { "epoch": 0.6733891642454282, "grad_norm": 0.0010061666835099459, "learning_rate": 4.9890603841626866e-05, "logits/chosen": -8.29127025604248, "logits/rejected": -8.277975082397461, "logps/chosen": -5.0622711181640625, "logps/rejected": -108.0377197265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6019389629364014, "rewards/margins": 10.25924301147461, "rewards/rejected": -7.6573052406311035, "step": 985 }, { "epoch": 0.6740728080669971, "grad_norm": 0.0006724161794409156, "learning_rate": 4.989794858235352e-05, "logits/chosen": -9.179121017456055, "logits/rejected": -9.166093826293945, "logps/chosen": -4.656798839569092, "logps/rejected": -109.68359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.6888062953948975, "rewards/margins": 10.435050964355469, "rewards/rejected": -7.746245384216309, "step": 986 }, { "epoch": 0.674756451888566, "grad_norm": 0.0006210625288076699, "learning_rate": 4.990528587782729e-05, "logits/chosen": -8.213547706604004, "logits/rejected": -8.1946439743042, "logps/chosen": -1.298168659210205, "logps/rejected": -109.64920043945312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1407840251922607, "rewards/margins": 10.836503028869629, "rewards/rejected": -7.695719242095947, "step": 987 }, { "epoch": 0.675440095710135, "grad_norm": 0.0010758081916719675, "learning_rate": 4.9912615743127146e-05, "logits/chosen": -7.811847686767578, "logits/rejected": -7.796146869659424, "logps/chosen": -3.9447903633117676, "logps/rejected": -109.1875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7710378170013428, "rewards/margins": 10.475818634033203, "rewards/rejected": -7.704780578613281, "step": 988 }, { "epoch": 0.676123739531704, "grad_norm": 0.0007962162490002811, "learning_rate": 4.991993819328633e-05, "logits/chosen": -8.237923622131348, "logits/rejected": -8.220452308654785, "logps/chosen": -3.6618947982788086, "logps/rejected": -109.00975036621094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9251255989074707, "rewards/margins": 10.588134765625, "rewards/rejected": -7.663008689880371, "step": 989 }, { "epoch": 0.676807383353273, "grad_norm": 0.0009024665341712534, "learning_rate": 4.9927253243292505e-05, "logits/chosen": -8.595659255981445, "logits/rejected": -8.58090591430664, "logps/chosen": -3.49055814743042, "logps/rejected": -109.44871520996094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8741891384124756, "rewards/margins": 10.541210174560547, "rewards/rejected": -7.667020320892334, "step": 990 }, { "epoch": 0.6774910271748419, "grad_norm": 0.0005386364064179361, "learning_rate": 4.993456090808793e-05, "logits/chosen": -8.87850570678711, "logits/rejected": -8.864123344421387, "logps/chosen": -0.5654153823852539, "logps/rejected": -111.08931732177734, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1176905632019043, "rewards/margins": 10.97177505493164, "rewards/rejected": -7.8540849685668945, "step": 991 }, { "epoch": 0.6781746709964108, "grad_norm": 0.0009315362549386919, "learning_rate": 4.994186120256965e-05, "logits/chosen": -8.172540664672852, "logits/rejected": -8.16024398803711, "logps/chosen": -5.918751239776611, "logps/rejected": -107.91651916503906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.658313274383545, "rewards/margins": 10.196496963500977, "rewards/rejected": -7.538183212280273, "step": 992 }, { "epoch": 0.6788583148179799, "grad_norm": 0.0009235217003151774, "learning_rate": 4.9949154141589696e-05, "logits/chosen": -8.877103805541992, "logits/rejected": -8.864442825317383, "logps/chosen": -6.073625564575195, "logps/rejected": -108.2072982788086, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.638617992401123, "rewards/margins": 10.173714637756348, "rewards/rejected": -7.535096168518066, "step": 993 }, { "epoch": 0.6795419586395488, "grad_norm": 0.001006744452752173, "learning_rate": 4.995643973995523e-05, "logits/chosen": -8.369074821472168, "logits/rejected": -8.351819038391113, "logps/chosen": -1.7195589542388916, "logps/rejected": -109.61624145507812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0163722038269043, "rewards/margins": 10.825647354125977, "rewards/rejected": -7.809275150299072, "step": 994 }, { "epoch": 0.6802256024611177, "grad_norm": 0.0007132020546123385, "learning_rate": 4.9963718012428765e-05, "logits/chosen": -7.759331703186035, "logits/rejected": -7.74721097946167, "logps/chosen": -1.5813050270080566, "logps/rejected": -109.86337280273438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.973219633102417, "rewards/margins": 10.748369216918945, "rewards/rejected": -7.775148868560791, "step": 995 }, { "epoch": 0.6809092462826867, "grad_norm": 0.0008416090859100223, "learning_rate": 4.9970988973728314e-05, "logits/chosen": -8.155257225036621, "logits/rejected": -8.13577651977539, "logps/chosen": -5.029997825622559, "logps/rejected": -107.31153869628906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7548913955688477, "rewards/margins": 10.185492515563965, "rewards/rejected": -7.430601119995117, "step": 996 }, { "epoch": 0.6815928901042557, "grad_norm": 0.0008758959593251348, "learning_rate": 4.99782526385276e-05, "logits/chosen": -8.830143928527832, "logits/rejected": -8.815970420837402, "logps/chosen": -2.38004469871521, "logps/rejected": -111.00369262695312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9500370025634766, "rewards/margins": 10.834129333496094, "rewards/rejected": -7.884093284606934, "step": 997 }, { "epoch": 0.6822765339258247, "grad_norm": 0.0007930777501314878, "learning_rate": 4.998550902145619e-05, "logits/chosen": -8.971197128295898, "logits/rejected": -8.95403003692627, "logps/chosen": -0.1529669165611267, "logps/rejected": -111.70320129394531, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.129173517227173, "rewards/margins": 11.163771629333496, "rewards/rejected": -8.034597396850586, "step": 998 }, { "epoch": 0.6829601777473936, "grad_norm": 0.0008702020859345794, "learning_rate": 4.999275813709971e-05, "logits/chosen": -8.078302383422852, "logits/rejected": -8.06308364868164, "logps/chosen": -1.8469740152359009, "logps/rejected": -110.5898208618164, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0003585815429688, "rewards/margins": 10.856971740722656, "rewards/rejected": -7.8566131591796875, "step": 999 }, { "epoch": 0.6836438215689625, "grad_norm": 0.0008491462212987244, "learning_rate": 5e-05, "logits/chosen": -8.406743049621582, "logits/rejected": -8.392217636108398, "logps/chosen": -6.509692192077637, "logps/rejected": -108.45311737060547, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6057238578796387, "rewards/margins": 10.190778732299805, "rewards/rejected": -7.585056304931641, "step": 1000 }, { "epoch": 0.6843274653905316, "grad_norm": 0.0006949109956622124, "learning_rate": 4.999998924049261e-05, "logits/chosen": -8.868277549743652, "logits/rejected": -8.852195739746094, "logps/chosen": -3.936406135559082, "logps/rejected": -108.77276611328125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.835433006286621, "rewards/margins": 10.469267845153809, "rewards/rejected": -7.633833885192871, "step": 1001 }, { "epoch": 0.6850111092121005, "grad_norm": 0.0008318388718180358, "learning_rate": 4.999995696197972e-05, "logits/chosen": -8.597005844116211, "logits/rejected": -8.577095985412598, "logps/chosen": -4.230869770050049, "logps/rejected": -109.8784408569336, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8877220153808594, "rewards/margins": 10.605791091918945, "rewards/rejected": -7.718068599700928, "step": 1002 }, { "epoch": 0.6856947530336694, "grad_norm": 0.0006976979784667492, "learning_rate": 4.999990316448909e-05, "logits/chosen": -7.653389930725098, "logits/rejected": -7.638319492340088, "logps/chosen": -5.13366174697876, "logps/rejected": -107.93705749511719, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6744840145111084, "rewards/margins": 10.236709594726562, "rewards/rejected": -7.562226295471191, "step": 1003 }, { "epoch": 0.6863783968552384, "grad_norm": 0.0011439593508839607, "learning_rate": 4.999982784806705e-05, "logits/chosen": -8.360647201538086, "logits/rejected": -8.344019889831543, "logps/chosen": -1.9801650047302246, "logps/rejected": -109.37751007080078, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9216995239257812, "rewards/margins": 10.628328323364258, "rewards/rejected": -7.706628799438477, "step": 1004 }, { "epoch": 0.6870620406768074, "grad_norm": 0.0007612941553816199, "learning_rate": 4.9999731012778434e-05, "logits/chosen": -8.112780570983887, "logits/rejected": -8.100162506103516, "logps/chosen": -3.272064208984375, "logps/rejected": -109.38562774658203, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.888758659362793, "rewards/margins": 10.475112915039062, "rewards/rejected": -7.5863542556762695, "step": 1005 }, { "epoch": 0.6877456844983764, "grad_norm": 0.0007928445120342076, "learning_rate": 4.99996126587066e-05, "logits/chosen": -8.337976455688477, "logits/rejected": -8.324987411499023, "logps/chosen": -4.191127300262451, "logps/rejected": -109.49366760253906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7164862155914307, "rewards/margins": 10.523641586303711, "rewards/rejected": -7.807154655456543, "step": 1006 }, { "epoch": 0.6884293283199453, "grad_norm": 0.0014046485302969813, "learning_rate": 4.9999472785953427e-05, "logits/chosen": -8.674690246582031, "logits/rejected": -8.659442901611328, "logps/chosen": -1.5630146265029907, "logps/rejected": -111.28717803955078, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.014652967453003, "rewards/margins": 10.913589477539062, "rewards/rejected": -7.898936748504639, "step": 1007 }, { "epoch": 0.6891129721415142, "grad_norm": 0.0006552720442414284, "learning_rate": 4.999931139463933e-05, "logits/chosen": -8.29518985748291, "logits/rejected": -8.27918529510498, "logps/chosen": -0.9735490083694458, "logps/rejected": -110.62979125976562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0372090339660645, "rewards/margins": 10.934858322143555, "rewards/rejected": -7.897650718688965, "step": 1008 }, { "epoch": 0.6897966159630833, "grad_norm": 0.0009814127115532756, "learning_rate": 4.9999128484903245e-05, "logits/chosen": -8.226497650146484, "logits/rejected": -8.208773612976074, "logps/chosen": -1.938506007194519, "logps/rejected": -111.14186096191406, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0531740188598633, "rewards/margins": 10.960153579711914, "rewards/rejected": -7.906979084014893, "step": 1009 }, { "epoch": 0.6904802597846522, "grad_norm": 0.0009586851811036468, "learning_rate": 4.999892405690262e-05, "logits/chosen": -8.470987319946289, "logits/rejected": -8.457762718200684, "logps/chosen": -4.357151508331299, "logps/rejected": -110.18334197998047, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.714859962463379, "rewards/margins": 10.466821670532227, "rewards/rejected": -7.751962184906006, "step": 1010 }, { "epoch": 0.6911639036062212, "grad_norm": 0.0007548825815320015, "learning_rate": 4.999869811081345e-05, "logits/chosen": -8.734758377075195, "logits/rejected": -8.721084594726562, "logps/chosen": -6.786341190338135, "logps/rejected": -109.01808166503906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6667237281799316, "rewards/margins": 10.207598686218262, "rewards/rejected": -7.540875434875488, "step": 1011 }, { "epoch": 0.6918475474277901, "grad_norm": 0.000652442395221442, "learning_rate": 4.9998450646830234e-05, "logits/chosen": -8.750350952148438, "logits/rejected": -8.732728958129883, "logps/chosen": -0.15729784965515137, "logps/rejected": -111.7157211303711, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1203956604003906, "rewards/margins": 11.099712371826172, "rewards/rejected": -7.979316234588623, "step": 1012 }, { "epoch": 0.6925311912493591, "grad_norm": 0.000957860320340842, "learning_rate": 4.999818166516599e-05, "logits/chosen": -9.018484115600586, "logits/rejected": -9.00090217590332, "logps/chosen": -7.32077169418335, "logps/rejected": -106.77142333984375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4866814613342285, "rewards/margins": 9.8814115524292, "rewards/rejected": -7.394730091094971, "step": 1013 }, { "epoch": 0.6932148350709281, "grad_norm": 0.0006405618041753769, "learning_rate": 4.9997891166052285e-05, "logits/chosen": -8.59302806854248, "logits/rejected": -8.576971054077148, "logps/chosen": -1.953786849975586, "logps/rejected": -111.02400207519531, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.945645809173584, "rewards/margins": 10.85740852355957, "rewards/rejected": -7.911762714385986, "step": 1014 }, { "epoch": 0.693898478892497, "grad_norm": 0.0008433797629550099, "learning_rate": 4.999757914973919e-05, "logits/chosen": -9.086372375488281, "logits/rejected": -9.073074340820312, "logps/chosen": -10.101940155029297, "logps/rejected": -106.46690368652344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3170924186706543, "rewards/margins": 9.593596458435059, "rewards/rejected": -7.276504039764404, "step": 1015 }, { "epoch": 0.6945821227140659, "grad_norm": 0.0005804076790809631, "learning_rate": 4.999724561649529e-05, "logits/chosen": -7.286895751953125, "logits/rejected": -7.273443222045898, "logps/chosen": -5.452493667602539, "logps/rejected": -107.0264892578125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.734997034072876, "rewards/margins": 10.182836532592773, "rewards/rejected": -7.447839260101318, "step": 1016 }, { "epoch": 0.695265766535635, "grad_norm": 0.0008442809339612722, "learning_rate": 4.999689056660772e-05, "logits/chosen": -8.287162780761719, "logits/rejected": -8.270992279052734, "logps/chosen": -3.642329454421997, "logps/rejected": -109.97036743164062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8591160774230957, "rewards/margins": 10.623655319213867, "rewards/rejected": -7.764538764953613, "step": 1017 }, { "epoch": 0.6959494103572039, "grad_norm": 0.0007158033549785614, "learning_rate": 4.999651400038214e-05, "logits/chosen": -8.545109748840332, "logits/rejected": -8.528270721435547, "logps/chosen": -3.0012106895446777, "logps/rejected": -109.32608795166016, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9072680473327637, "rewards/margins": 10.656171798706055, "rewards/rejected": -7.748904228210449, "step": 1018 }, { "epoch": 0.6966330541787729, "grad_norm": 0.0015119342133402824, "learning_rate": 4.999611591814267e-05, "logits/chosen": -8.441207885742188, "logits/rejected": -8.419816970825195, "logps/chosen": -2.0510520935058594, "logps/rejected": -110.50617980957031, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0402495861053467, "rewards/margins": 10.877840995788574, "rewards/rejected": -7.837591648101807, "step": 1019 }, { "epoch": 0.6973166980003418, "grad_norm": 0.016537567600607872, "learning_rate": 4.9995696320232036e-05, "logits/chosen": -7.823554992675781, "logits/rejected": -7.808736801147461, "logps/chosen": -2.3044800758361816, "logps/rejected": -110.60588073730469, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0073022842407227, "rewards/margins": 10.753091812133789, "rewards/rejected": -7.745790481567383, "step": 1020 }, { "epoch": 0.6980003418219107, "grad_norm": 0.0005090119666419923, "learning_rate": 4.999525520701144e-05, "logits/chosen": -8.90237045288086, "logits/rejected": -8.884513854980469, "logps/chosen": -4.430868148803711, "logps/rejected": -108.49114990234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.810932159423828, "rewards/margins": 10.429546356201172, "rewards/rejected": -7.618614196777344, "step": 1021 }, { "epoch": 0.6986839856434798, "grad_norm": 0.0006100447499193251, "learning_rate": 4.99947925788606e-05, "logits/chosen": -8.455206871032715, "logits/rejected": -8.439732551574707, "logps/chosen": -2.3787879943847656, "logps/rejected": -111.20968627929688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.918519973754883, "rewards/margins": 10.813851356506348, "rewards/rejected": -7.895331382751465, "step": 1022 }, { "epoch": 0.6993676294650487, "grad_norm": 0.0008415021584369242, "learning_rate": 4.999430843617778e-05, "logits/chosen": -8.302001953125, "logits/rejected": -8.285955429077148, "logps/chosen": -5.744622707366943, "logps/rejected": -108.11865997314453, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.702784538269043, "rewards/margins": 10.225957870483398, "rewards/rejected": -7.5231733322143555, "step": 1023 }, { "epoch": 0.7000512732866176, "grad_norm": 0.000778159243054688, "learning_rate": 4.999380277937975e-05, "logits/chosen": -8.952106475830078, "logits/rejected": -8.939616203308105, "logps/chosen": -5.571974754333496, "logps/rejected": -109.49087524414062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6427507400512695, "rewards/margins": 10.253091812133789, "rewards/rejected": -7.6103410720825195, "step": 1024 }, { "epoch": 0.7007349171081866, "grad_norm": 0.0007161899120546877, "learning_rate": 4.9993275608901804e-05, "logits/chosen": -8.47007942199707, "logits/rejected": -8.458064079284668, "logps/chosen": -5.409353733062744, "logps/rejected": -108.4394302368164, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6164462566375732, "rewards/margins": 10.170838356018066, "rewards/rejected": -7.554391860961914, "step": 1025 }, { "epoch": 0.7014185609297556, "grad_norm": 0.0008557179826311767, "learning_rate": 4.999272692519775e-05, "logits/chosen": -8.5775146484375, "logits/rejected": -8.558948516845703, "logps/chosen": -3.318523406982422, "logps/rejected": -108.65218353271484, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.892768383026123, "rewards/margins": 10.492971420288086, "rewards/rejected": -7.600203037261963, "step": 1026 }, { "epoch": 0.7021022047513246, "grad_norm": 0.001337604713626206, "learning_rate": 4.999215672873992e-05, "logits/chosen": -8.686923027038574, "logits/rejected": -8.667032241821289, "logps/chosen": -0.14136803150177002, "logps/rejected": -112.11487579345703, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1036720275878906, "rewards/margins": 11.181449890136719, "rewards/rejected": -8.077777862548828, "step": 1027 }, { "epoch": 0.7027858485728935, "grad_norm": 0.0005747002433054149, "learning_rate": 4.9991565020019165e-05, "logits/chosen": -8.617064476013184, "logits/rejected": -8.602107048034668, "logps/chosen": -0.1447603553533554, "logps/rejected": -112.00860595703125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1396939754486084, "rewards/margins": 11.167320251464844, "rewards/rejected": -8.027626037597656, "step": 1028 }, { "epoch": 0.7034694923944624, "grad_norm": 0.0008159268763847649, "learning_rate": 4.9990951799544874e-05, "logits/chosen": -8.716843605041504, "logits/rejected": -8.697613716125488, "logps/chosen": -0.8938173651695251, "logps/rejected": -111.8184814453125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0464160442352295, "rewards/margins": 11.059305191040039, "rewards/rejected": -8.012889862060547, "step": 1029 }, { "epoch": 0.7041531362160315, "grad_norm": 0.0017099074320867658, "learning_rate": 4.9990317067844914e-05, "logits/chosen": -8.60571002960205, "logits/rejected": -8.588508605957031, "logps/chosen": -0.19625899195671082, "logps/rejected": -112.10298156738281, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.0394904613494873, "rewards/margins": 11.079641342163086, "rewards/rejected": -8.040149688720703, "step": 1030 }, { "epoch": 0.7048367800376004, "grad_norm": 0.0009275242919102311, "learning_rate": 4.9989660825465704e-05, "logits/chosen": -8.89371109008789, "logits/rejected": -8.87940788269043, "logps/chosen": -7.21284818649292, "logps/rejected": -107.45243835449219, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4734835624694824, "rewards/margins": 9.970023155212402, "rewards/rejected": -7.49653959274292, "step": 1031 }, { "epoch": 0.7055204238591694, "grad_norm": 0.0010445998050272465, "learning_rate": 4.998898307297215e-05, "logits/chosen": -8.276391983032227, "logits/rejected": -8.26209831237793, "logps/chosen": -0.34579983353614807, "logps/rejected": -111.7236328125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.072604179382324, "rewards/margins": 11.086172103881836, "rewards/rejected": -8.013566970825195, "step": 1032 }, { "epoch": 0.7062040676807383, "grad_norm": 0.002356603043153882, "learning_rate": 4.9988283810947715e-05, "logits/chosen": -8.93923568725586, "logits/rejected": -8.926124572753906, "logps/chosen": -4.940008163452148, "logps/rejected": -109.67874908447266, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.524718761444092, "rewards/margins": 10.342129707336426, "rewards/rejected": -7.817410945892334, "step": 1033 }, { "epoch": 0.7068877115023073, "grad_norm": 0.0008522227872163057, "learning_rate": 4.998756303999434e-05, "logits/chosen": -8.282636642456055, "logits/rejected": -8.264254570007324, "logps/chosen": -0.13503102958202362, "logps/rejected": -112.33264923095703, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.2369956970214844, "rewards/margins": 11.232730865478516, "rewards/rejected": -7.995735168457031, "step": 1034 }, { "epoch": 0.7075713553238763, "grad_norm": 0.0007663163123652339, "learning_rate": 4.9986820760732516e-05, "logits/chosen": -8.693885803222656, "logits/rejected": -8.678711891174316, "logps/chosen": -0.1297779083251953, "logps/rejected": -112.30624389648438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.161747932434082, "rewards/margins": 11.170746803283691, "rewards/rejected": -8.00899887084961, "step": 1035 }, { "epoch": 0.7082549991454452, "grad_norm": 0.0007082514348439872, "learning_rate": 4.998605697380122e-05, "logits/chosen": -8.014117240905762, "logits/rejected": -8.000833511352539, "logps/chosen": -4.980625629425049, "logps/rejected": -110.82087707519531, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7415342330932617, "rewards/margins": 10.497321128845215, "rewards/rejected": -7.755785942077637, "step": 1036 }, { "epoch": 0.7089386429670141, "grad_norm": 0.000724417099263519, "learning_rate": 4.9985271679857956e-05, "logits/chosen": -8.709699630737305, "logits/rejected": -8.695452690124512, "logps/chosen": -2.412492513656616, "logps/rejected": -111.60667419433594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.009366989135742, "rewards/margins": 10.902015686035156, "rewards/rejected": -7.8926496505737305, "step": 1037 }, { "epoch": 0.7096222867885832, "grad_norm": 0.001196765573695302, "learning_rate": 4.998446487957875e-05, "logits/chosen": -9.068092346191406, "logits/rejected": -9.051738739013672, "logps/chosen": -2.7958929538726807, "logps/rejected": -109.88292694091797, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7915194034576416, "rewards/margins": 10.668732643127441, "rewards/rejected": -7.877213478088379, "step": 1038 }, { "epoch": 0.7103059306101521, "grad_norm": 0.00047212527715601027, "learning_rate": 4.998363657365811e-05, "logits/chosen": -8.801494598388672, "logits/rejected": -8.783819198608398, "logps/chosen": -1.8039668798446655, "logps/rejected": -111.03277587890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9765219688415527, "rewards/margins": 10.914180755615234, "rewards/rejected": -7.937658309936523, "step": 1039 }, { "epoch": 0.7109895744317211, "grad_norm": 0.0008237575530074537, "learning_rate": 4.99827867628091e-05, "logits/chosen": -8.40764045715332, "logits/rejected": -8.391271591186523, "logps/chosen": -1.7271794080734253, "logps/rejected": -110.77224731445312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0593295097351074, "rewards/margins": 10.918013572692871, "rewards/rejected": -7.858684062957764, "step": 1040 }, { "epoch": 0.71167321825329, "grad_norm": 0.0008319821208715439, "learning_rate": 4.998191544776328e-05, "logits/chosen": -8.197690963745117, "logits/rejected": -8.180468559265137, "logps/chosen": -1.893772840499878, "logps/rejected": -110.7824935913086, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.016533374786377, "rewards/margins": 10.912294387817383, "rewards/rejected": -7.895761489868164, "step": 1041 }, { "epoch": 0.712356862074859, "grad_norm": 0.0008465280407108366, "learning_rate": 4.9981022629270705e-05, "logits/chosen": -7.205036163330078, "logits/rejected": -7.190962791442871, "logps/chosen": -2.3611721992492676, "logps/rejected": -111.16426086425781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.988170862197876, "rewards/margins": 10.901437759399414, "rewards/rejected": -7.913267135620117, "step": 1042 }, { "epoch": 0.713040505896428, "grad_norm": 0.0010505244135856628, "learning_rate": 4.998010830809997e-05, "logits/chosen": -8.375842094421387, "logits/rejected": -8.361798286437988, "logps/chosen": -4.660637855529785, "logps/rejected": -108.55085754394531, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7349395751953125, "rewards/margins": 10.389612197875977, "rewards/rejected": -7.6546735763549805, "step": 1043 }, { "epoch": 0.7137241497179969, "grad_norm": 0.0005369300488382578, "learning_rate": 4.997917248503815e-05, "logits/chosen": -8.613321304321289, "logits/rejected": -8.597747802734375, "logps/chosen": -0.17001056671142578, "logps/rejected": -112.20695495605469, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1254444122314453, "rewards/margins": 11.108955383300781, "rewards/rejected": -7.983511924743652, "step": 1044 }, { "epoch": 0.7144077935395658, "grad_norm": 0.003062124829739332, "learning_rate": 4.9978215160890855e-05, "logits/chosen": -8.061929702758789, "logits/rejected": -8.04808235168457, "logps/chosen": -2.1792221069335938, "logps/rejected": -110.07550048828125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.941112995147705, "rewards/margins": 10.767910957336426, "rewards/rejected": -7.8267974853515625, "step": 1045 }, { "epoch": 0.7150914373611349, "grad_norm": 0.0008465195423923433, "learning_rate": 4.997723633648219e-05, "logits/chosen": -8.622817993164062, "logits/rejected": -8.605106353759766, "logps/chosen": -0.21955318748950958, "logps/rejected": -112.27278900146484, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1327102184295654, "rewards/margins": 11.172135353088379, "rewards/rejected": -8.039424896240234, "step": 1046 }, { "epoch": 0.7157750811827038, "grad_norm": 0.0006831659120507538, "learning_rate": 4.997623601265478e-05, "logits/chosen": -8.49179458618164, "logits/rejected": -8.473390579223633, "logps/chosen": -0.13663595914840698, "logps/rejected": -112.66033935546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1199228763580322, "rewards/margins": 11.18315315246582, "rewards/rejected": -8.063230514526367, "step": 1047 }, { "epoch": 0.7164587250042728, "grad_norm": 0.0006130499532446265, "learning_rate": 4.9975214190269736e-05, "logits/chosen": -8.914253234863281, "logits/rejected": -8.899191856384277, "logps/chosen": -4.029624938964844, "logps/rejected": -111.14883422851562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7663137912750244, "rewards/margins": 10.655022621154785, "rewards/rejected": -7.888708591461182, "step": 1048 }, { "epoch": 0.7171423688258417, "grad_norm": 0.0007279884885065258, "learning_rate": 4.99741708702067e-05, "logits/chosen": -8.1893892288208, "logits/rejected": -8.17112922668457, "logps/chosen": -0.16309469938278198, "logps/rejected": -112.35688018798828, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.154078722000122, "rewards/margins": 11.151113510131836, "rewards/rejected": -7.997035026550293, "step": 1049 }, { "epoch": 0.7178260126474107, "grad_norm": 0.0008018824155442417, "learning_rate": 4.997310605336382e-05, "logits/chosen": -8.873603820800781, "logits/rejected": -8.859044075012207, "logps/chosen": -3.7207772731781006, "logps/rejected": -109.84119415283203, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8208560943603516, "rewards/margins": 10.566606521606445, "rewards/rejected": -7.745749473571777, "step": 1050 }, { "epoch": 0.7185096564689797, "grad_norm": 0.0011831772280856967, "learning_rate": 4.997201974065772e-05, "logits/chosen": -8.877634048461914, "logits/rejected": -8.864164352416992, "logps/chosen": -1.7254599332809448, "logps/rejected": -111.45722961425781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9492616653442383, "rewards/margins": 10.883201599121094, "rewards/rejected": -7.9339399337768555, "step": 1051 }, { "epoch": 0.7191933002905486, "grad_norm": 0.0006684106774628162, "learning_rate": 4.997091193302356e-05, "logits/chosen": -8.482062339782715, "logits/rejected": -8.466789245605469, "logps/chosen": -5.8117218017578125, "logps/rejected": -108.85743713378906, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.694661855697632, "rewards/margins": 10.263628005981445, "rewards/rejected": -7.568966388702393, "step": 1052 }, { "epoch": 0.7198769441121176, "grad_norm": 0.000816987594589591, "learning_rate": 4.9969782631415e-05, "logits/chosen": -8.872570037841797, "logits/rejected": -8.856355667114258, "logps/chosen": -3.6541407108306885, "logps/rejected": -111.0953369140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8112449645996094, "rewards/margins": 10.645157814025879, "rewards/rejected": -7.833913803100586, "step": 1053 }, { "epoch": 0.7205605879336866, "grad_norm": 0.0014159315032884479, "learning_rate": 4.996863183680417e-05, "logits/chosen": -9.194089889526367, "logits/rejected": -9.179333686828613, "logps/chosen": -7.046261787414551, "logps/rejected": -107.50225830078125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.468282699584961, "rewards/margins": 10.022546768188477, "rewards/rejected": -7.554263591766357, "step": 1054 }, { "epoch": 0.7212442317552555, "grad_norm": 0.0007384831551462412, "learning_rate": 4.996745955018175e-05, "logits/chosen": -8.562849044799805, "logits/rejected": -8.543305397033691, "logps/chosen": -0.20196351408958435, "logps/rejected": -112.78904724121094, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1516757011413574, "rewards/margins": 11.217023849487305, "rewards/rejected": -8.065347671508789, "step": 1055 }, { "epoch": 0.7219278755768245, "grad_norm": 0.000789073237683624, "learning_rate": 4.9966265772556905e-05, "logits/chosen": -8.352559089660645, "logits/rejected": -8.336755752563477, "logps/chosen": -3.581974983215332, "logps/rejected": -109.29263305664062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8840866088867188, "rewards/margins": 10.543854713439941, "rewards/rejected": -7.659768581390381, "step": 1056 }, { "epoch": 0.7226115193983934, "grad_norm": 0.0008299625478684902, "learning_rate": 4.9965050504957265e-05, "logits/chosen": -8.163264274597168, "logits/rejected": -8.14759349822998, "logps/chosen": -2.3631505966186523, "logps/rejected": -111.85940551757812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9992640018463135, "rewards/margins": 10.906190872192383, "rewards/rejected": -7.906926155090332, "step": 1057 }, { "epoch": 0.7232951632199623, "grad_norm": 0.0008638422004878521, "learning_rate": 4.9963813748429015e-05, "logits/chosen": -9.191766738891602, "logits/rejected": -9.175183296203613, "logps/chosen": -5.923625946044922, "logps/rejected": -107.33343505859375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.639619827270508, "rewards/margins": 10.142440795898438, "rewards/rejected": -7.50282096862793, "step": 1058 }, { "epoch": 0.7239788070415314, "grad_norm": 0.0010159153025597334, "learning_rate": 4.9962555504036807e-05, "logits/chosen": -8.981282234191895, "logits/rejected": -8.961156845092773, "logps/chosen": -2.504660129547119, "logps/rejected": -111.94496154785156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.952958106994629, "rewards/margins": 10.932929039001465, "rewards/rejected": -7.979970455169678, "step": 1059 }, { "epoch": 0.7246624508631003, "grad_norm": 0.0008270087419077754, "learning_rate": 4.996127577286379e-05, "logits/chosen": -8.027502059936523, "logits/rejected": -8.012085914611816, "logps/chosen": -1.9581784009933472, "logps/rejected": -111.58218383789062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.97587251663208, "rewards/margins": 10.897382736206055, "rewards/rejected": -7.921510696411133, "step": 1060 }, { "epoch": 0.7253460946846693, "grad_norm": 0.19757632911205292, "learning_rate": 4.9959974556011615e-05, "logits/chosen": -7.963772296905518, "logits/rejected": -7.949508190155029, "logps/chosen": -0.1283021718263626, "logps/rejected": -112.46134185791016, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 3.0826687812805176, "rewards/margins": 11.116896629333496, "rewards/rejected": -8.034228324890137, "step": 1061 }, { "epoch": 0.7260297385062382, "grad_norm": 0.0009297553333453834, "learning_rate": 4.9958651854600454e-05, "logits/chosen": -8.937643051147461, "logits/rejected": -8.920398712158203, "logps/chosen": -1.6619410514831543, "logps/rejected": -111.7757568359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0312891006469727, "rewards/margins": 11.020275115966797, "rewards/rejected": -7.988985061645508, "step": 1062 }, { "epoch": 0.7267133823278072, "grad_norm": 0.0007664435543119907, "learning_rate": 4.995730766976892e-05, "logits/chosen": -9.057268142700195, "logits/rejected": -9.042606353759766, "logps/chosen": -2.9153177738189697, "logps/rejected": -111.31495666503906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8776073455810547, "rewards/margins": 10.754959106445312, "rewards/rejected": -7.877352714538574, "step": 1063 }, { "epoch": 0.7273970261493762, "grad_norm": 0.0019447716185823083, "learning_rate": 4.995594200267415e-05, "logits/chosen": -8.468511581420898, "logits/rejected": -8.452825546264648, "logps/chosen": -1.8581340312957764, "logps/rejected": -111.37432861328125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9993133544921875, "rewards/margins": 10.860227584838867, "rewards/rejected": -7.86091423034668, "step": 1064 }, { "epoch": 0.7280806699709451, "grad_norm": 0.0009258486097678542, "learning_rate": 4.995455485449181e-05, "logits/chosen": -8.380021095275879, "logits/rejected": -8.364803314208984, "logps/chosen": -4.582629680633545, "logps/rejected": -110.64257049560547, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7983157634735107, "rewards/margins": 10.586654663085938, "rewards/rejected": -7.788339138031006, "step": 1065 }, { "epoch": 0.728764313792514, "grad_norm": 6.475144386291504, "learning_rate": 4.995314622641598e-05, "logits/chosen": -7.80707311630249, "logits/rejected": -7.790374755859375, "logps/chosen": -1.7176398038864136, "logps/rejected": -111.6155014038086, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": 3.0270278453826904, "rewards/margins": 10.985770225524902, "rewards/rejected": -7.958741664886475, "step": 1066 }, { "epoch": 0.7294479576140831, "grad_norm": 0.001032130909152329, "learning_rate": 4.9951716119659296e-05, "logits/chosen": -7.697871208190918, "logits/rejected": -7.684598922729492, "logps/chosen": -4.364072322845459, "logps/rejected": -108.52334594726562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.804792881011963, "rewards/margins": 10.32100772857666, "rewards/rejected": -7.516214847564697, "step": 1067 }, { "epoch": 0.730131601435652, "grad_norm": 0.001059272326529026, "learning_rate": 4.995026453545286e-05, "logits/chosen": -8.248019218444824, "logits/rejected": -8.231451988220215, "logps/chosen": -6.754115104675293, "logps/rejected": -110.86921691894531, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.570132255554199, "rewards/margins": 10.370529174804688, "rewards/rejected": -7.800396919250488, "step": 1068 }, { "epoch": 0.730815245257221, "grad_norm": 0.0009623728110454977, "learning_rate": 4.994879147504625e-05, "logits/chosen": -7.750703811645508, "logits/rejected": -7.73049259185791, "logps/chosen": -0.13634975254535675, "logps/rejected": -112.90638732910156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.148979902267456, "rewards/margins": 11.201181411743164, "rewards/rejected": -8.052201271057129, "step": 1069 }, { "epoch": 0.7314988890787899, "grad_norm": 0.00077067717211321, "learning_rate": 4.994729693970756e-05, "logits/chosen": -7.3840179443359375, "logits/rejected": -7.367657661437988, "logps/chosen": -2.658147096633911, "logps/rejected": -111.96177673339844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.978896141052246, "rewards/margins": 10.888982772827148, "rewards/rejected": -7.910085678100586, "step": 1070 }, { "epoch": 0.732182532900359, "grad_norm": 0.0008939431863836944, "learning_rate": 4.9945780930723344e-05, "logits/chosen": -7.918720722198486, "logits/rejected": -7.905359745025635, "logps/chosen": -2.9833178520202637, "logps/rejected": -110.27784729003906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9317407608032227, "rewards/margins": 10.62722396850586, "rewards/rejected": -7.69548225402832, "step": 1071 }, { "epoch": 0.7328661767219279, "grad_norm": 0.001007035723887384, "learning_rate": 4.9944243449398656e-05, "logits/chosen": -7.479750633239746, "logits/rejected": -7.461266994476318, "logps/chosen": -0.9079251289367676, "logps/rejected": -112.3290786743164, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1091623306274414, "rewards/margins": 11.142770767211914, "rewards/rejected": -8.033608436584473, "step": 1072 }, { "epoch": 0.7335498205434968, "grad_norm": 0.0016106875846162438, "learning_rate": 4.994268449705705e-05, "logits/chosen": -8.25117015838623, "logits/rejected": -8.233026504516602, "logps/chosen": -4.95058012008667, "logps/rejected": -108.82328796386719, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8249459266662598, "rewards/margins": 10.383779525756836, "rewards/rejected": -7.558833122253418, "step": 1073 }, { "epoch": 0.7342334643650658, "grad_norm": 0.0010183234699070454, "learning_rate": 4.994110407504051e-05, "logits/chosen": -7.830243110656738, "logits/rejected": -7.814326763153076, "logps/chosen": -7.086246967315674, "logps/rejected": -107.39993286132812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5865750312805176, "rewards/margins": 10.00705337524414, "rewards/rejected": -7.420478820800781, "step": 1074 }, { "epoch": 0.7349171081866348, "grad_norm": 0.004421349614858627, "learning_rate": 4.9939502184709565e-05, "logits/chosen": -7.755003452301025, "logits/rejected": -7.740951061248779, "logps/chosen": -1.8808088302612305, "logps/rejected": -111.74734497070312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9561543464660645, "rewards/margins": 10.913448333740234, "rewards/rejected": -7.957294464111328, "step": 1075 }, { "epoch": 0.7356007520082037, "grad_norm": 0.0015125342179089785, "learning_rate": 4.993787882744319e-05, "logits/chosen": -8.279858589172363, "logits/rejected": -8.265750885009766, "logps/chosen": -5.181300640106201, "logps/rejected": -109.80594635009766, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.6617188453674316, "rewards/margins": 10.367166519165039, "rewards/rejected": -7.705447196960449, "step": 1076 }, { "epoch": 0.7362843958297727, "grad_norm": 0.016192948445677757, "learning_rate": 4.9936234004638845e-05, "logits/chosen": -7.9772047996521, "logits/rejected": -7.961528778076172, "logps/chosen": -2.790811777114868, "logps/rejected": -110.9072265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.873866081237793, "rewards/margins": 10.703887939453125, "rewards/rejected": -7.83002233505249, "step": 1077 }, { "epoch": 0.7369680396513416, "grad_norm": 0.0015320626553148031, "learning_rate": 4.9934567717712474e-05, "logits/chosen": -8.703059196472168, "logits/rejected": -8.686914443969727, "logps/chosen": -4.129756927490234, "logps/rejected": -109.16706848144531, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.740189552307129, "rewards/margins": 10.470600128173828, "rewards/rejected": -7.730409622192383, "step": 1078 }, { "epoch": 0.7376516834729107, "grad_norm": 0.011222545988857746, "learning_rate": 4.993287996809847e-05, "logits/chosen": -7.902098178863525, "logits/rejected": -7.882653713226318, "logps/chosen": -4.20308780670166, "logps/rejected": -108.68792724609375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8402302265167236, "rewards/margins": 10.406482696533203, "rewards/rejected": -7.566252708435059, "step": 1079 }, { "epoch": 0.7383353272944796, "grad_norm": 0.003941403701901436, "learning_rate": 4.993117075724977e-05, "logits/chosen": -8.308995246887207, "logits/rejected": -8.291254043579102, "logps/chosen": -0.9345564842224121, "logps/rejected": -111.83805847167969, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.037409543991089, "rewards/margins": 11.06981086730957, "rewards/rejected": -8.032401084899902, "step": 1080 }, { "epoch": 0.7390189711160485, "grad_norm": 1.4223687648773193, "learning_rate": 4.9929440086637714e-05, "logits/chosen": -8.367592811584473, "logits/rejected": -8.349177360534668, "logps/chosen": -0.7400519847869873, "logps/rejected": -111.90962219238281, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 3.0387001037597656, "rewards/margins": 11.120073318481445, "rewards/rejected": -8.08137321472168, "step": 1081 }, { "epoch": 0.7397026149376175, "grad_norm": 0.002338531194254756, "learning_rate": 4.992768795775215e-05, "logits/chosen": -7.85833215713501, "logits/rejected": -7.842677116394043, "logps/chosen": -7.7731146812438965, "logps/rejected": -107.50740814208984, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5313565731048584, "rewards/margins": 9.976227760314941, "rewards/rejected": -7.444870948791504, "step": 1082 }, { "epoch": 0.7403862587591865, "grad_norm": 0.0007461875211447477, "learning_rate": 4.992591437210139e-05, "logits/chosen": -8.353058815002441, "logits/rejected": -8.336825370788574, "logps/chosen": -1.7964224815368652, "logps/rejected": -110.96373748779297, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0252938270568848, "rewards/margins": 10.887216567993164, "rewards/rejected": -7.861921787261963, "step": 1083 }, { "epoch": 0.7410699025807554, "grad_norm": 0.0009599172044545412, "learning_rate": 4.992411933121222e-05, "logits/chosen": -7.380818843841553, "logits/rejected": -7.365475177764893, "logps/chosen": -7.13161039352417, "logps/rejected": -108.69742584228516, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.609854221343994, "rewards/margins": 10.186182022094727, "rewards/rejected": -7.576328277587891, "step": 1084 }, { "epoch": 0.7417535464023244, "grad_norm": 0.0009615740855224431, "learning_rate": 4.9922302836629906e-05, "logits/chosen": -8.470012664794922, "logits/rejected": -8.454597473144531, "logps/chosen": -6.785402774810791, "logps/rejected": -106.82044982910156, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.469346046447754, "rewards/margins": 9.996831893920898, "rewards/rejected": -7.527484893798828, "step": 1085 }, { "epoch": 0.7424371902238933, "grad_norm": 0.0021186412777751684, "learning_rate": 4.992046488991816e-05, "logits/chosen": -7.985604286193848, "logits/rejected": -7.971525192260742, "logps/chosen": -5.240016460418701, "logps/rejected": -109.97816467285156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.774900436401367, "rewards/margins": 10.492877006530762, "rewards/rejected": -7.7179765701293945, "step": 1086 }, { "epoch": 0.7431208340454624, "grad_norm": 0.0006711503374390304, "learning_rate": 4.9918605492659164e-05, "logits/chosen": -8.89402961730957, "logits/rejected": -8.875436782836914, "logps/chosen": -0.19229479134082794, "logps/rejected": -112.49107360839844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.165372848510742, "rewards/margins": 11.16392993927002, "rewards/rejected": -7.998558044433594, "step": 1087 }, { "epoch": 0.7438044778670313, "grad_norm": 0.000748236256185919, "learning_rate": 4.991672464645358e-05, "logits/chosen": -8.260205268859863, "logits/rejected": -8.242366790771484, "logps/chosen": -2.030214309692383, "logps/rejected": -111.00151824951172, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0322835445404053, "rewards/margins": 10.950273513793945, "rewards/rejected": -7.917990684509277, "step": 1088 }, { "epoch": 0.7444881216886002, "grad_norm": 0.0007452434510923922, "learning_rate": 4.9914822352920545e-05, "logits/chosen": -8.919660568237305, "logits/rejected": -8.900948524475098, "logps/chosen": -1.8560068607330322, "logps/rejected": -111.82569885253906, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8696208000183105, "rewards/margins": 10.913372039794922, "rewards/rejected": -8.043750762939453, "step": 1089 }, { "epoch": 0.7451717655101692, "grad_norm": 0.0038890184368938208, "learning_rate": 4.9912898613697626e-05, "logits/chosen": -8.129106521606445, "logits/rejected": -8.114460945129395, "logps/chosen": -5.27606201171875, "logps/rejected": -108.78508758544922, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.67264723777771, "rewards/margins": 10.251311302185059, "rewards/rejected": -7.5786638259887695, "step": 1090 }, { "epoch": 0.7458554093317382, "grad_norm": 0.0009681049850769341, "learning_rate": 4.9910953430440875e-05, "logits/chosen": -8.219632148742676, "logits/rejected": -8.202381134033203, "logps/chosen": -0.2641756534576416, "logps/rejected": -112.44730377197266, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1239209175109863, "rewards/margins": 11.152873992919922, "rewards/rejected": -8.028951644897461, "step": 1091 }, { "epoch": 0.7465390531533072, "grad_norm": 0.0008912445046007633, "learning_rate": 4.9908986804824795e-05, "logits/chosen": -8.764409065246582, "logits/rejected": -8.749605178833008, "logps/chosen": -4.317605972290039, "logps/rejected": -110.71070098876953, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.783872604370117, "rewards/margins": 10.664854049682617, "rewards/rejected": -7.8809814453125, "step": 1092 }, { "epoch": 0.7472226969748761, "grad_norm": 0.0006586030358448625, "learning_rate": 4.9906998738542334e-05, "logits/chosen": -8.915726661682129, "logits/rejected": -8.896056175231934, "logps/chosen": -0.3263954222202301, "logps/rejected": -111.55603790283203, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1408746242523193, "rewards/margins": 11.128137588500977, "rewards/rejected": -7.987262725830078, "step": 1093 }, { "epoch": 0.747906340796445, "grad_norm": 0.0008946466259658337, "learning_rate": 4.990498923330493e-05, "logits/chosen": -8.952948570251465, "logits/rejected": -8.933266639709473, "logps/chosen": -6.512459754943848, "logps/rejected": -108.6025161743164, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.693567991256714, "rewards/margins": 10.223604202270508, "rewards/rejected": -7.530035972595215, "step": 1094 }, { "epoch": 0.748589984618014, "grad_norm": 0.0006649985443800688, "learning_rate": 4.9902958290842455e-05, "logits/chosen": -8.440377235412598, "logits/rejected": -8.425219535827637, "logps/chosen": -4.331564903259277, "logps/rejected": -109.18923950195312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7676901817321777, "rewards/margins": 10.448774337768555, "rewards/rejected": -7.681084156036377, "step": 1095 }, { "epoch": 0.749273628439583, "grad_norm": 0.0007540961960330606, "learning_rate": 4.9900905912903246e-05, "logits/chosen": -8.647823333740234, "logits/rejected": -8.628308296203613, "logps/chosen": -2.652848958969116, "logps/rejected": -112.22649383544922, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.01216983795166, "rewards/margins": 10.930652618408203, "rewards/rejected": -7.918481826782227, "step": 1096 }, { "epoch": 0.7499572722611519, "grad_norm": 0.0011345212114974856, "learning_rate": 4.9898832101254066e-05, "logits/chosen": -8.503396987915039, "logits/rejected": -8.488027572631836, "logps/chosen": -2.2438771724700928, "logps/rejected": -110.56097412109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.931816577911377, "rewards/margins": 10.827312469482422, "rewards/rejected": -7.895496368408203, "step": 1097 }, { "epoch": 0.7506409160827209, "grad_norm": 0.0009600699995644391, "learning_rate": 4.989673685768016e-05, "logits/chosen": -8.289313316345215, "logits/rejected": -8.267428398132324, "logps/chosen": -0.4828759431838989, "logps/rejected": -111.16236877441406, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1738758087158203, "rewards/margins": 11.121540069580078, "rewards/rejected": -7.9476637840271, "step": 1098 }, { "epoch": 0.7513245599042898, "grad_norm": 0.0009384243749082088, "learning_rate": 4.989462018398522e-05, "logits/chosen": -8.67453384399414, "logits/rejected": -8.656204223632812, "logps/chosen": -5.173384189605713, "logps/rejected": -110.30097961425781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6690797805786133, "rewards/margins": 10.407683372497559, "rewards/rejected": -7.7386040687561035, "step": 1099 }, { "epoch": 0.7520082037258589, "grad_norm": 0.0008417131030000746, "learning_rate": 4.989248208199137e-05, "logits/chosen": -8.775930404663086, "logits/rejected": -8.759456634521484, "logps/chosen": -3.8637099266052246, "logps/rejected": -109.79894256591797, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8501086235046387, "rewards/margins": 10.571687698364258, "rewards/rejected": -7.721579551696777, "step": 1100 }, { "epoch": 0.7526918475474278, "grad_norm": 0.0024971188977360725, "learning_rate": 4.989032255353918e-05, "logits/chosen": -8.342081069946289, "logits/rejected": -8.32596206665039, "logps/chosen": -1.0962904691696167, "logps/rejected": -111.55238342285156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0332775115966797, "rewards/margins": 10.980896949768066, "rewards/rejected": -7.9476189613342285, "step": 1101 }, { "epoch": 0.7533754913689967, "grad_norm": 0.0008366075926460326, "learning_rate": 4.9888141600487684e-05, "logits/chosen": -7.860661029815674, "logits/rejected": -7.845659255981445, "logps/chosen": -4.510565757751465, "logps/rejected": -110.29314422607422, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.822394609451294, "rewards/margins": 10.547682762145996, "rewards/rejected": -7.725288391113281, "step": 1102 }, { "epoch": 0.7540591351905657, "grad_norm": 0.0012889301870018244, "learning_rate": 4.988593922471435e-05, "logits/chosen": -8.49436092376709, "logits/rejected": -8.480432510375977, "logps/chosen": -5.729500770568848, "logps/rejected": -109.67172241210938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.706547975540161, "rewards/margins": 10.2684326171875, "rewards/rejected": -7.561884880065918, "step": 1103 }, { "epoch": 0.7547427790121347, "grad_norm": 0.06234162300825119, "learning_rate": 4.988371542811507e-05, "logits/chosen": -8.268550872802734, "logits/rejected": -8.252655029296875, "logps/chosen": -8.286149024963379, "logps/rejected": -106.79644012451172, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 2.533794403076172, "rewards/margins": 9.820334434509277, "rewards/rejected": -7.286539554595947, "step": 1104 }, { "epoch": 0.7554264228337036, "grad_norm": 0.001607548794709146, "learning_rate": 4.988147021260421e-05, "logits/chosen": -8.284802436828613, "logits/rejected": -8.264469146728516, "logps/chosen": -2.113895893096924, "logps/rejected": -110.37977600097656, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.981693983078003, "rewards/margins": 10.831103324890137, "rewards/rejected": -7.849409103393555, "step": 1105 }, { "epoch": 0.7561100666552726, "grad_norm": 0.0012789819156751037, "learning_rate": 4.9879203580114546e-05, "logits/chosen": -8.641870498657227, "logits/rejected": -8.62443733215332, "logps/chosen": -1.0376436710357666, "logps/rejected": -111.14673614501953, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0173490047454834, "rewards/margins": 10.994020462036133, "rewards/rejected": -7.976672172546387, "step": 1106 }, { "epoch": 0.7567937104768415, "grad_norm": 0.0008677614387124777, "learning_rate": 4.98769155325973e-05, "logits/chosen": -8.001526832580566, "logits/rejected": -7.984664440155029, "logps/chosen": -1.864880084991455, "logps/rejected": -110.85025787353516, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.064345121383667, "rewards/margins": 10.902323722839355, "rewards/rejected": -7.837978839874268, "step": 1107 }, { "epoch": 0.7574773542984106, "grad_norm": 0.0009043680038303137, "learning_rate": 4.987460607202214e-05, "logits/chosen": -8.40762710571289, "logits/rejected": -8.393381118774414, "logps/chosen": -4.1958770751953125, "logps/rejected": -109.91543579101562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.798783540725708, "rewards/margins": 10.566311836242676, "rewards/rejected": -7.767528533935547, "step": 1108 }, { "epoch": 0.7581609981199795, "grad_norm": 0.0008095451630651951, "learning_rate": 4.987227520037715e-05, "logits/chosen": -7.946384429931641, "logits/rejected": -7.9336771965026855, "logps/chosen": -3.4383020401000977, "logps/rejected": -109.51334381103516, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8204965591430664, "rewards/margins": 10.580801010131836, "rewards/rejected": -7.7603044509887695, "step": 1109 }, { "epoch": 0.7588446419415484, "grad_norm": 0.06325415521860123, "learning_rate": 4.986992291966886e-05, "logits/chosen": -7.861595630645752, "logits/rejected": -7.845653533935547, "logps/chosen": -3.9762911796569824, "logps/rejected": -108.93370056152344, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.8015105724334717, "rewards/margins": 10.458662033081055, "rewards/rejected": -7.65715217590332, "step": 1110 }, { "epoch": 0.7595282857631174, "grad_norm": 0.0010540602961555123, "learning_rate": 4.98675492319222e-05, "logits/chosen": -8.194446563720703, "logits/rejected": -8.178677558898926, "logps/chosen": -2.46144437789917, "logps/rejected": -109.52543640136719, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9897098541259766, "rewards/margins": 10.732410430908203, "rewards/rejected": -7.74269962310791, "step": 1111 }, { "epoch": 0.7602119295846864, "grad_norm": 0.0009731943719089031, "learning_rate": 4.986515413918058e-05, "logits/chosen": -7.833192825317383, "logits/rejected": -7.819314956665039, "logps/chosen": -4.617852210998535, "logps/rejected": -107.68673706054688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.711876153945923, "rewards/margins": 10.283781051635742, "rewards/rejected": -7.571904182434082, "step": 1112 }, { "epoch": 0.7608955734062554, "grad_norm": 0.0010043010115623474, "learning_rate": 4.986273764350579e-05, "logits/chosen": -8.091856002807617, "logits/rejected": -8.076786041259766, "logps/chosen": -5.698742866516113, "logps/rejected": -109.77238464355469, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.632349967956543, "rewards/margins": 10.30627727508545, "rewards/rejected": -7.673927307128906, "step": 1113 }, { "epoch": 0.7615792172278243, "grad_norm": 0.000646304979454726, "learning_rate": 4.986029974697808e-05, "logits/chosen": -8.138237953186035, "logits/rejected": -8.123113632202148, "logps/chosen": -5.643448829650879, "logps/rejected": -109.25634765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7099688053131104, "rewards/margins": 10.319506645202637, "rewards/rejected": -7.6095380783081055, "step": 1114 }, { "epoch": 0.7622628610493932, "grad_norm": 0.0009015487739816308, "learning_rate": 4.9857840451696086e-05, "logits/chosen": -8.201376914978027, "logits/rejected": -8.179616928100586, "logps/chosen": -1.6589386463165283, "logps/rejected": -110.96340942382812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0914766788482666, "rewards/margins": 10.827754974365234, "rewards/rejected": -7.7362775802612305, "step": 1115 }, { "epoch": 0.7629465048709623, "grad_norm": 0.002057295059785247, "learning_rate": 4.985535975977689e-05, "logits/chosen": -8.212663650512695, "logits/rejected": -8.194581985473633, "logps/chosen": -2.2884621620178223, "logps/rejected": -112.10091400146484, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0012683868408203, "rewards/margins": 10.905841827392578, "rewards/rejected": -7.904572486877441, "step": 1116 }, { "epoch": 0.7636301486925312, "grad_norm": 0.0007032907451502979, "learning_rate": 4.985285767335599e-05, "logits/chosen": -8.363834381103516, "logits/rejected": -8.346635818481445, "logps/chosen": -3.357649564743042, "logps/rejected": -111.04194641113281, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8416566848754883, "rewards/margins": 10.71354866027832, "rewards/rejected": -7.87189245223999, "step": 1117 }, { "epoch": 0.7643137925141001, "grad_norm": 0.0007930827559903264, "learning_rate": 4.98503341945873e-05, "logits/chosen": -8.072054862976074, "logits/rejected": -8.055465698242188, "logps/chosen": -2.253812551498413, "logps/rejected": -111.240478515625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.968961715698242, "rewards/margins": 10.91561508178711, "rewards/rejected": -7.946652412414551, "step": 1118 }, { "epoch": 0.7649974363356691, "grad_norm": 0.0008606919436715543, "learning_rate": 4.984778932564314e-05, "logits/chosen": -8.273467063903809, "logits/rejected": -8.257674217224121, "logps/chosen": -3.955955982208252, "logps/rejected": -108.34262084960938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7492213249206543, "rewards/margins": 10.309911727905273, "rewards/rejected": -7.560689926147461, "step": 1119 }, { "epoch": 0.7656810801572381, "grad_norm": 0.0005963958683423698, "learning_rate": 4.984522306871427e-05, "logits/chosen": -8.186838150024414, "logits/rejected": -8.163002014160156, "logps/chosen": -1.3508577346801758, "logps/rejected": -110.69395446777344, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.104843854904175, "rewards/margins": 10.918684005737305, "rewards/rejected": -7.813840389251709, "step": 1120 }, { "epoch": 0.7663647239788071, "grad_norm": 0.0015214603627100587, "learning_rate": 4.984263542600982e-05, "logits/chosen": -8.769425392150879, "logits/rejected": -8.75391960144043, "logps/chosen": -3.8411905765533447, "logps/rejected": -110.69990539550781, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.753848075866699, "rewards/margins": 10.586862564086914, "rewards/rejected": -7.833014488220215, "step": 1121 }, { "epoch": 0.767048367800376, "grad_norm": 0.0009111723047681153, "learning_rate": 4.984002639975737e-05, "logits/chosen": -7.8607940673828125, "logits/rejected": -7.8474321365356445, "logps/chosen": -1.7971701622009277, "logps/rejected": -111.72709655761719, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8358612060546875, "rewards/margins": 10.90471076965332, "rewards/rejected": -8.068849563598633, "step": 1122 }, { "epoch": 0.7677320116219449, "grad_norm": 0.0010226686717942357, "learning_rate": 4.983739599220289e-05, "logits/chosen": -8.493049621582031, "logits/rejected": -8.474708557128906, "logps/chosen": -5.838546276092529, "logps/rejected": -108.98899841308594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6807191371917725, "rewards/margins": 10.381612777709961, "rewards/rejected": -7.700893402099609, "step": 1123 }, { "epoch": 0.768415655443514, "grad_norm": 0.0009862561710178852, "learning_rate": 4.983474420561075e-05, "logits/chosen": -8.33232593536377, "logits/rejected": -8.315472602844238, "logps/chosen": -0.22191964089870453, "logps/rejected": -112.50145721435547, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.060500144958496, "rewards/margins": 11.1221342086792, "rewards/rejected": -8.061634063720703, "step": 1124 }, { "epoch": 0.7690992992650829, "grad_norm": 0.0005539265694096684, "learning_rate": 4.9832071042263747e-05, "logits/chosen": -7.673359394073486, "logits/rejected": -7.657858371734619, "logps/chosen": -4.475478172302246, "logps/rejected": -109.77275085449219, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.833014965057373, "rewards/margins": 10.472681045532227, "rewards/rejected": -7.639665603637695, "step": 1125 }, { "epoch": 0.7697829430866518, "grad_norm": 0.0008219249430112541, "learning_rate": 4.9829376504463044e-05, "logits/chosen": -8.40607738494873, "logits/rejected": -8.381452560424805, "logps/chosen": -0.15695549547672272, "logps/rejected": -112.76094055175781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.242457866668701, "rewards/margins": 11.235610961914062, "rewards/rejected": -7.993153095245361, "step": 1126 }, { "epoch": 0.7704665869082208, "grad_norm": 0.000977231771685183, "learning_rate": 4.982666059452824e-05, "logits/chosen": -8.124101638793945, "logits/rejected": -8.104970932006836, "logps/chosen": -2.238191843032837, "logps/rejected": -110.93399047851562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0070455074310303, "rewards/margins": 10.8822603225708, "rewards/rejected": -7.875214576721191, "step": 1127 }, { "epoch": 0.7711502307297898, "grad_norm": 0.000936844851821661, "learning_rate": 4.9823923314797313e-05, "logits/chosen": -8.09340763092041, "logits/rejected": -8.075565338134766, "logps/chosen": -2.664750576019287, "logps/rejected": -111.18820190429688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8934969902038574, "rewards/margins": 10.85087776184082, "rewards/rejected": -7.9573798179626465, "step": 1128 }, { "epoch": 0.7718338745513588, "grad_norm": 0.0007874249713495374, "learning_rate": 4.982116466762663e-05, "logits/chosen": -8.266018867492676, "logits/rejected": -8.23678207397461, "logps/chosen": -0.1552121490240097, "logps/rejected": -113.33843994140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.170970916748047, "rewards/margins": 11.269043922424316, "rewards/rejected": -8.098072052001953, "step": 1129 }, { "epoch": 0.7725175183729277, "grad_norm": 0.000703653262462467, "learning_rate": 4.9818384655390984e-05, "logits/chosen": -7.652120113372803, "logits/rejected": -7.630074977874756, "logps/chosen": -2.0327935218811035, "logps/rejected": -111.21971130371094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0133132934570312, "rewards/margins": 10.885920524597168, "rewards/rejected": -7.872607231140137, "step": 1130 }, { "epoch": 0.7732011621944966, "grad_norm": 0.0008985501481220126, "learning_rate": 4.981558328048353e-05, "logits/chosen": -8.11571216583252, "logits/rejected": -8.095705032348633, "logps/chosen": -2.7318227291107178, "logps/rejected": -111.28717041015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0292227268218994, "rewards/margins": 10.883317947387695, "rewards/rejected": -7.854095458984375, "step": 1131 }, { "epoch": 0.7738848060160657, "grad_norm": 0.0009583805804140866, "learning_rate": 4.981276054531581e-05, "logits/chosen": -8.917720794677734, "logits/rejected": -8.900873184204102, "logps/chosen": -2.585416316986084, "logps/rejected": -112.55889892578125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8400611877441406, "rewards/margins": 10.87707233428955, "rewards/rejected": -8.037010192871094, "step": 1132 }, { "epoch": 0.7745684498376346, "grad_norm": 0.0008120889542624354, "learning_rate": 4.9809916452317785e-05, "logits/chosen": -8.526808738708496, "logits/rejected": -8.509998321533203, "logps/chosen": -0.7538440823554993, "logps/rejected": -112.00530242919922, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0685651302337646, "rewards/margins": 11.125799179077148, "rewards/rejected": -8.057233810424805, "step": 1133 }, { "epoch": 0.7752520936592036, "grad_norm": 0.0008490746840834618, "learning_rate": 4.9807051003937774e-05, "logits/chosen": -7.678044319152832, "logits/rejected": -7.6623759269714355, "logps/chosen": -4.129398345947266, "logps/rejected": -110.98995971679688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.824812650680542, "rewards/margins": 10.66199016571045, "rewards/rejected": -7.837177276611328, "step": 1134 }, { "epoch": 0.7759357374807725, "grad_norm": 0.0007823914638720453, "learning_rate": 4.980416420264249e-05, "logits/chosen": -8.162683486938477, "logits/rejected": -8.138425827026367, "logps/chosen": -2.589164972305298, "logps/rejected": -110.44801330566406, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9441754817962646, "rewards/margins": 10.740421295166016, "rewards/rejected": -7.79624605178833, "step": 1135 }, { "epoch": 0.7766193813023414, "grad_norm": 0.0007195056532509625, "learning_rate": 4.980125605091703e-05, "logits/chosen": -8.228278160095215, "logits/rejected": -8.20611572265625, "logps/chosen": -3.5705199241638184, "logps/rejected": -110.72245025634766, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.923419952392578, "rewards/margins": 10.714325904846191, "rewards/rejected": -7.790904998779297, "step": 1136 }, { "epoch": 0.7773030251239105, "grad_norm": 0.0008420677622780204, "learning_rate": 4.9798326551264856e-05, "logits/chosen": -7.853780746459961, "logits/rejected": -7.835509300231934, "logps/chosen": -2.036804437637329, "logps/rejected": -112.23832702636719, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0575900077819824, "rewards/margins": 10.949307441711426, "rewards/rejected": -7.891717910766602, "step": 1137 }, { "epoch": 0.7779866689454794, "grad_norm": 0.0007733848178759217, "learning_rate": 4.9795375706207824e-05, "logits/chosen": -8.388442039489746, "logits/rejected": -8.372811317443848, "logps/chosen": -1.7056748867034912, "logps/rejected": -112.212158203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9907779693603516, "rewards/margins": 10.950427055358887, "rewards/rejected": -7.959648132324219, "step": 1138 }, { "epoch": 0.7786703127670483, "grad_norm": 0.0006201796350069344, "learning_rate": 4.979240351828617e-05, "logits/chosen": -8.121231079101562, "logits/rejected": -8.104154586791992, "logps/chosen": -1.9579521417617798, "logps/rejected": -112.28550720214844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.021225929260254, "rewards/margins": 11.01926040649414, "rewards/rejected": -7.998034477233887, "step": 1139 }, { "epoch": 0.7793539565886173, "grad_norm": 0.0008645818452350795, "learning_rate": 4.978940999005847e-05, "logits/chosen": -8.2587251663208, "logits/rejected": -8.240399360656738, "logps/chosen": -7.156076908111572, "logps/rejected": -108.23271179199219, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.664383888244629, "rewards/margins": 10.09029769897461, "rewards/rejected": -7.4259138107299805, "step": 1140 }, { "epoch": 0.7800376004101863, "grad_norm": 0.0009732017060741782, "learning_rate": 4.978639512410172e-05, "logits/chosen": -7.424717903137207, "logits/rejected": -7.408544063568115, "logps/chosen": -4.780263423919678, "logps/rejected": -109.53777313232422, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.860118865966797, "rewards/margins": 10.434871673583984, "rewards/rejected": -7.5747528076171875, "step": 1141 }, { "epoch": 0.7807212442317553, "grad_norm": 0.0007890145643614233, "learning_rate": 4.9783358923011234e-05, "logits/chosen": -8.393688201904297, "logits/rejected": -8.376121520996094, "logps/chosen": -3.9386489391326904, "logps/rejected": -108.96644592285156, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8146188259124756, "rewards/margins": 10.481368064880371, "rewards/rejected": -7.666748046875, "step": 1142 }, { "epoch": 0.7814048880533242, "grad_norm": 0.0008266038494184613, "learning_rate": 4.978030138940072e-05, "logits/chosen": -8.515583992004395, "logits/rejected": -8.496322631835938, "logps/chosen": -2.03474497795105, "logps/rejected": -111.48505401611328, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9707751274108887, "rewards/margins": 10.936766624450684, "rewards/rejected": -7.965991497039795, "step": 1143 }, { "epoch": 0.7820885318748931, "grad_norm": 0.0007406659424304962, "learning_rate": 4.9777222525902254e-05, "logits/chosen": -8.993855476379395, "logits/rejected": -8.977298736572266, "logps/chosen": -2.3172898292541504, "logps/rejected": -112.20682525634766, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8159022331237793, "rewards/margins": 10.885937690734863, "rewards/rejected": -8.070035934448242, "step": 1144 }, { "epoch": 0.7827721756964622, "grad_norm": 0.0008695184951648116, "learning_rate": 4.9774122335166264e-05, "logits/chosen": -8.432995796203613, "logits/rejected": -8.416881561279297, "logps/chosen": -4.035462379455566, "logps/rejected": -109.66079711914062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.825047016143799, "rewards/margins": 10.531806945800781, "rewards/rejected": -7.706759929656982, "step": 1145 }, { "epoch": 0.7834558195180311, "grad_norm": 0.0006697330973111093, "learning_rate": 4.9771000819861535e-05, "logits/chosen": -8.174334526062012, "logits/rejected": -8.157449722290039, "logps/chosen": -3.0721561908721924, "logps/rejected": -110.70831298828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.824315071105957, "rewards/margins": 10.68940258026123, "rewards/rejected": -7.865087509155273, "step": 1146 }, { "epoch": 0.7841394633396, "grad_norm": 0.0011097349924966693, "learning_rate": 4.976785798267521e-05, "logits/chosen": -7.886288642883301, "logits/rejected": -7.866001129150391, "logps/chosen": -2.5425357818603516, "logps/rejected": -113.00135803222656, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.067657947540283, "rewards/margins": 11.022836685180664, "rewards/rejected": -7.955178260803223, "step": 1147 }, { "epoch": 0.784823107161169, "grad_norm": 0.0008254668791778386, "learning_rate": 4.9764693826312796e-05, "logits/chosen": -8.62434196472168, "logits/rejected": -8.606908798217773, "logps/chosen": -3.80734920501709, "logps/rejected": -111.935302734375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.825289726257324, "rewards/margins": 10.798395156860352, "rewards/rejected": -7.973104953765869, "step": 1148 }, { "epoch": 0.785506750982738, "grad_norm": 0.0006204472738318145, "learning_rate": 4.976150835349815e-05, "logits/chosen": -7.74623441696167, "logits/rejected": -7.7296929359436035, "logps/chosen": -3.304922103881836, "logps/rejected": -110.98089599609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8581769466400146, "rewards/margins": 10.762269020080566, "rewards/rejected": -7.904091835021973, "step": 1149 }, { "epoch": 0.786190394804307, "grad_norm": 0.0009248463902622461, "learning_rate": 4.975830156697346e-05, "logits/chosen": -8.433709144592285, "logits/rejected": -8.418211936950684, "logps/chosen": -0.7256656885147095, "logps/rejected": -112.44216918945312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0035560131073, "rewards/margins": 11.106456756591797, "rewards/rejected": -8.102901458740234, "step": 1150 }, { "epoch": 0.7868740386258759, "grad_norm": 0.0005183502216823399, "learning_rate": 4.97550734694993e-05, "logits/chosen": -8.687105178833008, "logits/rejected": -8.669321060180664, "logps/chosen": -5.5868377685546875, "logps/rejected": -109.17034912109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7315704822540283, "rewards/margins": 10.330766677856445, "rewards/rejected": -7.599195957183838, "step": 1151 }, { "epoch": 0.7875576824474448, "grad_norm": 0.000678028620313853, "learning_rate": 4.975182406385454e-05, "logits/chosen": -8.249621391296387, "logits/rejected": -8.232645034790039, "logps/chosen": -3.533501148223877, "logps/rejected": -111.04241943359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8186001777648926, "rewards/margins": 10.7149658203125, "rewards/rejected": -7.896365165710449, "step": 1152 }, { "epoch": 0.7882413262690139, "grad_norm": 0.0007444715593010187, "learning_rate": 4.9748553352836444e-05, "logits/chosen": -8.016989707946777, "logits/rejected": -7.998147010803223, "logps/chosen": -2.12209415435791, "logps/rejected": -112.15650177001953, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.048055648803711, "rewards/margins": 10.980823516845703, "rewards/rejected": -7.932766914367676, "step": 1153 }, { "epoch": 0.7889249700905828, "grad_norm": 0.00228900252841413, "learning_rate": 4.974526133926059e-05, "logits/chosen": -8.53437328338623, "logits/rejected": -8.509421348571777, "logps/chosen": -0.19766849279403687, "logps/rejected": -112.48921966552734, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.2184996604919434, "rewards/margins": 11.303323745727539, "rewards/rejected": -8.084823608398438, "step": 1154 }, { "epoch": 0.7896086139121518, "grad_norm": 0.0006752643384970725, "learning_rate": 4.974194802596087e-05, "logits/chosen": -7.700982570648193, "logits/rejected": -7.683475494384766, "logps/chosen": -1.686144471168518, "logps/rejected": -112.17194366455078, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.055119037628174, "rewards/margins": 10.991292953491211, "rewards/rejected": -7.936174392700195, "step": 1155 }, { "epoch": 0.7902922577337207, "grad_norm": 0.0007366701029241085, "learning_rate": 4.9738613415789576e-05, "logits/chosen": -8.48277473449707, "logits/rejected": -8.462072372436523, "logps/chosen": -0.17297601699829102, "logps/rejected": -113.38130950927734, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1265573501586914, "rewards/margins": 11.318087577819824, "rewards/rejected": -8.191530227661133, "step": 1156 }, { "epoch": 0.7909759015552897, "grad_norm": 0.0005952909123152494, "learning_rate": 4.973525751161728e-05, "logits/chosen": -8.299115180969238, "logits/rejected": -8.282774925231934, "logps/chosen": -2.1138248443603516, "logps/rejected": -112.17893981933594, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.912384510040283, "rewards/margins": 10.88991928100586, "rewards/rejected": -7.977535247802734, "step": 1157 }, { "epoch": 0.7916595453768587, "grad_norm": 0.0008460725075565279, "learning_rate": 4.973188031633289e-05, "logits/chosen": -7.929772853851318, "logits/rejected": -7.911904335021973, "logps/chosen": -0.15589533746242523, "logps/rejected": -113.56359100341797, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0819993019104004, "rewards/margins": 11.243892669677734, "rewards/rejected": -8.161893844604492, "step": 1158 }, { "epoch": 0.7923431891984276, "grad_norm": 0.0011136061511933804, "learning_rate": 4.9728481832843666e-05, "logits/chosen": -8.24428653717041, "logits/rejected": -8.225341796875, "logps/chosen": -2.208932638168335, "logps/rejected": -112.45963287353516, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9510762691497803, "rewards/margins": 10.983684539794922, "rewards/rejected": -8.032608032226562, "step": 1159 }, { "epoch": 0.7930268330199965, "grad_norm": 0.0007100799703039229, "learning_rate": 4.972506206407518e-05, "logits/chosen": -8.091946601867676, "logits/rejected": -8.070428848266602, "logps/chosen": -4.123018264770508, "logps/rejected": -111.5582275390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9229745864868164, "rewards/margins": 10.69458293914795, "rewards/rejected": -7.771608352661133, "step": 1160 }, { "epoch": 0.7937104768415656, "grad_norm": 0.0006396164535544813, "learning_rate": 4.972162101297133e-05, "logits/chosen": -7.561365604400635, "logits/rejected": -7.546293258666992, "logps/chosen": -4.885831832885742, "logps/rejected": -109.2830581665039, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7600271701812744, "rewards/margins": 10.323387145996094, "rewards/rejected": -7.563360214233398, "step": 1161 }, { "epoch": 0.7943941206631345, "grad_norm": 0.0006525259814225137, "learning_rate": 4.971815868249434e-05, "logits/chosen": -8.288846969604492, "logits/rejected": -8.272186279296875, "logps/chosen": -5.456273555755615, "logps/rejected": -109.21163177490234, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.5922584533691406, "rewards/margins": 10.362360000610352, "rewards/rejected": -7.770101547241211, "step": 1162 }, { "epoch": 0.7950777644847035, "grad_norm": 0.0005142599111422896, "learning_rate": 4.971467507562472e-05, "logits/chosen": -7.899874687194824, "logits/rejected": -7.882450580596924, "logps/chosen": -2.5942084789276123, "logps/rejected": -113.37181091308594, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.02799391746521, "rewards/margins": 11.054428100585938, "rewards/rejected": -8.026433944702148, "step": 1163 }, { "epoch": 0.7957614083062724, "grad_norm": 0.0006117954035289586, "learning_rate": 4.971117019536134e-05, "logits/chosen": -8.631607055664062, "logits/rejected": -8.614999771118164, "logps/chosen": -5.569869041442871, "logps/rejected": -109.91822052001953, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.574782133102417, "rewards/margins": 10.354572296142578, "rewards/rejected": -7.779789924621582, "step": 1164 }, { "epoch": 0.7964450521278414, "grad_norm": 0.0007078397320583463, "learning_rate": 4.9707644044721366e-05, "logits/chosen": -7.9418535232543945, "logits/rejected": -7.9217529296875, "logps/chosen": -2.0668206214904785, "logps/rejected": -112.67436218261719, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1187806129455566, "rewards/margins": 11.015024185180664, "rewards/rejected": -7.896243095397949, "step": 1165 }, { "epoch": 0.7971286959494104, "grad_norm": 0.0008061560802161694, "learning_rate": 4.970409662674027e-05, "logits/chosen": -8.99390983581543, "logits/rejected": -8.973089218139648, "logps/chosen": -0.15035903453826904, "logps/rejected": -113.45668029785156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.154521942138672, "rewards/margins": 11.3900146484375, "rewards/rejected": -8.235493659973145, "step": 1166 }, { "epoch": 0.7978123397709793, "grad_norm": 0.0008591293590143323, "learning_rate": 4.970052794447184e-05, "logits/chosen": -8.171126365661621, "logits/rejected": -8.156059265136719, "logps/chosen": -3.2015957832336426, "logps/rejected": -111.12818908691406, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8398773670196533, "rewards/margins": 10.720585823059082, "rewards/rejected": -7.880708694458008, "step": 1167 }, { "epoch": 0.7984959835925483, "grad_norm": 0.0005209256778471172, "learning_rate": 4.969693800098815e-05, "logits/chosen": -7.917677879333496, "logits/rejected": -7.9025349617004395, "logps/chosen": -4.13895320892334, "logps/rejected": -110.57864379882812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8772144317626953, "rewards/margins": 10.655548095703125, "rewards/rejected": -7.77833366394043, "step": 1168 }, { "epoch": 0.7991796274141173, "grad_norm": 0.0006312341429293156, "learning_rate": 4.96933267993796e-05, "logits/chosen": -8.745597839355469, "logits/rejected": -8.724357604980469, "logps/chosen": -2.1994881629943848, "logps/rejected": -112.45410919189453, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.006833553314209, "rewards/margins": 11.047283172607422, "rewards/rejected": -8.040450096130371, "step": 1169 }, { "epoch": 0.7998632712356862, "grad_norm": 0.0006404270534403622, "learning_rate": 4.968969434275488e-05, "logits/chosen": -7.5829854011535645, "logits/rejected": -7.567063331604004, "logps/chosen": -5.151514053344727, "logps/rejected": -109.82503509521484, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7232589721679688, "rewards/margins": 10.426229476928711, "rewards/rejected": -7.702970504760742, "step": 1170 }, { "epoch": 0.8005469150572552, "grad_norm": 0.0007159658125601709, "learning_rate": 4.968604063424098e-05, "logits/chosen": -7.703643798828125, "logits/rejected": -7.687089920043945, "logps/chosen": -1.867180347442627, "logps/rejected": -113.11154174804688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.015237331390381, "rewards/margins": 11.033163070678711, "rewards/rejected": -8.017925262451172, "step": 1171 }, { "epoch": 0.8012305588788241, "grad_norm": 0.0007472810102626681, "learning_rate": 4.9682365676983183e-05, "logits/chosen": -7.59987735748291, "logits/rejected": -7.580866813659668, "logps/chosen": -4.790372371673584, "logps/rejected": -110.72189331054688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.789355993270874, "rewards/margins": 10.594834327697754, "rewards/rejected": -7.805478096008301, "step": 1172 }, { "epoch": 0.801914202700393, "grad_norm": 0.0010271670762449503, "learning_rate": 4.967866947414507e-05, "logits/chosen": -8.500855445861816, "logits/rejected": -8.47948169708252, "logps/chosen": -5.254508018493652, "logps/rejected": -110.04610443115234, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.737743854522705, "rewards/margins": 10.410760879516602, "rewards/rejected": -7.673016548156738, "step": 1173 }, { "epoch": 0.8025978465219621, "grad_norm": 0.0006271208403632045, "learning_rate": 4.967495202890848e-05, "logits/chosen": -8.245676040649414, "logits/rejected": -8.226816177368164, "logps/chosen": -1.915022850036621, "logps/rejected": -113.04977416992188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.031388282775879, "rewards/margins": 11.064082145690918, "rewards/rejected": -8.032693862915039, "step": 1174 }, { "epoch": 0.803281490343531, "grad_norm": 0.0006919695297256112, "learning_rate": 4.967121334447359e-05, "logits/chosen": -8.867534637451172, "logits/rejected": -8.850455284118652, "logps/chosen": -2.125368595123291, "logps/rejected": -111.91034698486328, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8885297775268555, "rewards/margins": 10.93564224243164, "rewards/rejected": -8.047113418579102, "step": 1175 }, { "epoch": 0.8039651341651, "grad_norm": 0.0005768829141743481, "learning_rate": 4.966745342405882e-05, "logits/chosen": -8.287862777709961, "logits/rejected": -8.268888473510742, "logps/chosen": -2.130185127258301, "logps/rejected": -112.86518859863281, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.965620994567871, "rewards/margins": 11.124856948852539, "rewards/rejected": -8.159235000610352, "step": 1176 }, { "epoch": 0.8046487779866689, "grad_norm": 0.0006459085270762444, "learning_rate": 4.96636722709009e-05, "logits/chosen": -7.562222480773926, "logits/rejected": -7.545114994049072, "logps/chosen": -0.15265899896621704, "logps/rejected": -114.10826110839844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.193037509918213, "rewards/margins": 11.383756637573242, "rewards/rejected": -8.190717697143555, "step": 1177 }, { "epoch": 0.8053324218082379, "grad_norm": 0.0005884390557184815, "learning_rate": 4.96598698882548e-05, "logits/chosen": -8.148923873901367, "logits/rejected": -8.130605697631836, "logps/chosen": -0.20624180138111115, "logps/rejected": -113.83255004882812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1165199279785156, "rewards/margins": 11.316753387451172, "rewards/rejected": -8.200233459472656, "step": 1178 }, { "epoch": 0.8060160656298069, "grad_norm": 0.0008550174534320831, "learning_rate": 4.965604627939381e-05, "logits/chosen": -8.65376091003418, "logits/rejected": -8.638067245483398, "logps/chosen": -3.4515841007232666, "logps/rejected": -113.2243423461914, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8391761779785156, "rewards/margins": 10.884151458740234, "rewards/rejected": -8.044976234436035, "step": 1179 }, { "epoch": 0.8066997094513758, "grad_norm": 0.0006658603087998927, "learning_rate": 4.965220144760947e-05, "logits/chosen": -8.363338470458984, "logits/rejected": -8.34492015838623, "logps/chosen": -5.099004745483398, "logps/rejected": -110.56986236572266, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.728947639465332, "rewards/margins": 10.561285972595215, "rewards/rejected": -7.832338333129883, "step": 1180 }, { "epoch": 0.8073833532729447, "grad_norm": 0.0006947607034817338, "learning_rate": 4.964833539621156e-05, "logits/chosen": -8.81753158569336, "logits/rejected": -8.793227195739746, "logps/chosen": -1.9036459922790527, "logps/rejected": -113.13563537597656, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.01253080368042, "rewards/margins": 11.066254615783691, "rewards/rejected": -8.05372428894043, "step": 1181 }, { "epoch": 0.8080669970945138, "grad_norm": 0.0009722586255520582, "learning_rate": 4.964444812852819e-05, "logits/chosen": -7.5344648361206055, "logits/rejected": -7.518010139465332, "logps/chosen": -3.8958075046539307, "logps/rejected": -111.00482940673828, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8562705516815186, "rewards/margins": 10.724019050598145, "rewards/rejected": -7.867748260498047, "step": 1182 }, { "epoch": 0.8087506409160827, "grad_norm": 0.0006589463446289301, "learning_rate": 4.96405396479057e-05, "logits/chosen": -9.003989219665527, "logits/rejected": -8.987664222717285, "logps/chosen": -6.448462963104248, "logps/rejected": -110.5608139038086, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.588277816772461, "rewards/margins": 10.441500663757324, "rewards/rejected": -7.853222846984863, "step": 1183 }, { "epoch": 0.8094342847376517, "grad_norm": 0.000571930140722543, "learning_rate": 4.9636609957708674e-05, "logits/chosen": -8.063497543334961, "logits/rejected": -8.045852661132812, "logps/chosen": -0.17331556975841522, "logps/rejected": -114.17969512939453, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.138277292251587, "rewards/margins": 11.287688255310059, "rewards/rejected": -8.149412155151367, "step": 1184 }, { "epoch": 0.8101179285592206, "grad_norm": 0.0007551141898147762, "learning_rate": 4.963265906131999e-05, "logits/chosen": -8.195174217224121, "logits/rejected": -8.17685317993164, "logps/chosen": -1.6589069366455078, "logps/rejected": -112.51323699951172, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0014359951019287, "rewards/margins": 11.009674072265625, "rewards/rejected": -8.008237838745117, "step": 1185 }, { "epoch": 0.8108015723807896, "grad_norm": 0.0007415261934511364, "learning_rate": 4.962868696214075e-05, "logits/chosen": -8.018915176391602, "logits/rejected": -8.001667976379395, "logps/chosen": -3.684481143951416, "logps/rejected": -112.3217544555664, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.803988456726074, "rewards/margins": 10.827981948852539, "rewards/rejected": -8.023994445800781, "step": 1186 }, { "epoch": 0.8114852162023586, "grad_norm": 0.0007563703111372888, "learning_rate": 4.962469366359034e-05, "logits/chosen": -7.881796360015869, "logits/rejected": -7.864616394042969, "logps/chosen": -3.314748764038086, "logps/rejected": -111.9588394165039, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8837227821350098, "rewards/margins": 10.768075942993164, "rewards/rejected": -7.8843536376953125, "step": 1187 }, { "epoch": 0.8121688600239275, "grad_norm": 0.0006659762584604323, "learning_rate": 4.962067916910636e-05, "logits/chosen": -8.195154190063477, "logits/rejected": -8.179205894470215, "logps/chosen": -7.3434014320373535, "logps/rejected": -109.7624282836914, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.4468700885772705, "rewards/margins": 10.11190414428711, "rewards/rejected": -7.665033340454102, "step": 1188 }, { "epoch": 0.8128525038454965, "grad_norm": 0.0008977177203632891, "learning_rate": 4.961664348214468e-05, "logits/chosen": -8.041084289550781, "logits/rejected": -8.024714469909668, "logps/chosen": -1.7184306383132935, "logps/rejected": -112.4308853149414, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.914452075958252, "rewards/margins": 11.022921562194824, "rewards/rejected": -8.108469009399414, "step": 1189 }, { "epoch": 0.8135361476670655, "grad_norm": 0.0006934909615665674, "learning_rate": 4.9612586606179415e-05, "logits/chosen": -7.520926475524902, "logits/rejected": -7.504404067993164, "logps/chosen": -3.9241089820861816, "logps/rejected": -112.26516723632812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.860074520111084, "rewards/margins": 10.748451232910156, "rewards/rejected": -7.8883771896362305, "step": 1190 }, { "epoch": 0.8142197914886344, "grad_norm": 0.0009280996746383607, "learning_rate": 4.96085085447029e-05, "logits/chosen": -8.233652114868164, "logits/rejected": -8.21165657043457, "logps/chosen": -0.14526532590389252, "logps/rejected": -114.19917297363281, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.186519145965576, "rewards/margins": 11.31937026977539, "rewards/rejected": -8.132850646972656, "step": 1191 }, { "epoch": 0.8149034353102034, "grad_norm": 0.0006720155361108482, "learning_rate": 4.9604409301225726e-05, "logits/chosen": -8.625417709350586, "logits/rejected": -8.607072830200195, "logps/chosen": -4.233348369598389, "logps/rejected": -112.04298400878906, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7817907333374023, "rewards/margins": 10.692256927490234, "rewards/rejected": -7.910466194152832, "step": 1192 }, { "epoch": 0.8155870791317723, "grad_norm": 0.0005792303709313273, "learning_rate": 4.960028887927673e-05, "logits/chosen": -8.384385108947754, "logits/rejected": -8.359426498413086, "logps/chosen": -3.7849864959716797, "logps/rejected": -111.49121856689453, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.906555414199829, "rewards/margins": 10.774988174438477, "rewards/rejected": -7.868432521820068, "step": 1193 }, { "epoch": 0.8162707229533414, "grad_norm": 0.0006766867591068149, "learning_rate": 4.959614728240293e-05, "logits/chosen": -7.912775993347168, "logits/rejected": -7.89409875869751, "logps/chosen": -0.14338180422782898, "logps/rejected": -114.1563949584961, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0721588134765625, "rewards/margins": 11.350052833557129, "rewards/rejected": -8.277894020080566, "step": 1194 }, { "epoch": 0.8169543667749103, "grad_norm": 0.0008052353514358401, "learning_rate": 4.9591984514169645e-05, "logits/chosen": -7.956886291503906, "logits/rejected": -7.940912246704102, "logps/chosen": -4.691234588623047, "logps/rejected": -110.3521728515625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7069501876831055, "rewards/margins": 10.542716026306152, "rewards/rejected": -7.835765838623047, "step": 1195 }, { "epoch": 0.8176380105964792, "grad_norm": 0.0005851072492077947, "learning_rate": 4.958780057816034e-05, "logits/chosen": -8.335789680480957, "logits/rejected": -8.313695907592773, "logps/chosen": -0.835222065448761, "logps/rejected": -113.3221664428711, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0224239826202393, "rewards/margins": 11.181461334228516, "rewards/rejected": -8.159037590026855, "step": 1196 }, { "epoch": 0.8183216544180482, "grad_norm": 0.0005443746340461075, "learning_rate": 4.958359547797677e-05, "logits/chosen": -8.329645156860352, "logits/rejected": -8.311820983886719, "logps/chosen": -4.121917724609375, "logps/rejected": -112.35635375976562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8305227756500244, "rewards/margins": 10.865718841552734, "rewards/rejected": -8.035196304321289, "step": 1197 }, { "epoch": 0.8190052982396172, "grad_norm": 0.0009172183345071971, "learning_rate": 4.957936921723888e-05, "logits/chosen": -8.657126426696777, "logits/rejected": -8.637829780578613, "logps/chosen": -6.902716636657715, "logps/rejected": -109.03622436523438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5618438720703125, "rewards/margins": 10.206872940063477, "rewards/rejected": -7.645029067993164, "step": 1198 }, { "epoch": 0.8196889420611861, "grad_norm": 0.000606306828558445, "learning_rate": 4.9575121799584814e-05, "logits/chosen": -8.40029239654541, "logits/rejected": -8.37751579284668, "logps/chosen": -1.9010884761810303, "logps/rejected": -113.1811294555664, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1046972274780273, "rewards/margins": 11.089462280273438, "rewards/rejected": -7.98476505279541, "step": 1199 }, { "epoch": 0.8203725858827551, "grad_norm": 0.0006139544420875609, "learning_rate": 4.957085322867097e-05, "logits/chosen": -7.988683700561523, "logits/rejected": -7.967350482940674, "logps/chosen": -2.449981212615967, "logps/rejected": -113.61734008789062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9695191383361816, "rewards/margins": 11.056432723999023, "rewards/rejected": -8.086913108825684, "step": 1200 }, { "epoch": 0.821056229704324, "grad_norm": 0.0005900460528209805, "learning_rate": 4.956656350817192e-05, "logits/chosen": -7.919556140899658, "logits/rejected": -7.89896297454834, "logps/chosen": -1.833744764328003, "logps/rejected": -113.82748413085938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.952066421508789, "rewards/margins": 11.136879920959473, "rewards/rejected": -8.184812545776367, "step": 1201 }, { "epoch": 0.8217398735258931, "grad_norm": 0.0005902141565456986, "learning_rate": 4.956225264178046e-05, "logits/chosen": -7.707980155944824, "logits/rejected": -7.6842780113220215, "logps/chosen": -0.16057363152503967, "logps/rejected": -114.13323974609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1610031127929688, "rewards/margins": 11.335143089294434, "rewards/rejected": -8.174140930175781, "step": 1202 }, { "epoch": 0.822423517347462, "grad_norm": 0.0006750028114765882, "learning_rate": 4.955792063320758e-05, "logits/chosen": -8.184076309204102, "logits/rejected": -8.16765022277832, "logps/chosen": -2.7446060180664062, "logps/rejected": -112.16067504882812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8213424682617188, "rewards/margins": 10.884720802307129, "rewards/rejected": -8.063379287719727, "step": 1203 }, { "epoch": 0.8231071611690309, "grad_norm": 0.0007356255082413554, "learning_rate": 4.955356748618248e-05, "logits/chosen": -8.332717895507812, "logits/rejected": -8.316253662109375, "logps/chosen": -5.4183478355407715, "logps/rejected": -111.35224914550781, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.670971393585205, "rewards/margins": 10.592148780822754, "rewards/rejected": -7.921176910400391, "step": 1204 }, { "epoch": 0.8237908049905999, "grad_norm": 0.0005672688712365925, "learning_rate": 4.954919320445255e-05, "logits/chosen": -7.886079788208008, "logits/rejected": -7.866813659667969, "logps/chosen": -3.179332733154297, "logps/rejected": -111.91032409667969, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.91412353515625, "rewards/margins": 10.792015075683594, "rewards/rejected": -7.877892017364502, "step": 1205 }, { "epoch": 0.8244744488121689, "grad_norm": 0.000534937484189868, "learning_rate": 4.954479779178338e-05, "logits/chosen": -7.924499034881592, "logits/rejected": -7.904858112335205, "logps/chosen": -2.7668590545654297, "logps/rejected": -113.12846374511719, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.889641046524048, "rewards/margins": 10.99250602722168, "rewards/rejected": -8.102865219116211, "step": 1206 }, { "epoch": 0.8251580926337378, "grad_norm": 0.0004764898621942848, "learning_rate": 4.954038125195873e-05, "logits/chosen": -7.823927402496338, "logits/rejected": -7.804122447967529, "logps/chosen": -0.4234998822212219, "logps/rejected": -114.47508239746094, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0648584365844727, "rewards/margins": 11.325403213500977, "rewards/rejected": -8.260543823242188, "step": 1207 }, { "epoch": 0.8258417364553068, "grad_norm": 0.000527764146681875, "learning_rate": 4.953594358878059e-05, "logits/chosen": -8.503898620605469, "logits/rejected": -8.484227180480957, "logps/chosen": -1.9270260334014893, "logps/rejected": -113.58489990234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.908782958984375, "rewards/margins": 11.049016952514648, "rewards/rejected": -8.140233993530273, "step": 1208 }, { "epoch": 0.8265253802768757, "grad_norm": 0.0006269430159591138, "learning_rate": 4.953148480606909e-05, "logits/chosen": -8.278435707092285, "logits/rejected": -8.261537551879883, "logps/chosen": -7.217401504516602, "logps/rejected": -110.57716369628906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5672860145568848, "rewards/margins": 10.330687522888184, "rewards/rejected": -7.763401031494141, "step": 1209 }, { "epoch": 0.8272090240984447, "grad_norm": 0.0006939362501725554, "learning_rate": 4.952700490766256e-05, "logits/chosen": -8.732927322387695, "logits/rejected": -8.713960647583008, "logps/chosen": -5.951504707336426, "logps/rejected": -111.20225524902344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6618492603302, "rewards/margins": 10.473443984985352, "rewards/rejected": -7.8115949630737305, "step": 1210 }, { "epoch": 0.8278926679200137, "grad_norm": 0.0024882862344384193, "learning_rate": 4.952250389741751e-05, "logits/chosen": -8.730537414550781, "logits/rejected": -8.707657814025879, "logps/chosen": -0.1632496863603592, "logps/rejected": -114.53629302978516, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1380813121795654, "rewards/margins": 11.41801643371582, "rewards/rejected": -8.27993392944336, "step": 1211 }, { "epoch": 0.8285763117415826, "grad_norm": 0.0009700296213850379, "learning_rate": 4.951798177920862e-05, "logits/chosen": -8.124537467956543, "logits/rejected": -8.102188110351562, "logps/chosen": -0.1443794071674347, "logps/rejected": -114.70748138427734, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.134425163269043, "rewards/margins": 11.438995361328125, "rewards/rejected": -8.304569244384766, "step": 1212 }, { "epoch": 0.8292599555631516, "grad_norm": 0.0005934564978815615, "learning_rate": 4.951343855692874e-05, "logits/chosen": -8.313222885131836, "logits/rejected": -8.287544250488281, "logps/chosen": -1.721821665763855, "logps/rejected": -113.49004364013672, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0880351066589355, "rewards/margins": 11.158092498779297, "rewards/rejected": -8.070056915283203, "step": 1213 }, { "epoch": 0.8299435993847205, "grad_norm": 0.0006908857612870634, "learning_rate": 4.950887423448887e-05, "logits/chosen": -8.309296607971191, "logits/rejected": -8.289344787597656, "logps/chosen": -4.416295051574707, "logps/rejected": -113.05986785888672, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.713615894317627, "rewards/margins": 10.783537864685059, "rewards/rejected": -8.06992244720459, "step": 1214 }, { "epoch": 0.8306272432062896, "grad_norm": 0.0005752891884185374, "learning_rate": 4.950428881581823e-05, "logits/chosen": -7.565371513366699, "logits/rejected": -7.5486321449279785, "logps/chosen": -3.8558292388916016, "logps/rejected": -111.53903198242188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.917041540145874, "rewards/margins": 10.764856338500977, "rewards/rejected": -7.847814559936523, "step": 1215 }, { "epoch": 0.8313108870278585, "grad_norm": 0.000502847891766578, "learning_rate": 4.9499682304864126e-05, "logits/chosen": -9.139242172241211, "logits/rejected": -9.119821548461914, "logps/chosen": -2.838911294937134, "logps/rejected": -112.69962310791016, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8214311599731445, "rewards/margins": 10.863027572631836, "rewards/rejected": -8.041595458984375, "step": 1216 }, { "epoch": 0.8319945308494274, "grad_norm": 0.0006841435097157955, "learning_rate": 4.949505470559207e-05, "logits/chosen": -7.738075256347656, "logits/rejected": -7.721550941467285, "logps/chosen": -6.142876148223877, "logps/rejected": -110.464111328125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.652317762374878, "rewards/margins": 10.462002754211426, "rewards/rejected": -7.809685230255127, "step": 1217 }, { "epoch": 0.8326781746709964, "grad_norm": 0.0006065450143069029, "learning_rate": 4.949040602198572e-05, "logits/chosen": -8.4166259765625, "logits/rejected": -8.396690368652344, "logps/chosen": -1.2591123580932617, "logps/rejected": -113.57186889648438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0266146659851074, "rewards/margins": 11.14592170715332, "rewards/rejected": -8.119305610656738, "step": 1218 }, { "epoch": 0.8333618184925654, "grad_norm": 0.0005869042943231761, "learning_rate": 4.948573625804688e-05, "logits/chosen": -8.238158226013184, "logits/rejected": -8.215494155883789, "logps/chosen": -0.18370503187179565, "logps/rejected": -114.90168762207031, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.195387125015259, "rewards/margins": 11.472929000854492, "rewards/rejected": -8.277541160583496, "step": 1219 }, { "epoch": 0.8340454623141343, "grad_norm": 0.0006508206715807319, "learning_rate": 4.94810454177955e-05, "logits/chosen": -8.5887451171875, "logits/rejected": -8.569552421569824, "logps/chosen": -1.9886670112609863, "logps/rejected": -114.26014709472656, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.932607650756836, "rewards/margins": 11.176216125488281, "rewards/rejected": -8.243608474731445, "step": 1220 }, { "epoch": 0.8347291061357033, "grad_norm": 0.0004677207034546882, "learning_rate": 4.947633350526967e-05, "logits/chosen": -9.103724479675293, "logits/rejected": -9.074614524841309, "logps/chosen": -0.1479129046201706, "logps/rejected": -114.67546081542969, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.124541759490967, "rewards/margins": 11.387505531311035, "rewards/rejected": -8.262962341308594, "step": 1221 }, { "epoch": 0.8354127499572722, "grad_norm": 0.0028508699033409357, "learning_rate": 4.947160052452562e-05, "logits/chosen": -8.282388687133789, "logits/rejected": -8.264362335205078, "logps/chosen": -5.256239414215088, "logps/rejected": -110.79890441894531, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7335710525512695, "rewards/margins": 10.468779563903809, "rewards/rejected": -7.735208988189697, "step": 1222 }, { "epoch": 0.8360963937788413, "grad_norm": 0.0006470998632721603, "learning_rate": 4.9466846479637744e-05, "logits/chosen": -8.317811012268066, "logits/rejected": -8.291593551635742, "logps/chosen": -1.567407488822937, "logps/rejected": -113.35743713378906, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0962796211242676, "rewards/margins": 11.211376190185547, "rewards/rejected": -8.115097045898438, "step": 1223 }, { "epoch": 0.8367800376004102, "grad_norm": 0.0005603982717730105, "learning_rate": 4.946207137469851e-05, "logits/chosen": -7.857441425323486, "logits/rejected": -7.839021682739258, "logps/chosen": -1.2254403829574585, "logps/rejected": -113.2159423828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.027174949645996, "rewards/margins": 11.129188537597656, "rewards/rejected": -8.10201358795166, "step": 1224 }, { "epoch": 0.8374636814219791, "grad_norm": 0.0008305773371830583, "learning_rate": 4.945727521381858e-05, "logits/chosen": -8.50610637664795, "logits/rejected": -8.486761093139648, "logps/chosen": -2.0204079151153564, "logps/rejected": -113.84549713134766, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9627180099487305, "rewards/margins": 11.141881942749023, "rewards/rejected": -8.179163932800293, "step": 1225 }, { "epoch": 0.8381473252435481, "grad_norm": 0.0006862754817120731, "learning_rate": 4.94524580011267e-05, "logits/chosen": -8.504505157470703, "logits/rejected": -8.486038208007812, "logps/chosen": -10.60849380493164, "logps/rejected": -106.93086242675781, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.176731586456299, "rewards/margins": 9.661847114562988, "rewards/rejected": -7.485116004943848, "step": 1226 }, { "epoch": 0.8388309690651171, "grad_norm": 0.0006323836278170347, "learning_rate": 4.944761974076976e-05, "logits/chosen": -8.47514533996582, "logits/rejected": -8.457709312438965, "logps/chosen": -7.223193168640137, "logps/rejected": -111.13629150390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.5199742317199707, "rewards/margins": 10.369916915893555, "rewards/rejected": -7.849943161010742, "step": 1227 }, { "epoch": 0.839514612886686, "grad_norm": 0.0005243188934400678, "learning_rate": 4.9442760436912744e-05, "logits/chosen": -8.007908821105957, "logits/rejected": -7.9920125007629395, "logps/chosen": -3.749858856201172, "logps/rejected": -112.75267791748047, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8461971282958984, "rewards/margins": 10.841883659362793, "rewards/rejected": -7.9956865310668945, "step": 1228 }, { "epoch": 0.840198256708255, "grad_norm": 0.0006050104275345802, "learning_rate": 4.943788009373879e-05, "logits/chosen": -8.255731582641602, "logits/rejected": -8.23332691192627, "logps/chosen": -0.1354525089263916, "logps/rejected": -115.15309143066406, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1698644161224365, "rewards/margins": 11.428434371948242, "rewards/rejected": -8.258569717407227, "step": 1229 }, { "epoch": 0.8408819005298239, "grad_norm": 0.0005623162724077702, "learning_rate": 4.943297871544911e-05, "logits/chosen": -7.929165840148926, "logits/rejected": -7.911923885345459, "logps/chosen": -6.377062797546387, "logps/rejected": -112.46891784667969, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7454447746276855, "rewards/margins": 10.5889253616333, "rewards/rejected": -7.843480110168457, "step": 1230 }, { "epoch": 0.841565544351393, "grad_norm": 0.0009271553135477006, "learning_rate": 4.942805630626303e-05, "logits/chosen": -8.281848907470703, "logits/rejected": -8.259263038635254, "logps/chosen": -1.6858006715774536, "logps/rejected": -113.57749938964844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.084831714630127, "rewards/margins": 11.237106323242188, "rewards/rejected": -8.152274131774902, "step": 1231 }, { "epoch": 0.8422491881729619, "grad_norm": 0.0006762244156561792, "learning_rate": 4.942311287041803e-05, "logits/chosen": -8.247283935546875, "logits/rejected": -8.227173805236816, "logps/chosen": -2.3839547634124756, "logps/rejected": -114.05241394042969, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9563467502593994, "rewards/margins": 11.04936408996582, "rewards/rejected": -8.093017578125, "step": 1232 }, { "epoch": 0.8429328319945308, "grad_norm": 0.0005558975972235203, "learning_rate": 4.94181484121696e-05, "logits/chosen": -7.7728118896484375, "logits/rejected": -7.753591537475586, "logps/chosen": -2.244175672531128, "logps/rejected": -114.14140319824219, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.081512689590454, "rewards/margins": 11.232933044433594, "rewards/rejected": -8.151419639587402, "step": 1233 }, { "epoch": 0.8436164758160998, "grad_norm": 0.0006706966087222099, "learning_rate": 4.94131629357914e-05, "logits/chosen": -8.24308967590332, "logits/rejected": -8.22458553314209, "logps/chosen": -0.167179673910141, "logps/rejected": -114.96202850341797, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1150293350219727, "rewards/margins": 11.343171119689941, "rewards/rejected": -8.228141784667969, "step": 1234 }, { "epoch": 0.8443001196376688, "grad_norm": 0.0005146064795553684, "learning_rate": 4.940815644557517e-05, "logits/chosen": -7.943838119506836, "logits/rejected": -7.925068378448486, "logps/chosen": -0.1642010509967804, "logps/rejected": -114.97342681884766, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1248154640197754, "rewards/margins": 11.425371170043945, "rewards/rejected": -8.300555229187012, "step": 1235 }, { "epoch": 0.8449837634592378, "grad_norm": 0.0006163181969895959, "learning_rate": 4.940312894583072e-05, "logits/chosen": -7.660965442657471, "logits/rejected": -7.642329216003418, "logps/chosen": -1.4120845794677734, "logps/rejected": -113.88493347167969, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.049956798553467, "rewards/margins": 11.174915313720703, "rewards/rejected": -8.124958038330078, "step": 1236 }, { "epoch": 0.8456674072808067, "grad_norm": 0.0034489573445171118, "learning_rate": 4.9398080440885964e-05, "logits/chosen": -7.801029205322266, "logits/rejected": -7.782156944274902, "logps/chosen": -1.8179246187210083, "logps/rejected": -113.7861557006836, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.099850654602051, "rewards/margins": 11.171092987060547, "rewards/rejected": -8.071242332458496, "step": 1237 }, { "epoch": 0.8463510511023756, "grad_norm": 0.0005123725277371705, "learning_rate": 4.9393010935086874e-05, "logits/chosen": -7.814849376678467, "logits/rejected": -7.792697429656982, "logps/chosen": -2.2223246097564697, "logps/rejected": -114.2573013305664, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0654335021972656, "rewards/margins": 11.141687393188477, "rewards/rejected": -8.076253890991211, "step": 1238 }, { "epoch": 0.8470346949239447, "grad_norm": 0.0004518417699728161, "learning_rate": 4.938792043279753e-05, "logits/chosen": -7.908929824829102, "logits/rejected": -7.889340877532959, "logps/chosen": -3.764005661010742, "logps/rejected": -112.38227081298828, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8469042778015137, "rewards/margins": 10.815362930297852, "rewards/rejected": -7.96845817565918, "step": 1239 }, { "epoch": 0.8477183387455136, "grad_norm": 0.0006038299761712551, "learning_rate": 4.938280893840008e-05, "logits/chosen": -8.257820129394531, "logits/rejected": -8.238153457641602, "logps/chosen": -7.359813213348389, "logps/rejected": -110.81442260742188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.551547050476074, "rewards/margins": 10.286452293395996, "rewards/rejected": -7.734905242919922, "step": 1240 }, { "epoch": 0.8484019825670825, "grad_norm": 0.0005184194887988269, "learning_rate": 4.9377676456294725e-05, "logits/chosen": -7.397045135498047, "logits/rejected": -7.375854015350342, "logps/chosen": -4.867737770080566, "logps/rejected": -110.78791809082031, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7994203567504883, "rewards/margins": 10.62861156463623, "rewards/rejected": -7.829190254211426, "step": 1241 }, { "epoch": 0.8490856263886515, "grad_norm": 0.0006192799191921949, "learning_rate": 4.937252299089975e-05, "logits/chosen": -7.713428020477295, "logits/rejected": -7.693450927734375, "logps/chosen": -3.7323148250579834, "logps/rejected": -113.17221069335938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8987174034118652, "rewards/margins": 10.88562297821045, "rewards/rejected": -7.986905574798584, "step": 1242 }, { "epoch": 0.8497692702102205, "grad_norm": 0.0005109641933813691, "learning_rate": 4.9367348546651485e-05, "logits/chosen": -8.70235538482666, "logits/rejected": -8.680349349975586, "logps/chosen": -1.7604434490203857, "logps/rejected": -113.55638122558594, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0218353271484375, "rewards/margins": 11.108366966247559, "rewards/rejected": -8.086531639099121, "step": 1243 }, { "epoch": 0.8504529140317895, "grad_norm": 0.0004904319648630917, "learning_rate": 4.9362153128004355e-05, "logits/chosen": -8.00931167602539, "logits/rejected": -7.993890762329102, "logps/chosen": -2.208360195159912, "logps/rejected": -113.81145477294922, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.883439064025879, "rewards/margins": 11.04401969909668, "rewards/rejected": -8.1605806350708, "step": 1244 }, { "epoch": 0.8511365578533584, "grad_norm": 0.0005657679866999388, "learning_rate": 4.935693673943081e-05, "logits/chosen": -8.245235443115234, "logits/rejected": -8.22443675994873, "logps/chosen": -2.037851572036743, "logps/rejected": -114.86793518066406, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9843921661376953, "rewards/margins": 11.240252494812012, "rewards/rejected": -8.255860328674316, "step": 1245 }, { "epoch": 0.8518202016749273, "grad_norm": 0.000627933070063591, "learning_rate": 4.935169938542136e-05, "logits/chosen": -8.985086441040039, "logits/rejected": -8.965142250061035, "logps/chosen": -2.0929675102233887, "logps/rejected": -112.99497985839844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.919029474258423, "rewards/margins": 11.031039237976074, "rewards/rejected": -8.112009048461914, "step": 1246 }, { "epoch": 0.8525038454964963, "grad_norm": 0.0006092725088819861, "learning_rate": 4.9346441070484564e-05, "logits/chosen": -8.501593589782715, "logits/rejected": -8.481426239013672, "logps/chosen": -7.688446998596191, "logps/rejected": -109.92245483398438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.405057907104492, "rewards/margins": 10.161067962646484, "rewards/rejected": -7.756010055541992, "step": 1247 }, { "epoch": 0.8531874893180653, "grad_norm": 0.0006843569572083652, "learning_rate": 4.934116179914703e-05, "logits/chosen": -7.850571155548096, "logits/rejected": -7.826450347900391, "logps/chosen": -6.170347213745117, "logps/rejected": -112.71502685546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8173346519470215, "rewards/margins": 10.745681762695312, "rewards/rejected": -7.928346633911133, "step": 1248 }, { "epoch": 0.8538711331396343, "grad_norm": 0.0006830402999185026, "learning_rate": 4.9335861575953384e-05, "logits/chosen": -8.44783878326416, "logits/rejected": -8.427347183227539, "logps/chosen": -1.558029055595398, "logps/rejected": -114.02224731445312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.992769718170166, "rewards/margins": 11.20981502532959, "rewards/rejected": -8.217044830322266, "step": 1249 }, { "epoch": 0.8545547769612032, "grad_norm": 0.0005006775027140975, "learning_rate": 4.933054040546633e-05, "logits/chosen": -8.424718856811523, "logits/rejected": -8.404313087463379, "logps/chosen": -4.519201755523682, "logps/rejected": -112.16920471191406, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.740079402923584, "rewards/margins": 10.718168258666992, "rewards/rejected": -7.97808837890625, "step": 1250 }, { "epoch": 0.8552384207827721, "grad_norm": 0.0005770435673184693, "learning_rate": 4.932519829226656e-05, "logits/chosen": -8.434792518615723, "logits/rejected": -8.408037185668945, "logps/chosen": -0.15337511897087097, "logps/rejected": -115.42408752441406, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1387758255004883, "rewards/margins": 11.475265502929688, "rewards/rejected": -8.336490631103516, "step": 1251 }, { "epoch": 0.8559220646043412, "grad_norm": 0.0004740066942758858, "learning_rate": 4.9319835240952824e-05, "logits/chosen": -7.84962797164917, "logits/rejected": -7.830591678619385, "logps/chosen": -2.3708317279815674, "logps/rejected": -114.21202087402344, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.966187000274658, "rewards/margins": 11.143098831176758, "rewards/rejected": -8.176912307739258, "step": 1252 }, { "epoch": 0.8566057084259101, "grad_norm": 0.0007375037530437112, "learning_rate": 4.9314451256141885e-05, "logits/chosen": -8.780656814575195, "logits/rejected": -8.75809097290039, "logps/chosen": -2.243116617202759, "logps/rejected": -114.52267456054688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0700769424438477, "rewards/margins": 11.241584777832031, "rewards/rejected": -8.171506881713867, "step": 1253 }, { "epoch": 0.857289352247479, "grad_norm": 0.0006027886993251741, "learning_rate": 4.930904634246852e-05, "logits/chosen": -8.25257682800293, "logits/rejected": -8.23060131072998, "logps/chosen": -1.9399229288101196, "logps/rejected": -114.55390930175781, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0044631958007812, "rewards/margins": 11.277036666870117, "rewards/rejected": -8.27257251739502, "step": 1254 }, { "epoch": 0.857972996069048, "grad_norm": 0.0007093885214999318, "learning_rate": 4.930362050458555e-05, "logits/chosen": -8.232022285461426, "logits/rejected": -8.212961196899414, "logps/chosen": -4.553935527801514, "logps/rejected": -113.70040893554688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6914148330688477, "rewards/margins": 10.826236724853516, "rewards/rejected": -8.134820938110352, "step": 1255 }, { "epoch": 0.858656639890617, "grad_norm": 0.00044925312977284193, "learning_rate": 4.9298173747163786e-05, "logits/chosen": -8.02331829071045, "logits/rejected": -8.0014066696167, "logps/chosen": -0.17023539543151855, "logps/rejected": -115.57052612304688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.086923837661743, "rewards/margins": 11.483104705810547, "rewards/rejected": -8.3961820602417, "step": 1256 }, { "epoch": 0.859340283712186, "grad_norm": 0.000610952905844897, "learning_rate": 4.929270607489203e-05, "logits/chosen": -7.869575500488281, "logits/rejected": -7.847542762756348, "logps/chosen": -0.2952064275741577, "logps/rejected": -114.825927734375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1162772178649902, "rewards/margins": 11.396347045898438, "rewards/rejected": -8.280069351196289, "step": 1257 }, { "epoch": 0.8600239275337549, "grad_norm": 0.0005752466968260705, "learning_rate": 4.9287217492477124e-05, "logits/chosen": -8.710481643676758, "logits/rejected": -8.687271118164062, "logps/chosen": -7.568804740905762, "logps/rejected": -110.99730682373047, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.506165027618408, "rewards/margins": 10.316009521484375, "rewards/rejected": -7.809844970703125, "step": 1258 }, { "epoch": 0.8607075713553238, "grad_norm": 0.0005823310930281878, "learning_rate": 4.9281708004643904e-05, "logits/chosen": -7.737423896789551, "logits/rejected": -7.715079307556152, "logps/chosen": -1.8615514039993286, "logps/rejected": -114.37775421142578, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.056401491165161, "rewards/margins": 11.311664581298828, "rewards/rejected": -8.255263328552246, "step": 1259 }, { "epoch": 0.8613912151768929, "grad_norm": 0.0006001914152875543, "learning_rate": 4.927617761613518e-05, "logits/chosen": -8.773185729980469, "logits/rejected": -8.75091552734375, "logps/chosen": -4.110446929931641, "logps/rejected": -113.21407318115234, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7993979454040527, "rewards/margins": 10.820428848266602, "rewards/rejected": -8.021031379699707, "step": 1260 }, { "epoch": 0.8620748589984618, "grad_norm": 0.0005537347751669586, "learning_rate": 4.927062633171177e-05, "logits/chosen": -8.340046882629395, "logits/rejected": -8.319598197937012, "logps/chosen": -0.1544744372367859, "logps/rejected": -115.40716552734375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1240429878234863, "rewards/margins": 11.447400093078613, "rewards/rejected": -8.323357582092285, "step": 1261 }, { "epoch": 0.8627585028200307, "grad_norm": 0.0006767537561245263, "learning_rate": 4.9265054156152494e-05, "logits/chosen": -8.562687873840332, "logits/rejected": -8.540875434875488, "logps/chosen": -0.14749614894390106, "logps/rejected": -115.56515502929688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1663529872894287, "rewards/margins": 11.502697944641113, "rewards/rejected": -8.336345672607422, "step": 1262 }, { "epoch": 0.8634421466415997, "grad_norm": 0.0008105220040306449, "learning_rate": 4.9259461094254125e-05, "logits/chosen": -7.530216217041016, "logits/rejected": -7.505654811859131, "logps/chosen": -1.2680381536483765, "logps/rejected": -114.78236389160156, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.2367379665374756, "rewards/margins": 11.430408477783203, "rewards/rejected": -8.193670272827148, "step": 1263 }, { "epoch": 0.8641257904631687, "grad_norm": 0.0008145326864905655, "learning_rate": 4.925384715083143e-05, "logits/chosen": -7.650557041168213, "logits/rejected": -7.627913951873779, "logps/chosen": -3.3353755474090576, "logps/rejected": -114.00942993164062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.91557240486145, "rewards/margins": 11.126551628112793, "rewards/rejected": -8.210979461669922, "step": 1264 }, { "epoch": 0.8648094342847377, "grad_norm": 0.0006569104152731597, "learning_rate": 4.9248212330717163e-05, "logits/chosen": -8.136165618896484, "logits/rejected": -8.115912437438965, "logps/chosen": -2.85697603225708, "logps/rejected": -113.13629150390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.777042865753174, "rewards/margins": 10.932523727416992, "rewards/rejected": -8.155481338500977, "step": 1265 }, { "epoch": 0.8654930781063066, "grad_norm": 0.0005818833597004414, "learning_rate": 4.924255663876204e-05, "logits/chosen": -8.050131797790527, "logits/rejected": -8.03145694732666, "logps/chosen": -6.455282688140869, "logps/rejected": -112.27494812011719, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.6723084449768066, "rewards/margins": 10.623069763183594, "rewards/rejected": -7.950761795043945, "step": 1266 }, { "epoch": 0.8661767219278755, "grad_norm": 0.000642834638711065, "learning_rate": 4.923688007983474e-05, "logits/chosen": -7.915164470672607, "logits/rejected": -7.894064903259277, "logps/chosen": -1.8995047807693481, "logps/rejected": -114.67353057861328, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9463069438934326, "rewards/margins": 11.239864349365234, "rewards/rejected": -8.293556213378906, "step": 1267 }, { "epoch": 0.8668603657494446, "grad_norm": 0.0005825157859362662, "learning_rate": 4.923118265882191e-05, "logits/chosen": -7.524744987487793, "logits/rejected": -7.5046491622924805, "logps/chosen": -1.949639081954956, "logps/rejected": -114.18929290771484, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.998976707458496, "rewards/margins": 11.135740280151367, "rewards/rejected": -8.136764526367188, "step": 1268 }, { "epoch": 0.8675440095710135, "grad_norm": 0.0007120324298739433, "learning_rate": 4.922546438062815e-05, "logits/chosen": -7.319874286651611, "logits/rejected": -7.295016288757324, "logps/chosen": -6.644918441772461, "logps/rejected": -111.25728607177734, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6215553283691406, "rewards/margins": 10.42349624633789, "rewards/rejected": -7.801941394805908, "step": 1269 }, { "epoch": 0.8682276533925825, "grad_norm": 0.0006793127977289259, "learning_rate": 4.921972525017605e-05, "logits/chosen": -7.9449567794799805, "logits/rejected": -7.923887252807617, "logps/chosen": -2.5643470287323, "logps/rejected": -111.99732971191406, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.871701240539551, "rewards/margins": 10.914691925048828, "rewards/rejected": -8.042990684509277, "step": 1270 }, { "epoch": 0.8689112972141514, "grad_norm": 0.00042689210386015475, "learning_rate": 4.921396527240608e-05, "logits/chosen": -8.263653755187988, "logits/rejected": -8.242462158203125, "logps/chosen": -4.950646877288818, "logps/rejected": -111.71704864501953, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.723341464996338, "rewards/margins": 10.65360164642334, "rewards/rejected": -7.93026065826416, "step": 1271 }, { "epoch": 0.8695949410357204, "grad_norm": 0.000519051740411669, "learning_rate": 4.920818445227672e-05, "logits/chosen": -8.232175827026367, "logits/rejected": -8.210538864135742, "logps/chosen": -1.5887634754180908, "logps/rejected": -114.61331176757812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0574822425842285, "rewards/margins": 11.26270866394043, "rewards/rejected": -8.20522689819336, "step": 1272 }, { "epoch": 0.8702785848572894, "grad_norm": 0.0029769965913146734, "learning_rate": 4.920238279476437e-05, "logits/chosen": -8.49654769897461, "logits/rejected": -8.475912094116211, "logps/chosen": -6.4372100830078125, "logps/rejected": -110.1059799194336, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.6486430168151855, "rewards/margins": 10.30994987487793, "rewards/rejected": -7.661306381225586, "step": 1273 }, { "epoch": 0.8709622286788583, "grad_norm": 0.0007115076878108084, "learning_rate": 4.919656030486337e-05, "logits/chosen": -7.773332595825195, "logits/rejected": -7.75448751449585, "logps/chosen": -3.789618492126465, "logps/rejected": -113.966064453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8177804946899414, "rewards/margins": 10.978104591369629, "rewards/rejected": -8.160324096679688, "step": 1274 }, { "epoch": 0.8716458725004272, "grad_norm": 0.0005640623276121914, "learning_rate": 4.919071698758598e-05, "logits/chosen": -8.828556060791016, "logits/rejected": -8.806107521057129, "logps/chosen": -4.297393798828125, "logps/rejected": -113.44781494140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7466607093811035, "rewards/margins": 10.914684295654297, "rewards/rejected": -8.168024063110352, "step": 1275 }, { "epoch": 0.8723295163219963, "grad_norm": 0.00044229888590052724, "learning_rate": 4.9184852847962406e-05, "logits/chosen": -7.677814483642578, "logits/rejected": -7.657397270202637, "logps/chosen": -1.7688368558883667, "logps/rejected": -114.67583465576172, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0517616271972656, "rewards/margins": 11.172613143920898, "rewards/rejected": -8.120851516723633, "step": 1276 }, { "epoch": 0.8730131601435652, "grad_norm": 0.0007314018439501524, "learning_rate": 4.917896789104078e-05, "logits/chosen": -8.41592025756836, "logits/rejected": -8.392648696899414, "logps/chosen": -2.1277458667755127, "logps/rejected": -114.935546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9693143367767334, "rewards/margins": 11.322549819946289, "rewards/rejected": -8.353235244750977, "step": 1277 }, { "epoch": 0.8736968039651342, "grad_norm": 0.0005178714636713266, "learning_rate": 4.9173062121887134e-05, "logits/chosen": -7.954891204833984, "logits/rejected": -7.930927276611328, "logps/chosen": -1.3203095197677612, "logps/rejected": -114.42713165283203, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0703930854797363, "rewards/margins": 11.251216888427734, "rewards/rejected": -8.180822372436523, "step": 1278 }, { "epoch": 0.8743804477867031, "grad_norm": 0.0005165988113731146, "learning_rate": 4.9167135545585436e-05, "logits/chosen": -8.357234954833984, "logits/rejected": -8.336231231689453, "logps/chosen": -3.223888874053955, "logps/rejected": -112.3775634765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8101272583007812, "rewards/margins": 10.888920783996582, "rewards/rejected": -8.0787935256958, "step": 1279 }, { "epoch": 0.8750640916082721, "grad_norm": 0.0006691672024317086, "learning_rate": 4.916118816723757e-05, "logits/chosen": -8.375186920166016, "logits/rejected": -8.352052688598633, "logps/chosen": -1.9748567342758179, "logps/rejected": -115.46754455566406, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.974888801574707, "rewards/margins": 11.377182006835938, "rewards/rejected": -8.40229320526123, "step": 1280 }, { "epoch": 0.8757477354298411, "grad_norm": 0.0004957020282745361, "learning_rate": 4.91552199919633e-05, "logits/chosen": -7.973511695861816, "logits/rejected": -7.9551100730896, "logps/chosen": -4.907182216644287, "logps/rejected": -112.78334045410156, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.735678195953369, "rewards/margins": 10.702685356140137, "rewards/rejected": -7.967006206512451, "step": 1281 }, { "epoch": 0.87643137925141, "grad_norm": 0.0005708396784029901, "learning_rate": 4.914923102490031e-05, "logits/chosen": -8.421977996826172, "logits/rejected": -8.39936351776123, "logps/chosen": -3.3118996620178223, "logps/rejected": -113.8213119506836, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.935520887374878, "rewards/margins": 11.005904197692871, "rewards/rejected": -8.07038402557373, "step": 1282 }, { "epoch": 0.877115023072979, "grad_norm": 0.000559844309464097, "learning_rate": 4.91432212712042e-05, "logits/chosen": -7.972768306732178, "logits/rejected": -7.955721855163574, "logps/chosen": -1.8280442953109741, "logps/rejected": -114.05247497558594, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.889925718307495, "rewards/margins": 11.188274383544922, "rewards/rejected": -8.298348426818848, "step": 1283 }, { "epoch": 0.8777986668945479, "grad_norm": 0.0004787096695508808, "learning_rate": 4.913719073604843e-05, "logits/chosen": -8.11536693572998, "logits/rejected": -8.093680381774902, "logps/chosen": -0.1624186486005783, "logps/rejected": -115.79156494140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1460843086242676, "rewards/margins": 11.52728271484375, "rewards/rejected": -8.381197929382324, "step": 1284 }, { "epoch": 0.8784823107161169, "grad_norm": 0.0005403980612754822, "learning_rate": 4.913113942462437e-05, "logits/chosen": -8.659079551696777, "logits/rejected": -8.63176155090332, "logps/chosen": -3.6899008750915527, "logps/rejected": -113.05397033691406, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8993444442749023, "rewards/margins": 10.954293251037598, "rewards/rejected": -8.054947853088379, "step": 1285 }, { "epoch": 0.8791659545376859, "grad_norm": 0.0005907126469537616, "learning_rate": 4.912506734214127e-05, "logits/chosen": -8.803993225097656, "logits/rejected": -8.782013893127441, "logps/chosen": -4.093164920806885, "logps/rejected": -114.28517150878906, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.796879768371582, "rewards/margins": 10.933103561401367, "rewards/rejected": -8.136224746704102, "step": 1286 }, { "epoch": 0.8798495983592548, "grad_norm": 0.0005575796822085977, "learning_rate": 4.911897449382628e-05, "logits/chosen": -8.017484664916992, "logits/rejected": -7.9947099685668945, "logps/chosen": -5.463737964630127, "logps/rejected": -112.824951171875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.732452869415283, "rewards/margins": 10.677231788635254, "rewards/rejected": -7.944778919219971, "step": 1287 }, { "epoch": 0.8805332421808237, "grad_norm": 0.0005994606181047857, "learning_rate": 4.911286088492438e-05, "logits/chosen": -8.39724349975586, "logits/rejected": -8.373395919799805, "logps/chosen": -3.905158519744873, "logps/rejected": -113.92082214355469, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8536698818206787, "rewards/margins": 11.009649276733398, "rewards/rejected": -8.15597915649414, "step": 1288 }, { "epoch": 0.8812168860023928, "grad_norm": 0.000507068180013448, "learning_rate": 4.910672652069846e-05, "logits/chosen": -8.781460762023926, "logits/rejected": -8.75878620147705, "logps/chosen": -0.14557626843452454, "logps/rejected": -116.16384887695312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.09547758102417, "rewards/margins": 11.58346939086914, "rewards/rejected": -8.487991333007812, "step": 1289 }, { "epoch": 0.8819005298239617, "grad_norm": 0.0005646398640237749, "learning_rate": 4.910057140642929e-05, "logits/chosen": -8.252235412597656, "logits/rejected": -8.228139877319336, "logps/chosen": -1.6543357372283936, "logps/rejected": -114.440673828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0007951259613037, "rewards/margins": 11.2068510055542, "rewards/rejected": -8.206055641174316, "step": 1290 }, { "epoch": 0.8825841736455307, "grad_norm": 0.0005292770802043378, "learning_rate": 4.909439554741544e-05, "logits/chosen": -8.990294456481934, "logits/rejected": -8.969416618347168, "logps/chosen": -4.8392486572265625, "logps/rejected": -111.42013549804688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.62918758392334, "rewards/margins": 10.586786270141602, "rewards/rejected": -7.957597732543945, "step": 1291 }, { "epoch": 0.8832678174670996, "grad_norm": 0.0006154276197776198, "learning_rate": 4.9088198948973406e-05, "logits/chosen": -7.8486175537109375, "logits/rejected": -7.823202133178711, "logps/chosen": -1.9809532165527344, "logps/rejected": -113.83676147460938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0699360370635986, "rewards/margins": 11.136457443237305, "rewards/rejected": -8.066521644592285, "step": 1292 }, { "epoch": 0.8839514612886686, "grad_norm": 0.0007373990956693888, "learning_rate": 4.90819816164375e-05, "logits/chosen": -8.443300247192383, "logits/rejected": -8.422754287719727, "logps/chosen": -5.856985092163086, "logps/rejected": -112.70198059082031, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.646803379058838, "rewards/margins": 10.599590301513672, "rewards/rejected": -7.952787399291992, "step": 1293 }, { "epoch": 0.8846351051102376, "grad_norm": 0.00047872462891973555, "learning_rate": 4.907574355515989e-05, "logits/chosen": -7.892865180969238, "logits/rejected": -7.871011734008789, "logps/chosen": -3.115171194076538, "logps/rejected": -114.70647430419922, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.904299259185791, "rewards/margins": 11.085822105407715, "rewards/rejected": -8.181522369384766, "step": 1294 }, { "epoch": 0.8853187489318065, "grad_norm": 0.0046040513552725315, "learning_rate": 4.90694847705106e-05, "logits/chosen": -8.524163246154785, "logits/rejected": -8.503459930419922, "logps/chosen": -3.763479709625244, "logps/rejected": -113.68641662597656, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7464590072631836, "rewards/margins": 10.89356803894043, "rewards/rejected": -8.147109031677246, "step": 1295 }, { "epoch": 0.8860023927533754, "grad_norm": 0.0005381960654631257, "learning_rate": 4.9063205267877465e-05, "logits/chosen": -8.119284629821777, "logits/rejected": -8.09666633605957, "logps/chosen": -3.9085071086883545, "logps/rejected": -113.52045440673828, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.86961030960083, "rewards/margins": 10.96953010559082, "rewards/rejected": -8.099920272827148, "step": 1296 }, { "epoch": 0.8866860365749445, "grad_norm": 0.0004975406336598098, "learning_rate": 4.90569050526662e-05, "logits/chosen": -7.760429859161377, "logits/rejected": -7.742204666137695, "logps/chosen": -2.3586905002593994, "logps/rejected": -115.52030181884766, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.967223644256592, "rewards/margins": 11.285390853881836, "rewards/rejected": -8.318167686462402, "step": 1297 }, { "epoch": 0.8873696803965134, "grad_norm": 0.0005976604297757149, "learning_rate": 4.905058413030031e-05, "logits/chosen": -7.959812641143799, "logits/rejected": -7.936854839324951, "logps/chosen": -3.082974433898926, "logps/rejected": -113.61300659179688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9275646209716797, "rewards/margins": 11.007621765136719, "rewards/rejected": -8.080057144165039, "step": 1298 }, { "epoch": 0.8880533242180824, "grad_norm": 0.0007479600026272237, "learning_rate": 4.9044242506221125e-05, "logits/chosen": -8.333331108093262, "logits/rejected": -8.308578491210938, "logps/chosen": -4.733559608459473, "logps/rejected": -112.84622192382812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.754075765609741, "rewards/margins": 10.829242706298828, "rewards/rejected": -8.075166702270508, "step": 1299 }, { "epoch": 0.8887369680396513, "grad_norm": 0.0006220974028110504, "learning_rate": 4.9037880185887844e-05, "logits/chosen": -7.782049655914307, "logits/rejected": -7.764733791351318, "logps/chosen": -3.316815137863159, "logps/rejected": -112.13162231445312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.79115629196167, "rewards/margins": 10.85200023651123, "rewards/rejected": -8.060844421386719, "step": 1300 }, { "epoch": 0.8894206118612203, "grad_norm": 0.0005936693050898612, "learning_rate": 4.903149717477742e-05, "logits/chosen": -7.070740222930908, "logits/rejected": -7.050826549530029, "logps/chosen": -3.486471652984619, "logps/rejected": -113.73075866699219, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.888882637023926, "rewards/margins": 11.037808418273926, "rewards/rejected": -8.14892578125, "step": 1301 }, { "epoch": 0.8901042556827893, "grad_norm": 0.000885510875377804, "learning_rate": 4.902509347838466e-05, "logits/chosen": -8.7493896484375, "logits/rejected": -8.728609085083008, "logps/chosen": -4.217710018157959, "logps/rejected": -112.23155212402344, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.727915048599243, "rewards/margins": 10.749460220336914, "rewards/rejected": -8.02154541015625, "step": 1302 }, { "epoch": 0.8907878995043582, "grad_norm": 0.0005900425603613257, "learning_rate": 4.901866910222217e-05, "logits/chosen": -8.167232513427734, "logits/rejected": -8.144591331481934, "logps/chosen": -1.6431063413619995, "logps/rejected": -114.58708190917969, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.016387462615967, "rewards/margins": 11.183223724365234, "rewards/rejected": -8.16683578491211, "step": 1303 }, { "epoch": 0.8914715433259272, "grad_norm": 0.000767684425227344, "learning_rate": 4.901222405182034e-05, "logits/chosen": -8.029717445373535, "logits/rejected": -8.011651039123535, "logps/chosen": -3.9061503410339355, "logps/rejected": -113.99578857421875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7784571647644043, "rewards/margins": 10.91628646850586, "rewards/rejected": -8.137828826904297, "step": 1304 }, { "epoch": 0.8921551871474962, "grad_norm": 0.000512937200255692, "learning_rate": 4.900575833272737e-05, "logits/chosen": -8.604711532592773, "logits/rejected": -8.576794624328613, "logps/chosen": -1.8906971216201782, "logps/rejected": -115.52899932861328, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9609062671661377, "rewards/margins": 11.374796867370605, "rewards/rejected": -8.413890838623047, "step": 1305 }, { "epoch": 0.8928388309690651, "grad_norm": 0.0005529130576178432, "learning_rate": 4.899927195050928e-05, "logits/chosen": -7.816373825073242, "logits/rejected": -7.796581268310547, "logps/chosen": -3.9822773933410645, "logps/rejected": -112.92556762695312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.865154504776001, "rewards/margins": 10.908418655395508, "rewards/rejected": -8.043264389038086, "step": 1306 }, { "epoch": 0.8935224747906341, "grad_norm": 0.0005378571222536266, "learning_rate": 4.899276491074981e-05, "logits/chosen": -8.428974151611328, "logits/rejected": -8.405150413513184, "logps/chosen": -0.6606793999671936, "logps/rejected": -115.2469482421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1558890342712402, "rewards/margins": 11.444387435913086, "rewards/rejected": -8.288497924804688, "step": 1307 }, { "epoch": 0.894206118612203, "grad_norm": 0.0007318595889955759, "learning_rate": 4.898623721905055e-05, "logits/chosen": -8.183073043823242, "logits/rejected": -8.15865707397461, "logps/chosen": -4.9529924392700195, "logps/rejected": -112.57051849365234, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.741922616958618, "rewards/margins": 10.722126007080078, "rewards/rejected": -7.9802021980285645, "step": 1308 }, { "epoch": 0.894889762433772, "grad_norm": 0.0005892960471101105, "learning_rate": 4.897968888103084e-05, "logits/chosen": -8.057971954345703, "logits/rejected": -8.037229537963867, "logps/chosen": -5.5182037353515625, "logps/rejected": -113.41139221191406, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.817211151123047, "rewards/margins": 10.807336807250977, "rewards/rejected": -7.990126609802246, "step": 1309 }, { "epoch": 0.895573406255341, "grad_norm": 0.0010695185046643019, "learning_rate": 4.8973119902327786e-05, "logits/chosen": -8.360199928283691, "logits/rejected": -8.339292526245117, "logps/chosen": -2.5023202896118164, "logps/rejected": -115.22511291503906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9695911407470703, "rewards/margins": 11.230194091796875, "rewards/rejected": -8.260602951049805, "step": 1310 }, { "epoch": 0.8962570500769099, "grad_norm": 0.0006245879922062159, "learning_rate": 4.896653028859627e-05, "logits/chosen": -8.468717575073242, "logits/rejected": -8.442266464233398, "logps/chosen": -2.078690767288208, "logps/rejected": -115.38556671142578, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9975411891937256, "rewards/margins": 11.345611572265625, "rewards/rejected": -8.34807014465332, "step": 1311 }, { "epoch": 0.8969406938984789, "grad_norm": 0.00041314985719509423, "learning_rate": 4.895992004550895e-05, "logits/chosen": -8.284098625183105, "logits/rejected": -8.257415771484375, "logps/chosen": -0.1557444930076599, "logps/rejected": -116.80097961425781, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.171171188354492, "rewards/margins": 11.617324829101562, "rewards/rejected": -8.44615364074707, "step": 1312 }, { "epoch": 0.8976243377200479, "grad_norm": 0.0005734345759265125, "learning_rate": 4.895328917875623e-05, "logits/chosen": -8.546256065368652, "logits/rejected": -8.524333000183105, "logps/chosen": -4.426817893981934, "logps/rejected": -112.03157806396484, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.685713291168213, "rewards/margins": 10.70237922668457, "rewards/rejected": -8.0166654586792, "step": 1313 }, { "epoch": 0.8983079815416168, "grad_norm": 0.0004714125825557858, "learning_rate": 4.8946637694046265e-05, "logits/chosen": -7.653057098388672, "logits/rejected": -7.633351802825928, "logps/chosen": -2.4808120727539062, "logps/rejected": -114.73751831054688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9192070960998535, "rewards/margins": 11.146976470947266, "rewards/rejected": -8.22776985168457, "step": 1314 }, { "epoch": 0.8989916253631858, "grad_norm": 0.0005269072717055678, "learning_rate": 4.893996559710496e-05, "logits/chosen": -8.348112106323242, "logits/rejected": -8.328374862670898, "logps/chosen": -3.6413910388946533, "logps/rejected": -113.89461517333984, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7394371032714844, "rewards/margins": 10.93100357055664, "rewards/rejected": -8.191568374633789, "step": 1315 }, { "epoch": 0.8996752691847547, "grad_norm": 0.0004001336346846074, "learning_rate": 4.893327289367597e-05, "logits/chosen": -8.428942680358887, "logits/rejected": -8.408123970031738, "logps/chosen": -1.9133970737457275, "logps/rejected": -115.90266418457031, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9716708660125732, "rewards/margins": 11.363513946533203, "rewards/rejected": -8.391844749450684, "step": 1316 }, { "epoch": 0.9003589130063238, "grad_norm": 0.0006943729240447283, "learning_rate": 4.8926559589520696e-05, "logits/chosen": -8.473026275634766, "logits/rejected": -8.45578384399414, "logps/chosen": -7.254324913024902, "logps/rejected": -111.37974548339844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5270323753356934, "rewards/margins": 10.36971664428711, "rewards/rejected": -7.842683792114258, "step": 1317 }, { "epoch": 0.9010425568278927, "grad_norm": 0.0004455661110114306, "learning_rate": 4.891982569041825e-05, "logits/chosen": -8.398849487304688, "logits/rejected": -8.369771003723145, "logps/chosen": -3.239201307296753, "logps/rejected": -115.25605773925781, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.026093006134033, "rewards/margins": 11.286410331726074, "rewards/rejected": -8.2603178024292, "step": 1318 }, { "epoch": 0.9017262006494616, "grad_norm": 0.0005928257596679032, "learning_rate": 4.891307120216549e-05, "logits/chosen": -8.222771644592285, "logits/rejected": -8.194831848144531, "logps/chosen": -0.214618980884552, "logps/rejected": -116.90621948242188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1823763847351074, "rewards/margins": 11.655593872070312, "rewards/rejected": -8.47321605682373, "step": 1319 }, { "epoch": 0.9024098444710306, "grad_norm": 0.0006920764571987092, "learning_rate": 4.890629613057701e-05, "logits/chosen": -8.575854301452637, "logits/rejected": -8.557096481323242, "logps/chosen": -2.8590354919433594, "logps/rejected": -115.29244232177734, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8270483016967773, "rewards/margins": 11.195899963378906, "rewards/rejected": -8.368851661682129, "step": 1320 }, { "epoch": 0.9030934882925996, "grad_norm": 0.0006213370943441987, "learning_rate": 4.8899500481485086e-05, "logits/chosen": -7.982054710388184, "logits/rejected": -7.959553241729736, "logps/chosen": -4.717156887054443, "logps/rejected": -113.11445617675781, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.831207752227783, "rewards/margins": 10.824117660522461, "rewards/rejected": -7.9929094314575195, "step": 1321 }, { "epoch": 0.9037771321141685, "grad_norm": 0.0006213531014509499, "learning_rate": 4.889268426073974e-05, "logits/chosen": -8.293875694274902, "logits/rejected": -8.27142333984375, "logps/chosen": -4.392594814300537, "logps/rejected": -114.9649887084961, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7759082317352295, "rewards/margins": 10.965380668640137, "rewards/rejected": -8.189472198486328, "step": 1322 }, { "epoch": 0.9044607759357375, "grad_norm": 0.0005370675935409963, "learning_rate": 4.888584747420869e-05, "logits/chosen": -8.030661582946777, "logits/rejected": -8.009448051452637, "logps/chosen": -4.9673895835876465, "logps/rejected": -114.171875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.697291374206543, "rewards/margins": 10.8652925491333, "rewards/rejected": -8.168001174926758, "step": 1323 }, { "epoch": 0.9051444197573064, "grad_norm": 0.0004564623231999576, "learning_rate": 4.8878990127777377e-05, "logits/chosen": -8.226360321044922, "logits/rejected": -8.201809883117676, "logps/chosen": -1.9691554307937622, "logps/rejected": -115.74192810058594, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0721545219421387, "rewards/margins": 11.377300262451172, "rewards/rejected": -8.305146217346191, "step": 1324 }, { "epoch": 0.9058280635788754, "grad_norm": 0.0005303762736730278, "learning_rate": 4.88721122273489e-05, "logits/chosen": -8.690357208251953, "logits/rejected": -8.668079376220703, "logps/chosen": -0.18677666783332825, "logps/rejected": -117.01420593261719, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1109957695007324, "rewards/margins": 11.67314338684082, "rewards/rejected": -8.56214714050293, "step": 1325 }, { "epoch": 0.9065117074004444, "grad_norm": 0.0006780924159102142, "learning_rate": 4.886521377884409e-05, "logits/chosen": -8.212800025939941, "logits/rejected": -8.194812774658203, "logps/chosen": -5.13828182220459, "logps/rejected": -112.62764739990234, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.667534828186035, "rewards/margins": 10.724964141845703, "rewards/rejected": -8.057428359985352, "step": 1326 }, { "epoch": 0.9071953512220133, "grad_norm": 0.0004116122145205736, "learning_rate": 4.885829478820145e-05, "logits/chosen": -8.445707321166992, "logits/rejected": -8.425862312316895, "logps/chosen": -2.4007294178009033, "logps/rejected": -115.97860717773438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.955179214477539, "rewards/margins": 11.32058334350586, "rewards/rejected": -8.36540412902832, "step": 1327 }, { "epoch": 0.9078789950435823, "grad_norm": 0.00045638534356839955, "learning_rate": 4.885135526137717e-05, "logits/chosen": -8.55713176727295, "logits/rejected": -8.535628318786621, "logps/chosen": -0.20650914311408997, "logps/rejected": -117.18233489990234, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.127885580062866, "rewards/margins": 11.623756408691406, "rewards/rejected": -8.495870590209961, "step": 1328 }, { "epoch": 0.9085626388651512, "grad_norm": 0.0005234471173025668, "learning_rate": 4.8844395204345115e-05, "logits/chosen": -8.836727142333984, "logits/rejected": -8.809305191040039, "logps/chosen": -1.7966103553771973, "logps/rejected": -116.17935180664062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0293948650360107, "rewards/margins": 11.42989730834961, "rewards/rejected": -8.400503158569336, "step": 1329 }, { "epoch": 0.9092462826867203, "grad_norm": 0.0004959556390531361, "learning_rate": 4.883741462309684e-05, "logits/chosen": -8.917153358459473, "logits/rejected": -8.892622947692871, "logps/chosen": -4.003321647644043, "logps/rejected": -115.56999206542969, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.834707736968994, "rewards/margins": 11.127923965454102, "rewards/rejected": -8.293214797973633, "step": 1330 }, { "epoch": 0.9099299265082892, "grad_norm": 0.0005384612013585865, "learning_rate": 4.883041352364154e-05, "logits/chosen": -8.084296226501465, "logits/rejected": -8.065245628356934, "logps/chosen": -1.9276525974273682, "logps/rejected": -115.83574676513672, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0133891105651855, "rewards/margins": 11.328876495361328, "rewards/rejected": -8.315486907958984, "step": 1331 }, { "epoch": 0.9106135703298581, "grad_norm": 0.000501891307067126, "learning_rate": 4.8823391912006107e-05, "logits/chosen": -7.7465009689331055, "logits/rejected": -7.721682071685791, "logps/chosen": -2.14758563041687, "logps/rejected": -115.48542785644531, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0417988300323486, "rewards/margins": 11.314428329467773, "rewards/rejected": -8.272629737854004, "step": 1332 }, { "epoch": 0.9112972141514271, "grad_norm": 0.000559278589207679, "learning_rate": 4.881634979423505e-05, "logits/chosen": -7.37161922454834, "logits/rejected": -7.34989070892334, "logps/chosen": -2.4849820137023926, "logps/rejected": -116.74497985839844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9906017780303955, "rewards/margins": 11.391334533691406, "rewards/rejected": -8.400732040405273, "step": 1333 }, { "epoch": 0.9119808579729961, "grad_norm": 0.0006151261040940881, "learning_rate": 4.8809287176390564e-05, "logits/chosen": -7.815141677856445, "logits/rejected": -7.792733192443848, "logps/chosen": -3.6223373413085938, "logps/rejected": -114.27821350097656, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8642499446868896, "rewards/margins": 10.958267211914062, "rewards/rejected": -8.094017028808594, "step": 1334 }, { "epoch": 0.912664501794565, "grad_norm": 0.0005080833216197789, "learning_rate": 4.880220406455248e-05, "logits/chosen": -8.644883155822754, "logits/rejected": -8.622757911682129, "logps/chosen": -6.607322692871094, "logps/rejected": -113.8384017944336, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.5326216220855713, "rewards/margins": 10.637035369873047, "rewards/rejected": -8.104413986206055, "step": 1335 }, { "epoch": 0.913348145616134, "grad_norm": 0.0004774544795509428, "learning_rate": 4.879510046481828e-05, "logits/chosen": -8.044975280761719, "logits/rejected": -8.017511367797852, "logps/chosen": -0.23795247077941895, "logps/rejected": -117.14924621582031, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.142428398132324, "rewards/margins": 11.651657104492188, "rewards/rejected": -8.509227752685547, "step": 1336 }, { "epoch": 0.9140317894377029, "grad_norm": 0.000578556617256254, "learning_rate": 4.878797638330305e-05, "logits/chosen": -8.231748580932617, "logits/rejected": -8.202646255493164, "logps/chosen": -1.6759777069091797, "logps/rejected": -115.98043060302734, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.075528144836426, "rewards/margins": 11.386569023132324, "rewards/rejected": -8.311041831970215, "step": 1337 }, { "epoch": 0.914715433259272, "grad_norm": 0.000398417585529387, "learning_rate": 4.8780831826139555e-05, "logits/chosen": -8.723695755004883, "logits/rejected": -8.699073791503906, "logps/chosen": -1.7942407131195068, "logps/rejected": -116.51331329345703, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9843199253082275, "rewards/margins": 11.423019409179688, "rewards/rejected": -8.438699722290039, "step": 1338 }, { "epoch": 0.9153990770808409, "grad_norm": 0.0006359878461807966, "learning_rate": 4.877366679947815e-05, "logits/chosen": -8.806035995483398, "logits/rejected": -8.776243209838867, "logps/chosen": -0.5086678266525269, "logps/rejected": -116.45532989501953, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.133138656616211, "rewards/margins": 11.625051498413086, "rewards/rejected": -8.491911888122559, "step": 1339 }, { "epoch": 0.9160827209024098, "grad_norm": 0.00048663699999451637, "learning_rate": 4.8766481309486834e-05, "logits/chosen": -7.567422866821289, "logits/rejected": -7.540811061859131, "logps/chosen": -4.709779739379883, "logps/rejected": -112.5954818725586, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8010807037353516, "rewards/margins": 10.764535903930664, "rewards/rejected": -7.963454723358154, "step": 1340 }, { "epoch": 0.9167663647239788, "grad_norm": 0.0005341379437595606, "learning_rate": 4.8759275362351205e-05, "logits/chosen": -7.891430377960205, "logits/rejected": -7.87213134765625, "logps/chosen": -3.8796088695526123, "logps/rejected": -113.62026977539062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.892749071121216, "rewards/margins": 10.97209644317627, "rewards/rejected": -8.079346656799316, "step": 1341 }, { "epoch": 0.9174500085455478, "grad_norm": 0.0005945701850578189, "learning_rate": 4.875204896427447e-05, "logits/chosen": -8.850961685180664, "logits/rejected": -8.828448295593262, "logps/chosen": -8.270353317260742, "logps/rejected": -114.05036926269531, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.4760665893554688, "rewards/margins": 10.509450912475586, "rewards/rejected": -8.033384323120117, "step": 1342 }, { "epoch": 0.9181336523671167, "grad_norm": 0.00032674428075551987, "learning_rate": 4.874480212147748e-05, "logits/chosen": -8.2994384765625, "logits/rejected": -8.278823852539062, "logps/chosen": -2.039985179901123, "logps/rejected": -116.52011108398438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.909541130065918, "rewards/margins": 11.39782428741455, "rewards/rejected": -8.48828411102295, "step": 1343 }, { "epoch": 0.9188172961886857, "grad_norm": 0.0005535947275348008, "learning_rate": 4.873753484019862e-05, "logits/chosen": -7.888293743133545, "logits/rejected": -7.864645957946777, "logps/chosen": -5.058781623840332, "logps/rejected": -114.3436050415039, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.72896671295166, "rewards/margins": 10.877769470214844, "rewards/rejected": -8.148801803588867, "step": 1344 }, { "epoch": 0.9195009400102546, "grad_norm": 0.0004171151667833328, "learning_rate": 4.873024712669393e-05, "logits/chosen": -7.655845642089844, "logits/rejected": -7.632759094238281, "logps/chosen": -0.14195363223552704, "logps/rejected": -117.32079315185547, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1694555282592773, "rewards/margins": 11.711803436279297, "rewards/rejected": -8.542346954345703, "step": 1345 }, { "epoch": 0.9201845838318237, "grad_norm": 0.0005296951276250184, "learning_rate": 4.872293898723701e-05, "logits/chosen": -7.826446533203125, "logits/rejected": -7.8058271408081055, "logps/chosen": -5.892349720001221, "logps/rejected": -114.326904296875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.5928895473480225, "rewards/margins": 10.788755416870117, "rewards/rejected": -8.195865631103516, "step": 1346 }, { "epoch": 0.9208682276533926, "grad_norm": 0.000415553105995059, "learning_rate": 4.871561042811903e-05, "logits/chosen": -8.0061616897583, "logits/rejected": -7.983870029449463, "logps/chosen": -2.5032880306243896, "logps/rejected": -117.00294494628906, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0553267002105713, "rewards/margins": 11.473699569702148, "rewards/rejected": -8.41837215423584, "step": 1347 }, { "epoch": 0.9215518714749615, "grad_norm": 0.0006293836049735546, "learning_rate": 4.870826145564877e-05, "logits/chosen": -8.22445297241211, "logits/rejected": -8.20481014251709, "logps/chosen": -4.643301010131836, "logps/rejected": -114.74417114257812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.724792242050171, "rewards/margins": 10.89331340789795, "rewards/rejected": -8.168519973754883, "step": 1348 }, { "epoch": 0.9222355152965305, "grad_norm": 0.00035602564457803965, "learning_rate": 4.870089207615258e-05, "logits/chosen": -8.824495315551758, "logits/rejected": -8.800960540771484, "logps/chosen": -0.18674880266189575, "logps/rejected": -117.41680908203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1469976902008057, "rewards/margins": 11.745735168457031, "rewards/rejected": -8.598737716674805, "step": 1349 }, { "epoch": 0.9229191591180995, "grad_norm": 0.00042989110806956887, "learning_rate": 4.8693502295974335e-05, "logits/chosen": -8.126361846923828, "logits/rejected": -8.103464126586914, "logps/chosen": -1.6826183795928955, "logps/rejected": -116.56729888916016, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9938416481018066, "rewards/margins": 11.451423645019531, "rewards/rejected": -8.457581520080566, "step": 1350 }, { "epoch": 0.9236028029396685, "grad_norm": 0.0006015824619680643, "learning_rate": 4.8686092121475535e-05, "logits/chosen": -8.272480964660645, "logits/rejected": -8.250883102416992, "logps/chosen": -1.1686426401138306, "logps/rejected": -116.90414428710938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0331082344055176, "rewards/margins": 11.448055267333984, "rewards/rejected": -8.414946556091309, "step": 1351 }, { "epoch": 0.9242864467612374, "grad_norm": 0.0011177838314324617, "learning_rate": 4.8678661559035184e-05, "logits/chosen": -8.991227149963379, "logits/rejected": -8.96784782409668, "logps/chosen": -1.6470413208007812, "logps/rejected": -117.06903076171875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0103845596313477, "rewards/margins": 11.558479309082031, "rewards/rejected": -8.548093795776367, "step": 1352 }, { "epoch": 0.9249700905828063, "grad_norm": 0.00042372927418909967, "learning_rate": 4.8671210615049864e-05, "logits/chosen": -8.592618942260742, "logits/rejected": -8.56606388092041, "logps/chosen": -6.055691719055176, "logps/rejected": -114.53379821777344, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.724430799484253, "rewards/margins": 10.86462688446045, "rewards/rejected": -8.140195846557617, "step": 1353 }, { "epoch": 0.9256537344043754, "grad_norm": 0.0004756336857099086, "learning_rate": 4.8663739295933694e-05, "logits/chosen": -7.756525993347168, "logits/rejected": -7.735775470733643, "logps/chosen": -4.295398712158203, "logps/rejected": -116.2455062866211, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7955121994018555, "rewards/margins": 11.098651885986328, "rewards/rejected": -8.303139686584473, "step": 1354 }, { "epoch": 0.9263373782259443, "grad_norm": 0.0006146066589280963, "learning_rate": 4.8656247608118325e-05, "logits/chosen": -8.854621887207031, "logits/rejected": -8.827945709228516, "logps/chosen": -0.19281736016273499, "logps/rejected": -117.23796081542969, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.080394744873047, "rewards/margins": 11.671487808227539, "rewards/rejected": -8.59109115600586, "step": 1355 }, { "epoch": 0.9270210220475132, "grad_norm": 0.0006937271100468934, "learning_rate": 4.864873555805297e-05, "logits/chosen": -8.839616775512695, "logits/rejected": -8.817099571228027, "logps/chosen": -0.19747930765151978, "logps/rejected": -117.36088562011719, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0543477535247803, "rewards/margins": 11.694469451904297, "rewards/rejected": -8.640122413635254, "step": 1356 }, { "epoch": 0.9277046658690822, "grad_norm": 0.0005431644967757165, "learning_rate": 4.8641203152204346e-05, "logits/chosen": -8.120278358459473, "logits/rejected": -8.095603942871094, "logps/chosen": -3.3322362899780273, "logps/rejected": -114.09609985351562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.861203670501709, "rewards/margins": 10.981342315673828, "rewards/rejected": -8.120140075683594, "step": 1357 }, { "epoch": 0.9283883096906512, "grad_norm": 0.0005968004697933793, "learning_rate": 4.863365039705669e-05, "logits/chosen": -8.612981796264648, "logits/rejected": -8.586739540100098, "logps/chosen": -1.915972113609314, "logps/rejected": -116.40162658691406, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9814815521240234, "rewards/margins": 11.371685981750488, "rewards/rejected": -8.390203475952148, "step": 1358 }, { "epoch": 0.9290719535122202, "grad_norm": 0.0005276694428175688, "learning_rate": 4.862607729911177e-05, "logits/chosen": -9.499650955200195, "logits/rejected": -9.474762916564941, "logps/chosen": -4.095235824584961, "logps/rejected": -115.32275390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.74817156791687, "rewards/margins": 11.07026195526123, "rewards/rejected": -8.322090148925781, "step": 1359 }, { "epoch": 0.9297555973337891, "grad_norm": 0.00047311215894296765, "learning_rate": 4.861848386488887e-05, "logits/chosen": -8.555950164794922, "logits/rejected": -8.524944305419922, "logps/chosen": -0.18573547899723053, "logps/rejected": -117.64253234863281, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.120940685272217, "rewards/margins": 11.681354522705078, "rewards/rejected": -8.560413360595703, "step": 1360 }, { "epoch": 0.930439241155358, "grad_norm": 0.0006299018277786672, "learning_rate": 4.8610870100924765e-05, "logits/chosen": -7.615543365478516, "logits/rejected": -7.591771602630615, "logps/chosen": -0.4204029440879822, "logps/rejected": -117.13356018066406, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.129108190536499, "rewards/margins": 11.695427894592285, "rewards/rejected": -8.566320419311523, "step": 1361 }, { "epoch": 0.931122884976927, "grad_norm": 0.000394977570977062, "learning_rate": 4.8603236013773746e-05, "logits/chosen": -8.748939514160156, "logits/rejected": -8.726104736328125, "logps/chosen": -3.491726875305176, "logps/rejected": -115.43185424804688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.847993850708008, "rewards/margins": 11.17113971710205, "rewards/rejected": -8.323144912719727, "step": 1362 }, { "epoch": 0.931806528798496, "grad_norm": 0.0007562381797470152, "learning_rate": 4.8595581610007576e-05, "logits/chosen": -8.419139862060547, "logits/rejected": -8.390316009521484, "logps/chosen": -0.21231743693351746, "logps/rejected": -117.88868713378906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.197659730911255, "rewards/margins": 11.74970817565918, "rewards/rejected": -8.552047729492188, "step": 1363 }, { "epoch": 0.932490172620065, "grad_norm": 0.000565223628655076, "learning_rate": 4.858790689621555e-05, "logits/chosen": -8.523120880126953, "logits/rejected": -8.500862121582031, "logps/chosen": -1.982521891593933, "logps/rejected": -116.68153381347656, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9622533321380615, "rewards/margins": 11.440300941467285, "rewards/rejected": -8.478048324584961, "step": 1364 }, { "epoch": 0.9331738164416339, "grad_norm": 0.0007267623441293836, "learning_rate": 4.8580211879004385e-05, "logits/chosen": -8.274660110473633, "logits/rejected": -8.2500638961792, "logps/chosen": -5.48942756652832, "logps/rejected": -113.4080810546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.6686837673187256, "rewards/margins": 10.751890182495117, "rewards/rejected": -8.083206176757812, "step": 1365 }, { "epoch": 0.9338574602632028, "grad_norm": 0.0005080733681097627, "learning_rate": 4.8572496564998344e-05, "logits/chosen": -8.383846282958984, "logits/rejected": -8.357275009155273, "logps/chosen": -0.38293981552124023, "logps/rejected": -116.92562866210938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.088531970977783, "rewards/margins": 11.63701343536377, "rewards/rejected": -8.548480987548828, "step": 1366 }, { "epoch": 0.9345411040847719, "grad_norm": 0.0005424765404313803, "learning_rate": 4.85647609608391e-05, "logits/chosen": -8.429422378540039, "logits/rejected": -8.403341293334961, "logps/chosen": -3.4491398334503174, "logps/rejected": -116.00332641601562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.924600124359131, "rewards/margins": 11.314508438110352, "rewards/rejected": -8.389907836914062, "step": 1367 }, { "epoch": 0.9352247479063408, "grad_norm": 0.00033653047285042703, "learning_rate": 4.8557005073185845e-05, "logits/chosen": -8.07836627960205, "logits/rejected": -8.054880142211914, "logps/chosen": -1.2888234853744507, "logps/rejected": -115.94181060791016, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0046043395996094, "rewards/margins": 11.455333709716797, "rewards/rejected": -8.450729370117188, "step": 1368 }, { "epoch": 0.9359083917279097, "grad_norm": 0.00041436817264184356, "learning_rate": 4.8549228908715204e-05, "logits/chosen": -7.891766548156738, "logits/rejected": -7.871780872344971, "logps/chosen": -3.5179734230041504, "logps/rejected": -114.97483825683594, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7797274589538574, "rewards/margins": 11.040879249572754, "rewards/rejected": -8.261152267456055, "step": 1369 }, { "epoch": 0.9365920355494787, "grad_norm": 0.00429169274866581, "learning_rate": 4.8541432474121254e-05, "logits/chosen": -7.967691421508789, "logits/rejected": -7.944651126861572, "logps/chosen": -2.0105526447296143, "logps/rejected": -116.05776977539062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9612464904785156, "rewards/margins": 11.382682800292969, "rewards/rejected": -8.421435356140137, "step": 1370 }, { "epoch": 0.9372756793710477, "grad_norm": 0.0005328432307578623, "learning_rate": 4.853361577611554e-05, "logits/chosen": -8.244266510009766, "logits/rejected": -8.219311714172363, "logps/chosen": -1.0155525207519531, "logps/rejected": -116.33100128173828, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1589574813842773, "rewards/margins": 11.555105209350586, "rewards/rejected": -8.396147727966309, "step": 1371 }, { "epoch": 0.9379593231926167, "grad_norm": 0.0007286674808710814, "learning_rate": 4.852577882142703e-05, "logits/chosen": -8.307685852050781, "logits/rejected": -8.285090446472168, "logps/chosen": -1.9958772659301758, "logps/rejected": -116.36128997802734, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.909414052963257, "rewards/margins": 11.332907676696777, "rewards/rejected": -8.423493385314941, "step": 1372 }, { "epoch": 0.9386429670141856, "grad_norm": 0.0006141374469734728, "learning_rate": 4.851792161680215e-05, "logits/chosen": -8.775558471679688, "logits/rejected": -8.755669593811035, "logps/chosen": -1.659204363822937, "logps/rejected": -116.26294708251953, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.963493824005127, "rewards/margins": 11.38501262664795, "rewards/rejected": -8.421520233154297, "step": 1373 }, { "epoch": 0.9393266108357545, "grad_norm": 0.0005572926020249724, "learning_rate": 4.851004416900474e-05, "logits/chosen": -8.727728843688965, "logits/rejected": -8.707996368408203, "logps/chosen": -1.3518218994140625, "logps/rejected": -116.99647521972656, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.043910264968872, "rewards/margins": 11.511940956115723, "rewards/rejected": -8.46803092956543, "step": 1374 }, { "epoch": 0.9400102546573236, "grad_norm": 0.00044053143938072026, "learning_rate": 4.850214648481608e-05, "logits/chosen": -8.582932472229004, "logits/rejected": -8.562679290771484, "logps/chosen": -1.5444707870483398, "logps/rejected": -117.1534652709961, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.03812837600708, "rewards/margins": 11.53558349609375, "rewards/rejected": -8.497454643249512, "step": 1375 }, { "epoch": 0.9406938984788925, "grad_norm": 0.0004901179345324636, "learning_rate": 4.8494228571034875e-05, "logits/chosen": -8.653060913085938, "logits/rejected": -8.627992630004883, "logps/chosen": -2.658639669418335, "logps/rejected": -116.84700012207031, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9932608604431152, "rewards/margins": 11.378546714782715, "rewards/rejected": -8.385286331176758, "step": 1376 }, { "epoch": 0.9413775423004614, "grad_norm": 0.0006438642740249634, "learning_rate": 4.848629043447721e-05, "logits/chosen": -9.214624404907227, "logits/rejected": -9.194870948791504, "logps/chosen": -8.625653266906738, "logps/rejected": -113.3849868774414, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.3428521156311035, "rewards/margins": 10.392338752746582, "rewards/rejected": -8.04948616027832, "step": 1377 }, { "epoch": 0.9420611861220304, "grad_norm": 0.0005159930442459881, "learning_rate": 4.847833208197662e-05, "logits/chosen": -7.895098686218262, "logits/rejected": -7.872616291046143, "logps/chosen": -3.3573005199432373, "logps/rejected": -115.33606719970703, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8276445865631104, "rewards/margins": 11.107690811157227, "rewards/rejected": -8.280045509338379, "step": 1378 }, { "epoch": 0.9427448299435994, "grad_norm": 0.0003904659242834896, "learning_rate": 4.847035352038403e-05, "logits/chosen": -8.538772583007812, "logits/rejected": -8.517583847045898, "logps/chosen": -2.662367582321167, "logps/rejected": -115.623779296875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.855466365814209, "rewards/margins": 11.184150695800781, "rewards/rejected": -8.328683853149414, "step": 1379 }, { "epoch": 0.9434284737651684, "grad_norm": 0.00041983721894212067, "learning_rate": 4.8462354756567754e-05, "logits/chosen": -7.879304885864258, "logits/rejected": -7.854798316955566, "logps/chosen": -1.9229120016098022, "logps/rejected": -116.69529724121094, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9810903072357178, "rewards/margins": 11.43946361541748, "rewards/rejected": -8.458372116088867, "step": 1380 }, { "epoch": 0.9441121175867373, "grad_norm": 0.000565393187571317, "learning_rate": 4.845433579741349e-05, "logits/chosen": -7.943650245666504, "logits/rejected": -7.917245864868164, "logps/chosen": -2.0313668251037598, "logps/rejected": -116.67759704589844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0099363327026367, "rewards/margins": 11.432461738586426, "rewards/rejected": -8.422525405883789, "step": 1381 }, { "epoch": 0.9447957614083062, "grad_norm": 0.00043648621067404747, "learning_rate": 4.8446296649824344e-05, "logits/chosen": -8.073904991149902, "logits/rejected": -8.04998779296875, "logps/chosen": -5.13289213180542, "logps/rejected": -116.21859741210938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.6210358142852783, "rewards/margins": 11.001455307006836, "rewards/rejected": -8.380420684814453, "step": 1382 }, { "epoch": 0.9454794052298753, "grad_norm": 0.00116639188490808, "learning_rate": 4.843823732072079e-05, "logits/chosen": -8.166728973388672, "logits/rejected": -8.137659072875977, "logps/chosen": -0.19127212464809418, "logps/rejected": -117.9575424194336, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1777777671813965, "rewards/margins": 11.731006622314453, "rewards/rejected": -8.553228378295898, "step": 1383 }, { "epoch": 0.9461630490514442, "grad_norm": 0.000573838478885591, "learning_rate": 4.843015781704067e-05, "logits/chosen": -7.579144477844238, "logits/rejected": -7.555595397949219, "logps/chosen": -5.690086364746094, "logps/rejected": -114.40242004394531, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.781632900238037, "rewards/margins": 10.81186580657959, "rewards/rejected": -8.030232429504395, "step": 1384 }, { "epoch": 0.9468466928730132, "grad_norm": 0.0006624241359531879, "learning_rate": 4.84220581457392e-05, "logits/chosen": -8.219002723693848, "logits/rejected": -8.196022987365723, "logps/chosen": -6.53692626953125, "logps/rejected": -112.78324890136719, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.537144422531128, "rewards/margins": 10.560660362243652, "rewards/rejected": -8.023515701293945, "step": 1385 }, { "epoch": 0.9475303366945821, "grad_norm": 0.0005861364770680666, "learning_rate": 4.841393831378895e-05, "logits/chosen": -8.125373840332031, "logits/rejected": -8.100363731384277, "logps/chosen": -2.231760263442993, "logps/rejected": -117.36026000976562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0122196674346924, "rewards/margins": 11.463655471801758, "rewards/rejected": -8.451435089111328, "step": 1386 }, { "epoch": 0.9482139805161511, "grad_norm": 0.0004913816228508949, "learning_rate": 4.8405798328179864e-05, "logits/chosen": -8.095359802246094, "logits/rejected": -8.076863288879395, "logps/chosen": -7.741765022277832, "logps/rejected": -114.84419250488281, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.4352164268493652, "rewards/margins": 10.658445358276367, "rewards/rejected": -8.223230361938477, "step": 1387 }, { "epoch": 0.9488976243377201, "grad_norm": 0.0006517760921269655, "learning_rate": 4.839763819591921e-05, "logits/chosen": -8.10205078125, "logits/rejected": -8.07867431640625, "logps/chosen": -0.5235865712165833, "logps/rejected": -117.29182434082031, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0105884075164795, "rewards/margins": 11.67108154296875, "rewards/rejected": -8.660493850708008, "step": 1388 }, { "epoch": 0.949581268159289, "grad_norm": 0.0005382320960052311, "learning_rate": 4.8389457924031614e-05, "logits/chosen": -7.677652835845947, "logits/rejected": -7.65809440612793, "logps/chosen": -3.6367592811584473, "logps/rejected": -115.0833740234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8337416648864746, "rewards/margins": 11.117104530334473, "rewards/rejected": -8.28336238861084, "step": 1389 }, { "epoch": 0.9502649119808579, "grad_norm": 0.0004272439982742071, "learning_rate": 4.838125751955903e-05, "logits/chosen": -8.409236907958984, "logits/rejected": -8.377901077270508, "logps/chosen": -2.4635260105133057, "logps/rejected": -117.06526184082031, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.041532516479492, "rewards/margins": 11.425323486328125, "rewards/rejected": -8.383790969848633, "step": 1390 }, { "epoch": 0.950948555802427, "grad_norm": 0.0003898987197317183, "learning_rate": 4.837303698956075e-05, "logits/chosen": -7.941970348358154, "logits/rejected": -7.920611381530762, "logps/chosen": -3.7499003410339355, "logps/rejected": -116.14183807373047, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.936063051223755, "rewards/margins": 11.253057479858398, "rewards/rejected": -8.316994667053223, "step": 1391 }, { "epoch": 0.9516321996239959, "grad_norm": 0.0004197190282866359, "learning_rate": 4.836479634111341e-05, "logits/chosen": -8.48448371887207, "logits/rejected": -8.464929580688477, "logps/chosen": -4.480459213256836, "logps/rejected": -114.71605682373047, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.764291286468506, "rewards/margins": 10.962587356567383, "rewards/rejected": -8.198296546936035, "step": 1392 }, { "epoch": 0.9523158434455649, "grad_norm": 0.0003807885223068297, "learning_rate": 4.835653558131092e-05, "logits/chosen": -7.611315727233887, "logits/rejected": -7.5866498947143555, "logps/chosen": -5.516650199890137, "logps/rejected": -114.442138671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.685655117034912, "rewards/margins": 10.7919921875, "rewards/rejected": -8.106337547302246, "step": 1393 }, { "epoch": 0.9529994872671338, "grad_norm": 0.0006341390544548631, "learning_rate": 4.834825471726454e-05, "logits/chosen": -8.102954864501953, "logits/rejected": -8.080848693847656, "logps/chosen": -2.1808557510375977, "logps/rejected": -116.50008392333984, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9594616889953613, "rewards/margins": 11.348252296447754, "rewards/rejected": -8.388790130615234, "step": 1394 }, { "epoch": 0.9536831310887028, "grad_norm": 0.0007642251439392567, "learning_rate": 4.833995375610282e-05, "logits/chosen": -8.356866836547852, "logits/rejected": -8.334753036499023, "logps/chosen": -0.26422587037086487, "logps/rejected": -118.0142822265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1534929275512695, "rewards/margins": 11.738799095153809, "rewards/rejected": -8.585306167602539, "step": 1395 }, { "epoch": 0.9543667749102718, "grad_norm": 0.0008094199583865702, "learning_rate": 4.8331632704971614e-05, "logits/chosen": -8.55427360534668, "logits/rejected": -8.530599594116211, "logps/chosen": -11.643908500671387, "logps/rejected": -110.99398040771484, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.2424559593200684, "rewards/margins": 9.89140510559082, "rewards/rejected": -7.648948669433594, "step": 1396 }, { "epoch": 0.9550504187318407, "grad_norm": 0.0004663401923608035, "learning_rate": 4.8323291571034065e-05, "logits/chosen": -7.869191646575928, "logits/rejected": -7.848363399505615, "logps/chosen": -5.597321510314941, "logps/rejected": -114.54290771484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.6791110038757324, "rewards/margins": 10.904487609863281, "rewards/rejected": -8.22537612915039, "step": 1397 }, { "epoch": 0.9557340625534096, "grad_norm": 0.0005139994318597019, "learning_rate": 4.8314930361470635e-05, "logits/chosen": -7.816288471221924, "logits/rejected": -7.793849468231201, "logps/chosen": -0.22853703796863556, "logps/rejected": -118.24604034423828, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.196937084197998, "rewards/margins": 11.755728721618652, "rewards/rejected": -8.558791160583496, "step": 1398 }, { "epoch": 0.9564177063749786, "grad_norm": 0.0005316516617313027, "learning_rate": 4.830654908347902e-05, "logits/chosen": -8.33247184753418, "logits/rejected": -8.30762767791748, "logps/chosen": -0.16655346751213074, "logps/rejected": -118.09281921386719, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.2261292934417725, "rewards/margins": 11.795842170715332, "rewards/rejected": -8.569711685180664, "step": 1399 }, { "epoch": 0.9571013501965476, "grad_norm": 0.0005811552982777357, "learning_rate": 4.8298147744274216e-05, "logits/chosen": -8.27707290649414, "logits/rejected": -8.255062103271484, "logps/chosen": -6.089713096618652, "logps/rejected": -115.22003173828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7380285263061523, "rewards/margins": 10.847663879394531, "rewards/rejected": -8.109636306762695, "step": 1400 }, { "epoch": 0.9577849940181166, "grad_norm": 0.00043712640763260424, "learning_rate": 4.828972635108849e-05, "logits/chosen": -8.303277969360352, "logits/rejected": -8.27928638458252, "logps/chosen": -2.5140721797943115, "logps/rejected": -117.39136505126953, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.853104829788208, "rewards/margins": 11.410945892333984, "rewards/rejected": -8.557841300964355, "step": 1401 }, { "epoch": 0.9584686378396855, "grad_norm": 0.0005170649965293705, "learning_rate": 4.8281284911171384e-05, "logits/chosen": -8.299674987792969, "logits/rejected": -8.27251148223877, "logps/chosen": -0.20517128705978394, "logps/rejected": -118.29212188720703, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.184283971786499, "rewards/margins": 11.803132057189941, "rewards/rejected": -8.618846893310547, "step": 1402 }, { "epoch": 0.9591522816612544, "grad_norm": 0.000512321712449193, "learning_rate": 4.8272823431789674e-05, "logits/chosen": -8.576984405517578, "logits/rejected": -8.55514144897461, "logps/chosen": -2.7659881114959717, "logps/rejected": -114.68167877197266, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.853231906890869, "rewards/margins": 11.165945053100586, "rewards/rejected": -8.312714576721191, "step": 1403 }, { "epoch": 0.9598359254828235, "grad_norm": 0.0005951822968199849, "learning_rate": 4.82643419202274e-05, "logits/chosen": -8.043466567993164, "logits/rejected": -8.021005630493164, "logps/chosen": -1.478387475013733, "logps/rejected": -117.04061889648438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0844388008117676, "rewards/margins": 11.540082931518555, "rewards/rejected": -8.455644607543945, "step": 1404 }, { "epoch": 0.9605195693043924, "grad_norm": 0.0005995897809043527, "learning_rate": 4.8255840383785827e-05, "logits/chosen": -7.746166706085205, "logits/rejected": -7.722971439361572, "logps/chosen": -0.6959683299064636, "logps/rejected": -117.67323303222656, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0706398487091064, "rewards/margins": 11.645857810974121, "rewards/rejected": -8.575218200683594, "step": 1405 }, { "epoch": 0.9612032131259614, "grad_norm": 0.0004454932059161365, "learning_rate": 4.824731882978349e-05, "logits/chosen": -8.601668357849121, "logits/rejected": -8.584023475646973, "logps/chosen": -5.380753993988037, "logps/rejected": -114.8964614868164, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.505690097808838, "rewards/margins": 10.790119171142578, "rewards/rejected": -8.284428596496582, "step": 1406 }, { "epoch": 0.9618868569475303, "grad_norm": 0.00033019427792169154, "learning_rate": 4.823877726555614e-05, "logits/chosen": -8.61629867553711, "logits/rejected": -8.585796356201172, "logps/chosen": -0.23390451073646545, "logps/rejected": -118.31068420410156, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.156240940093994, "rewards/margins": 11.761575698852539, "rewards/rejected": -8.605335235595703, "step": 1407 }, { "epoch": 0.9625705007690993, "grad_norm": 0.0005713361315429211, "learning_rate": 4.8230215698456753e-05, "logits/chosen": -8.462449073791504, "logits/rejected": -8.439990043640137, "logps/chosen": -1.3081446886062622, "logps/rejected": -116.56942749023438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0986952781677246, "rewards/margins": 11.53378963470459, "rewards/rejected": -8.435094833374023, "step": 1408 }, { "epoch": 0.9632541445906683, "grad_norm": 0.0005530971684493124, "learning_rate": 4.822163413585552e-05, "logits/chosen": -8.032795906066895, "logits/rejected": -8.00717830657959, "logps/chosen": -0.21700797975063324, "logps/rejected": -117.94866180419922, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.2177093029022217, "rewards/margins": 11.822132110595703, "rewards/rejected": -8.604421615600586, "step": 1409 }, { "epoch": 0.9639377884122372, "grad_norm": 0.000569117721170187, "learning_rate": 4.8213032585139855e-05, "logits/chosen": -7.669466018676758, "logits/rejected": -7.647403717041016, "logps/chosen": -1.4433751106262207, "logps/rejected": -116.80502319335938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0191519260406494, "rewards/margins": 11.45138931274414, "rewards/rejected": -8.43223762512207, "step": 1410 }, { "epoch": 0.9646214322338061, "grad_norm": 0.0004602937842719257, "learning_rate": 4.8204411053714376e-05, "logits/chosen": -8.390803337097168, "logits/rejected": -8.36653995513916, "logps/chosen": -2.9890265464782715, "logps/rejected": -115.19563293457031, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8288321495056152, "rewards/margins": 11.185148239135742, "rewards/rejected": -8.356315612792969, "step": 1411 }, { "epoch": 0.9653050760553752, "grad_norm": 0.0005532748182304204, "learning_rate": 4.8195769549000893e-05, "logits/chosen": -7.991272449493408, "logits/rejected": -7.969335556030273, "logps/chosen": -3.2943458557128906, "logps/rejected": -115.3948974609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8910093307495117, "rewards/margins": 11.194236755371094, "rewards/rejected": -8.303228378295898, "step": 1412 }, { "epoch": 0.9659887198769441, "grad_norm": 0.0005719604669138789, "learning_rate": 4.818710807843843e-05, "logits/chosen": -7.796582221984863, "logits/rejected": -7.776040077209473, "logps/chosen": -5.647776126861572, "logps/rejected": -115.45085144042969, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.680481195449829, "rewards/margins": 10.891077995300293, "rewards/rejected": -8.210597038269043, "step": 1413 }, { "epoch": 0.9666723636985131, "grad_norm": 0.0005037700175307691, "learning_rate": 4.817842664948317e-05, "logits/chosen": -7.634538173675537, "logits/rejected": -7.610711097717285, "logps/chosen": -0.18060587346553802, "logps/rejected": -118.29163360595703, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.18113112449646, "rewards/margins": 11.750670433044434, "rewards/rejected": -8.569540023803711, "step": 1414 }, { "epoch": 0.967356007520082, "grad_norm": 0.00043980381451547146, "learning_rate": 4.81697252696085e-05, "logits/chosen": -7.939533233642578, "logits/rejected": -7.917428016662598, "logps/chosen": -2.374316692352295, "logps/rejected": -116.73197174072266, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.914066791534424, "rewards/margins": 11.416428565979004, "rewards/rejected": -8.502362251281738, "step": 1415 }, { "epoch": 0.968039651341651, "grad_norm": 0.0004260108107700944, "learning_rate": 4.8161003946304975e-05, "logits/chosen": -7.689847946166992, "logits/rejected": -7.663125991821289, "logps/chosen": -3.7714695930480957, "logps/rejected": -115.02584838867188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.916079521179199, "rewards/margins": 11.115045547485352, "rewards/rejected": -8.198966979980469, "step": 1416 }, { "epoch": 0.96872329516322, "grad_norm": 0.0004984989645890892, "learning_rate": 4.815226268708031e-05, "logits/chosen": -8.479853630065918, "logits/rejected": -8.457287788391113, "logps/chosen": -3.4325180053710938, "logps/rejected": -115.22177124023438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.889421224594116, "rewards/margins": 11.222147941589355, "rewards/rejected": -8.332725524902344, "step": 1417 }, { "epoch": 0.9694069389847889, "grad_norm": 0.0005462641711346805, "learning_rate": 4.8143501499459395e-05, "logits/chosen": -7.9668803215026855, "logits/rejected": -7.942050933837891, "logps/chosen": -2.27215838432312, "logps/rejected": -117.09918212890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0157227516174316, "rewards/margins": 11.435811996459961, "rewards/rejected": -8.420087814331055, "step": 1418 }, { "epoch": 0.9700905828063578, "grad_norm": 0.00511953653767705, "learning_rate": 4.813472039098426e-05, "logits/chosen": -8.432422637939453, "logits/rejected": -8.408624649047852, "logps/chosen": -1.5505726337432861, "logps/rejected": -117.7197265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.027127504348755, "rewards/margins": 11.574596405029297, "rewards/rejected": -8.547469139099121, "step": 1419 }, { "epoch": 0.9707742266279269, "grad_norm": 0.0005379038630053401, "learning_rate": 4.8125919369214104e-05, "logits/chosen": -8.043804168701172, "logits/rejected": -8.021134376525879, "logps/chosen": -4.744858741760254, "logps/rejected": -116.956298828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.815603733062744, "rewards/margins": 11.191668510437012, "rewards/rejected": -8.37606430053711, "step": 1420 }, { "epoch": 0.9714578704494958, "grad_norm": 0.00042401516111567616, "learning_rate": 4.8117098441725265e-05, "logits/chosen": -8.816908836364746, "logits/rejected": -8.784547805786133, "logps/chosen": -0.18950432538986206, "logps/rejected": -118.36054992675781, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1731836795806885, "rewards/margins": 11.796764373779297, "rewards/rejected": -8.623580932617188, "step": 1421 }, { "epoch": 0.9721415142710648, "grad_norm": 0.00042810162995010614, "learning_rate": 4.81082576161112e-05, "logits/chosen": -9.073381423950195, "logits/rejected": -9.047935485839844, "logps/chosen": -2.7773618698120117, "logps/rejected": -117.57137298583984, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9482674598693848, "rewards/margins": 11.462085723876953, "rewards/rejected": -8.51381778717041, "step": 1422 }, { "epoch": 0.9728251580926337, "grad_norm": 0.0006133444840088487, "learning_rate": 4.8099396899982486e-05, "logits/chosen": -8.028738021850586, "logits/rejected": -8.004366874694824, "logps/chosen": -1.8666718006134033, "logps/rejected": -117.43258666992188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.034514904022217, "rewards/margins": 11.512983322143555, "rewards/rejected": -8.478469848632812, "step": 1423 }, { "epoch": 0.9735088019142027, "grad_norm": 0.0005520040867850184, "learning_rate": 4.8090516300966866e-05, "logits/chosen": -8.290926933288574, "logits/rejected": -8.268217086791992, "logps/chosen": -8.84949016571045, "logps/rejected": -111.79730987548828, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.429643392562866, "rewards/margins": 10.211151123046875, "rewards/rejected": -7.78150749206543, "step": 1424 }, { "epoch": 0.9741924457357717, "grad_norm": 0.0004835394211113453, "learning_rate": 4.808161582670916e-05, "logits/chosen": -8.019587516784668, "logits/rejected": -7.997065544128418, "logps/chosen": -0.23506119847297668, "logps/rejected": -118.72769165039062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1133594512939453, "rewards/margins": 11.732552528381348, "rewards/rejected": -8.619193077087402, "step": 1425 }, { "epoch": 0.9748760895573406, "grad_norm": 0.00048173347022384405, "learning_rate": 4.807269548487133e-05, "logits/chosen": -8.053374290466309, "logits/rejected": -8.033391952514648, "logps/chosen": -2.432157278060913, "logps/rejected": -116.31742858886719, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.898451805114746, "rewards/margins": 11.357723236083984, "rewards/rejected": -8.459270477294922, "step": 1426 }, { "epoch": 0.9755597333789096, "grad_norm": 0.0007386979414150119, "learning_rate": 4.80637552831324e-05, "logits/chosen": -8.039505958557129, "logits/rejected": -8.017441749572754, "logps/chosen": -2.691483974456787, "logps/rejected": -117.66790771484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.012970209121704, "rewards/margins": 11.515134811401367, "rewards/rejected": -8.50216293334961, "step": 1427 }, { "epoch": 0.9762433772004786, "grad_norm": 0.00048769620480015874, "learning_rate": 4.805479522918852e-05, "logits/chosen": -8.644789695739746, "logits/rejected": -8.617761611938477, "logps/chosen": -0.23156708478927612, "logps/rejected": -118.53211975097656, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1266751289367676, "rewards/margins": 11.813725471496582, "rewards/rejected": -8.687049865722656, "step": 1428 }, { "epoch": 0.9769270210220475, "grad_norm": 0.0006973544950596988, "learning_rate": 4.804581533075293e-05, "logits/chosen": -8.554539680480957, "logits/rejected": -8.52792739868164, "logps/chosen": -4.937350273132324, "logps/rejected": -114.72758483886719, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.702589988708496, "rewards/margins": 10.96237564086914, "rewards/rejected": -8.259785652160645, "step": 1429 }, { "epoch": 0.9776106648436165, "grad_norm": 0.000592358410358429, "learning_rate": 4.803681559555595e-05, "logits/chosen": -7.7288737297058105, "logits/rejected": -7.703502655029297, "logps/chosen": -3.6853678226470947, "logps/rejected": -115.78812408447266, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9036383628845215, "rewards/margins": 11.212698936462402, "rewards/rejected": -8.309060096740723, "step": 1430 }, { "epoch": 0.9782943086651854, "grad_norm": 0.0007388530648313463, "learning_rate": 4.8027796031344946e-05, "logits/chosen": -8.201752662658691, "logits/rejected": -8.176280975341797, "logps/chosen": -1.9134626388549805, "logps/rejected": -117.1952896118164, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9954473972320557, "rewards/margins": 11.48451042175293, "rewards/rejected": -8.489063262939453, "step": 1431 }, { "epoch": 0.9789779524867545, "grad_norm": 0.0006177971954457462, "learning_rate": 4.801875664588441e-05, "logits/chosen": -7.644628524780273, "logits/rejected": -7.619627475738525, "logps/chosen": -0.25170761346817017, "logps/rejected": -118.41864013671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1203665733337402, "rewards/margins": 11.780537605285645, "rewards/rejected": -8.660170555114746, "step": 1432 }, { "epoch": 0.9796615963083234, "grad_norm": 0.0004586310242302716, "learning_rate": 4.800969744695585e-05, "logits/chosen": -8.04806137084961, "logits/rejected": -8.023771286010742, "logps/chosen": -3.86983323097229, "logps/rejected": -114.05731964111328, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9095499515533447, "rewards/margins": 11.014827728271484, "rewards/rejected": -8.105277061462402, "step": 1433 }, { "epoch": 0.9803452401298923, "grad_norm": 0.0005234279669821262, "learning_rate": 4.800061844235786e-05, "logits/chosen": -8.399038314819336, "logits/rejected": -8.370157241821289, "logps/chosen": -3.473271369934082, "logps/rejected": -116.35971069335938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9492788314819336, "rewards/margins": 11.300786972045898, "rewards/rejected": -8.351507186889648, "step": 1434 }, { "epoch": 0.9810288839514613, "grad_norm": 0.00044891799916513264, "learning_rate": 4.799151963990605e-05, "logits/chosen": -8.00802993774414, "logits/rejected": -7.984992027282715, "logps/chosen": -2.8768181800842285, "logps/rejected": -116.22029113769531, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8807530403137207, "rewards/margins": 11.285555839538574, "rewards/rejected": -8.404803276062012, "step": 1435 }, { "epoch": 0.9817125277730302, "grad_norm": 0.0005147407646290958, "learning_rate": 4.798240104743311e-05, "logits/chosen": -8.486943244934082, "logits/rejected": -8.464432716369629, "logps/chosen": -4.466490745544434, "logps/rejected": -114.97575378417969, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.665339231491089, "rewards/margins": 10.975156784057617, "rewards/rejected": -8.309818267822266, "step": 1436 }, { "epoch": 0.9823961715945992, "grad_norm": 0.00041984260315075517, "learning_rate": 4.797326267278875e-05, "logits/chosen": -8.67398452758789, "logits/rejected": -8.647644996643066, "logps/chosen": -4.105910301208496, "logps/rejected": -116.39286041259766, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8372769355773926, "rewards/margins": 11.200801849365234, "rewards/rejected": -8.363524436950684, "step": 1437 }, { "epoch": 0.9830798154161682, "grad_norm": 0.0004290193028282374, "learning_rate": 4.79641045238397e-05, "logits/chosen": -8.483174324035645, "logits/rejected": -8.452775955200195, "logps/chosen": -0.28382495045661926, "logps/rejected": -118.69342041015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.15877103805542, "rewards/margins": 11.802495002746582, "rewards/rejected": -8.64372444152832, "step": 1438 }, { "epoch": 0.9837634592377371, "grad_norm": 0.0004940185463055968, "learning_rate": 4.795492660846973e-05, "logits/chosen": -8.073529243469238, "logits/rejected": -8.050058364868164, "logps/chosen": -3.32718825340271, "logps/rejected": -116.28855895996094, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8568615913391113, "rewards/margins": 11.23708724975586, "rewards/rejected": -8.380227088928223, "step": 1439 }, { "epoch": 0.984447103059306, "grad_norm": 0.0006068128859624267, "learning_rate": 4.794572893457962e-05, "logits/chosen": -8.110042572021484, "logits/rejected": -8.08810806274414, "logps/chosen": -2.428036689758301, "logps/rejected": -115.89505004882812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9082212448120117, "rewards/margins": 11.256814956665039, "rewards/rejected": -8.348592758178711, "step": 1440 }, { "epoch": 0.9851307468808751, "grad_norm": 0.000788584933616221, "learning_rate": 4.793651151008715e-05, "logits/chosen": -8.185977935791016, "logits/rejected": -8.161170959472656, "logps/chosen": -4.214919090270996, "logps/rejected": -116.62789154052734, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.854810953140259, "rewards/margins": 11.256418228149414, "rewards/rejected": -8.401607513427734, "step": 1441 }, { "epoch": 0.985814390702444, "grad_norm": 0.000484639109345153, "learning_rate": 4.79272743429271e-05, "logits/chosen": -8.239517211914062, "logits/rejected": -8.215975761413574, "logps/chosen": -7.387721061706543, "logps/rejected": -114.85902404785156, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.59907865524292, "rewards/margins": 10.667196273803711, "rewards/rejected": -8.068117141723633, "step": 1442 }, { "epoch": 0.986498034524013, "grad_norm": 0.0005757987382821739, "learning_rate": 4.7918017441051286e-05, "logits/chosen": -7.937175750732422, "logits/rejected": -7.9156107902526855, "logps/chosen": -6.300258636474609, "logps/rejected": -115.17433166503906, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.5870132446289062, "rewards/margins": 10.866737365722656, "rewards/rejected": -8.279722213745117, "step": 1443 }, { "epoch": 0.9871816783455819, "grad_norm": 0.0007247687899507582, "learning_rate": 4.790874081242845e-05, "logits/chosen": -7.880533695220947, "logits/rejected": -7.856389999389648, "logps/chosen": -3.4068026542663574, "logps/rejected": -116.11295318603516, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9037094116210938, "rewards/margins": 11.230412483215332, "rewards/rejected": -8.326704025268555, "step": 1444 }, { "epoch": 0.987865322167151, "grad_norm": 0.00039695901796221733, "learning_rate": 4.789944446504437e-05, "logits/chosen": -8.693038940429688, "logits/rejected": -8.66805362701416, "logps/chosen": -1.9073673486709595, "logps/rejected": -117.22352600097656, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.938608169555664, "rewards/margins": 11.460874557495117, "rewards/rejected": -8.52226734161377, "step": 1445 }, { "epoch": 0.9885489659887199, "grad_norm": 0.0006510947132483125, "learning_rate": 4.7890128406901754e-05, "logits/chosen": -8.015436172485352, "logits/rejected": -7.993863582611084, "logps/chosen": -4.218257427215576, "logps/rejected": -116.44905853271484, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8273720741271973, "rewards/margins": 11.190637588500977, "rewards/rejected": -8.363265037536621, "step": 1446 }, { "epoch": 0.9892326098102888, "grad_norm": 0.00041739403968676925, "learning_rate": 4.7880792646020315e-05, "logits/chosen": -8.573802947998047, "logits/rejected": -8.549030303955078, "logps/chosen": -1.1026946306228638, "logps/rejected": -117.06588745117188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0662271976470947, "rewards/margins": 11.569608688354492, "rewards/rejected": -8.50338077545166, "step": 1447 }, { "epoch": 0.9899162536318578, "grad_norm": 0.0004720393044408411, "learning_rate": 4.78714371904367e-05, "logits/chosen": -8.704023361206055, "logits/rejected": -8.674433708190918, "logps/chosen": -0.21279987692832947, "logps/rejected": -118.22233581542969, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1246347427368164, "rewards/margins": 11.7197847366333, "rewards/rejected": -8.595149993896484, "step": 1448 }, { "epoch": 0.9905998974534268, "grad_norm": 0.0005632918328046799, "learning_rate": 4.7862062048204536e-05, "logits/chosen": -8.879329681396484, "logits/rejected": -8.851112365722656, "logps/chosen": -2.0288898944854736, "logps/rejected": -117.26383972167969, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.953697681427002, "rewards/margins": 11.479681968688965, "rewards/rejected": -8.525983810424805, "step": 1449 }, { "epoch": 0.9912835412749957, "grad_norm": 0.0004930102732032537, "learning_rate": 4.785266722739438e-05, "logits/chosen": -8.434072494506836, "logits/rejected": -8.411018371582031, "logps/chosen": -4.547789573669434, "logps/rejected": -115.09446716308594, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7171459197998047, "rewards/margins": 11.014267921447754, "rewards/rejected": -8.297121047973633, "step": 1450 }, { "epoch": 0.9919671850965647, "grad_norm": 0.0004581655084621161, "learning_rate": 4.7843252736093714e-05, "logits/chosen": -9.090365409851074, "logits/rejected": -9.061616897583008, "logps/chosen": -7.312652111053467, "logps/rejected": -112.66590118408203, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.400780200958252, "rewards/margins": 10.474326133728027, "rewards/rejected": -8.073545455932617, "step": 1451 }, { "epoch": 0.9926508289181336, "grad_norm": 0.0005654146079905331, "learning_rate": 4.7833818582407e-05, "logits/chosen": -8.299306869506836, "logits/rejected": -8.275521278381348, "logps/chosen": -3.5221025943756104, "logps/rejected": -115.45541381835938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8158743381500244, "rewards/margins": 11.110577583312988, "rewards/rejected": -8.294703483581543, "step": 1452 }, { "epoch": 0.9933344727397027, "grad_norm": 0.0007387820514850318, "learning_rate": 4.782436477445557e-05, "logits/chosen": -8.367149353027344, "logits/rejected": -8.345664978027344, "logps/chosen": -5.343130588531494, "logps/rejected": -115.27506256103516, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.6468186378479004, "rewards/margins": 11.019107818603516, "rewards/rejected": -8.37228775024414, "step": 1453 }, { "epoch": 0.9940181165612716, "grad_norm": 0.0005441796383820474, "learning_rate": 4.781489132037773e-05, "logits/chosen": -8.709607124328613, "logits/rejected": -8.684736251831055, "logps/chosen": -7.873733997344971, "logps/rejected": -114.47486877441406, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.558863639831543, "rewards/margins": 10.660877227783203, "rewards/rejected": -8.10201358795166, "step": 1454 }, { "epoch": 0.9947017603828405, "grad_norm": 0.0007720529683865607, "learning_rate": 4.780539822832864e-05, "logits/chosen": -8.116147994995117, "logits/rejected": -8.087377548217773, "logps/chosen": -1.8547946214675903, "logps/rejected": -117.76957702636719, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0001306533813477, "rewards/margins": 11.54566478729248, "rewards/rejected": -8.545534133911133, "step": 1455 }, { "epoch": 0.9953854042044095, "grad_norm": 0.0004110531008336693, "learning_rate": 4.779588550648043e-05, "logits/chosen": -7.473636150360107, "logits/rejected": -7.450601577758789, "logps/chosen": -2.8893003463745117, "logps/rejected": -116.07787322998047, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9777588844299316, "rewards/margins": 11.306123733520508, "rewards/rejected": -8.328365325927734, "step": 1456 }, { "epoch": 0.9960690480259785, "grad_norm": 0.0008718686876818538, "learning_rate": 4.7786353163022074e-05, "logits/chosen": -8.785465240478516, "logits/rejected": -8.759981155395508, "logps/chosen": -0.2828516960144043, "logps/rejected": -118.59123229980469, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.076323986053467, "rewards/margins": 11.82604694366455, "rewards/rejected": -8.749722480773926, "step": 1457 }, { "epoch": 0.9967526918475474, "grad_norm": 0.0004912349977530539, "learning_rate": 4.777680120615947e-05, "logits/chosen": -8.391707420349121, "logits/rejected": -8.360785484313965, "logps/chosen": -0.2592249810695648, "logps/rejected": -118.85089111328125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1404199600219727, "rewards/margins": 11.839757919311523, "rewards/rejected": -8.699337005615234, "step": 1458 }, { "epoch": 0.9974363356691164, "grad_norm": 0.00040348953916691244, "learning_rate": 4.7767229644115375e-05, "logits/chosen": -8.733474731445312, "logits/rejected": -8.706860542297363, "logps/chosen": -2.569934844970703, "logps/rejected": -117.3207015991211, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.920285224914551, "rewards/margins": 11.430595397949219, "rewards/rejected": -8.510310173034668, "step": 1459 }, { "epoch": 0.9981199794906853, "grad_norm": 0.0005161604494787753, "learning_rate": 4.775763848512945e-05, "logits/chosen": -8.631210327148438, "logits/rejected": -8.604156494140625, "logps/chosen": -1.4468181133270264, "logps/rejected": -117.93389892578125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0527281761169434, "rewards/margins": 11.595368385314941, "rewards/rejected": -8.54263973236084, "step": 1460 }, { "epoch": 0.9988036233122544, "grad_norm": 0.000560856715310365, "learning_rate": 4.77480277374582e-05, "logits/chosen": -7.7861433029174805, "logits/rejected": -7.764842987060547, "logps/chosen": -0.27965080738067627, "logps/rejected": -117.87433624267578, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0720224380493164, "rewards/margins": 11.715263366699219, "rewards/rejected": -8.643241882324219, "step": 1461 }, { "epoch": 0.9994872671338233, "grad_norm": 0.00031527827377431095, "learning_rate": 4.773839740937501e-05, "logits/chosen": -8.95820140838623, "logits/rejected": -8.930770874023438, "logps/chosen": -3.4509217739105225, "logps/rejected": -116.88359069824219, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.819087028503418, "rewards/margins": 11.224546432495117, "rewards/rejected": -8.405460357666016, "step": 1462 }, { "epoch": 1.0001709109553922, "grad_norm": 0.0005215179407969117, "learning_rate": 4.772874750917012e-05, "logits/chosen": -8.516026496887207, "logits/rejected": -8.495267868041992, "logps/chosen": -2.2863221168518066, "logps/rejected": -117.29500579833984, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.906411647796631, "rewards/margins": 11.426310539245605, "rewards/rejected": -8.519899368286133, "step": 1463 }, { "epoch": 1.0008545547769612, "grad_norm": 0.0005931539344601333, "learning_rate": 4.771907804515061e-05, "logits/chosen": -8.440147399902344, "logits/rejected": -8.412713050842285, "logps/chosen": -1.857930302619934, "logps/rejected": -117.55647277832031, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9339723587036133, "rewards/margins": 11.553009986877441, "rewards/rejected": -8.619036674499512, "step": 1464 }, { "epoch": 1.00153819859853, "grad_norm": 0.0015417997492477298, "learning_rate": 4.770938902564041e-05, "logits/chosen": -8.836585998535156, "logits/rejected": -8.814688682556152, "logps/chosen": -0.32511454820632935, "logps/rejected": -119.04838562011719, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9650702476501465, "rewards/margins": 11.75595760345459, "rewards/rejected": -8.790887832641602, "step": 1465 }, { "epoch": 1.002221842420099, "grad_norm": 0.0005005328566767275, "learning_rate": 4.769968045898027e-05, "logits/chosen": -8.364471435546875, "logits/rejected": -8.342220306396484, "logps/chosen": -7.329944610595703, "logps/rejected": -114.78330993652344, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.5787010192871094, "rewards/margins": 10.700876235961914, "rewards/rejected": -8.122176170349121, "step": 1466 }, { "epoch": 1.0029054862416682, "grad_norm": 0.0007159854285418987, "learning_rate": 4.76899523535278e-05, "logits/chosen": -8.270731925964355, "logits/rejected": -8.246550559997559, "logps/chosen": -0.25531327724456787, "logps/rejected": -118.4389877319336, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1699533462524414, "rewards/margins": 11.794919967651367, "rewards/rejected": -8.62496566772461, "step": 1467 }, { "epoch": 1.0035891300632371, "grad_norm": 0.0005245906650088727, "learning_rate": 4.768020471765738e-05, "logits/chosen": -8.369646072387695, "logits/rejected": -8.342447280883789, "logps/chosen": -2.911684513092041, "logps/rejected": -117.12777709960938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9707517623901367, "rewards/margins": 11.353440284729004, "rewards/rejected": -8.382688522338867, "step": 1468 }, { "epoch": 1.004272773884806, "grad_norm": 0.00046009791549295187, "learning_rate": 4.767043755976025e-05, "logits/chosen": -8.350549697875977, "logits/rejected": -8.325033187866211, "logps/chosen": -1.297900915145874, "logps/rejected": -117.48629760742188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.998358726501465, "rewards/margins": 11.547554016113281, "rewards/rejected": -8.549196243286133, "step": 1469 }, { "epoch": 1.004956417706375, "grad_norm": 0.00037312047788873315, "learning_rate": 4.766065088824442e-05, "logits/chosen": -8.485559463500977, "logits/rejected": -8.453821182250977, "logps/chosen": -0.2620188891887665, "logps/rejected": -118.83689880371094, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1846275329589844, "rewards/margins": 11.866382598876953, "rewards/rejected": -8.681755065917969, "step": 1470 }, { "epoch": 1.005640061527944, "grad_norm": 0.00044114436605013907, "learning_rate": 4.765084471153472e-05, "logits/chosen": -8.524364471435547, "logits/rejected": -8.501016616821289, "logps/chosen": -0.2298356294631958, "logps/rejected": -118.82377624511719, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1373729705810547, "rewards/margins": 11.838203430175781, "rewards/rejected": -8.700830459594727, "step": 1471 }, { "epoch": 1.0063237053495129, "grad_norm": 0.0004037067119497806, "learning_rate": 4.7641019038072774e-05, "logits/chosen": -8.213691711425781, "logits/rejected": -8.18779182434082, "logps/chosen": -3.7222700119018555, "logps/rejected": -117.22877502441406, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8261172771453857, "rewards/margins": 11.343841552734375, "rewards/rejected": -8.517724990844727, "step": 1472 }, { "epoch": 1.0070073491710818, "grad_norm": 0.0005738798063248396, "learning_rate": 4.763117387631696e-05, "logits/chosen": -8.431140899658203, "logits/rejected": -8.410362243652344, "logps/chosen": -7.599185466766357, "logps/rejected": -116.80765533447266, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.5594944953918457, "rewards/margins": 10.858413696289062, "rewards/rejected": -8.298919677734375, "step": 1473 }, { "epoch": 1.0076909929926507, "grad_norm": 0.0004859022446908057, "learning_rate": 4.762130923474248e-05, "logits/chosen": -8.141043663024902, "logits/rejected": -8.120769500732422, "logps/chosen": -2.361053466796875, "logps/rejected": -118.39130401611328, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.903681516647339, "rewards/margins": 11.489386558532715, "rewards/rejected": -8.585705757141113, "step": 1474 }, { "epoch": 1.0083746368142197, "grad_norm": 0.0004205108853057027, "learning_rate": 4.761142512184125e-05, "logits/chosen": -8.309585571289062, "logits/rejected": -8.284080505371094, "logps/chosen": -3.8623275756835938, "logps/rejected": -117.197998046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.842977285385132, "rewards/margins": 11.33416748046875, "rewards/rejected": -8.491189956665039, "step": 1475 }, { "epoch": 1.0090582806357888, "grad_norm": 0.00046727884910069406, "learning_rate": 4.7601521546122005e-05, "logits/chosen": -8.042664527893066, "logits/rejected": -8.019739151000977, "logps/chosen": -4.2062907218933105, "logps/rejected": -115.53308868408203, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8695244789123535, "rewards/margins": 11.128353118896484, "rewards/rejected": -8.258828163146973, "step": 1476 }, { "epoch": 1.0097419244573578, "grad_norm": 0.0004721475997939706, "learning_rate": 4.759159851611017e-05, "logits/chosen": -7.80107307434082, "logits/rejected": -7.778738975524902, "logps/chosen": -1.8813132047653198, "logps/rejected": -117.95721435546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.005103349685669, "rewards/margins": 11.557762145996094, "rewards/rejected": -8.552659034729004, "step": 1477 }, { "epoch": 1.0104255682789267, "grad_norm": 0.0004165636783000082, "learning_rate": 4.7581656040347986e-05, "logits/chosen": -8.466707229614258, "logits/rejected": -8.442545890808105, "logps/chosen": -4.372413635253906, "logps/rejected": -117.01057434082031, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7963051795959473, "rewards/margins": 11.223401069641113, "rewards/rejected": -8.427095413208008, "step": 1478 }, { "epoch": 1.0111092121004956, "grad_norm": 0.002518878784030676, "learning_rate": 4.757169412739437e-05, "logits/chosen": -8.607147216796875, "logits/rejected": -8.583011627197266, "logps/chosen": -4.339472770690918, "logps/rejected": -117.15882873535156, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8152856826782227, "rewards/margins": 11.238371849060059, "rewards/rejected": -8.423086166381836, "step": 1479 }, { "epoch": 1.0117928559220646, "grad_norm": 0.00041566972504369915, "learning_rate": 4.7561712785825034e-05, "logits/chosen": -8.190460205078125, "logits/rejected": -8.161078453063965, "logps/chosen": -2.239342212677002, "logps/rejected": -117.49491882324219, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.023620128631592, "rewards/margins": 11.52928352355957, "rewards/rejected": -8.50566291809082, "step": 1480 }, { "epoch": 1.0124764997436335, "grad_norm": 0.0005672018742188811, "learning_rate": 4.755171202423236e-05, "logits/chosen": -8.557475090026855, "logits/rejected": -8.534822463989258, "logps/chosen": -3.8927183151245117, "logps/rejected": -116.9569091796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7456374168395996, "rewards/margins": 11.338239669799805, "rewards/rejected": -8.592602729797363, "step": 1481 }, { "epoch": 1.0131601435652025, "grad_norm": 0.00038510316517204046, "learning_rate": 4.754169185122547e-05, "logits/chosen": -7.966883182525635, "logits/rejected": -7.940473556518555, "logps/chosen": -1.7622233629226685, "logps/rejected": -117.69621276855469, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.058802366256714, "rewards/margins": 11.584504127502441, "rewards/rejected": -8.525701522827148, "step": 1482 }, { "epoch": 1.0138437873867714, "grad_norm": 0.0006461461307480931, "learning_rate": 4.753165227543021e-05, "logits/chosen": -8.507442474365234, "logits/rejected": -8.47325611114502, "logps/chosen": -2.447314739227295, "logps/rejected": -117.74191284179688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.007080078125, "rewards/margins": 11.545415878295898, "rewards/rejected": -8.538335800170898, "step": 1483 }, { "epoch": 1.0145274312083405, "grad_norm": 0.0005450506578199565, "learning_rate": 4.7521593305489104e-05, "logits/chosen": -8.710807800292969, "logits/rejected": -8.68427848815918, "logps/chosen": -5.912083148956299, "logps/rejected": -116.5137710571289, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.629728078842163, "rewards/margins": 10.944690704345703, "rewards/rejected": -8.314962387084961, "step": 1484 }, { "epoch": 1.0152110750299095, "grad_norm": 0.0004972880706191063, "learning_rate": 4.751151495006139e-05, "logits/chosen": -8.463788986206055, "logits/rejected": -8.431534767150879, "logps/chosen": -0.22363126277923584, "logps/rejected": -118.97086334228516, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.2024989128112793, "rewards/margins": 11.90960693359375, "rewards/rejected": -8.707107543945312, "step": 1485 }, { "epoch": 1.0158947188514784, "grad_norm": 0.0004097105411347002, "learning_rate": 4.7501417217822985e-05, "logits/chosen": -8.283233642578125, "logits/rejected": -8.254633903503418, "logps/chosen": -1.9917285442352295, "logps/rejected": -117.4720458984375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.007736921310425, "rewards/margins": 11.5696439743042, "rewards/rejected": -8.561907768249512, "step": 1486 }, { "epoch": 1.0165783626730474, "grad_norm": 0.0007344750338234007, "learning_rate": 4.749130011746648e-05, "logits/chosen": -7.914640426635742, "logits/rejected": -7.890373229980469, "logps/chosen": -4.152311325073242, "logps/rejected": -117.15229034423828, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7473325729370117, "rewards/margins": 11.173181533813477, "rewards/rejected": -8.425848960876465, "step": 1487 }, { "epoch": 1.0172620064946163, "grad_norm": 0.000523555965628475, "learning_rate": 4.7481163657701155e-05, "logits/chosen": -8.020648956298828, "logits/rejected": -7.999275207519531, "logps/chosen": -3.526318311691284, "logps/rejected": -115.75598907470703, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.875678777694702, "rewards/margins": 11.192302703857422, "rewards/rejected": -8.31662368774414, "step": 1488 }, { "epoch": 1.0179456503161852, "grad_norm": 0.00043412804370746017, "learning_rate": 4.747100784725296e-05, "logits/chosen": -8.760818481445312, "logits/rejected": -8.731951713562012, "logps/chosen": -2.6654937267303467, "logps/rejected": -118.17082214355469, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.937941551208496, "rewards/margins": 11.52265739440918, "rewards/rejected": -8.584716796875, "step": 1489 }, { "epoch": 1.0186292941377542, "grad_norm": 0.000597175385337323, "learning_rate": 4.7460832694864466e-05, "logits/chosen": -7.885517120361328, "logits/rejected": -7.863033294677734, "logps/chosen": -2.9215991497039795, "logps/rejected": -115.67057800292969, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.860801935195923, "rewards/margins": 11.218114852905273, "rewards/rejected": -8.35731315612793, "step": 1490 }, { "epoch": 1.019312937959323, "grad_norm": 0.000782771734520793, "learning_rate": 4.745063820929493e-05, "logits/chosen": -7.221593856811523, "logits/rejected": -7.1973676681518555, "logps/chosen": -4.149637222290039, "logps/rejected": -116.37774658203125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8707005977630615, "rewards/margins": 11.213022232055664, "rewards/rejected": -8.342321395874023, "step": 1491 }, { "epoch": 1.0199965817808923, "grad_norm": 0.00047814141726121306, "learning_rate": 4.744042439932024e-05, "logits/chosen": -7.879638671875, "logits/rejected": -7.852182388305664, "logps/chosen": -0.794178307056427, "logps/rejected": -117.8206787109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0918965339660645, "rewards/margins": 11.655786514282227, "rewards/rejected": -8.563889503479004, "step": 1492 }, { "epoch": 1.0206802256024612, "grad_norm": 0.0004867608950007707, "learning_rate": 4.7430191273732935e-05, "logits/chosen": -8.509159088134766, "logits/rejected": -8.479840278625488, "logps/chosen": -2.626254081726074, "logps/rejected": -118.51113891601562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1041595935821533, "rewards/margins": 11.597925186157227, "rewards/rejected": -8.493765830993652, "step": 1493 }, { "epoch": 1.0213638694240301, "grad_norm": 0.0004360819002613425, "learning_rate": 4.741993884134214e-05, "logits/chosen": -8.213179588317871, "logits/rejected": -8.184392929077148, "logps/chosen": -2.2647149562835693, "logps/rejected": -118.62176513671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.946074962615967, "rewards/margins": 11.566644668579102, "rewards/rejected": -8.620569229125977, "step": 1494 }, { "epoch": 1.022047513245599, "grad_norm": 0.0008716142037883401, "learning_rate": 4.7409667110973646e-05, "logits/chosen": -8.354615211486816, "logits/rejected": -8.321649551391602, "logps/chosen": -0.7334609031677246, "logps/rejected": -118.0571060180664, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0948379039764404, "rewards/margins": 11.72536563873291, "rewards/rejected": -8.63052749633789, "step": 1495 }, { "epoch": 1.022731157067168, "grad_norm": 0.0005964471492916346, "learning_rate": 4.739937609146984e-05, "logits/chosen": -8.242355346679688, "logits/rejected": -8.221118927001953, "logps/chosen": -3.6977078914642334, "logps/rejected": -116.05718994140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7692203521728516, "rewards/margins": 11.182385444641113, "rewards/rejected": -8.413164138793945, "step": 1496 }, { "epoch": 1.023414800888737, "grad_norm": 0.0004897978506051004, "learning_rate": 4.73890657916897e-05, "logits/chosen": -7.9427056312561035, "logits/rejected": -7.919280052185059, "logps/chosen": -1.874027132987976, "logps/rejected": -118.0886459350586, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.03080153465271, "rewards/margins": 11.606219291687012, "rewards/rejected": -8.575417518615723, "step": 1497 }, { "epoch": 1.0240984447103059, "grad_norm": 0.0006497633294202387, "learning_rate": 4.7378736220508825e-05, "logits/chosen": -7.80632209777832, "logits/rejected": -7.781477928161621, "logps/chosen": -3.0972952842712402, "logps/rejected": -116.2794189453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9141745567321777, "rewards/margins": 11.231149673461914, "rewards/rejected": -8.316975593566895, "step": 1498 }, { "epoch": 1.0247820885318748, "grad_norm": 0.00043167086550965905, "learning_rate": 4.736838738681937e-05, "logits/chosen": -7.529211044311523, "logits/rejected": -7.504091262817383, "logps/chosen": -3.336428642272949, "logps/rejected": -116.50300598144531, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9357006549835205, "rewards/margins": 11.305810928344727, "rewards/rejected": -8.370110511779785, "step": 1499 }, { "epoch": 1.025465732353444, "grad_norm": 0.00044662135769613087, "learning_rate": 4.7358019299530123e-05, "logits/chosen": -7.870656967163086, "logits/rejected": -7.8482985496521, "logps/chosen": -3.2179346084594727, "logps/rejected": -117.09274291992188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8841352462768555, "rewards/margins": 11.32730770111084, "rewards/rejected": -8.443172454833984, "step": 1500 } ], "logging_steps": 1.0, "max_steps": 4386, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.086995024908124e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }