{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996489996489997, "eval_steps": 500, "global_step": 1424, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000702000702000702, "grad_norm": 0.7362509965896606, "learning_rate": 0.0, "logits/chosen": -6.327065467834473, "logits/rejected": -6.3214111328125, "logps/chosen": -31.38215446472168, "logps/rejected": -31.593799591064453, "loss": 0.6911, "rewards/accuracies": 0.125, "rewards/chosen": 0.0008592843660153449, "rewards/margins": -0.011922812089323997, "rewards/rejected": 0.012782096862792969, "step": 1 }, { "epoch": 0.001404001404001404, "grad_norm": 0.5403744578361511, "learning_rate": 5.017166594399687e-06, "logits/chosen": -6.465273857116699, "logits/rejected": -6.457963943481445, "logps/chosen": -31.11476707458496, "logps/rejected": -31.728424072265625, "loss": 0.6857, "rewards/accuracies": 0.625, "rewards/chosen": 0.01498501282185316, "rewards/margins": 0.031891629099845886, "rewards/rejected": -0.01690661907196045, "step": 2 }, { "epoch": 0.002106002106002106, "grad_norm": 0.5609850287437439, "learning_rate": 7.952020911994375e-06, "logits/chosen": -7.335034370422363, "logits/rejected": -7.3306050300598145, "logps/chosen": -31.1169376373291, "logps/rejected": -31.970130920410156, "loss": 0.6806, "rewards/accuracies": 0.5, "rewards/chosen": 0.02497897297143936, "rewards/margins": 0.03217017650604248, "rewards/rejected": -0.007191205397248268, "step": 3 }, { "epoch": 0.002808002808002808, "grad_norm": 0.6437790393829346, "learning_rate": 1.0034333188799373e-05, "logits/chosen": -6.426914215087891, "logits/rejected": -6.419122219085693, "logps/chosen": -31.469467163085938, "logps/rejected": -32.89900207519531, "loss": 0.6914, "rewards/accuracies": 0.625, "rewards/chosen": 0.035521674901247025, "rewards/margins": 0.06036510691046715, "rewards/rejected": -0.024843430146574974, "step": 4 }, { "epoch": 0.00351000351000351, "grad_norm": 0.5063899159431458, "learning_rate": 1.164950007226698e-05, "logits/chosen": -6.346234321594238, "logits/rejected": -6.345472812652588, "logps/chosen": -31.976961135864258, "logps/rejected": -33.20209503173828, "loss": 0.6849, "rewards/accuracies": 0.625, "rewards/chosen": 0.03988344594836235, "rewards/margins": 0.06311867386102676, "rewards/rejected": -0.023235226050019264, "step": 5 }, { "epoch": 0.004212004212004212, "grad_norm": 0.8671891093254089, "learning_rate": 1.2969187506394062e-05, "logits/chosen": -7.327124118804932, "logits/rejected": -7.327218055725098, "logps/chosen": -33.43262481689453, "logps/rejected": -32.30723571777344, "loss": 0.6928, "rewards/accuracies": 0.25, "rewards/chosen": -0.0005933530628681183, "rewards/margins": -0.032119229435920715, "rewards/rejected": 0.0315258763730526, "step": 6 }, { "epoch": 0.004914004914004914, "grad_norm": 0.6007040143013, "learning_rate": 1.4084967333570947e-05, "logits/chosen": -6.732720375061035, "logits/rejected": -6.738742828369141, "logps/chosen": -31.847654342651367, "logps/rejected": -31.445751190185547, "loss": 0.6741, "rewards/accuracies": 0.375, "rewards/chosen": 0.03493847697973251, "rewards/margins": -0.00411059707403183, "rewards/rejected": 0.03904907405376434, "step": 7 }, { "epoch": 0.005616005616005616, "grad_norm": 0.598247230052948, "learning_rate": 1.505149978319906e-05, "logits/chosen": -6.997643947601318, "logits/rejected": -6.9969940185546875, "logps/chosen": -29.658597946166992, "logps/rejected": -31.438892364501953, "loss": 0.6724, "rewards/accuracies": 0.625, "rewards/chosen": 0.10314944386482239, "rewards/margins": 0.11496177315711975, "rewards/rejected": -0.011812330223619938, "step": 8 }, { "epoch": 0.006318006318006318, "grad_norm": 0.6316475868225098, "learning_rate": 1.590404182398875e-05, "logits/chosen": -7.152309894561768, "logits/rejected": -7.13640832901001, "logps/chosen": -31.014575958251953, "logps/rejected": -30.484161376953125, "loss": 0.6699, "rewards/accuracies": 0.375, "rewards/chosen": 0.05054879188537598, "rewards/margins": -0.03699038177728653, "rewards/rejected": 0.0875391736626625, "step": 9 }, { "epoch": 0.00702000702000702, "grad_norm": 0.7718402147293091, "learning_rate": 1.666666666666667e-05, "logits/chosen": -6.588470458984375, "logits/rejected": -6.567473411560059, "logps/chosen": -31.99350357055664, "logps/rejected": -32.00482940673828, "loss": 0.655, "rewards/accuracies": 0.5, "rewards/chosen": 0.114017553627491, "rewards/margins": 0.06192741543054581, "rewards/rejected": 0.05209014192223549, "step": 10 }, { "epoch": 0.007722007722007722, "grad_norm": 0.6582656502723694, "learning_rate": 1.7356544752637084e-05, "logits/chosen": -7.080864429473877, "logits/rejected": -7.047537803649902, "logps/chosen": -30.979829788208008, "logps/rejected": -31.83462142944336, "loss": 0.6514, "rewards/accuracies": 0.75, "rewards/chosen": 0.16073352098464966, "rewards/margins": 0.10353808850049973, "rewards/rejected": 0.057195425033569336, "step": 11 }, { "epoch": 0.008424008424008424, "grad_norm": 1.034062147140503, "learning_rate": 1.7986354100793748e-05, "logits/chosen": -6.488955497741699, "logits/rejected": -6.4926652908325195, "logps/chosen": -30.68929672241211, "logps/rejected": -31.537372589111328, "loss": 0.6501, "rewards/accuracies": 0.75, "rewards/chosen": 0.2007775604724884, "rewards/margins": 0.10109500586986542, "rewards/rejected": 0.09968254715204239, "step": 12 }, { "epoch": 0.009126009126009126, "grad_norm": 0.7473065257072449, "learning_rate": 1.8565722538447282e-05, "logits/chosen": -7.174704551696777, "logits/rejected": -7.174220085144043, "logps/chosen": -28.073570251464844, "logps/rejected": -31.040287017822266, "loss": 0.6277, "rewards/accuracies": 0.75, "rewards/chosen": 0.33573704957962036, "rewards/margins": 0.24402739107608795, "rewards/rejected": 0.09170965850353241, "step": 13 }, { "epoch": 0.009828009828009828, "grad_norm": 0.9580808877944946, "learning_rate": 1.9102133927970633e-05, "logits/chosen": -6.443475723266602, "logits/rejected": -6.452967643737793, "logps/chosen": -28.645126342773438, "logps/rejected": -30.86092758178711, "loss": 0.609, "rewards/accuracies": 1.0, "rewards/chosen": 0.30662083625793457, "rewards/margins": 0.227390855550766, "rewards/rejected": 0.07922998070716858, "step": 14 }, { "epoch": 0.01053001053001053, "grad_norm": 0.945046067237854, "learning_rate": 1.9601520984261358e-05, "logits/chosen": -7.185513496398926, "logits/rejected": -7.18564510345459, "logps/chosen": -28.89418601989746, "logps/rejected": -31.196807861328125, "loss": 0.596, "rewards/accuracies": 0.5, "rewards/chosen": 0.3306455612182617, "rewards/margins": 0.15761223435401917, "rewards/rejected": 0.17303331196308136, "step": 15 }, { "epoch": 0.011232011232011231, "grad_norm": 1.099999189376831, "learning_rate": 2.0068666377598747e-05, "logits/chosen": -6.949054718017578, "logits/rejected": -6.9503889083862305, "logps/chosen": -28.481788635253906, "logps/rejected": -33.57887268066406, "loss": 0.5662, "rewards/accuracies": 0.875, "rewards/chosen": 0.4179115891456604, "rewards/margins": 0.3992195427417755, "rewards/rejected": 0.01869208738207817, "step": 16 }, { "epoch": 0.011934011934011933, "grad_norm": 1.455985426902771, "learning_rate": 2.0507482022971233e-05, "logits/chosen": -6.771892547607422, "logits/rejected": -6.771533012390137, "logps/chosen": -29.03791046142578, "logps/rejected": -32.80541229248047, "loss": 0.5483, "rewards/accuracies": 0.75, "rewards/chosen": 0.3084973096847534, "rewards/margins": 0.30892568826675415, "rewards/rejected": -0.00042839162051677704, "step": 17 }, { "epoch": 0.012636012636012635, "grad_norm": 2.72833514213562, "learning_rate": 2.0921208418388435e-05, "logits/chosen": -7.352644443511963, "logits/rejected": -7.356955528259277, "logps/chosen": -27.039522171020508, "logps/rejected": -30.887033462524414, "loss": 0.5408, "rewards/accuracies": 0.875, "rewards/chosen": 0.37791353464126587, "rewards/margins": 0.3164868652820587, "rewards/rejected": 0.061426639556884766, "step": 18 }, { "epoch": 0.013338013338013339, "grad_norm": 1.5357730388641357, "learning_rate": 2.1312560015880482e-05, "logits/chosen": -6.873319149017334, "logits/rejected": -6.861461162567139, "logps/chosen": -28.443361282348633, "logps/rejected": -30.863727569580078, "loss": 0.5102, "rewards/accuracies": 0.75, "rewards/chosen": 0.5070673823356628, "rewards/margins": 0.35419386625289917, "rewards/rejected": 0.15287350118160248, "step": 19 }, { "epoch": 0.01404001404001404, "grad_norm": 1.6758450269699097, "learning_rate": 2.1683833261066357e-05, "logits/chosen": -7.481657028198242, "logits/rejected": -7.485768795013428, "logps/chosen": -26.070436477661133, "logps/rejected": -33.05839538574219, "loss": 0.4495, "rewards/accuracies": 0.875, "rewards/chosen": 0.5084783434867859, "rewards/margins": 0.6822522878646851, "rewards/rejected": -0.17377394437789917, "step": 20 }, { "epoch": 0.014742014742014743, "grad_norm": 1.977996826171875, "learning_rate": 2.2036988245565324e-05, "logits/chosen": -7.021907329559326, "logits/rejected": -7.008756160736084, "logps/chosen": -25.68326187133789, "logps/rejected": -33.42749786376953, "loss": 0.427, "rewards/accuracies": 1.0, "rewards/chosen": 0.6725587844848633, "rewards/margins": 0.8097036480903625, "rewards/rejected": -0.13714483380317688, "step": 21 }, { "epoch": 0.015444015444015444, "grad_norm": 3.4576056003570557, "learning_rate": 2.2373711347036773e-05, "logits/chosen": -6.888313293457031, "logits/rejected": -6.887960433959961, "logps/chosen": -26.02092742919922, "logps/rejected": -34.642940521240234, "loss": 0.3929, "rewards/accuracies": 0.875, "rewards/chosen": 0.6215429306030273, "rewards/margins": 0.8293198347091675, "rewards/rejected": -0.20777687430381775, "step": 22 }, { "epoch": 0.016146016146016146, "grad_norm": 2.234536647796631, "learning_rate": 2.269546393362655e-05, "logits/chosen": -7.197415351867676, "logits/rejected": -7.200768947601318, "logps/chosen": -25.943140029907227, "logps/rejected": -37.489341735839844, "loss": 0.3416, "rewards/accuracies": 1.0, "rewards/chosen": 0.6059697866439819, "rewards/margins": 1.0779063701629639, "rewards/rejected": -0.47193658351898193, "step": 23 }, { "epoch": 0.016848016848016848, "grad_norm": 1.7282993793487549, "learning_rate": 2.3003520695193437e-05, "logits/chosen": -6.665156364440918, "logits/rejected": -6.65745735168457, "logps/chosen": -28.22674560546875, "logps/rejected": -37.07148742675781, "loss": 0.3121, "rewards/accuracies": 0.875, "rewards/chosen": 0.5782899856567383, "rewards/margins": 0.9194584488868713, "rewards/rejected": -0.3411684036254883, "step": 24 }, { "epoch": 0.01755001755001755, "grad_norm": 2.418588399887085, "learning_rate": 2.329900014453396e-05, "logits/chosen": -7.054847717285156, "logits/rejected": -7.048114776611328, "logps/chosen": -25.915428161621094, "logps/rejected": -40.034183502197266, "loss": 0.2714, "rewards/accuracies": 0.875, "rewards/chosen": 0.5742180943489075, "rewards/margins": 1.3773601055145264, "rewards/rejected": -0.8031419515609741, "step": 25 }, { "epoch": 0.018252018252018252, "grad_norm": 1.49710214138031, "learning_rate": 2.3582889132846968e-05, "logits/chosen": -6.947665214538574, "logits/rejected": -6.940887451171875, "logps/chosen": -24.65955352783203, "logps/rejected": -45.891944885253906, "loss": 0.2044, "rewards/accuracies": 1.0, "rewards/chosen": 0.6726310849189758, "rewards/margins": 2.049088954925537, "rewards/rejected": -1.3764580488204956, "step": 26 }, { "epoch": 0.018954018954018954, "grad_norm": 2.990037441253662, "learning_rate": 2.3856062735983123e-05, "logits/chosen": -7.451685905456543, "logits/rejected": -7.45396614074707, "logps/chosen": -25.604557037353516, "logps/rejected": -45.548316955566406, "loss": 0.2197, "rewards/accuracies": 1.0, "rewards/chosen": 0.6487122774124146, "rewards/margins": 2.0438947677612305, "rewards/rejected": -1.395182728767395, "step": 27 }, { "epoch": 0.019656019656019656, "grad_norm": 1.3130444288253784, "learning_rate": 2.4119300522370322e-05, "logits/chosen": -6.748967170715332, "logits/rejected": -6.742885589599609, "logps/chosen": -26.126693725585938, "logps/rejected": -48.78252410888672, "loss": 0.1908, "rewards/accuracies": 1.0, "rewards/chosen": 0.6142779588699341, "rewards/margins": 2.2859020233154297, "rewards/rejected": -1.671623945236206, "step": 28 }, { "epoch": 0.020358020358020357, "grad_norm": 1.781205177307129, "learning_rate": 2.4373299964982603e-05, "logits/chosen": -7.403308391571045, "logits/rejected": -7.405962944030762, "logps/chosen": -23.803842544555664, "logps/rejected": -54.39312744140625, "loss": 0.1762, "rewards/accuracies": 1.0, "rewards/chosen": 0.7152693271636963, "rewards/margins": 3.0240540504455566, "rewards/rejected": -2.3087847232818604, "step": 29 }, { "epoch": 0.02106002106002106, "grad_norm": 2.3816843032836914, "learning_rate": 2.4618687578661044e-05, "logits/chosen": -7.127253532409668, "logits/rejected": -7.1271867752075195, "logps/chosen": -23.851673126220703, "logps/rejected": -54.523494720458984, "loss": 0.1351, "rewards/accuracies": 1.0, "rewards/chosen": 0.756695568561554, "rewards/margins": 3.1444711685180664, "rewards/rejected": -2.387775421142578, "step": 30 }, { "epoch": 0.02176202176202176, "grad_norm": 1.956632375717163, "learning_rate": 2.4856028230571212e-05, "logits/chosen": -7.299242973327637, "logits/rejected": -7.280484199523926, "logps/chosen": -24.1784610748291, "logps/rejected": -57.83273696899414, "loss": 0.0867, "rewards/accuracies": 1.0, "rewards/chosen": 0.7200201153755188, "rewards/margins": 3.4142708778381348, "rewards/rejected": -2.6942505836486816, "step": 31 }, { "epoch": 0.022464022464022463, "grad_norm": 1.7002724409103394, "learning_rate": 2.5085832971998436e-05, "logits/chosen": -7.152646064758301, "logits/rejected": -7.168011665344238, "logps/chosen": -27.682212829589844, "logps/rejected": -63.43915939331055, "loss": 0.0924, "rewards/accuracies": 0.875, "rewards/chosen": 0.5433571338653564, "rewards/margins": 3.4348068237304688, "rewards/rejected": -2.8914496898651123, "step": 32 }, { "epoch": 0.023166023166023165, "grad_norm": 0.35595279932022095, "learning_rate": 2.530856566463146e-05, "logits/chosen": -7.919728755950928, "logits/rejected": -7.9174909591674805, "logps/chosen": -28.308427810668945, "logps/rejected": -58.79429626464844, "loss": 0.086, "rewards/accuracies": 0.75, "rewards/chosen": 0.4114716947078705, "rewards/margins": 3.059133529663086, "rewards/rejected": -2.6476619243621826, "step": 33 }, { "epoch": 0.023868023868023867, "grad_norm": 3.2185330390930176, "learning_rate": 2.552464861737092e-05, "logits/chosen": -6.9826154708862305, "logits/rejected": -6.978462219238281, "logps/chosen": -27.858413696289062, "logps/rejected": -58.40654754638672, "loss": 0.1606, "rewards/accuracies": 0.75, "rewards/chosen": 0.4308580458164215, "rewards/margins": 3.031878709793091, "rewards/rejected": -2.601020336151123, "step": 34 }, { "epoch": 0.02457002457002457, "grad_norm": 0.5814086198806763, "learning_rate": 2.5734467405837933e-05, "logits/chosen": -6.692983150482178, "logits/rejected": -6.699528217315674, "logps/chosen": -25.594343185424805, "logps/rejected": -70.49267578125, "loss": 0.12, "rewards/accuracies": 1.0, "rewards/chosen": 0.6806867122650146, "rewards/margins": 4.52584171295166, "rewards/rejected": -3.8451545238494873, "step": 35 }, { "epoch": 0.02527202527202527, "grad_norm": 9.620914459228516, "learning_rate": 2.5938375012788124e-05, "logits/chosen": -7.345959663391113, "logits/rejected": -7.336450576782227, "logps/chosen": -26.077024459838867, "logps/rejected": -75.65653991699219, "loss": 0.1591, "rewards/accuracies": 1.0, "rewards/chosen": 0.5862878561019897, "rewards/margins": 4.8591156005859375, "rewards/rejected": -4.272828102111816, "step": 36 }, { "epoch": 0.025974025974025976, "grad_norm": 1.157991647720337, "learning_rate": 2.6136695401116585e-05, "logits/chosen": -7.682406902313232, "logits/rejected": -7.668662071228027, "logps/chosen": -26.913570404052734, "logps/rejected": -69.53828430175781, "loss": 0.0715, "rewards/accuracies": 1.0, "rewards/chosen": 0.472379207611084, "rewards/margins": 4.2450761795043945, "rewards/rejected": -3.7726964950561523, "step": 37 }, { "epoch": 0.026676026676026678, "grad_norm": 0.7764812111854553, "learning_rate": 2.6329726610280168e-05, "logits/chosen": -7.061161041259766, "logits/rejected": -7.076615333557129, "logps/chosen": -27.80942153930664, "logps/rejected": -72.387939453125, "loss": 0.1467, "rewards/accuracies": 1.0, "rewards/chosen": 0.45654648542404175, "rewards/margins": 4.366056442260742, "rewards/rejected": -3.9095096588134766, "step": 38 }, { "epoch": 0.02737802737802738, "grad_norm": 2.377315044403076, "learning_rate": 2.651774345044166e-05, "logits/chosen": -7.305835723876953, "logits/rejected": -7.306196212768555, "logps/chosen": -24.079666137695312, "logps/rejected": -75.50300598144531, "loss": 0.156, "rewards/accuracies": 1.0, "rewards/chosen": 0.6834268569946289, "rewards/margins": 5.159507751464844, "rewards/rejected": -4.476080894470215, "step": 39 }, { "epoch": 0.02808002808002808, "grad_norm": 0.14778615534305573, "learning_rate": 2.6700999855466042e-05, "logits/chosen": -7.065064907073975, "logits/rejected": -7.076864242553711, "logps/chosen": -27.560585021972656, "logps/rejected": -80.85428619384766, "loss": 0.0559, "rewards/accuracies": 1.0, "rewards/chosen": 0.3454429507255554, "rewards/margins": 5.217992782592773, "rewards/rejected": -4.872549533843994, "step": 40 }, { "epoch": 0.028782028782028783, "grad_norm": 0.21911178529262543, "learning_rate": 2.687973094532893e-05, "logits/chosen": -7.154488563537598, "logits/rejected": -7.1684465408325195, "logps/chosen": -27.364961624145508, "logps/rejected": -69.74729919433594, "loss": 0.0647, "rewards/accuracies": 1.0, "rewards/chosen": 0.4174615740776062, "rewards/margins": 4.1540398597717285, "rewards/rejected": -3.7365784645080566, "step": 41 }, { "epoch": 0.029484029484029485, "grad_norm": 8.211211204528809, "learning_rate": 2.7054154839965013e-05, "logits/chosen": -6.935814380645752, "logits/rejected": -6.937582015991211, "logps/chosen": -29.3238525390625, "logps/rejected": -69.93264770507812, "loss": 0.1587, "rewards/accuracies": 0.875, "rewards/chosen": 0.3536485731601715, "rewards/margins": 4.030691146850586, "rewards/rejected": -3.6770429611206055, "step": 42 }, { "epoch": 0.030186030186030187, "grad_norm": 3.8625638484954834, "learning_rate": 2.722447425965978e-05, "logits/chosen": -7.42475700378418, "logits/rejected": -7.428657531738281, "logps/chosen": -26.61966896057129, "logps/rejected": -76.5493392944336, "loss": 0.0855, "rewards/accuracies": 0.875, "rewards/chosen": 0.4687429964542389, "rewards/margins": 4.912079811096191, "rewards/rejected": -4.443337440490723, "step": 43 }, { "epoch": 0.03088803088803089, "grad_norm": 0.17460407316684723, "learning_rate": 2.739087794143646e-05, "logits/chosen": -7.42650032043457, "logits/rejected": -7.427938461303711, "logps/chosen": -25.35947608947754, "logps/rejected": -84.289306640625, "loss": 0.0926, "rewards/accuracies": 1.0, "rewards/chosen": 0.5432809591293335, "rewards/margins": 5.876151084899902, "rewards/rejected": -5.3328704833984375, "step": 44 }, { "epoch": 0.03159003159003159, "grad_norm": 14.944300651550293, "learning_rate": 2.755354189625573e-05, "logits/chosen": -6.93408203125, "logits/rejected": -6.939152240753174, "logps/chosen": -28.066770553588867, "logps/rejected": -70.78054809570312, "loss": 0.1445, "rewards/accuracies": 0.875, "rewards/chosen": 0.5390792489051819, "rewards/margins": 4.2954792976379395, "rewards/rejected": -3.7563998699188232, "step": 45 }, { "epoch": 0.03229203229203229, "grad_norm": 3.5002503395080566, "learning_rate": 2.771263052802624e-05, "logits/chosen": -7.353217601776123, "logits/rejected": -7.340875625610352, "logps/chosen": -24.206520080566406, "logps/rejected": -81.5072021484375, "loss": 0.1697, "rewards/accuracies": 1.0, "rewards/chosen": 0.7133325338363647, "rewards/margins": 5.757226943969727, "rewards/rejected": -5.0438947677612305, "step": 46 }, { "epoch": 0.032994032994032994, "grad_norm": 1.689164638519287, "learning_rate": 2.7868297632261957e-05, "logits/chosen": -7.080233573913574, "logits/rejected": -7.088419437408447, "logps/chosen": -25.423442840576172, "logps/rejected": -82.2406005859375, "loss": 0.1109, "rewards/accuracies": 1.0, "rewards/chosen": 0.7442028522491455, "rewards/margins": 5.743407249450684, "rewards/rejected": -4.999204158782959, "step": 47 }, { "epoch": 0.033696033696033696, "grad_norm": 1.6787567138671875, "learning_rate": 2.8020687289593123e-05, "logits/chosen": -7.661046981811523, "logits/rejected": -7.653748512268066, "logps/chosen": -31.291196823120117, "logps/rejected": -68.75875091552734, "loss": 0.0943, "rewards/accuracies": 0.625, "rewards/chosen": 0.3098016083240509, "rewards/margins": 3.6318600177764893, "rewards/rejected": -3.3220584392547607, "step": 48 }, { "epoch": 0.0343980343980344, "grad_norm": 1.7416592836380005, "learning_rate": 2.8169934667141895e-05, "logits/chosen": -7.4187469482421875, "logits/rejected": -7.4205827713012695, "logps/chosen": -26.30982208251953, "logps/rejected": -78.28893280029297, "loss": 0.0948, "rewards/accuracies": 0.875, "rewards/chosen": 0.48395073413848877, "rewards/margins": 5.171774864196777, "rewards/rejected": -4.687824249267578, "step": 49 }, { "epoch": 0.0351000351000351, "grad_norm": 0.1485549956560135, "learning_rate": 2.8316166738933646e-05, "logits/chosen": -8.050887107849121, "logits/rejected": -8.071555137634277, "logps/chosen": -24.947830200195312, "logps/rejected": -83.34628295898438, "loss": 0.0689, "rewards/accuracies": 1.0, "rewards/chosen": 0.5378541946411133, "rewards/margins": 5.779118537902832, "rewards/rejected": -5.241264343261719, "step": 50 }, { "epoch": 0.0358020358020358, "grad_norm": 0.22805428504943848, "learning_rate": 2.845950293496561e-05, "logits/chosen": -7.3327131271362305, "logits/rejected": -7.323657035827637, "logps/chosen": -25.47657012939453, "logps/rejected": -85.75929260253906, "loss": 0.0881, "rewards/accuracies": 1.0, "rewards/chosen": 0.5426871180534363, "rewards/margins": 5.961248397827148, "rewards/rejected": -5.4185614585876465, "step": 51 }, { "epoch": 0.036504036504036504, "grad_norm": 0.24115796387195587, "learning_rate": 2.8600055727246657e-05, "logits/chosen": -7.2126054763793945, "logits/rejected": -7.210597515106201, "logps/chosen": -25.881084442138672, "logps/rejected": -84.55435180664062, "loss": 0.0721, "rewards/accuracies": 1.0, "rewards/chosen": 0.5113131403923035, "rewards/margins": 5.8197455406188965, "rewards/rejected": -5.308432579040527, "step": 52 }, { "epoch": 0.037206037206037205, "grad_norm": 0.5711573958396912, "learning_rate": 2.8737931160013153e-05, "logits/chosen": -7.611987113952637, "logits/rejected": -7.627943992614746, "logps/chosen": -27.376998901367188, "logps/rejected": -74.73385620117188, "loss": 0.0969, "rewards/accuracies": 0.875, "rewards/chosen": 0.38435521721839905, "rewards/margins": 4.695669174194336, "rewards/rejected": -4.311313629150391, "step": 53 }, { "epoch": 0.03790803790803791, "grad_norm": 3.94449520111084, "learning_rate": 2.8873229330382812e-05, "logits/chosen": -7.410796642303467, "logits/rejected": -7.419951438903809, "logps/chosen": -25.359132766723633, "logps/rejected": -83.06319427490234, "loss": 0.103, "rewards/accuracies": 0.875, "rewards/chosen": 0.6156649589538574, "rewards/margins": 5.618312835693359, "rewards/rejected": -5.002647876739502, "step": 54 }, { "epoch": 0.03861003861003861, "grad_norm": 0.2533474564552307, "learning_rate": 2.9006044824904066e-05, "logits/chosen": -7.426368713378906, "logits/rejected": -7.416305065155029, "logps/chosen": -31.591720581054688, "logps/rejected": -67.71700286865234, "loss": 0.0756, "rewards/accuracies": 0.875, "rewards/chosen": 0.20455510914325714, "rewards/margins": 3.60770845413208, "rewards/rejected": -3.40315318107605, "step": 55 }, { "epoch": 0.03931203931203931, "grad_norm": 0.34706911444664, "learning_rate": 2.913646711677001e-05, "logits/chosen": -6.756829261779785, "logits/rejected": -6.764748573303223, "logps/chosen": -27.542354583740234, "logps/rejected": -75.05244445800781, "loss": 0.0474, "rewards/accuracies": 0.875, "rewards/chosen": 0.4998428225517273, "rewards/margins": 4.809538841247559, "rewards/rejected": -4.309696197509766, "step": 56 }, { "epoch": 0.04001404001404001, "grad_norm": 8.474733352661133, "learning_rate": 2.926458092787486e-05, "logits/chosen": -7.148025989532471, "logits/rejected": -7.151997089385986, "logps/chosen": -23.255695343017578, "logps/rejected": -88.71858215332031, "loss": 0.1579, "rewards/accuracies": 1.0, "rewards/chosen": 0.8335578441619873, "rewards/margins": 6.4554314613342285, "rewards/rejected": -5.621873378753662, "step": 57 }, { "epoch": 0.040716040716040715, "grad_norm": 0.20417949557304382, "learning_rate": 2.939046655938229e-05, "logits/chosen": -6.950979232788086, "logits/rejected": -6.983558654785156, "logps/chosen": -28.690874099731445, "logps/rejected": -69.21992492675781, "loss": 0.0666, "rewards/accuracies": 1.0, "rewards/chosen": 0.5719759464263916, "rewards/margins": 4.1248674392700195, "rewards/rejected": -3.552891731262207, "step": 58 }, { "epoch": 0.04141804141804142, "grad_norm": 0.320267915725708, "learning_rate": 2.951420019403574e-05, "logits/chosen": -7.2105207443237305, "logits/rejected": -7.226320266723633, "logps/chosen": -25.22861099243164, "logps/rejected": -81.38958740234375, "loss": 0.0815, "rewards/accuracies": 1.0, "rewards/chosen": 0.7187371253967285, "rewards/margins": 5.66035270690918, "rewards/rejected": -4.941615104675293, "step": 59 }, { "epoch": 0.04212004212004212, "grad_norm": 0.12221161276102066, "learning_rate": 2.963585417306073e-05, "logits/chosen": -7.773287773132324, "logits/rejected": -7.774829864501953, "logps/chosen": -22.38068389892578, "logps/rejected": -90.90509033203125, "loss": 0.0711, "rewards/accuracies": 1.0, "rewards/chosen": 0.8984471559524536, "rewards/margins": 6.7972002029418945, "rewards/rejected": -5.898752689361572, "step": 60 }, { "epoch": 0.04282204282204282, "grad_norm": 0.09575088322162628, "learning_rate": 2.9755497250179453e-05, "logits/chosen": -7.143295764923096, "logits/rejected": -7.158502578735352, "logps/chosen": -22.370710372924805, "logps/rejected": -90.53697967529297, "loss": 0.0685, "rewards/accuracies": 1.0, "rewards/chosen": 0.955229640007019, "rewards/margins": 6.814311504364014, "rewards/rejected": -5.859081268310547, "step": 61 }, { "epoch": 0.04352404352404352, "grad_norm": 2.7291769981384277, "learning_rate": 2.98731948249709e-05, "logits/chosen": -7.932046413421631, "logits/rejected": -7.9301605224609375, "logps/chosen": -25.160741806030273, "logps/rejected": -77.22018432617188, "loss": 0.0731, "rewards/accuracies": 0.875, "rewards/chosen": 0.6401044130325317, "rewards/margins": 5.139678955078125, "rewards/rejected": -4.499574184417725, "step": 62 }, { "epoch": 0.044226044226044224, "grad_norm": 0.15039488673210144, "learning_rate": 2.9989009157559694e-05, "logits/chosen": -7.488473892211914, "logits/rejected": -7.505375862121582, "logps/chosen": -25.160144805908203, "logps/rejected": -79.36732482910156, "loss": 0.078, "rewards/accuracies": 1.0, "rewards/chosen": 0.7045561075210571, "rewards/margins": 5.397197246551514, "rewards/rejected": -4.692641258239746, "step": 63 }, { "epoch": 0.044928044928044926, "grad_norm": 0.1088738813996315, "learning_rate": 3.010299956639812e-05, "logits/chosen": -7.172284126281738, "logits/rejected": -7.164319038391113, "logps/chosen": -22.961069107055664, "logps/rejected": -85.93089294433594, "loss": 0.0793, "rewards/accuracies": 1.0, "rewards/chosen": 0.8685182929039001, "rewards/margins": 6.194622993469238, "rewards/rejected": -5.326104640960693, "step": 64 }, { "epoch": 0.04563004563004563, "grad_norm": 0.1300433725118637, "learning_rate": 3.021522261071426e-05, "logits/chosen": -6.737222671508789, "logits/rejected": -6.736132621765137, "logps/chosen": -24.32071304321289, "logps/rejected": -83.16117095947266, "loss": 0.0888, "rewards/accuracies": 0.875, "rewards/chosen": 0.9272854328155518, "rewards/margins": 5.937081336975098, "rewards/rejected": -5.009796142578125, "step": 65 }, { "epoch": 0.04633204633204633, "grad_norm": 0.1365581452846527, "learning_rate": 3.0325732259031143e-05, "logits/chosen": -7.092480182647705, "logits/rejected": -7.074071407318115, "logps/chosen": -20.918304443359375, "logps/rejected": -93.18510437011719, "loss": 0.1086, "rewards/accuracies": 1.0, "rewards/chosen": 1.1129056215286255, "rewards/margins": 7.217442035675049, "rewards/rejected": -6.104536533355713, "step": 66 }, { "epoch": 0.04703404703404703, "grad_norm": 0.13219770789146423, "learning_rate": 3.043458004501377e-05, "logits/chosen": -7.176571846008301, "logits/rejected": -7.17874813079834, "logps/chosen": -21.26753044128418, "logps/rejected": -93.65159606933594, "loss": 0.0762, "rewards/accuracies": 1.0, "rewards/chosen": 1.0357904434204102, "rewards/margins": 7.13702392578125, "rewards/rejected": -6.10123348236084, "step": 67 }, { "epoch": 0.04773604773604773, "grad_norm": 0.10713231563568115, "learning_rate": 3.054181521177061e-05, "logits/chosen": -7.4527740478515625, "logits/rejected": -7.450488567352295, "logps/chosen": -24.155200958251953, "logps/rejected": -88.61746215820312, "loss": 0.0478, "rewards/accuracies": 1.0, "rewards/chosen": 0.9059360027313232, "rewards/margins": 6.389184951782227, "rewards/rejected": -5.483248710632324, "step": 68 }, { "epoch": 0.048438048438048435, "grad_norm": 0.46614763140678406, "learning_rate": 3.064748484562093e-05, "logits/chosen": -7.309267997741699, "logits/rejected": -7.30295467376709, "logps/chosen": -23.5020751953125, "logps/rejected": -79.38430786132812, "loss": 0.1407, "rewards/accuracies": 0.875, "rewards/chosen": 0.788501501083374, "rewards/margins": 5.433658599853516, "rewards/rejected": -4.6451568603515625, "step": 69 }, { "epoch": 0.04914004914004914, "grad_norm": 0.11028512567281723, "learning_rate": 3.0751634000237615e-05, "logits/chosen": -7.266515254974365, "logits/rejected": -7.28378963470459, "logps/chosen": -22.454370498657227, "logps/rejected": -88.74626159667969, "loss": 0.0493, "rewards/accuracies": 1.0, "rewards/chosen": 0.9464644193649292, "rewards/margins": 6.548521995544434, "rewards/rejected": -5.602057456970215, "step": 70 }, { "epoch": 0.04984204984204984, "grad_norm": 0.11105122417211533, "learning_rate": 3.085430581198459e-05, "logits/chosen": -7.240549087524414, "logits/rejected": -7.231055736541748, "logps/chosen": -21.323692321777344, "logps/rejected": -86.63516235351562, "loss": 0.0668, "rewards/accuracies": 0.875, "rewards/chosen": 0.9426981210708618, "rewards/margins": 6.441919326782227, "rewards/rejected": -5.499220848083496, "step": 71 }, { "epoch": 0.05054405054405054, "grad_norm": 0.13463686406612396, "learning_rate": 3.095554160718781e-05, "logits/chosen": -7.8926239013671875, "logits/rejected": -7.904955863952637, "logps/chosen": -20.626028060913086, "logps/rejected": -85.18846893310547, "loss": 0.0529, "rewards/accuracies": 1.0, "rewards/chosen": 0.9416337013244629, "rewards/margins": 6.435574054718018, "rewards/rejected": -5.493940353393555, "step": 72 }, { "epoch": 0.05124605124605124, "grad_norm": 0.09524887055158615, "learning_rate": 3.10553810020076e-05, "logits/chosen": -7.259347915649414, "logits/rejected": -7.257107734680176, "logps/chosen": -21.92057228088379, "logps/rejected": -87.46835327148438, "loss": 0.0454, "rewards/accuracies": 1.0, "rewards/chosen": 1.072593092918396, "rewards/margins": 6.563956260681152, "rewards/rejected": -5.491363525390625, "step": 73 }, { "epoch": 0.05194805194805195, "grad_norm": 0.11606042087078094, "learning_rate": 3.115386199551628e-05, "logits/chosen": -7.714974403381348, "logits/rejected": -7.7409515380859375, "logps/chosen": -19.770488739013672, "logps/rejected": -92.20685577392578, "loss": 0.0526, "rewards/accuracies": 1.0, "rewards/chosen": 1.1219253540039062, "rewards/margins": 7.249269962310791, "rewards/rejected": -6.127344608306885, "step": 74 }, { "epoch": 0.05265005265005265, "grad_norm": 2.510187864303589, "learning_rate": 3.1251021056528336e-05, "logits/chosen": -7.325782775878906, "logits/rejected": -7.339624881744385, "logps/chosen": -26.0257568359375, "logps/rejected": -88.05560302734375, "loss": 0.0667, "rewards/accuracies": 0.875, "rewards/chosen": 0.5321630239486694, "rewards/margins": 6.1714348793029785, "rewards/rejected": -5.639272212982178, "step": 75 }, { "epoch": 0.053352053352053355, "grad_norm": 0.11650217324495316, "learning_rate": 3.134689320467986e-05, "logits/chosen": -7.478418350219727, "logits/rejected": -7.472134113311768, "logps/chosen": -22.32622528076172, "logps/rejected": -79.58806610107422, "loss": 0.0922, "rewards/accuracies": 0.875, "rewards/chosen": 0.780845046043396, "rewards/margins": 5.645715713500977, "rewards/rejected": -4.864870548248291, "step": 76 }, { "epoch": 0.05405405405405406, "grad_norm": 0.1307910978794098, "learning_rate": 3.144151208620804e-05, "logits/chosen": -7.313758850097656, "logits/rejected": -7.296645164489746, "logps/chosen": -20.339046478271484, "logps/rejected": -90.91523742675781, "loss": 0.058, "rewards/accuracies": 1.0, "rewards/chosen": 1.1828434467315674, "rewards/margins": 7.106139659881592, "rewards/rejected": -5.9232964515686035, "step": 77 }, { "epoch": 0.05475605475605476, "grad_norm": 0.14854516088962555, "learning_rate": 3.1534910044841344e-05, "logits/chosen": -7.422662734985352, "logits/rejected": -7.402349948883057, "logps/chosen": -20.149436950683594, "logps/rejected": -92.42562866210938, "loss": 0.0674, "rewards/accuracies": 1.0, "rewards/chosen": 1.2358131408691406, "rewards/margins": 7.241641044616699, "rewards/rejected": -6.0058274269104, "step": 78 }, { "epoch": 0.05545805545805546, "grad_norm": 0.11487939208745956, "learning_rate": 3.1627118188174024e-05, "logits/chosen": -7.342196941375732, "logits/rejected": -7.328601837158203, "logps/chosen": -20.616432189941406, "logps/rejected": -92.01414489746094, "loss": 0.066, "rewards/accuracies": 1.0, "rewards/chosen": 1.1473076343536377, "rewards/margins": 7.158587455749512, "rewards/rejected": -6.011280059814453, "step": 79 }, { "epoch": 0.05616005616005616, "grad_norm": 0.13341820240020752, "learning_rate": 3.171816644986573e-05, "logits/chosen": -7.50250768661499, "logits/rejected": -7.493607521057129, "logps/chosen": -20.23822593688965, "logps/rejected": -93.1771011352539, "loss": 0.0668, "rewards/accuracies": 1.0, "rewards/chosen": 1.079056978225708, "rewards/margins": 7.22094202041626, "rewards/rejected": -6.141885280609131, "step": 80 }, { "epoch": 0.056862056862056864, "grad_norm": 0.11182121187448502, "learning_rate": 3.18080836479775e-05, "logits/chosen": -6.927457332611084, "logits/rejected": -6.927408218383789, "logps/chosen": -22.10438346862793, "logps/rejected": -85.97686767578125, "loss": 0.0612, "rewards/accuracies": 0.875, "rewards/chosen": 0.9387284517288208, "rewards/margins": 6.278690338134766, "rewards/rejected": -5.339962005615234, "step": 81 }, { "epoch": 0.057564057564057566, "grad_norm": 2.973522663116455, "learning_rate": 3.1896897539728616e-05, "logits/chosen": -7.848840713500977, "logits/rejected": -7.835525035858154, "logps/chosen": -23.771812438964844, "logps/rejected": -75.15774536132812, "loss": 0.1127, "rewards/accuracies": 1.0, "rewards/chosen": 0.8201979398727417, "rewards/margins": 5.198735237121582, "rewards/rejected": -4.378537178039551, "step": 82 }, { "epoch": 0.05826605826605827, "grad_norm": 0.1551780104637146, "learning_rate": 3.198463487293457e-05, "logits/chosen": -7.630067825317383, "logits/rejected": -7.640313148498535, "logps/chosen": -22.142623901367188, "logps/rejected": -84.80403137207031, "loss": 0.1153, "rewards/accuracies": 1.0, "rewards/chosen": 0.9954205751419067, "rewards/margins": 6.287491321563721, "rewards/rejected": -5.2920708656311035, "step": 83 }, { "epoch": 0.05896805896805897, "grad_norm": 0.14048708975315094, "learning_rate": 3.207132143436469e-05, "logits/chosen": -7.565755844116211, "logits/rejected": -7.558052062988281, "logps/chosen": -20.2623291015625, "logps/rejected": -95.4808349609375, "loss": 0.0785, "rewards/accuracies": 1.0, "rewards/chosen": 1.0885125398635864, "rewards/margins": 7.402470588684082, "rewards/rejected": -6.313958168029785, "step": 84 }, { "epoch": 0.05967005967005967, "grad_norm": 0.14304043352603912, "learning_rate": 3.215698209523821e-05, "logits/chosen": -6.744610786437988, "logits/rejected": -6.718930244445801, "logps/chosen": -22.878793716430664, "logps/rejected": -86.48863220214844, "loss": 0.0741, "rewards/accuracies": 1.0, "rewards/chosen": 1.0470266342163086, "rewards/margins": 6.3511457443237305, "rewards/rejected": -5.304119110107422, "step": 85 }, { "epoch": 0.060372060372060374, "grad_norm": 0.09290531277656555, "learning_rate": 3.224164085405946e-05, "logits/chosen": -7.312817573547363, "logits/rejected": -7.315550804138184, "logps/chosen": -23.60333824157715, "logps/rejected": -78.12030029296875, "loss": 0.049, "rewards/accuracies": 0.875, "rewards/chosen": 0.8752925395965576, "rewards/margins": 5.44439172744751, "rewards/rejected": -4.569099426269531, "step": 86 }, { "epoch": 0.061074061074061076, "grad_norm": 0.1411767601966858, "learning_rate": 3.232532087697698e-05, "logits/chosen": -7.287440299987793, "logits/rejected": -7.282231330871582, "logps/chosen": -19.32866096496582, "logps/rejected": -96.47938537597656, "loss": 0.0748, "rewards/accuracies": 1.0, "rewards/chosen": 1.1330996751785278, "rewards/margins": 7.616439342498779, "rewards/rejected": -6.483339309692383, "step": 87 }, { "epoch": 0.06177606177606178, "grad_norm": 2.5083703994750977, "learning_rate": 3.240804453583615e-05, "logits/chosen": -7.349209308624268, "logits/rejected": -7.340395927429199, "logps/chosen": -21.633548736572266, "logps/rejected": -87.37542724609375, "loss": 0.0992, "rewards/accuracies": 0.875, "rewards/chosen": 1.0882898569107056, "rewards/margins": 6.621559143066406, "rewards/rejected": -5.533268928527832, "step": 88 }, { "epoch": 0.06247806247806248, "grad_norm": 0.19671890139579773, "learning_rate": 3.248983344408188e-05, "logits/chosen": -7.899417877197266, "logits/rejected": -7.8947038650512695, "logps/chosen": -19.665199279785156, "logps/rejected": -95.02740478515625, "loss": 0.0715, "rewards/accuracies": 1.0, "rewards/chosen": 1.264890432357788, "rewards/margins": 7.5587921142578125, "rewards/rejected": -6.293901443481445, "step": 89 }, { "epoch": 0.06318006318006318, "grad_norm": 0.14440222084522247, "learning_rate": 3.2570708490655414e-05, "logits/chosen": -7.398755073547363, "logits/rejected": -7.391363620758057, "logps/chosen": -19.529495239257812, "logps/rejected": -94.62332916259766, "loss": 0.072, "rewards/accuracies": 1.0, "rewards/chosen": 1.2236747741699219, "rewards/margins": 7.552162170410156, "rewards/rejected": -6.328487396240234, "step": 90 }, { "epoch": 0.06388206388206388, "grad_norm": 0.15851452946662903, "learning_rate": 3.265068987201822e-05, "logits/chosen": -7.311764717102051, "logits/rejected": -7.311919212341309, "logps/chosen": -21.905750274658203, "logps/rejected": -90.237060546875, "loss": 0.0572, "rewards/accuracies": 0.875, "rewards/chosen": 0.9369328022003174, "rewards/margins": 6.714702129364014, "rewards/rejected": -5.777769088745117, "step": 91 }, { "epoch": 0.06458406458406458, "grad_norm": 0.12608979642391205, "learning_rate": 3.2729797122425925e-05, "logits/chosen": -7.032383918762207, "logits/rejected": -7.054978370666504, "logps/chosen": -21.960769653320312, "logps/rejected": -88.3003158569336, "loss": 0.0539, "rewards/accuracies": 1.0, "rewards/chosen": 1.1056840419769287, "rewards/margins": 6.628111839294434, "rewards/rejected": -5.522428035736084, "step": 92 }, { "epoch": 0.06528606528606529, "grad_norm": 0.11493901908397675, "learning_rate": 3.280804914256559e-05, "logits/chosen": -7.594240665435791, "logits/rejected": -7.595704078674316, "logps/chosen": -20.88797950744629, "logps/rejected": -88.06593322753906, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": 0.9787160754203796, "rewards/margins": 6.73248291015625, "rewards/rejected": -5.7537665367126465, "step": 93 }, { "epoch": 0.06598806598806599, "grad_norm": 0.14122043550014496, "learning_rate": 3.288546422666164e-05, "logits/chosen": -6.880650043487549, "logits/rejected": -6.885013580322266, "logps/chosen": -20.61052703857422, "logps/rejected": -88.28717803955078, "loss": 0.0471, "rewards/accuracies": 1.0, "rewards/chosen": 1.0804901123046875, "rewards/margins": 6.712750434875488, "rewards/rejected": -5.632260322570801, "step": 94 }, { "epoch": 0.06669006669006669, "grad_norm": 5.474856376647949, "learning_rate": 3.2962060088147464e-05, "logits/chosen": -7.424438953399658, "logits/rejected": -7.428860664367676, "logps/chosen": -19.28183364868164, "logps/rejected": -94.30338287353516, "loss": 0.0804, "rewards/accuracies": 1.0, "rewards/chosen": 1.0059568881988525, "rewards/margins": 7.478621006011963, "rewards/rejected": -6.472663879394531, "step": 95 }, { "epoch": 0.06739206739206739, "grad_norm": 0.18736739456653595, "learning_rate": 3.3037853883992805e-05, "logits/chosen": -7.571017742156982, "logits/rejected": -7.5589399337768555, "logps/chosen": -22.078136444091797, "logps/rejected": -87.3583984375, "loss": 0.0984, "rewards/accuracies": 0.875, "rewards/chosen": 1.0174239873886108, "rewards/margins": 6.628276348114014, "rewards/rejected": -5.610852241516113, "step": 96 }, { "epoch": 0.0680940680940681, "grad_norm": 1.2062569856643677, "learning_rate": 3.3112862237770756e-05, "logits/chosen": -7.115910530090332, "logits/rejected": -7.119783401489258, "logps/chosen": -19.48395538330078, "logps/rejected": -94.38021850585938, "loss": 0.0932, "rewards/accuracies": 1.0, "rewards/chosen": 1.3266751766204834, "rewards/margins": 7.57487678527832, "rewards/rejected": -6.248202323913574, "step": 97 }, { "epoch": 0.0687960687960688, "grad_norm": 0.09014569967985153, "learning_rate": 3.3187101261541584e-05, "logits/chosen": -7.035510063171387, "logits/rejected": -7.03132438659668, "logps/chosen": -22.59494972229004, "logps/rejected": -90.5684814453125, "loss": 0.0402, "rewards/accuracies": 1.0, "rewards/chosen": 1.1550054550170898, "rewards/margins": 6.772560119628906, "rewards/rejected": -5.617555141448975, "step": 98 }, { "epoch": 0.0694980694980695, "grad_norm": 0.18076029419898987, "learning_rate": 3.326058657662584e-05, "logits/chosen": -7.308511734008789, "logits/rejected": -7.312093734741211, "logps/chosen": -18.93863868713379, "logps/rejected": -97.39258575439453, "loss": 0.0848, "rewards/accuracies": 1.0, "rewards/chosen": 1.1822571754455566, "rewards/margins": 7.8002519607543945, "rewards/rejected": -6.617995262145996, "step": 99 }, { "epoch": 0.0702000702000702, "grad_norm": 0.1353362649679184, "learning_rate": 3.333333333333334e-05, "logits/chosen": -7.0915117263793945, "logits/rejected": -7.1125054359436035, "logps/chosen": -19.029293060302734, "logps/rejected": -97.88648986816406, "loss": 0.0698, "rewards/accuracies": 1.0, "rewards/chosen": 1.2009437084197998, "rewards/margins": 7.835084915161133, "rewards/rejected": -6.634140968322754, "step": 100 }, { "epoch": 0.0709020709020709, "grad_norm": 0.10884874314069748, "learning_rate": 3.340535622971072e-05, "logits/chosen": -7.051809310913086, "logits/rejected": -7.049546241760254, "logps/chosen": -23.600322723388672, "logps/rejected": -84.68515014648438, "loss": 0.0466, "rewards/accuracies": 0.875, "rewards/chosen": 0.7847188711166382, "rewards/margins": 6.016472816467285, "rewards/rejected": -5.231754302978516, "step": 101 }, { "epoch": 0.0716040716040716, "grad_norm": 0.1307877153158188, "learning_rate": 3.3476669529365295e-05, "logits/chosen": -7.762927055358887, "logits/rejected": -7.759670257568359, "logps/chosen": -18.584918975830078, "logps/rejected": -95.52191162109375, "loss": 0.0792, "rewards/accuracies": 1.0, "rewards/chosen": 1.1910876035690308, "rewards/margins": 7.754782676696777, "rewards/rejected": -6.563695430755615, "step": 102 }, { "epoch": 0.0723060723060723, "grad_norm": 0.17887228727340698, "learning_rate": 3.3547287078419544e-05, "logits/chosen": -7.89346981048584, "logits/rejected": -7.898442268371582, "logps/chosen": -24.879505157470703, "logps/rejected": -82.63700866699219, "loss": 0.0996, "rewards/accuracies": 0.875, "rewards/chosen": 0.8099548816680908, "rewards/margins": 5.707764625549316, "rewards/rejected": -4.897809982299805, "step": 103 }, { "epoch": 0.07300807300807301, "grad_norm": 0.14247027039527893, "learning_rate": 3.361722232164634e-05, "logits/chosen": -6.870982646942139, "logits/rejected": -6.8588547706604, "logps/chosen": -25.25574493408203, "logps/rejected": -83.30470275878906, "loss": 0.064, "rewards/accuracies": 1.0, "rewards/chosen": 0.9735981822013855, "rewards/margins": 5.864006042480469, "rewards/rejected": -4.890408515930176, "step": 104 }, { "epoch": 0.07371007371007371, "grad_norm": 0.1408097743988037, "learning_rate": 3.3686488317832306e-05, "logits/chosen": -7.0463151931762695, "logits/rejected": -7.071116924285889, "logps/chosen": -19.267126083374023, "logps/rejected": -97.51585388183594, "loss": 0.0559, "rewards/accuracies": 1.0, "rewards/chosen": 1.276366114616394, "rewards/margins": 7.8446574211120605, "rewards/rejected": -6.568291187286377, "step": 105 }, { "epoch": 0.07441207441207441, "grad_norm": 1.6950085163116455, "learning_rate": 3.375509775441284e-05, "logits/chosen": -7.035571098327637, "logits/rejected": -7.039609909057617, "logps/chosen": -23.7667236328125, "logps/rejected": -80.45639038085938, "loss": 0.0991, "rewards/accuracies": 0.875, "rewards/chosen": 0.870686948299408, "rewards/margins": 5.663837909698486, "rewards/rejected": -4.793150901794434, "step": 106 }, { "epoch": 0.07511407511407511, "grad_norm": 0.1621626913547516, "learning_rate": 3.382306296142016e-05, "logits/chosen": -6.961594581604004, "logits/rejected": -6.978695392608643, "logps/chosen": -20.41448974609375, "logps/rejected": -95.28137969970703, "loss": 0.0599, "rewards/accuracies": 1.0, "rewards/chosen": 1.0202090740203857, "rewards/margins": 7.397640228271484, "rewards/rejected": -6.377431392669678, "step": 107 }, { "epoch": 0.07581607581607581, "grad_norm": 0.63109290599823, "learning_rate": 3.38903959247825e-05, "logits/chosen": -7.2141008377075195, "logits/rejected": -7.217859745025635, "logps/chosen": -22.792984008789062, "logps/rejected": -94.95234680175781, "loss": 0.061, "rewards/accuracies": 1.0, "rewards/chosen": 0.9233251214027405, "rewards/margins": 7.105767726898193, "rewards/rejected": -6.182442665100098, "step": 108 }, { "epoch": 0.07651807651807652, "grad_norm": 3.293724536895752, "learning_rate": 3.395710829901039e-05, "logits/chosen": -7.273015022277832, "logits/rejected": -7.2633771896362305, "logps/chosen": -19.515830993652344, "logps/rejected": -99.09077453613281, "loss": 0.109, "rewards/accuracies": 1.0, "rewards/chosen": 1.2761492729187012, "rewards/margins": 7.969019889831543, "rewards/rejected": -6.692870140075684, "step": 109 }, { "epoch": 0.07722007722007722, "grad_norm": 0.13784830272197723, "learning_rate": 3.402321141930376e-05, "logits/chosen": -6.914823532104492, "logits/rejected": -6.897804260253906, "logps/chosen": -18.91080093383789, "logps/rejected": -95.8990707397461, "loss": 0.0594, "rewards/accuracies": 1.0, "rewards/chosen": 1.3322492837905884, "rewards/margins": 7.8156304359436035, "rewards/rejected": -6.4833807945251465, "step": 110 }, { "epoch": 0.07792207792207792, "grad_norm": 0.14752840995788574, "learning_rate": 3.4088716313110955e-05, "logits/chosen": -7.355859756469727, "logits/rejected": -7.371285915374756, "logps/chosen": -20.431747436523438, "logps/rejected": -90.39700317382812, "loss": 0.0531, "rewards/accuracies": 1.0, "rewards/chosen": 1.0086824893951416, "rewards/margins": 7.036027908325195, "rewards/rejected": -6.027345657348633, "step": 111 }, { "epoch": 0.07862407862407862, "grad_norm": 0.3433920443058014, "learning_rate": 3.415363371116969e-05, "logits/chosen": -7.760258674621582, "logits/rejected": -7.748672008514404, "logps/chosen": -19.227121353149414, "logps/rejected": -99.0928955078125, "loss": 0.084, "rewards/accuracies": 1.0, "rewards/chosen": 1.2863168716430664, "rewards/margins": 7.931951522827148, "rewards/rejected": -6.645634651184082, "step": 112 }, { "epoch": 0.07932607932607932, "grad_norm": 0.18298867344856262, "learning_rate": 3.4217974058057e-05, "logits/chosen": -7.444199562072754, "logits/rejected": -7.4443511962890625, "logps/chosen": -18.730552673339844, "logps/rejected": -100.07673645019531, "loss": 0.0654, "rewards/accuracies": 1.0, "rewards/chosen": 1.3059790134429932, "rewards/margins": 8.065505027770996, "rewards/rejected": -6.759526252746582, "step": 113 }, { "epoch": 0.08002808002808003, "grad_norm": 0.15094107389450073, "learning_rate": 3.428174752227455e-05, "logits/chosen": -7.053144454956055, "logits/rejected": -7.046832084655762, "logps/chosen": -23.227218627929688, "logps/rejected": -83.53099060058594, "loss": 0.0653, "rewards/accuracies": 1.0, "rewards/chosen": 1.0797193050384521, "rewards/margins": 6.122915267944336, "rewards/rejected": -5.043196201324463, "step": 114 }, { "epoch": 0.08073008073008073, "grad_norm": 0.14125965535640717, "learning_rate": 3.434496400589353e-05, "logits/chosen": -7.7370829582214355, "logits/rejected": -7.7322540283203125, "logps/chosen": -18.29018783569336, "logps/rejected": -99.07318878173828, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": 1.2387666702270508, "rewards/margins": 8.041101455688477, "rewards/rejected": -6.802334785461426, "step": 115 }, { "epoch": 0.08143208143208143, "grad_norm": 3.0066347122192383, "learning_rate": 3.440763315378198e-05, "logits/chosen": -7.8140974044799805, "logits/rejected": -7.820437431335449, "logps/chosen": -18.87710952758789, "logps/rejected": -100.24430847167969, "loss": 0.0555, "rewards/accuracies": 1.0, "rewards/chosen": 1.2097234725952148, "rewards/margins": 8.079198837280273, "rewards/rejected": -6.869475364685059, "step": 116 }, { "epoch": 0.08213408213408213, "grad_norm": 1.726335048675537, "learning_rate": 3.446976436243603e-05, "logits/chosen": -8.08055591583252, "logits/rejected": -8.086912155151367, "logps/chosen": -20.792678833007812, "logps/rejected": -92.8045425415039, "loss": 0.0936, "rewards/accuracies": 1.0, "rewards/chosen": 1.100651741027832, "rewards/margins": 7.184983253479004, "rewards/rejected": -6.084331035614014, "step": 117 }, { "epoch": 0.08283608283608283, "grad_norm": 0.27900710701942444, "learning_rate": 3.4531366788435425e-05, "logits/chosen": -7.4473066329956055, "logits/rejected": -7.457216262817383, "logps/chosen": -21.46112823486328, "logps/rejected": -92.31007385253906, "loss": 0.0619, "rewards/accuracies": 0.875, "rewards/chosen": 1.0382219552993774, "rewards/margins": 7.072957992553711, "rewards/rejected": -6.034735679626465, "step": 118 }, { "epoch": 0.08353808353808354, "grad_norm": 0.14539343118667603, "learning_rate": 3.459244935654219e-05, "logits/chosen": -7.592484474182129, "logits/rejected": -7.604640960693359, "logps/chosen": -17.887418746948242, "logps/rejected": -99.43290710449219, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": 1.3287017345428467, "rewards/margins": 8.19625473022461, "rewards/rejected": -6.867552757263184, "step": 119 }, { "epoch": 0.08424008424008424, "grad_norm": 0.1583695113658905, "learning_rate": 3.465302076746041e-05, "logits/chosen": -7.282991409301758, "logits/rejected": -7.264293193817139, "logps/chosen": -19.65121078491211, "logps/rejected": -90.81167602539062, "loss": 0.0454, "rewards/accuracies": 1.0, "rewards/chosen": 1.2195422649383545, "rewards/margins": 7.065338134765625, "rewards/rejected": -5.84579610824585, "step": 120 }, { "epoch": 0.08494208494208494, "grad_norm": 0.14518851041793823, "learning_rate": 3.471308950527417e-05, "logits/chosen": -7.383155822753906, "logits/rejected": -7.398057460784912, "logps/chosen": -17.632295608520508, "logps/rejected": -98.55841064453125, "loss": 0.0488, "rewards/accuracies": 1.0, "rewards/chosen": 1.371635913848877, "rewards/margins": 8.098966598510742, "rewards/rejected": -6.727330684661865, "step": 121 }, { "epoch": 0.08564408564408564, "grad_norm": 0.17409978806972504, "learning_rate": 3.477266384457914e-05, "logits/chosen": -7.473102569580078, "logits/rejected": -7.470751762390137, "logps/chosen": -18.048171997070312, "logps/rejected": -101.27825164794922, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": 1.285508155822754, "rewards/margins": 8.220542907714844, "rewards/rejected": -6.93503475189209, "step": 122 }, { "epoch": 0.08634608634608634, "grad_norm": 0.22828377783298492, "learning_rate": 3.48317518573233e-05, "logits/chosen": -6.783804893493652, "logits/rejected": -6.795731544494629, "logps/chosen": -20.50562286376953, "logps/rejected": -92.42366027832031, "loss": 0.0691, "rewards/accuracies": 1.0, "rewards/chosen": 1.0895661115646362, "rewards/margins": 7.059884548187256, "rewards/rejected": -5.970318794250488, "step": 123 }, { "epoch": 0.08704808704808704, "grad_norm": 0.17955878376960754, "learning_rate": 3.489036141937059e-05, "logits/chosen": -7.851428985595703, "logits/rejected": -7.855470180511475, "logps/chosen": -17.496849060058594, "logps/rejected": -99.49530029296875, "loss": 0.0714, "rewards/accuracies": 1.0, "rewards/chosen": 1.4314199686050415, "rewards/margins": 8.238540649414062, "rewards/rejected": -6.807120323181152, "step": 124 }, { "epoch": 0.08775008775008775, "grad_norm": 2.043734312057495, "learning_rate": 3.494850021680094e-05, "logits/chosen": -7.130043983459473, "logits/rejected": -7.1042351722717285, "logps/chosen": -18.7564754486084, "logps/rejected": -92.924560546875, "loss": 0.1893, "rewards/accuracies": 1.0, "rewards/chosen": 1.301405429840088, "rewards/margins": 7.36538028717041, "rewards/rejected": -6.0639753341674805, "step": 125 }, { "epoch": 0.08845208845208845, "grad_norm": 0.20197635889053345, "learning_rate": 3.500617575195938e-05, "logits/chosen": -8.037834167480469, "logits/rejected": -8.016092300415039, "logps/chosen": -18.83987045288086, "logps/rejected": -101.12525939941406, "loss": 0.0909, "rewards/accuracies": 1.0, "rewards/chosen": 1.2448923587799072, "rewards/margins": 8.081230163574219, "rewards/rejected": -6.836338043212891, "step": 126 }, { "epoch": 0.08915408915408915, "grad_norm": 0.17912715673446655, "learning_rate": 3.5063395349265945e-05, "logits/chosen": -7.990342140197754, "logits/rejected": -7.996137619018555, "logps/chosen": -17.838237762451172, "logps/rejected": -99.74215698242188, "loss": 0.0657, "rewards/accuracies": 1.0, "rewards/chosen": 1.4162174463272095, "rewards/margins": 8.198055267333984, "rewards/rejected": -6.781838417053223, "step": 127 }, { "epoch": 0.08985608985608985, "grad_norm": 0.17709124088287354, "learning_rate": 3.5120166160797804e-05, "logits/chosen": -7.2281694412231445, "logits/rejected": -7.240697860717773, "logps/chosen": -20.002666473388672, "logps/rejected": -90.99267578125, "loss": 0.0617, "rewards/accuracies": 0.875, "rewards/chosen": 1.15201997756958, "rewards/margins": 7.092888355255127, "rewards/rejected": -5.940868377685547, "step": 128 }, { "epoch": 0.09055809055809055, "grad_norm": 7.339588165283203, "learning_rate": 3.517649517165415e-05, "logits/chosen": -7.51271390914917, "logits/rejected": -7.539922714233398, "logps/chosen": -19.041221618652344, "logps/rejected": -101.83401489257812, "loss": 0.0665, "rewards/accuracies": 1.0, "rewards/chosen": 1.2011319398880005, "rewards/margins": 8.155099868774414, "rewards/rejected": -6.953968524932861, "step": 129 }, { "epoch": 0.09126009126009126, "grad_norm": 0.4086860120296478, "learning_rate": 3.523238920511395e-05, "logits/chosen": -7.6638336181640625, "logits/rejected": -7.672208786010742, "logps/chosen": -18.02283477783203, "logps/rejected": -98.79280853271484, "loss": 0.0689, "rewards/accuracies": 1.0, "rewards/chosen": 1.3385823965072632, "rewards/margins": 8.089651107788086, "rewards/rejected": -6.75106954574585, "step": 130 }, { "epoch": 0.09196209196209196, "grad_norm": 0.21513953804969788, "learning_rate": 3.528785492759607e-05, "logits/chosen": -7.944557189941406, "logits/rejected": -7.945068359375, "logps/chosen": -23.1630859375, "logps/rejected": -76.04181671142578, "loss": 0.0604, "rewards/accuracies": 1.0, "rewards/chosen": 0.9701535701751709, "rewards/margins": 5.302094459533691, "rewards/rejected": -4.331940650939941, "step": 131 }, { "epoch": 0.09266409266409266, "grad_norm": 5.646815299987793, "learning_rate": 3.5342898853430836e-05, "logits/chosen": -7.938549995422363, "logits/rejected": -7.93215274810791, "logps/chosen": -18.361181259155273, "logps/rejected": -100.20779418945312, "loss": 0.0904, "rewards/accuracies": 1.0, "rewards/chosen": 1.3071396350860596, "rewards/margins": 8.148252487182617, "rewards/rejected": -6.841113567352295, "step": 132 }, { "epoch": 0.09336609336609336, "grad_norm": 0.17059233784675598, "learning_rate": 3.539752734945143e-05, "logits/chosen": -7.188520431518555, "logits/rejected": -7.188028335571289, "logps/chosen": -18.516159057617188, "logps/rejected": -98.57898712158203, "loss": 0.0528, "rewards/accuracies": 1.0, "rewards/chosen": 1.4891870021820068, "rewards/margins": 8.072174072265625, "rewards/rejected": -6.582986354827881, "step": 133 }, { "epoch": 0.09406809406809406, "grad_norm": 6.119170188903809, "learning_rate": 3.5451746639413466e-05, "logits/chosen": -7.558382987976074, "logits/rejected": -7.564486503601074, "logps/chosen": -20.582725524902344, "logps/rejected": -92.52873992919922, "loss": 0.0804, "rewards/accuracies": 1.0, "rewards/chosen": 1.262923002243042, "rewards/margins": 7.219661712646484, "rewards/rejected": -5.9567389488220215, "step": 134 }, { "epoch": 0.09477009477009476, "grad_norm": 0.20077641308307648, "learning_rate": 3.550556280825011e-05, "logits/chosen": -7.218997001647949, "logits/rejected": -7.2096357345581055, "logps/chosen": -18.955135345458984, "logps/rejected": -99.68740844726562, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": 1.3219819068908691, "rewards/margins": 8.04659366607666, "rewards/rejected": -6.724612236022949, "step": 135 }, { "epoch": 0.09547209547209547, "grad_norm": 0.651006817817688, "learning_rate": 3.55589818061703e-05, "logits/chosen": -7.717750072479248, "logits/rejected": -7.71749210357666, "logps/chosen": -19.00601577758789, "logps/rejected": -100.29142761230469, "loss": 0.084, "rewards/accuracies": 1.0, "rewards/chosen": 1.0761585235595703, "rewards/margins": 8.000811576843262, "rewards/rejected": -6.924653053283691, "step": 136 }, { "epoch": 0.09617409617409617, "grad_norm": 0.32037070393562317, "learning_rate": 3.561200945260678e-05, "logits/chosen": -7.607232093811035, "logits/rejected": -7.609197616577148, "logps/chosen": -21.261714935302734, "logps/rejected": -90.03850555419922, "loss": 0.0673, "rewards/accuracies": 0.875, "rewards/chosen": 1.0572645664215088, "rewards/margins": 6.916416168212891, "rewards/rejected": -5.859151840209961, "step": 137 }, { "epoch": 0.09687609687609687, "grad_norm": 0.10127952694892883, "learning_rate": 3.5664651440020616e-05, "logits/chosen": -7.18588924407959, "logits/rejected": -7.196730613708496, "logps/chosen": -18.802722930908203, "logps/rejected": -99.88723754882812, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": 1.3778181076049805, "rewards/margins": 8.064238548278809, "rewards/rejected": -6.68641996383667, "step": 138 }, { "epoch": 0.09757809757809757, "grad_norm": 2.0502114295959473, "learning_rate": 3.571691333756825e-05, "logits/chosen": -8.099369049072266, "logits/rejected": -8.110013961791992, "logps/chosen": -18.852176666259766, "logps/rejected": -99.44851684570312, "loss": 0.0811, "rewards/accuracies": 1.0, "rewards/chosen": 1.1565394401550293, "rewards/margins": 7.99715518951416, "rewards/rejected": -6.840615749359131, "step": 139 }, { "epoch": 0.09828009828009827, "grad_norm": 0.23849105834960938, "learning_rate": 3.5768800594637304e-05, "logits/chosen": -7.694960594177246, "logits/rejected": -7.677449703216553, "logps/chosen": -18.47509002685547, "logps/rejected": -98.63558959960938, "loss": 0.0586, "rewards/accuracies": 1.0, "rewards/chosen": 1.2368602752685547, "rewards/margins": 8.00662612915039, "rewards/rejected": -6.769765377044678, "step": 140 }, { "epoch": 0.09898209898209898, "grad_norm": 0.2149023860692978, "learning_rate": 3.582031854425634e-05, "logits/chosen": -7.690888404846191, "logits/rejected": -7.707386493682861, "logps/chosen": -23.874797821044922, "logps/rejected": -94.188232421875, "loss": 0.0367, "rewards/accuracies": 0.875, "rewards/chosen": 0.7733875513076782, "rewards/margins": 6.9607696533203125, "rewards/rejected": -6.187381744384766, "step": 141 }, { "epoch": 0.09968409968409968, "grad_norm": 1.8909997940063477, "learning_rate": 3.587147240638428e-05, "logits/chosen": -8.311124801635742, "logits/rejected": -8.300284385681152, "logps/chosen": -19.072885513305664, "logps/rejected": -97.82979583740234, "loss": 0.0557, "rewards/accuracies": 1.0, "rewards/chosen": 1.292679786682129, "rewards/margins": 7.938316345214844, "rewards/rejected": -6.645636558532715, "step": 142 }, { "epoch": 0.10038610038610038, "grad_norm": 0.30346792936325073, "learning_rate": 3.5922267291084366e-05, "logits/chosen": -7.696118354797363, "logits/rejected": -7.703825950622559, "logps/chosen": -22.902557373046875, "logps/rejected": -92.21865844726562, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": 0.9112571477890015, "rewards/margins": 6.895322799682617, "rewards/rejected": -5.984066009521484, "step": 143 }, { "epoch": 0.10108810108810108, "grad_norm": 0.46949970722198486, "learning_rate": 3.5972708201587496e-05, "logits/chosen": -8.103676795959473, "logits/rejected": -8.119165420532227, "logps/chosen": -20.409713745117188, "logps/rejected": -99.84727478027344, "loss": 0.0904, "rewards/accuracies": 1.0, "rewards/chosen": 1.0930099487304688, "rewards/margins": 7.880669593811035, "rewards/rejected": -6.787659645080566, "step": 144 }, { "epoch": 0.10179010179010178, "grad_norm": 0.3480038642883301, "learning_rate": 3.6022800037249585e-05, "logits/chosen": -7.856291770935059, "logits/rejected": -7.858176231384277, "logps/chosen": -22.185009002685547, "logps/rejected": -91.70024108886719, "loss": 0.0668, "rewards/accuracies": 1.0, "rewards/chosen": 0.9438284635543823, "rewards/margins": 6.884868621826172, "rewards/rejected": -5.941040515899658, "step": 145 }, { "epoch": 0.10249210249210249, "grad_norm": 0.2712872624397278, "learning_rate": 3.607254759640729e-05, "logits/chosen": -8.546213150024414, "logits/rejected": -8.539338111877441, "logps/chosen": -25.633617401123047, "logps/rejected": -88.52719116210938, "loss": 0.0566, "rewards/accuracies": 1.0, "rewards/chosen": 0.5804653167724609, "rewards/margins": 6.258432865142822, "rewards/rejected": -5.677967071533203, "step": 146 }, { "epoch": 0.10319410319410319, "grad_norm": 0.2910563349723816, "learning_rate": 3.612195557913627e-05, "logits/chosen": -7.600732326507568, "logits/rejected": -7.593277454376221, "logps/chosen": -21.961027145385742, "logps/rejected": -87.4490966796875, "loss": 0.0659, "rewards/accuracies": 0.875, "rewards/chosen": 1.135108470916748, "rewards/margins": 6.646454334259033, "rewards/rejected": -5.511345863342285, "step": 147 }, { "epoch": 0.1038961038961039, "grad_norm": 0.489331990480423, "learning_rate": 3.6171028589915954e-05, "logits/chosen": -7.392334938049316, "logits/rejected": -7.384254455566406, "logps/chosen": -21.393178939819336, "logps/rejected": -99.15642547607422, "loss": 0.0859, "rewards/accuracies": 1.0, "rewards/chosen": 1.0471261739730835, "rewards/margins": 7.6946234703063965, "rewards/rejected": -6.647497177124023, "step": 148 }, { "epoch": 0.1045981045981046, "grad_norm": 0.4649643003940582, "learning_rate": 3.6219771140204575e-05, "logits/chosen": -7.667098522186279, "logits/rejected": -7.653289794921875, "logps/chosen": -28.809518814086914, "logps/rejected": -78.83575439453125, "loss": 0.0707, "rewards/accuracies": 0.875, "rewards/chosen": 0.3456419110298157, "rewards/margins": 4.920358657836914, "rewards/rejected": -4.574717044830322, "step": 149 }, { "epoch": 0.1053001053001053, "grad_norm": 0.21343767642974854, "learning_rate": 3.626818765092802e-05, "logits/chosen": -6.828312397003174, "logits/rejected": -6.833069801330566, "logps/chosen": -21.54191780090332, "logps/rejected": -97.41004943847656, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": 1.0394655466079712, "rewards/margins": 7.529290199279785, "rewards/rejected": -6.489824295043945, "step": 150 }, { "epoch": 0.10600210600210601, "grad_norm": 0.25068971514701843, "learning_rate": 3.6316282454886157e-05, "logits/chosen": -7.911776065826416, "logits/rejected": -7.9143595695495605, "logps/chosen": -24.534229278564453, "logps/rejected": -93.05718994140625, "loss": 0.0595, "rewards/accuracies": 0.875, "rewards/chosen": 0.8170626163482666, "rewards/margins": 6.798537254333496, "rewards/rejected": -5.981474876403809, "step": 151 }, { "epoch": 0.10670410670410671, "grad_norm": 2.0864453315734863, "learning_rate": 3.636405979907955e-05, "logits/chosen": -7.793551445007324, "logits/rejected": -7.787513732910156, "logps/chosen": -22.602632522583008, "logps/rejected": -98.13957977294922, "loss": 0.1011, "rewards/accuracies": 1.0, "rewards/chosen": 0.872575044631958, "rewards/margins": 7.473413467407227, "rewards/rejected": -6.600838661193848, "step": 152 }, { "epoch": 0.10740610740610741, "grad_norm": 0.15609942376613617, "learning_rate": 3.6411523846959985e-05, "logits/chosen": -8.32400894165039, "logits/rejected": -8.33419132232666, "logps/chosen": -21.570890426635742, "logps/rejected": -100.617919921875, "loss": 0.0183, "rewards/accuracies": 1.0, "rewards/chosen": 1.0258387327194214, "rewards/margins": 7.80034065246582, "rewards/rejected": -6.774501800537109, "step": 153 }, { "epoch": 0.10810810810810811, "grad_norm": 0.2295418530702591, "learning_rate": 3.645867868060772e-05, "logits/chosen": -7.748170852661133, "logits/rejected": -7.742678642272949, "logps/chosen": -24.657901763916016, "logps/rejected": -84.7904281616211, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": 0.8009465932846069, "rewards/margins": 6.027615547180176, "rewards/rejected": -5.226668357849121, "step": 154 }, { "epoch": 0.10881010881010882, "grad_norm": 0.3484094440937042, "learning_rate": 3.6505528302838193e-05, "logits/chosen": -7.613758087158203, "logits/rejected": -7.628352165222168, "logps/chosen": -19.390789031982422, "logps/rejected": -94.78460693359375, "loss": 0.0529, "rewards/accuracies": 1.0, "rewards/chosen": 1.2833950519561768, "rewards/margins": 7.631183624267578, "rewards/rejected": -6.3477888107299805, "step": 155 }, { "epoch": 0.10951210951210952, "grad_norm": 0.3267975151538849, "learning_rate": 3.6552076639241027e-05, "logits/chosen": -8.169540405273438, "logits/rejected": -8.187601089477539, "logps/chosen": -19.602323532104492, "logps/rejected": -99.122314453125, "loss": 0.0678, "rewards/accuracies": 1.0, "rewards/chosen": 1.1322873830795288, "rewards/margins": 7.940481185913086, "rewards/rejected": -6.808193683624268, "step": 156 }, { "epoch": 0.11021411021411022, "grad_norm": 0.27803105115890503, "learning_rate": 3.65983275401539e-05, "logits/chosen": -7.673957824707031, "logits/rejected": -7.6782073974609375, "logps/chosen": -18.952550888061523, "logps/rejected": -98.75310516357422, "loss": 0.0519, "rewards/accuracies": 1.0, "rewards/chosen": 1.3143980503082275, "rewards/margins": 7.992189407348633, "rewards/rejected": -6.677791118621826, "step": 157 }, { "epoch": 0.11091611091611092, "grad_norm": 0.44256865978240967, "learning_rate": 3.664428478257371e-05, "logits/chosen": -7.716214179992676, "logits/rejected": -7.727974891662598, "logps/chosen": -19.164283752441406, "logps/rejected": -98.0462875366211, "loss": 0.0411, "rewards/accuracies": 1.0, "rewards/chosen": 1.3505544662475586, "rewards/margins": 7.902575492858887, "rewards/rejected": -6.552021026611328, "step": 158 }, { "epoch": 0.11161811161811162, "grad_norm": 0.38108155131340027, "learning_rate": 3.668995207200753e-05, "logits/chosen": -7.9213457107543945, "logits/rejected": -7.907304763793945, "logps/chosen": -18.827259063720703, "logps/rejected": -102.6356430053711, "loss": 0.0542, "rewards/accuracies": 1.0, "rewards/chosen": 1.1544685363769531, "rewards/margins": 8.177608489990234, "rewards/rejected": -7.023139953613281, "step": 159 }, { "epoch": 0.11232011232011233, "grad_norm": 0.32485055923461914, "learning_rate": 3.673533304426541e-05, "logits/chosen": -7.405879020690918, "logits/rejected": -7.421060085296631, "logps/chosen": -20.968263626098633, "logps/rejected": -91.50701904296875, "loss": 0.0638, "rewards/accuracies": 1.0, "rewards/chosen": 1.1737148761749268, "rewards/margins": 7.070158958435059, "rewards/rejected": -5.8964433670043945, "step": 160 }, { "epoch": 0.11302211302211303, "grad_norm": 0.3755860924720764, "learning_rate": 3.67804312671975e-05, "logits/chosen": -7.51318883895874, "logits/rejected": -7.517733573913574, "logps/chosen": -21.601367950439453, "logps/rejected": -84.94363403320312, "loss": 0.073, "rewards/accuracies": 1.0, "rewards/chosen": 1.1227061748504639, "rewards/margins": 6.359409332275391, "rewards/rejected": -5.236702919006348, "step": 161 }, { "epoch": 0.11372411372411373, "grad_norm": 0.3406166136264801, "learning_rate": 3.682525024237719e-05, "logits/chosen": -7.927081108093262, "logits/rejected": -7.912776947021484, "logps/chosen": -22.705718994140625, "logps/rejected": -90.97901153564453, "loss": 0.0667, "rewards/accuracies": 0.875, "rewards/chosen": 0.9502506852149963, "rewards/margins": 6.844906806945801, "rewards/rejected": -5.894656181335449, "step": 162 }, { "epoch": 0.11442611442611443, "grad_norm": 0.3898352384567261, "learning_rate": 3.6869793406732636e-05, "logits/chosen": -8.6654691696167, "logits/rejected": -8.66085433959961, "logps/chosen": -21.918052673339844, "logps/rejected": -84.44361877441406, "loss": 0.0719, "rewards/accuracies": 1.0, "rewards/chosen": 1.008136510848999, "rewards/margins": 6.256683826446533, "rewards/rejected": -5.248547554016113, "step": 163 }, { "epoch": 0.11512811512811513, "grad_norm": 1.7123216390609741, "learning_rate": 3.69140641341283e-05, "logits/chosen": -8.098188400268555, "logits/rejected": -8.110034942626953, "logps/chosen": -19.854040145874023, "logps/rejected": -92.34028625488281, "loss": 0.1004, "rewards/accuracies": 1.0, "rewards/chosen": 1.2161524295806885, "rewards/margins": 7.205447673797607, "rewards/rejected": -5.98929500579834, "step": 164 }, { "epoch": 0.11583011583011583, "grad_norm": 0.39066094160079956, "learning_rate": 3.695806573689844e-05, "logits/chosen": -7.760594844818115, "logits/rejected": -7.756313323974609, "logps/chosen": -19.322879791259766, "logps/rejected": -94.86058044433594, "loss": 0.0488, "rewards/accuracies": 1.0, "rewards/chosen": 1.1274909973144531, "rewards/margins": 7.449814319610596, "rewards/rejected": -6.322322845458984, "step": 165 }, { "epoch": 0.11653211653211654, "grad_norm": 0.3673892319202423, "learning_rate": 3.700180146733426e-05, "logits/chosen": -8.240257263183594, "logits/rejected": -8.242159843444824, "logps/chosen": -21.187198638916016, "logps/rejected": -94.03084564208984, "loss": 0.0611, "rewards/accuracies": 1.0, "rewards/chosen": 1.015844702720642, "rewards/margins": 7.237771987915039, "rewards/rejected": -6.221927165985107, "step": 166 }, { "epoch": 0.11723411723411724, "grad_norm": 0.40098920464515686, "learning_rate": 3.704527451912639e-05, "logits/chosen": -7.5457868576049805, "logits/rejected": -7.537236213684082, "logps/chosen": -16.912071228027344, "logps/rejected": -98.0260009765625, "loss": 0.0272, "rewards/accuracies": 1.0, "rewards/chosen": 1.5237476825714111, "rewards/margins": 8.125090599060059, "rewards/rejected": -6.601343154907227, "step": 167 }, { "epoch": 0.11793611793611794, "grad_norm": 0.3912601172924042, "learning_rate": 3.708848802876438e-05, "logits/chosen": -7.886138916015625, "logits/rejected": -7.886955261230469, "logps/chosen": -20.72175407409668, "logps/rejected": -88.85093688964844, "loss": 0.0489, "rewards/accuracies": 1.0, "rewards/chosen": 1.1486139297485352, "rewards/margins": 6.833949565887451, "rewards/rejected": -5.685335636138916, "step": 168 }, { "epoch": 0.11863811863811864, "grad_norm": 0.3221084773540497, "learning_rate": 3.7131445076894564e-05, "logits/chosen": -7.555947303771973, "logits/rejected": -7.554098606109619, "logps/chosen": -23.65464973449707, "logps/rejected": -86.67320251464844, "loss": 0.046, "rewards/accuracies": 0.875, "rewards/chosen": 0.9367548823356628, "rewards/margins": 6.2689619064331055, "rewards/rejected": -5.332207679748535, "step": 169 }, { "epoch": 0.11934011934011934, "grad_norm": 0.3625067472457886, "learning_rate": 3.717414868963791e-05, "logits/chosen": -7.572892189025879, "logits/rejected": -7.589142799377441, "logps/chosen": -17.07326889038086, "logps/rejected": -99.91954040527344, "loss": 0.0308, "rewards/accuracies": 1.0, "rewards/chosen": 1.5612865686416626, "rewards/margins": 8.335865020751953, "rewards/rejected": -6.774579048156738, "step": 170 }, { "epoch": 0.12004212004212005, "grad_norm": 2.4849255084991455, "learning_rate": 3.721660183986924e-05, "logits/chosen": -7.001974105834961, "logits/rejected": -7.015968322753906, "logps/chosen": -20.972929000854492, "logps/rejected": -94.69083404541016, "loss": 0.0685, "rewards/accuracies": 1.0, "rewards/chosen": 1.16594398021698, "rewards/margins": 7.277277946472168, "rewards/rejected": -6.111333847045898, "step": 171 }, { "epoch": 0.12074412074412075, "grad_norm": 0.805355966091156, "learning_rate": 3.725880744845915e-05, "logits/chosen": -7.903604507446289, "logits/rejected": -7.874320030212402, "logps/chosen": -16.675830841064453, "logps/rejected": -99.74359130859375, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": 1.5231497287750244, "rewards/margins": 8.371809005737305, "rewards/rejected": -6.848658561706543, "step": 172 }, { "epoch": 0.12144612144612145, "grad_norm": 0.42274701595306396, "learning_rate": 3.730076838547993e-05, "logits/chosen": -8.806150436401367, "logits/rejected": -8.80722427368164, "logps/chosen": -19.520233154296875, "logps/rejected": -101.85262298583984, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/chosen": 0.9903805255889893, "rewards/margins": 8.148277282714844, "rewards/rejected": -7.157896995544434, "step": 173 }, { "epoch": 0.12214812214812215, "grad_norm": 0.4547980725765228, "learning_rate": 3.734248747137666e-05, "logits/chosen": -8.583671569824219, "logits/rejected": -8.57231330871582, "logps/chosen": -16.47041130065918, "logps/rejected": -97.65269470214844, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": 1.5680454969406128, "rewards/margins": 8.250554084777832, "rewards/rejected": -6.68250846862793, "step": 174 }, { "epoch": 0.12285012285012285, "grad_norm": 0.6159536838531494, "learning_rate": 3.738396747810492e-05, "logits/chosen": -8.054576873779297, "logits/rejected": -8.064279556274414, "logps/chosen": -18.21646499633789, "logps/rejected": -99.63160705566406, "loss": 0.044, "rewards/accuracies": 1.0, "rewards/chosen": 1.2869718074798584, "rewards/margins": 8.142821311950684, "rewards/rejected": -6.855849742889404, "step": 175 }, { "epoch": 0.12355212355212356, "grad_norm": 2.258744716644287, "learning_rate": 3.7425211130235834e-05, "logits/chosen": -8.470221519470215, "logits/rejected": -8.468670845031738, "logps/chosen": -17.9901123046875, "logps/rejected": -99.63473510742188, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": 1.3500969409942627, "rewards/margins": 8.153528213500977, "rewards/rejected": -6.803430557250977, "step": 176 }, { "epoch": 0.12425412425412426, "grad_norm": 2.190383195877075, "learning_rate": 3.7466221106030115e-05, "logits/chosen": -8.268542289733887, "logits/rejected": -8.262495040893555, "logps/chosen": -18.57537269592285, "logps/rejected": -100.00627136230469, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": 1.3351540565490723, "rewards/margins": 8.13683032989502, "rewards/rejected": -6.801675796508789, "step": 177 }, { "epoch": 0.12495612495612496, "grad_norm": 0.2589016258716583, "learning_rate": 3.750700003848157e-05, "logits/chosen": -8.941194534301758, "logits/rejected": -8.953802108764648, "logps/chosen": -18.521690368652344, "logps/rejected": -94.37974548339844, "loss": 0.0219, "rewards/accuracies": 1.0, "rewards/chosen": 1.2977834939956665, "rewards/margins": 7.639655590057373, "rewards/rejected": -6.341871738433838, "step": 178 }, { "epoch": 0.12565812565812565, "grad_norm": 0.9590855836868286, "learning_rate": 3.7547550516331555e-05, "logits/chosen": -8.757786750793457, "logits/rejected": -8.757072448730469, "logps/chosen": -18.814651489257812, "logps/rejected": -93.95803833007812, "loss": 0.0241, "rewards/accuracies": 1.0, "rewards/chosen": 1.2435977458953857, "rewards/margins": 7.592794895172119, "rewards/rejected": -6.3491973876953125, "step": 179 }, { "epoch": 0.12636012636012636, "grad_norm": 1.1361846923828125, "learning_rate": 3.75878750850551e-05, "logits/chosen": -7.764183044433594, "logits/rejected": -7.776927947998047, "logps/chosen": -22.133222579956055, "logps/rejected": -95.98182678222656, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": 0.9332303404808044, "rewards/margins": 7.389235496520996, "rewards/rejected": -6.456005096435547, "step": 180 }, { "epoch": 0.12706212706212705, "grad_norm": 0.20723015069961548, "learning_rate": 3.7627976247819744e-05, "logits/chosen": -7.788818359375, "logits/rejected": -7.80531120300293, "logps/chosen": -20.391544342041016, "logps/rejected": -90.06130981445312, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 1.2857881784439087, "rewards/margins": 7.103300094604492, "rewards/rejected": -5.817511558532715, "step": 181 }, { "epoch": 0.12776412776412777, "grad_norm": 2.2230331897735596, "learning_rate": 3.766785646641792e-05, "logits/chosen": -7.877173900604248, "logits/rejected": -7.867382526397705, "logps/chosen": -22.407503128051758, "logps/rejected": -101.93107604980469, "loss": 0.0298, "rewards/accuracies": 1.0, "rewards/chosen": 0.8847008347511292, "rewards/margins": 7.8671464920043945, "rewards/rejected": -6.982445240020752, "step": 182 }, { "epoch": 0.12846612846612845, "grad_norm": 1.4594792127609253, "learning_rate": 3.770751816217383e-05, "logits/chosen": -8.004451751708984, "logits/rejected": -7.9826555252075195, "logps/chosen": -21.16651153564453, "logps/rejected": -91.38316345214844, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": 1.1076135635375977, "rewards/margins": 7.062329292297363, "rewards/rejected": -5.954715728759766, "step": 183 }, { "epoch": 0.12916812916812917, "grad_norm": 0.7141205668449402, "learning_rate": 3.7746963716825615e-05, "logits/chosen": -7.890381813049316, "logits/rejected": -7.8951520919799805, "logps/chosen": -20.88627052307129, "logps/rejected": -98.81242370605469, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": 1.2808811664581299, "rewards/margins": 7.896780967712402, "rewards/rejected": -6.615900039672852, "step": 184 }, { "epoch": 0.12987012987012986, "grad_norm": 2.7278614044189453, "learning_rate": 3.778619547338356e-05, "logits/chosen": -7.5416154861450195, "logits/rejected": -7.53035831451416, "logps/chosen": -20.868789672851562, "logps/rejected": -99.27313995361328, "loss": 0.0454, "rewards/accuracies": 1.0, "rewards/chosen": 1.1446789503097534, "rewards/margins": 7.903213977813721, "rewards/rejected": -6.758535385131836, "step": 185 }, { "epoch": 0.13057213057213057, "grad_norm": 1.2759206295013428, "learning_rate": 3.782521573696528e-05, "logits/chosen": -8.762497901916504, "logits/rejected": -8.765325546264648, "logps/chosen": -24.639806747436523, "logps/rejected": -79.37983703613281, "loss": 0.0312, "rewards/accuracies": 1.0, "rewards/chosen": 0.8256840705871582, "rewards/margins": 5.530773162841797, "rewards/rejected": -4.705089092254639, "step": 186 }, { "epoch": 0.13127413127413126, "grad_norm": 0.33815619349479675, "learning_rate": 3.786402677560832e-05, "logits/chosen": -8.515809059143066, "logits/rejected": -8.51432991027832, "logps/chosen": -23.561622619628906, "logps/rejected": -96.29591369628906, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 0.7219645380973816, "rewards/margins": 7.258949279785156, "rewards/rejected": -6.536984920501709, "step": 187 }, { "epoch": 0.13197613197613198, "grad_norm": 1.1443562507629395, "learning_rate": 3.790263082106134e-05, "logits/chosen": -7.9466657638549805, "logits/rejected": -7.965194225311279, "logps/chosen": -23.115966796875, "logps/rejected": -93.49640655517578, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": 0.8524225950241089, "rewards/margins": 6.972733497619629, "rewards/rejected": -6.1203107833862305, "step": 188 }, { "epoch": 0.13267813267813267, "grad_norm": 1.8093936443328857, "learning_rate": 3.794103006955407e-05, "logits/chosen": -8.467307090759277, "logits/rejected": -8.484516143798828, "logps/chosen": -27.600868225097656, "logps/rejected": -90.8130111694336, "loss": 0.0502, "rewards/accuracies": 0.875, "rewards/chosen": 0.4182731509208679, "rewards/margins": 6.251033782958984, "rewards/rejected": -5.832760334014893, "step": 189 }, { "epoch": 0.13338013338013338, "grad_norm": 0.2653859555721283, "learning_rate": 3.797922668254715e-05, "logits/chosen": -8.861492156982422, "logits/rejected": -8.875917434692383, "logps/chosen": -25.85000991821289, "logps/rejected": -89.64067840576172, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": 0.7729813456535339, "rewards/margins": 6.513546943664551, "rewards/rejected": -5.740566253662109, "step": 190 }, { "epoch": 0.13408213408213407, "grad_norm": 0.29263412952423096, "learning_rate": 3.801722278746213e-05, "logits/chosen": -8.680886268615723, "logits/rejected": -8.67392635345459, "logps/chosen": -25.32840347290039, "logps/rejected": -96.47200012207031, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": 0.5885277390480042, "rewards/margins": 6.978774547576904, "rewards/rejected": -6.390246868133545, "step": 191 }, { "epoch": 0.13478413478413478, "grad_norm": 0.18127386271953583, "learning_rate": 3.8055020478392495e-05, "logits/chosen": -8.394521713256836, "logits/rejected": -8.402703285217285, "logps/chosen": -22.78615951538086, "logps/rejected": -101.85408782958984, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": 0.9096255898475647, "rewards/margins": 7.891705513000488, "rewards/rejected": -6.982080459594727, "step": 192 }, { "epoch": 0.13548613548613547, "grad_norm": 0.14253807067871094, "learning_rate": 3.809262181679623e-05, "logits/chosen": -8.756370544433594, "logits/rejected": -8.743931770324707, "logps/chosen": -24.495193481445312, "logps/rejected": -94.77302551269531, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 0.8122169971466064, "rewards/margins": 7.081738471984863, "rewards/rejected": -6.269521713256836, "step": 193 }, { "epoch": 0.1361881361881362, "grad_norm": 0.14271044731140137, "learning_rate": 3.813002883217044e-05, "logits/chosen": -8.078531265258789, "logits/rejected": -8.082202911376953, "logps/chosen": -22.763477325439453, "logps/rejected": -97.46916198730469, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 0.8855063319206238, "rewards/margins": 7.473370552062988, "rewards/rejected": -6.587864398956299, "step": 194 }, { "epoch": 0.13689013689013688, "grad_norm": 0.44687098264694214, "learning_rate": 3.816724352270863e-05, "logits/chosen": -8.56879997253418, "logits/rejected": -8.611652374267578, "logps/chosen": -22.341203689575195, "logps/rejected": -99.25498962402344, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": 0.9054560661315918, "rewards/margins": 7.605410099029541, "rewards/rejected": -6.699954032897949, "step": 195 }, { "epoch": 0.1375921375921376, "grad_norm": 0.07012852281332016, "learning_rate": 3.8204267855941266e-05, "logits/chosen": -8.069513320922852, "logits/rejected": -8.082135200500488, "logps/chosen": -21.048412322998047, "logps/rejected": -99.21280670166016, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.1062850952148438, "rewards/margins": 7.93264627456665, "rewards/rejected": -6.826361179351807, "step": 196 }, { "epoch": 0.1382941382941383, "grad_norm": 1.9623947143554688, "learning_rate": 3.824110376935989e-05, "logits/chosen": -8.078295707702637, "logits/rejected": -8.097434997558594, "logps/chosen": -23.69803237915039, "logps/rejected": -95.223388671875, "loss": 0.026, "rewards/accuracies": 1.0, "rewards/chosen": 0.9427775144577026, "rewards/margins": 7.229762077331543, "rewards/rejected": -6.286984443664551, "step": 197 }, { "epoch": 0.138996138996139, "grad_norm": 0.39703309535980225, "learning_rate": 3.827775317102552e-05, "logits/chosen": -8.174117088317871, "logits/rejected": -8.168572425842285, "logps/chosen": -22.995166778564453, "logps/rejected": -89.40516662597656, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": 0.9517701864242554, "rewards/margins": 6.69978141784668, "rewards/rejected": -5.748010635375977, "step": 198 }, { "epoch": 0.1396981396981397, "grad_norm": 0.1890508532524109, "learning_rate": 3.831421794016178e-05, "logits/chosen": -8.797588348388672, "logits/rejected": -8.791847229003906, "logps/chosen": -23.07459831237793, "logps/rejected": -99.32400512695312, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 0.8383432626724243, "rewards/margins": 7.627185821533203, "rewards/rejected": -6.788843154907227, "step": 199 }, { "epoch": 0.1404001404001404, "grad_norm": 2.05440616607666, "learning_rate": 3.835049992773302e-05, "logits/chosen": -8.133867263793945, "logits/rejected": -8.144014358520508, "logps/chosen": -19.5064754486084, "logps/rejected": -103.37393951416016, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": 1.1687363386154175, "rewards/margins": 8.308393478393555, "rewards/rejected": -7.139657974243164, "step": 200 }, { "epoch": 0.14110214110214112, "grad_norm": 0.34660065174102783, "learning_rate": 3.838660095700815e-05, "logits/chosen": -9.162574768066406, "logits/rejected": -9.177444458007812, "logps/chosen": -21.447856903076172, "logps/rejected": -99.5054931640625, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": 0.9690049290657043, "rewards/margins": 7.781795501708984, "rewards/rejected": -6.812790870666504, "step": 201 }, { "epoch": 0.1418041418041418, "grad_norm": 1.4414465427398682, "learning_rate": 3.84225228241104e-05, "logits/chosen": -8.51369857788086, "logits/rejected": -8.518377304077148, "logps/chosen": -24.89828109741211, "logps/rejected": -94.57894897460938, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 0.8470261096954346, "rewards/margins": 7.059021949768066, "rewards/rejected": -6.211996078491211, "step": 202 }, { "epoch": 0.14250614250614252, "grad_norm": 0.8299551606178284, "learning_rate": 3.8458267298553554e-05, "logits/chosen": -8.830947875976562, "logits/rejected": -8.827447891235352, "logps/chosen": -18.43064308166504, "logps/rejected": -96.06614685058594, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 1.240814208984375, "rewards/margins": 7.757233619689941, "rewards/rejected": -6.516419410705566, "step": 203 }, { "epoch": 0.1432081432081432, "grad_norm": 0.44161731004714966, "learning_rate": 3.8493836123764984e-05, "logits/chosen": -9.020879745483398, "logits/rejected": -9.033422470092773, "logps/chosen": -14.32162857055664, "logps/rejected": -101.77822875976562, "loss": 0.0452, "rewards/accuracies": 1.0, "rewards/chosen": 1.645437240600586, "rewards/margins": 8.776103973388672, "rewards/rejected": -7.130666732788086, "step": 204 }, { "epoch": 0.14391014391014392, "grad_norm": 0.04485098645091057, "learning_rate": 3.852923101759591e-05, "logits/chosen": -8.172466278076172, "logits/rejected": -8.18247127532959, "logps/chosen": -15.30668830871582, "logps/rejected": -101.7931137084961, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 1.6742885112762451, "rewards/margins": 8.687026977539062, "rewards/rejected": -7.012738227844238, "step": 205 }, { "epoch": 0.1446121446121446, "grad_norm": 0.1133572906255722, "learning_rate": 3.856445367281923e-05, "logits/chosen": -8.025765419006348, "logits/rejected": -8.045225143432617, "logps/chosen": -13.55118179321289, "logps/rejected": -103.484130859375, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 1.729882001876831, "rewards/margins": 8.859251022338867, "rewards/rejected": -7.129369735717773, "step": 206 }, { "epoch": 0.14531414531414533, "grad_norm": 0.12442497164011002, "learning_rate": 3.859950575761529e-05, "logits/chosen": -8.566313743591309, "logits/rejected": -8.567716598510742, "logps/chosen": -14.974251747131348, "logps/rejected": -91.42678833007812, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 1.7499418258666992, "rewards/margins": 7.655340671539307, "rewards/rejected": -5.905398845672607, "step": 207 }, { "epoch": 0.14601614601614601, "grad_norm": 0.13546162843704224, "learning_rate": 3.8634388916046025e-05, "logits/chosen": -8.069467544555664, "logits/rejected": -8.052011489868164, "logps/chosen": -15.361384391784668, "logps/rejected": -100.42359924316406, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.6827210187911987, "rewards/margins": 8.465988159179688, "rewards/rejected": -6.783267498016357, "step": 208 }, { "epoch": 0.14671814671814673, "grad_norm": 0.4922577440738678, "learning_rate": 3.866910476851757e-05, "logits/chosen": -8.450356483459473, "logits/rejected": -8.453315734863281, "logps/chosen": -10.938240051269531, "logps/rejected": -101.92213439941406, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": 2.1313893795013428, "rewards/margins": 9.188261032104492, "rewards/rejected": -7.056872367858887, "step": 209 }, { "epoch": 0.14742014742014742, "grad_norm": 0.4832911789417267, "learning_rate": 3.870365491223199e-05, "logits/chosen": -8.693262100219727, "logits/rejected": -8.692747116088867, "logps/chosen": -13.147320747375488, "logps/rejected": -103.72109985351562, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": 1.8427300453186035, "rewards/margins": 8.936820983886719, "rewards/rejected": -7.094091415405273, "step": 210 }, { "epoch": 0.14812214812214813, "grad_norm": 0.1274886280298233, "learning_rate": 3.8738040921628215e-05, "logits/chosen": -8.83476448059082, "logits/rejected": -8.847158432006836, "logps/chosen": -13.432687759399414, "logps/rejected": -102.68789672851562, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 1.8294259309768677, "rewards/margins": 8.85903549194336, "rewards/rejected": -7.0296101570129395, "step": 211 }, { "epoch": 0.14882414882414882, "grad_norm": 2.23388409614563, "learning_rate": 3.877226434881253e-05, "logits/chosen": -9.305366516113281, "logits/rejected": -9.334085464477539, "logps/chosen": -14.652730941772461, "logps/rejected": -96.07176208496094, "loss": 0.0857, "rewards/accuracies": 1.0, "rewards/chosen": 1.6960638761520386, "rewards/margins": 8.187326431274414, "rewards/rejected": -6.491261959075928, "step": 212 }, { "epoch": 0.14952614952614954, "grad_norm": 0.5314605236053467, "learning_rate": 3.880632672397897e-05, "logits/chosen": -9.298408508300781, "logits/rejected": -9.286131858825684, "logps/chosen": -11.507303237915039, "logps/rejected": -103.00035095214844, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": 1.9274749755859375, "rewards/margins": 9.10441780090332, "rewards/rejected": -7.176943302154541, "step": 213 }, { "epoch": 0.15022815022815023, "grad_norm": 1.5911074876785278, "learning_rate": 3.884022955581985e-05, "logits/chosen": -8.271459579467773, "logits/rejected": -8.291229248046875, "logps/chosen": -19.065574645996094, "logps/rejected": -91.94277954101562, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": 1.3900065422058105, "rewards/margins": 7.274221420288086, "rewards/rejected": -5.884214878082275, "step": 214 }, { "epoch": 0.15093015093015094, "grad_norm": 0.04503808543086052, "learning_rate": 3.887397433192676e-05, "logits/chosen": -9.827035903930664, "logits/rejected": -9.804494857788086, "logps/chosen": -12.600761413574219, "logps/rejected": -102.53011322021484, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 1.8846367597579956, "rewards/margins": 8.992534637451172, "rewards/rejected": -7.107897758483887, "step": 215 }, { "epoch": 0.15163215163215163, "grad_norm": 0.07925863564014435, "learning_rate": 3.890756251918219e-05, "logits/chosen": -8.323220252990723, "logits/rejected": -8.306710243225098, "logps/chosen": -16.837505340576172, "logps/rejected": -98.24346160888672, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 1.4674417972564697, "rewards/margins": 8.095512390136719, "rewards/rejected": -6.628070831298828, "step": 216 }, { "epoch": 0.15233415233415235, "grad_norm": 0.4342501759529114, "learning_rate": 3.894099556414216e-05, "logits/chosen": -8.846985816955566, "logits/rejected": -8.83777141571045, "logps/chosen": -15.633768081665039, "logps/rejected": -101.31890869140625, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 1.43574857711792, "rewards/margins": 8.517414093017578, "rewards/rejected": -7.0816650390625, "step": 217 }, { "epoch": 0.15303615303615303, "grad_norm": 0.027595961466431618, "learning_rate": 3.897427489341009e-05, "logits/chosen": -8.547807693481445, "logits/rejected": -8.562054634094238, "logps/chosen": -18.116458892822266, "logps/rejected": -97.04737854003906, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 1.3461041450500488, "rewards/margins": 7.925119876861572, "rewards/rejected": -6.579015731811523, "step": 218 }, { "epoch": 0.15373815373815375, "grad_norm": 0.11770953238010406, "learning_rate": 3.900740191400198e-05, "logits/chosen": -8.328121185302734, "logits/rejected": -8.339106559753418, "logps/chosen": -18.163280487060547, "logps/rejected": -98.45510864257812, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 1.4789507389068604, "rewards/margins": 8.100175857543945, "rewards/rejected": -6.621225357055664, "step": 219 }, { "epoch": 0.15444015444015444, "grad_norm": 1.1351804733276367, "learning_rate": 3.904037801370344e-05, "logits/chosen": -8.54415225982666, "logits/rejected": -8.5723237991333, "logps/chosen": -18.644214630126953, "logps/rejected": -101.72767639160156, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": 1.2777879238128662, "rewards/margins": 8.28019905090332, "rewards/rejected": -7.002411365509033, "step": 220 }, { "epoch": 0.15514215514215515, "grad_norm": 1.8886774778366089, "learning_rate": 3.9073204561418514e-05, "logits/chosen": -8.438125610351562, "logits/rejected": -8.44377326965332, "logps/chosen": -23.885263442993164, "logps/rejected": -95.69446563720703, "loss": 0.0488, "rewards/accuracies": 1.0, "rewards/chosen": 0.7484297752380371, "rewards/margins": 7.172182083129883, "rewards/rejected": -6.423752784729004, "step": 221 }, { "epoch": 0.15584415584415584, "grad_norm": 0.6555754542350769, "learning_rate": 3.9105882907510644e-05, "logits/chosen": -8.70721435546875, "logits/rejected": -8.705801963806152, "logps/chosen": -19.70771026611328, "logps/rejected": -91.28721618652344, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": 1.3467071056365967, "rewards/margins": 7.245902061462402, "rewards/rejected": -5.899194717407227, "step": 222 }, { "epoch": 0.15654615654615656, "grad_norm": 1.7174272537231445, "learning_rate": 3.913841438413601e-05, "logits/chosen": -8.673357009887695, "logits/rejected": -8.654621124267578, "logps/chosen": -17.38541030883789, "logps/rejected": -102.1085433959961, "loss": 0.0207, "rewards/accuracies": 1.0, "rewards/chosen": 1.3978283405303955, "rewards/margins": 8.478913307189941, "rewards/rejected": -7.081084251403809, "step": 223 }, { "epoch": 0.15724815724815724, "grad_norm": 1.7493647336959839, "learning_rate": 3.917080030556938e-05, "logits/chosen": -8.213982582092285, "logits/rejected": -8.236342430114746, "logps/chosen": -19.529003143310547, "logps/rejected": -98.78301239013672, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": 1.355391502380371, "rewards/margins": 7.936057090759277, "rewards/rejected": -6.5806660652160645, "step": 224 }, { "epoch": 0.15795015795015796, "grad_norm": 1.0447288751602173, "learning_rate": 3.9203041968522716e-05, "logits/chosen": -9.664929389953613, "logits/rejected": -9.663713455200195, "logps/chosen": -19.708837509155273, "logps/rejected": -96.84849548339844, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": 1.1690940856933594, "rewards/margins": 7.660174369812012, "rewards/rejected": -6.491079807281494, "step": 225 }, { "epoch": 0.15865215865215865, "grad_norm": 0.0818687453866005, "learning_rate": 3.923514065245669e-05, "logits/chosen": -8.913846015930176, "logits/rejected": -8.9315824508667, "logps/chosen": -16.09840202331543, "logps/rejected": -95.63426208496094, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 1.6346213817596436, "rewards/margins": 7.969356536865234, "rewards/rejected": -6.334734916687012, "step": 226 }, { "epoch": 0.15935415935415936, "grad_norm": 0.06316595524549484, "learning_rate": 3.926709761988538e-05, "logits/chosen": -8.940530776977539, "logits/rejected": -8.930830955505371, "logps/chosen": -20.191476821899414, "logps/rejected": -80.11697387695312, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 1.2111088037490845, "rewards/margins": 6.058352947235107, "rewards/rejected": -4.8472442626953125, "step": 227 }, { "epoch": 0.16005616005616005, "grad_norm": 0.04170745238661766, "learning_rate": 3.929891411667424e-05, "logits/chosen": -8.427237510681152, "logits/rejected": -8.413457870483398, "logps/chosen": -14.361978530883789, "logps/rejected": -100.16503143310547, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 1.7686448097229004, "rewards/margins": 8.634977340698242, "rewards/rejected": -6.8663330078125, "step": 228 }, { "epoch": 0.16075816075816077, "grad_norm": 0.08087937533855438, "learning_rate": 3.933059137233147e-05, "logits/chosen": -8.442742347717285, "logits/rejected": -8.44344425201416, "logps/chosen": -10.896326065063477, "logps/rejected": -101.30597686767578, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 2.0822510719299316, "rewards/margins": 9.071765899658203, "rewards/rejected": -6.9895148277282715, "step": 229 }, { "epoch": 0.16146016146016146, "grad_norm": 0.16632147133350372, "learning_rate": 3.9362130600293214e-05, "logits/chosen": -7.951422691345215, "logits/rejected": -7.958190441131592, "logps/chosen": -14.409790992736816, "logps/rejected": -99.968994140625, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 1.902887225151062, "rewards/margins": 8.558490753173828, "rewards/rejected": -6.65560245513916, "step": 230 }, { "epoch": 0.16216216216216217, "grad_norm": 0.22838647663593292, "learning_rate": 3.9393532998202405e-05, "logits/chosen": -8.501943588256836, "logits/rejected": -8.492792129516602, "logps/chosen": -13.3207426071167, "logps/rejected": -97.33335876464844, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 1.9666965007781982, "rewards/margins": 8.44558334350586, "rewards/rejected": -6.47888708114624, "step": 231 }, { "epoch": 0.16286416286416286, "grad_norm": 0.07375779002904892, "learning_rate": 3.942479974818166e-05, "logits/chosen": -8.814910888671875, "logits/rejected": -8.815544128417969, "logps/chosen": -13.254278182983398, "logps/rejected": -92.6255874633789, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": 1.8217872381210327, "rewards/margins": 8.03392219543457, "rewards/rejected": -6.21213436126709, "step": 232 }, { "epoch": 0.16356616356616357, "grad_norm": 0.08125881850719452, "learning_rate": 3.945593201710032e-05, "logits/chosen": -8.685150146484375, "logits/rejected": -8.698638916015625, "logps/chosen": -10.890146255493164, "logps/rejected": -100.01165008544922, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 2.034757614135742, "rewards/margins": 8.898473739624023, "rewards/rejected": -6.8637166023254395, "step": 233 }, { "epoch": 0.16426816426816426, "grad_norm": 0.9403457641601562, "learning_rate": 3.9486930956835724e-05, "logits/chosen": -8.613592147827148, "logits/rejected": -8.627740859985352, "logps/chosen": -11.382429122924805, "logps/rejected": -98.87037658691406, "loss": 0.0285, "rewards/accuracies": 1.0, "rewards/chosen": 1.9615437984466553, "rewards/margins": 8.686964988708496, "rewards/rejected": -6.725421905517578, "step": 234 }, { "epoch": 0.16497016497016498, "grad_norm": 3.8597731590270996, "learning_rate": 3.951779770452894e-05, "logits/chosen": -8.495304107666016, "logits/rejected": -8.501811981201172, "logps/chosen": -16.910070419311523, "logps/rejected": -90.88866424560547, "loss": 0.0423, "rewards/accuracies": 1.0, "rewards/chosen": 1.5816395282745361, "rewards/margins": 7.418920516967773, "rewards/rejected": -5.837281227111816, "step": 235 }, { "epoch": 0.16567216567216567, "grad_norm": 0.0724283829331398, "learning_rate": 3.954853338283512e-05, "logits/chosen": -8.60200309753418, "logits/rejected": -8.58193588256836, "logps/chosen": -11.785398483276367, "logps/rejected": -101.2637710571289, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 1.9620437622070312, "rewards/margins": 8.867547035217285, "rewards/rejected": -6.905502796173096, "step": 236 }, { "epoch": 0.16637416637416638, "grad_norm": 0.054624803364276886, "learning_rate": 3.9579139100168404e-05, "logits/chosen": -8.042311668395996, "logits/rejected": -8.026243209838867, "logps/chosen": -13.633519172668457, "logps/rejected": -96.27269744873047, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 1.8081172704696655, "rewards/margins": 8.389942169189453, "rewards/rejected": -6.58182430267334, "step": 237 }, { "epoch": 0.16707616707616707, "grad_norm": 2.123426914215088, "learning_rate": 3.960961595094187e-05, "logits/chosen": -8.153595924377441, "logits/rejected": -8.15337085723877, "logps/chosen": -10.312664031982422, "logps/rejected": -100.57037353515625, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": 1.9946398735046387, "rewards/margins": 9.014122009277344, "rewards/rejected": -7.019482612609863, "step": 238 }, { "epoch": 0.16777816777816779, "grad_norm": 0.05339115858078003, "learning_rate": 3.96399650158023e-05, "logits/chosen": -8.45573616027832, "logits/rejected": -8.456380844116211, "logps/chosen": -11.025104522705078, "logps/rejected": -101.03890228271484, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 2.0720810890197754, "rewards/margins": 8.930277824401855, "rewards/rejected": -6.858196258544922, "step": 239 }, { "epoch": 0.16848016848016847, "grad_norm": 0.06699275225400925, "learning_rate": 3.96701873618601e-05, "logits/chosen": -8.189186096191406, "logits/rejected": -8.198141098022461, "logps/chosen": -13.585570335388184, "logps/rejected": -95.56890106201172, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 1.8785713911056519, "rewards/margins": 8.170707702636719, "rewards/rejected": -6.292136192321777, "step": 240 }, { "epoch": 0.1691821691821692, "grad_norm": 0.04850306734442711, "learning_rate": 3.970028404291448e-05, "logits/chosen": -7.874259948730469, "logits/rejected": -7.874581336975098, "logps/chosen": -19.457456588745117, "logps/rejected": -95.2472915649414, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 1.4693249464035034, "rewards/margins": 7.5732879638671875, "rewards/rejected": -6.1039628982543945, "step": 241 }, { "epoch": 0.16988416988416988, "grad_norm": 0.04323434457182884, "learning_rate": 3.9730256099673865e-05, "logits/chosen": -7.59445858001709, "logits/rejected": -7.554975509643555, "logps/chosen": -10.746862411499023, "logps/rejected": -100.13307189941406, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 2.126997709274292, "rewards/margins": 8.908334732055664, "rewards/rejected": -6.781336307525635, "step": 242 }, { "epoch": 0.1705861705861706, "grad_norm": 0.07338925451040268, "learning_rate": 3.976010455997187e-05, "logits/chosen": -7.6927490234375, "logits/rejected": -7.696385383605957, "logps/chosen": -18.233022689819336, "logps/rejected": -97.99156951904297, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.3827767372131348, "rewards/margins": 7.842388153076172, "rewards/rejected": -6.459611415863037, "step": 243 }, { "epoch": 0.17128817128817128, "grad_norm": 0.15339581668376923, "learning_rate": 3.978983043897883e-05, "logits/chosen": -8.625326156616211, "logits/rejected": -8.629570960998535, "logps/chosen": -12.882743835449219, "logps/rejected": -93.7718734741211, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 1.9896838665008545, "rewards/margins": 8.083964347839355, "rewards/rejected": -6.094281196594238, "step": 244 }, { "epoch": 0.171990171990172, "grad_norm": 0.05019182339310646, "learning_rate": 3.981943473940888e-05, "logits/chosen": -7.682768821716309, "logits/rejected": -7.6916422843933105, "logps/chosen": -17.80765151977539, "logps/rejected": -94.87046813964844, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 1.6559364795684814, "rewards/margins": 7.704854488372803, "rewards/rejected": -6.048918724060059, "step": 245 }, { "epoch": 0.17269217269217269, "grad_norm": 0.039299823343753815, "learning_rate": 3.984891845172299e-05, "logits/chosen": -9.337804794311523, "logits/rejected": -9.336587905883789, "logps/chosen": -11.470099449157715, "logps/rejected": -98.46244049072266, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 1.8730318546295166, "rewards/margins": 8.616815567016602, "rewards/rejected": -6.743783950805664, "step": 246 }, { "epoch": 0.1733941733941734, "grad_norm": 0.03497990965843201, "learning_rate": 3.987828255432777e-05, "logits/chosen": -8.216849327087402, "logits/rejected": -8.217277526855469, "logps/chosen": -14.677803039550781, "logps/rejected": -94.75424194335938, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 1.753398060798645, "rewards/margins": 8.024885177612305, "rewards/rejected": -6.271486759185791, "step": 247 }, { "epoch": 0.1740961740961741, "grad_norm": 0.021778099238872528, "learning_rate": 3.9907528013770276e-05, "logits/chosen": -8.58681869506836, "logits/rejected": -8.601800918579102, "logps/chosen": -14.047489166259766, "logps/rejected": -95.55862426757812, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.7841475009918213, "rewards/margins": 8.17956256866455, "rewards/rejected": -6.395415306091309, "step": 248 }, { "epoch": 0.1747981747981748, "grad_norm": 0.02914053201675415, "learning_rate": 3.993665578492894e-05, "logits/chosen": -8.850407600402832, "logits/rejected": -8.839385986328125, "logps/chosen": -12.799951553344727, "logps/rejected": -94.8795166015625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 1.752589225769043, "rewards/margins": 8.20439338684082, "rewards/rejected": -6.451805114746094, "step": 249 }, { "epoch": 0.1755001755001755, "grad_norm": 0.0301784947514534, "learning_rate": 3.9965666811200624e-05, "logits/chosen": -7.602506160736084, "logits/rejected": -7.580987930297852, "logps/chosen": -12.962373733520508, "logps/rejected": -95.60194396972656, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 2.0274102687835693, "rewards/margins": 8.320815086364746, "rewards/rejected": -6.293405055999756, "step": 250 }, { "epoch": 0.1762021762021762, "grad_norm": 7.158124923706055, "learning_rate": 3.999456202468397e-05, "logits/chosen": -8.367471694946289, "logits/rejected": -8.346355438232422, "logps/chosen": -17.253734588623047, "logps/rejected": -97.21731567382812, "loss": 0.0342, "rewards/accuracies": 1.0, "rewards/chosen": 1.4224286079406738, "rewards/margins": 7.909479141235352, "rewards/rejected": -6.487050533294678, "step": 251 }, { "epoch": 0.1769041769041769, "grad_norm": 0.9674628376960754, "learning_rate": 4.002334234635907e-05, "logits/chosen": -8.309431076049805, "logits/rejected": -8.303704261779785, "logps/chosen": -16.47026252746582, "logps/rejected": -90.54997253417969, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": 1.7452831268310547, "rewards/margins": 7.52919864654541, "rewards/rejected": -5.783914566040039, "step": 252 }, { "epoch": 0.1776061776061776, "grad_norm": 0.4950415790081024, "learning_rate": 4.005200868626364e-05, "logits/chosen": -9.153397560119629, "logits/rejected": -9.179061889648438, "logps/chosen": -12.05160140991211, "logps/rejected": -101.62191009521484, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": 1.9610238075256348, "rewards/margins": 8.890508651733398, "rewards/rejected": -6.929485321044922, "step": 253 }, { "epoch": 0.1783081783081783, "grad_norm": 0.024625038728117943, "learning_rate": 4.008056194366564e-05, "logits/chosen": -7.934206485748291, "logits/rejected": -7.93437385559082, "logps/chosen": -11.178647994995117, "logps/rejected": -101.86433410644531, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.9916726350784302, "rewards/margins": 9.026688575744629, "rewards/rejected": -7.035015106201172, "step": 254 }, { "epoch": 0.17901017901017902, "grad_norm": 0.26098373532295227, "learning_rate": 4.010900300723259e-05, "logits/chosen": -8.314605712890625, "logits/rejected": -8.324173927307129, "logps/chosen": -12.758441925048828, "logps/rejected": -90.79134368896484, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 1.987694263458252, "rewards/margins": 7.813441753387451, "rewards/rejected": -5.825747013092041, "step": 255 }, { "epoch": 0.1797121797121797, "grad_norm": 0.09866314381361008, "learning_rate": 4.013733275519749e-05, "logits/chosen": -8.715863227844238, "logits/rejected": -8.710840225219727, "logps/chosen": -10.770325660705566, "logps/rejected": -100.53877258300781, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 2.0349557399749756, "rewards/margins": 8.92311954498291, "rewards/rejected": -6.8881635665893555, "step": 256 }, { "epoch": 0.18041418041418042, "grad_norm": 0.11835075914859772, "learning_rate": 4.016555205552158e-05, "logits/chosen": -8.128640174865723, "logits/rejected": -8.139445304870605, "logps/chosen": -12.92670726776123, "logps/rejected": -94.51060485839844, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 1.9385038614273071, "rewards/margins": 8.118526458740234, "rewards/rejected": -6.180023193359375, "step": 257 }, { "epoch": 0.1811161811161811, "grad_norm": 0.07036490738391876, "learning_rate": 4.0193661766053834e-05, "logits/chosen": -8.980119705200195, "logits/rejected": -9.004888534545898, "logps/chosen": -10.857986450195312, "logps/rejected": -98.5386734008789, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 2.0185956954956055, "rewards/margins": 8.744546890258789, "rewards/rejected": -6.725951194763184, "step": 258 }, { "epoch": 0.18181818181818182, "grad_norm": 0.01131998561322689, "learning_rate": 4.022166273468753e-05, "logits/chosen": -8.5206937789917, "logits/rejected": -8.577882766723633, "logps/chosen": -14.081257820129395, "logps/rejected": -100.45974731445312, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.6076232194900513, "rewards/margins": 8.573431015014648, "rewards/rejected": -6.965807914733887, "step": 259 }, { "epoch": 0.1825201825201825, "grad_norm": 0.014436646364629269, "learning_rate": 4.024955579951363e-05, "logits/chosen": -8.044841766357422, "logits/rejected": -8.055949211120605, "logps/chosen": -10.199614524841309, "logps/rejected": -101.87124633789062, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 2.2080397605895996, "rewards/margins": 9.179252624511719, "rewards/rejected": -6.971213340759277, "step": 260 }, { "epoch": 0.18322218322218323, "grad_norm": 0.030855907127261162, "learning_rate": 4.027734178897136e-05, "logits/chosen": -7.6317644119262695, "logits/rejected": -7.641480445861816, "logps/chosen": -13.970993995666504, "logps/rejected": -100.91768646240234, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 1.801725149154663, "rewards/margins": 8.575414657592773, "rewards/rejected": -6.773689270019531, "step": 261 }, { "epoch": 0.18392418392418391, "grad_norm": 0.038669269531965256, "learning_rate": 4.030502152199576e-05, "logits/chosen": -8.56912612915039, "logits/rejected": -8.549825668334961, "logps/chosen": -13.367143630981445, "logps/rejected": -96.03999328613281, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 2.0618300437927246, "rewards/margins": 8.330230712890625, "rewards/rejected": -6.2684006690979, "step": 262 }, { "epoch": 0.18462618462618463, "grad_norm": 0.029080411419272423, "learning_rate": 4.033259580816264e-05, "logits/chosen": -9.494409561157227, "logits/rejected": -9.490755081176758, "logps/chosen": -18.775066375732422, "logps/rejected": -93.71451568603516, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 1.2613294124603271, "rewards/margins": 7.470936298370361, "rewards/rejected": -6.209606647491455, "step": 263 }, { "epoch": 0.18532818532818532, "grad_norm": 0.12511292099952698, "learning_rate": 4.036006544783052e-05, "logits/chosen": -9.015152931213379, "logits/rejected": -9.017484664916992, "logps/chosen": -12.738801956176758, "logps/rejected": -98.46756744384766, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.9349381923675537, "rewards/margins": 8.598733901977539, "rewards/rejected": -6.6637959480285645, "step": 264 }, { "epoch": 0.18603018603018603, "grad_norm": 0.06973445415496826, "learning_rate": 4.0387431232280135e-05, "logits/chosen": -8.753440856933594, "logits/rejected": -8.771016120910645, "logps/chosen": -10.347978591918945, "logps/rejected": -101.62733459472656, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 2.097371816635132, "rewards/margins": 9.091360092163086, "rewards/rejected": -6.993988513946533, "step": 265 }, { "epoch": 0.18673218673218672, "grad_norm": 0.39182931184768677, "learning_rate": 4.041469394385112e-05, "logits/chosen": -8.636486053466797, "logits/rejected": -8.621065139770508, "logps/chosen": -12.733766555786133, "logps/rejected": -96.54627227783203, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 1.9544415473937988, "rewards/margins": 8.371745109558105, "rewards/rejected": -6.417303085327148, "step": 266 }, { "epoch": 0.18743418743418744, "grad_norm": 0.06179701164364815, "learning_rate": 4.0441854356076257e-05, "logits/chosen": -8.16397476196289, "logits/rejected": -8.17342472076416, "logps/chosen": -14.183650970458984, "logps/rejected": -99.59009552001953, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 1.7756614685058594, "rewards/margins": 8.497014999389648, "rewards/rejected": -6.721353530883789, "step": 267 }, { "epoch": 0.18813618813618813, "grad_norm": 0.02663104236125946, "learning_rate": 4.046891323381315e-05, "logits/chosen": -8.930896759033203, "logits/rejected": -8.92031478881836, "logps/chosen": -12.130513191223145, "logps/rejected": -97.16653442382812, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.9763071537017822, "rewards/margins": 8.531848907470703, "rewards/rejected": -6.555541038513184, "step": 268 }, { "epoch": 0.18883818883818884, "grad_norm": 0.30379176139831543, "learning_rate": 4.049587133337347e-05, "logits/chosen": -8.61055850982666, "logits/rejected": -8.615470886230469, "logps/chosen": -10.614154815673828, "logps/rejected": -103.19973754882812, "loss": 0.0747, "rewards/accuracies": 1.0, "rewards/chosen": 2.0391643047332764, "rewards/margins": 9.162639617919922, "rewards/rejected": -7.123475074768066, "step": 269 }, { "epoch": 0.18954018954018953, "grad_norm": 0.029227109625935555, "learning_rate": 4.0522729402649793e-05, "logits/chosen": -8.76578140258789, "logits/rejected": -8.763066291809082, "logps/chosen": -19.97441864013672, "logps/rejected": -99.6162109375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 1.0268900394439697, "rewards/margins": 7.837881565093994, "rewards/rejected": -6.810991287231445, "step": 270 }, { "epoch": 0.19024219024219025, "grad_norm": 0.039260704070329666, "learning_rate": 4.0549488181240096e-05, "logits/chosen": -8.432332992553711, "logits/rejected": -8.438749313354492, "logps/chosen": -10.762312889099121, "logps/rejected": -100.05078125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 2.1297948360443115, "rewards/margins": 8.981738090515137, "rewards/rejected": -6.851943016052246, "step": 271 }, { "epoch": 0.19094419094419093, "grad_norm": 1.4156996011734009, "learning_rate": 4.057614840056998e-05, "logits/chosen": -7.973563194274902, "logits/rejected": -7.995189666748047, "logps/chosen": -17.45932960510254, "logps/rejected": -102.02638244628906, "loss": 0.0266, "rewards/accuracies": 1.0, "rewards/chosen": 1.386905312538147, "rewards/margins": 8.25082778930664, "rewards/rejected": -6.863922119140625, "step": 272 }, { "epoch": 0.19164619164619165, "grad_norm": 0.061032164841890335, "learning_rate": 4.06027107840126e-05, "logits/chosen": -8.056655883789062, "logits/rejected": -8.072223663330078, "logps/chosen": -12.285317420959473, "logps/rejected": -101.63723754882812, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 2.033947467803955, "rewards/margins": 8.929094314575195, "rewards/rejected": -6.89514684677124, "step": 273 }, { "epoch": 0.19234819234819234, "grad_norm": 0.05284848436713219, "learning_rate": 4.0629176047006474e-05, "logits/chosen": -9.562898635864258, "logits/rejected": -9.563494682312012, "logps/chosen": -10.539663314819336, "logps/rejected": -94.54891967773438, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 2.0336225032806396, "rewards/margins": 8.480271339416504, "rewards/rejected": -6.446648597717285, "step": 274 }, { "epoch": 0.19305019305019305, "grad_norm": 0.03229490667581558, "learning_rate": 4.065554489717105e-05, "logits/chosen": -8.059063911437988, "logits/rejected": -8.049444198608398, "logps/chosen": -12.119667053222656, "logps/rejected": -101.8021240234375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 2.0256786346435547, "rewards/margins": 9.027280807495117, "rewards/rejected": -7.0016021728515625, "step": 275 }, { "epoch": 0.19375219375219374, "grad_norm": 0.012755614705383778, "learning_rate": 4.068181803442029e-05, "logits/chosen": -8.562145233154297, "logits/rejected": -8.531954765319824, "logps/chosen": -11.109854698181152, "logps/rejected": -102.3246841430664, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.217000961303711, "rewards/margins": 9.237144470214844, "rewards/rejected": -7.020143985748291, "step": 276 }, { "epoch": 0.19445419445419446, "grad_norm": 3.1403045654296875, "learning_rate": 4.0707996151074147e-05, "logits/chosen": -8.811005592346191, "logits/rejected": -8.792320251464844, "logps/chosen": -20.143600463867188, "logps/rejected": -98.3689193725586, "loss": 0.0536, "rewards/accuracies": 1.0, "rewards/chosen": 1.2440783977508545, "rewards/margins": 7.798819541931152, "rewards/rejected": -6.554740905761719, "step": 277 }, { "epoch": 0.19515619515619514, "grad_norm": 0.060872882604599, "learning_rate": 4.073407993196794e-05, "logits/chosen": -9.097461700439453, "logits/rejected": -9.076940536499023, "logps/chosen": -14.756370544433594, "logps/rejected": -99.47417449951172, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.6920722723007202, "rewards/margins": 8.421127319335938, "rewards/rejected": -6.729055881500244, "step": 278 }, { "epoch": 0.19585819585819586, "grad_norm": 0.026953935623168945, "learning_rate": 4.076007005455996e-05, "logits/chosen": -8.78667163848877, "logits/rejected": -8.77279281616211, "logps/chosen": -13.507906913757324, "logps/rejected": -101.43273162841797, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.9016094207763672, "rewards/margins": 8.799139022827148, "rewards/rejected": -6.897529125213623, "step": 279 }, { "epoch": 0.19656019656019655, "grad_norm": 0.03867524117231369, "learning_rate": 4.0785967189036986e-05, "logits/chosen": -8.771544456481934, "logits/rejected": -8.77197265625, "logps/chosen": -15.220451354980469, "logps/rejected": -100.0452880859375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 1.765640377998352, "rewards/margins": 8.50214672088623, "rewards/rejected": -6.736506462097168, "step": 280 }, { "epoch": 0.19726219726219726, "grad_norm": 0.0369332991540432, "learning_rate": 4.0811771998418e-05, "logits/chosen": -9.169656753540039, "logits/rejected": -9.182169914245605, "logps/chosen": -14.421768188476562, "logps/rejected": -101.57664489746094, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.7618005275726318, "rewards/margins": 8.613462448120117, "rewards/rejected": -6.851661682128906, "step": 281 }, { "epoch": 0.19796419796419795, "grad_norm": 0.0678400844335556, "learning_rate": 4.083748513865602e-05, "logits/chosen": -8.670317649841309, "logits/rejected": -8.647636413574219, "logps/chosen": -20.923797607421875, "logps/rejected": -93.75079345703125, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 1.1718136072158813, "rewards/margins": 7.24702262878418, "rewards/rejected": -6.075209617614746, "step": 282 }, { "epoch": 0.19866619866619867, "grad_norm": 0.0666450560092926, "learning_rate": 4.086310725873818e-05, "logits/chosen": -8.136433601379395, "logits/rejected": -8.17440414428711, "logps/chosen": -16.453105926513672, "logps/rejected": -98.59405517578125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 1.5848039388656616, "rewards/margins": 8.226390838623047, "rewards/rejected": -6.641587257385254, "step": 283 }, { "epoch": 0.19936819936819936, "grad_norm": 0.0489591620862484, "learning_rate": 4.0888639000783966e-05, "logits/chosen": -8.681991577148438, "logits/rejected": -8.667882919311523, "logps/chosen": -15.109058380126953, "logps/rejected": -100.48627471923828, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.6681671142578125, "rewards/margins": 8.466772079467773, "rewards/rejected": -6.798604965209961, "step": 284 }, { "epoch": 0.20007020007020007, "grad_norm": 0.03423040732741356, "learning_rate": 4.0914081000141844e-05, "logits/chosen": -8.07885456085205, "logits/rejected": -8.043415069580078, "logps/chosen": -17.80687141418457, "logps/rejected": -98.38238525390625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.584549903869629, "rewards/margins": 8.096050262451172, "rewards/rejected": -6.511499881744385, "step": 285 }, { "epoch": 0.20077220077220076, "grad_norm": 0.01423642598092556, "learning_rate": 4.0939433885484055e-05, "logits/chosen": -8.471600532531738, "logits/rejected": -8.45504379272461, "logps/chosen": -16.844375610351562, "logps/rejected": -98.44650268554688, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 1.3471399545669556, "rewards/margins": 8.113492965698242, "rewards/rejected": -6.766353607177734, "step": 286 }, { "epoch": 0.20147420147420148, "grad_norm": 0.03953203931450844, "learning_rate": 4.0964698278899874e-05, "logits/chosen": -8.538389205932617, "logits/rejected": -8.566452026367188, "logps/chosen": -18.043346405029297, "logps/rejected": -97.50334930419922, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 1.3733323812484741, "rewards/margins": 7.869681358337402, "rewards/rejected": -6.496349334716797, "step": 287 }, { "epoch": 0.20217620217620216, "grad_norm": 0.17854955792427063, "learning_rate": 4.0989874795987185e-05, "logits/chosen": -8.194684028625488, "logits/rejected": -8.217040061950684, "logps/chosen": -18.203805923461914, "logps/rejected": -99.68656921386719, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.3460875749588013, "rewards/margins": 8.11988639831543, "rewards/rejected": -6.773798942565918, "step": 288 }, { "epoch": 0.20287820287820288, "grad_norm": 0.8528527021408081, "learning_rate": 4.1014964045942465e-05, "logits/chosen": -8.76877212524414, "logits/rejected": -8.762212753295898, "logps/chosen": -15.108551025390625, "logps/rejected": -100.242431640625, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 1.6052398681640625, "rewards/margins": 8.452356338500977, "rewards/rejected": -6.847115993499756, "step": 289 }, { "epoch": 0.20358020358020357, "grad_norm": 0.018171364441514015, "learning_rate": 4.103996663164927e-05, "logits/chosen": -9.16641616821289, "logits/rejected": -9.179755210876465, "logps/chosen": -17.977378845214844, "logps/rejected": -95.40016174316406, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.409759759902954, "rewards/margins": 7.731564521789551, "rewards/rejected": -6.321804046630859, "step": 290 }, { "epoch": 0.20428220428220428, "grad_norm": 0.10277897864580154, "learning_rate": 4.106488314976513e-05, "logits/chosen": -8.734413146972656, "logits/rejected": -8.735071182250977, "logps/chosen": -16.120512008666992, "logps/rejected": -99.74151611328125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 1.504415512084961, "rewards/margins": 8.320821762084961, "rewards/rejected": -6.81640625, "step": 291 }, { "epoch": 0.20498420498420497, "grad_norm": 0.019599217921495438, "learning_rate": 4.108971419080698e-05, "logits/chosen": -8.05855655670166, "logits/rejected": -8.05344295501709, "logps/chosen": -21.634288787841797, "logps/rejected": -93.97883605957031, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 1.2626662254333496, "rewards/margins": 7.154860019683838, "rewards/rejected": -5.892193794250488, "step": 292 }, { "epoch": 0.2056862056862057, "grad_norm": 0.025813085958361626, "learning_rate": 4.111446033923516e-05, "logits/chosen": -8.188396453857422, "logits/rejected": -8.200550079345703, "logps/chosen": -11.154434204101562, "logps/rejected": -98.82417297363281, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 2.1732406616210938, "rewards/margins": 8.854095458984375, "rewards/rejected": -6.680854797363281, "step": 293 }, { "epoch": 0.20638820638820637, "grad_norm": 0.03658149018883705, "learning_rate": 4.113912217353596e-05, "logits/chosen": -8.814366340637207, "logits/rejected": -8.814653396606445, "logps/chosen": -12.348867416381836, "logps/rejected": -100.65960693359375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.9724292755126953, "rewards/margins": 8.86679458618164, "rewards/rejected": -6.894366264343262, "step": 294 }, { "epoch": 0.2070902070902071, "grad_norm": 3.996860980987549, "learning_rate": 4.116370026630272e-05, "logits/chosen": -8.195992469787598, "logits/rejected": -8.208494186401367, "logps/chosen": -19.90730094909668, "logps/rejected": -98.9251708984375, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": 1.3670495748519897, "rewards/margins": 7.7798991203308105, "rewards/rejected": -6.412849426269531, "step": 295 }, { "epoch": 0.2077922077922078, "grad_norm": 0.023338306695222855, "learning_rate": 4.118819518431564e-05, "logits/chosen": -8.194100379943848, "logits/rejected": -8.187079429626465, "logps/chosen": -17.384531021118164, "logps/rejected": -96.85806274414062, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.4214268922805786, "rewards/margins": 7.8734660148620605, "rewards/rejected": -6.452038764953613, "step": 296 }, { "epoch": 0.2084942084942085, "grad_norm": 0.027244191616773605, "learning_rate": 4.121260748862021e-05, "logits/chosen": -8.400263786315918, "logits/rejected": -8.391716957092285, "logps/chosen": -12.843732833862305, "logps/rejected": -100.65888214111328, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.7150306701660156, "rewards/margins": 8.7464017868042, "rewards/rejected": -7.031371116638184, "step": 297 }, { "epoch": 0.2091962091962092, "grad_norm": 1.3168338537216187, "learning_rate": 4.123693773460426e-05, "logits/chosen": -8.579416275024414, "logits/rejected": -8.582653999328613, "logps/chosen": -17.653812408447266, "logps/rejected": -97.17681121826172, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 1.554112434387207, "rewards/margins": 7.950679779052734, "rewards/rejected": -6.396567344665527, "step": 298 }, { "epoch": 0.2098982098982099, "grad_norm": 5.182368278503418, "learning_rate": 4.126118647207383e-05, "logits/chosen": -7.808794021606445, "logits/rejected": -7.807418346405029, "logps/chosen": -18.688508987426758, "logps/rejected": -97.24798583984375, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": 1.411212682723999, "rewards/margins": 7.859883785247803, "rewards/rejected": -6.448671340942383, "step": 299 }, { "epoch": 0.2106002106002106, "grad_norm": 0.031464193016290665, "learning_rate": 4.1285354245327715e-05, "logits/chosen": -8.72213077545166, "logits/rejected": -8.69766902923584, "logps/chosen": -11.706310272216797, "logps/rejected": -102.42788696289062, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 1.9339714050292969, "rewards/margins": 9.093713760375977, "rewards/rejected": -7.159742832183838, "step": 300 }, { "epoch": 0.2113022113022113, "grad_norm": 0.0054284026846289635, "learning_rate": 4.1309441593230726e-05, "logits/chosen": -8.278093338012695, "logits/rejected": -8.289163589477539, "logps/chosen": -15.214544296264648, "logps/rejected": -101.74449157714844, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.6998469829559326, "rewards/margins": 8.562355041503906, "rewards/rejected": -6.862508296966553, "step": 301 }, { "epoch": 0.21200421200421202, "grad_norm": 0.00527734961360693, "learning_rate": 4.133344904928585e-05, "logits/chosen": -8.43124771118164, "logits/rejected": -8.433006286621094, "logps/chosen": -11.956754684448242, "logps/rejected": -101.44631958007812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.0538506507873535, "rewards/margins": 8.99520492553711, "rewards/rejected": -6.941354751586914, "step": 302 }, { "epoch": 0.2127062127062127, "grad_norm": 0.010007710196077824, "learning_rate": 4.1357377141705084e-05, "logits/chosen": -8.435497283935547, "logits/rejected": -8.4461669921875, "logps/chosen": -16.428808212280273, "logps/rejected": -97.73359680175781, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.6254689693450928, "rewards/margins": 8.133443832397461, "rewards/rejected": -6.507974624633789, "step": 303 }, { "epoch": 0.21340821340821342, "grad_norm": 0.013349405489861965, "learning_rate": 4.1381226393479236e-05, "logits/chosen": -8.224332809448242, "logits/rejected": -8.224615097045898, "logps/chosen": -15.320993423461914, "logps/rejected": -96.68037414550781, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.7579803466796875, "rewards/margins": 8.121363639831543, "rewards/rejected": -6.3633832931518555, "step": 304 }, { "epoch": 0.2141102141102141, "grad_norm": 0.014333044178783894, "learning_rate": 4.1404997322446435e-05, "logits/chosen": -8.264671325683594, "logits/rejected": -8.235285758972168, "logps/chosen": -13.91732406616211, "logps/rejected": -101.40872955322266, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.8038700819015503, "rewards/margins": 8.660833358764648, "rewards/rejected": -6.856964111328125, "step": 305 }, { "epoch": 0.21481221481221482, "grad_norm": 0.021341143175959587, "learning_rate": 4.142869044135967e-05, "logits/chosen": -9.012674331665039, "logits/rejected": -9.007782936096191, "logps/chosen": -20.970264434814453, "logps/rejected": -97.08370971679688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.199303388595581, "rewards/margins": 7.724829196929932, "rewards/rejected": -6.52552604675293, "step": 306 }, { "epoch": 0.2155142155142155, "grad_norm": 0.810597836971283, "learning_rate": 4.145230625795311e-05, "logits/chosen": -8.411235809326172, "logits/rejected": -8.399651527404785, "logps/chosen": -13.137958526611328, "logps/rejected": -95.92816162109375, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 1.9202677011489868, "rewards/margins": 8.338675498962402, "rewards/rejected": -6.418407440185547, "step": 307 }, { "epoch": 0.21621621621621623, "grad_norm": 0.029872704297304153, "learning_rate": 4.14758452750074e-05, "logits/chosen": -7.753263473510742, "logits/rejected": -7.778371334075928, "logps/chosen": -15.409818649291992, "logps/rejected": -99.91984558105469, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 1.4826682806015015, "rewards/margins": 8.34415340423584, "rewards/rejected": -6.861485481262207, "step": 308 }, { "epoch": 0.21691821691821692, "grad_norm": 0.010794593021273613, "learning_rate": 4.149930799041392e-05, "logits/chosen": -8.902389526367188, "logits/rejected": -8.884074211120605, "logps/chosen": -13.391631126403809, "logps/rejected": -96.7863998413086, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.8309881687164307, "rewards/margins": 8.388866424560547, "rewards/rejected": -6.557877540588379, "step": 309 }, { "epoch": 0.21762021762021763, "grad_norm": 0.015561787411570549, "learning_rate": 4.152269489723788e-05, "logits/chosen": -8.655173301696777, "logits/rejected": -8.641276359558105, "logps/chosen": -15.033305168151855, "logps/rejected": -98.54629516601562, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.6981194019317627, "rewards/margins": 8.396872520446777, "rewards/rejected": -6.698753356933594, "step": 310 }, { "epoch": 0.21832221832221832, "grad_norm": 0.6822290420532227, "learning_rate": 4.1546006483780626e-05, "logits/chosen": -8.682979583740234, "logits/rejected": -8.667840003967285, "logps/chosen": -11.634486198425293, "logps/rejected": -98.20820617675781, "loss": 0.065, "rewards/accuracies": 1.0, "rewards/chosen": 2.0357871055603027, "rewards/margins": 8.672481536865234, "rewards/rejected": -6.636693954467773, "step": 311 }, { "epoch": 0.21902421902421904, "grad_norm": 0.014162633568048477, "learning_rate": 4.156924323364072e-05, "logits/chosen": -9.051889419555664, "logits/rejected": -9.078604698181152, "logps/chosen": -11.851667404174805, "logps/rejected": -98.29989624023438, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.0644307136535645, "rewards/margins": 8.714835166931152, "rewards/rejected": -6.65040397644043, "step": 312 }, { "epoch": 0.21972621972621972, "grad_norm": 0.02579483948647976, "learning_rate": 4.1592405625774144e-05, "logits/chosen": -7.972561836242676, "logits/rejected": -7.985413074493408, "logps/chosen": -14.844849586486816, "logps/rejected": -99.30680084228516, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.7039482593536377, "rewards/margins": 8.41041374206543, "rewards/rejected": -6.706465721130371, "step": 313 }, { "epoch": 0.22042822042822044, "grad_norm": 0.01595437154173851, "learning_rate": 4.161549413455358e-05, "logits/chosen": -8.833253860473633, "logits/rejected": -8.838678359985352, "logps/chosen": -13.15753173828125, "logps/rejected": -99.64207458496094, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.7936038970947266, "rewards/margins": 8.679454803466797, "rewards/rejected": -6.8858513832092285, "step": 314 }, { "epoch": 0.22113022113022113, "grad_norm": 0.03728681057691574, "learning_rate": 4.163850922982668e-05, "logits/chosen": -7.372354984283447, "logits/rejected": -7.354386329650879, "logps/chosen": -15.846065521240234, "logps/rejected": -96.1905517578125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.6489317417144775, "rewards/margins": 8.03685188293457, "rewards/rejected": -6.38792085647583, "step": 315 }, { "epoch": 0.22183222183222184, "grad_norm": 0.9251024723052979, "learning_rate": 4.16614513769734e-05, "logits/chosen": -8.149184226989746, "logits/rejected": -8.156143188476562, "logps/chosen": -16.017257690429688, "logps/rejected": -97.28324890136719, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.6537699699401855, "rewards/margins": 8.10733413696289, "rewards/rejected": -6.453563690185547, "step": 316 }, { "epoch": 0.22253422253422253, "grad_norm": 0.0476548857986927, "learning_rate": 4.1684321036962526e-05, "logits/chosen": -8.610288619995117, "logits/rejected": -8.602516174316406, "logps/chosen": -17.66185760498047, "logps/rejected": -96.71820831298828, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.4002933502197266, "rewards/margins": 7.797935485839844, "rewards/rejected": -6.397642135620117, "step": 317 }, { "epoch": 0.22323622323622325, "grad_norm": 0.027853790670633316, "learning_rate": 4.170711866640721e-05, "logits/chosen": -8.577299118041992, "logits/rejected": -8.589197158813477, "logps/chosen": -20.0517578125, "logps/rejected": -97.60946655273438, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.2755486965179443, "rewards/margins": 7.699488639831543, "rewards/rejected": -6.4239397048950195, "step": 318 }, { "epoch": 0.22393822393822393, "grad_norm": 0.04707622155547142, "learning_rate": 4.1729844717619684e-05, "logits/chosen": -9.139261245727539, "logits/rejected": -9.112375259399414, "logps/chosen": -19.246402740478516, "logps/rejected": -96.5921401977539, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.1697962284088135, "rewards/margins": 7.782240867614746, "rewards/rejected": -6.6124444007873535, "step": 319 }, { "epoch": 0.22464022464022465, "grad_norm": 4.8473944664001465, "learning_rate": 4.17524996386651e-05, "logits/chosen": -8.760217666625977, "logits/rejected": -8.756261825561523, "logps/chosen": -20.226266860961914, "logps/rejected": -97.09477233886719, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": 1.2907425165176392, "rewards/margins": 7.736563682556152, "rewards/rejected": -6.4458208084106445, "step": 320 }, { "epoch": 0.22534222534222534, "grad_norm": 0.03711394593119621, "learning_rate": 4.177508387341454e-05, "logits/chosen": -8.743501663208008, "logits/rejected": -8.757942199707031, "logps/chosen": -14.14346694946289, "logps/rejected": -97.36595153808594, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.716031789779663, "rewards/margins": 8.330545425415039, "rewards/rejected": -6.614513874053955, "step": 321 }, { "epoch": 0.22604422604422605, "grad_norm": 0.19018018245697021, "learning_rate": 4.179759786159719e-05, "logits/chosen": -8.568645477294922, "logits/rejected": -8.563835144042969, "logps/chosen": -19.182540893554688, "logps/rejected": -95.73019409179688, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 1.1612038612365723, "rewards/margins": 7.50034236907959, "rewards/rejected": -6.339138507843018, "step": 322 }, { "epoch": 0.22674622674622674, "grad_norm": 0.024802109226584435, "learning_rate": 4.182004203885172e-05, "logits/chosen": -8.409210205078125, "logits/rejected": -8.392257690429688, "logps/chosen": -18.358970642089844, "logps/rejected": -97.91949462890625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.3383710384368896, "rewards/margins": 7.919835090637207, "rewards/rejected": -6.581464767456055, "step": 323 }, { "epoch": 0.22744822744822746, "grad_norm": 0.044707927852869034, "learning_rate": 4.184241683677687e-05, "logits/chosen": -7.789756774902344, "logits/rejected": -7.756288528442383, "logps/chosen": -16.329586029052734, "logps/rejected": -95.88390350341797, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 1.6084058284759521, "rewards/margins": 7.935108661651611, "rewards/rejected": -6.32670259475708, "step": 324 }, { "epoch": 0.22815022815022815, "grad_norm": 0.021142814308404922, "learning_rate": 4.1864722682981245e-05, "logits/chosen": -8.112024307250977, "logits/rejected": -8.121979713439941, "logps/chosen": -22.516250610351562, "logps/rejected": -94.114990234375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.0318107604980469, "rewards/margins": 7.110743522644043, "rewards/rejected": -6.078932285308838, "step": 325 }, { "epoch": 0.22885222885222886, "grad_norm": 0.06473670154809952, "learning_rate": 4.188696000113232e-05, "logits/chosen": -8.541786193847656, "logits/rejected": -8.524847030639648, "logps/chosen": -13.896973609924316, "logps/rejected": -98.306396484375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 1.8771497011184692, "rewards/margins": 8.528929710388184, "rewards/rejected": -6.65178108215332, "step": 326 }, { "epoch": 0.22955422955422955, "grad_norm": 0.017380740493535995, "learning_rate": 4.190912921100477e-05, "logits/chosen": -8.851766586303711, "logits/rejected": -8.863758087158203, "logps/chosen": -19.062904357910156, "logps/rejected": -95.89366149902344, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.2721573114395142, "rewards/margins": 7.600899696350098, "rewards/rejected": -6.328742504119873, "step": 327 }, { "epoch": 0.23025623025623027, "grad_norm": 0.03154124319553375, "learning_rate": 4.1931230728527994e-05, "logits/chosen": -8.774520874023438, "logits/rejected": -8.779754638671875, "logps/chosen": -17.5617733001709, "logps/rejected": -95.47010803222656, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 1.3486015796661377, "rewards/margins": 7.710781097412109, "rewards/rejected": -6.362179756164551, "step": 328 }, { "epoch": 0.23095823095823095, "grad_norm": 0.0262030977755785, "learning_rate": 4.195326496583291e-05, "logits/chosen": -7.908943176269531, "logits/rejected": -7.911153793334961, "logps/chosen": -17.56151580810547, "logps/rejected": -99.10794067382812, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.4965591430664062, "rewards/margins": 8.11199951171875, "rewards/rejected": -6.615440845489502, "step": 329 }, { "epoch": 0.23166023166023167, "grad_norm": 0.021081745624542236, "learning_rate": 4.1975232331298125e-05, "logits/chosen": -8.25564956665039, "logits/rejected": -8.232112884521484, "logps/chosen": -12.94320297241211, "logps/rejected": -101.02361297607422, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 1.987985610961914, "rewards/margins": 8.81142807006836, "rewards/rejected": -6.8234429359436035, "step": 330 }, { "epoch": 0.23236223236223236, "grad_norm": 0.014708888716995716, "learning_rate": 4.1997133229595316e-05, "logits/chosen": -8.825480461120605, "logits/rejected": -8.825662612915039, "logps/chosen": -21.081783294677734, "logps/rejected": -95.20052337646484, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.163656234741211, "rewards/margins": 7.362332344055176, "rewards/rejected": -6.198676109313965, "step": 331 }, { "epoch": 0.23306423306423307, "grad_norm": 0.020604578778147697, "learning_rate": 4.201896806173394e-05, "logits/chosen": -8.494359016418457, "logits/rejected": -8.493246078491211, "logps/chosen": -13.205642700195312, "logps/rejected": -102.60633850097656, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.9356404542922974, "rewards/margins": 8.975809097290039, "rewards/rejected": -7.040168762207031, "step": 332 }, { "epoch": 0.23376623376623376, "grad_norm": 0.010628285817801952, "learning_rate": 4.2040737225105335e-05, "logits/chosen": -9.047785758972168, "logits/rejected": -9.033025741577148, "logps/chosen": -15.314519882202148, "logps/rejected": -100.19223022460938, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.7054563760757446, "rewards/margins": 8.454951286315918, "rewards/rejected": -6.749494552612305, "step": 333 }, { "epoch": 0.23446823446823448, "grad_norm": 0.01635003462433815, "learning_rate": 4.206244111352608e-05, "logits/chosen": -9.086885452270508, "logits/rejected": -9.075584411621094, "logps/chosen": -15.54598331451416, "logps/rejected": -101.25452423095703, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.4819127321243286, "rewards/margins": 8.474639892578125, "rewards/rejected": -6.992727279663086, "step": 334 }, { "epoch": 0.23517023517023516, "grad_norm": 0.015002855099737644, "learning_rate": 4.2084080117280756e-05, "logits/chosen": -8.73969841003418, "logits/rejected": -8.72346305847168, "logps/chosen": -15.666435241699219, "logps/rejected": -99.66680908203125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.5380752086639404, "rewards/margins": 8.359989166259766, "rewards/rejected": -6.821914196014404, "step": 335 }, { "epoch": 0.23587223587223588, "grad_norm": 0.017557041719555855, "learning_rate": 4.210565462316407e-05, "logits/chosen": -9.264495849609375, "logits/rejected": -9.294807434082031, "logps/chosen": -13.774351119995117, "logps/rejected": -102.56745147705078, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.8506407737731934, "rewards/margins": 8.866278648376465, "rewards/rejected": -7.015637397766113, "step": 336 }, { "epoch": 0.23657423657423657, "grad_norm": 0.014712509699165821, "learning_rate": 4.2127165014522315e-05, "logits/chosen": -9.190164566040039, "logits/rejected": -9.203145980834961, "logps/chosen": -14.903305053710938, "logps/rejected": -101.86732482910156, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.6547489166259766, "rewards/margins": 8.623626708984375, "rewards/rejected": -6.968877792358398, "step": 337 }, { "epoch": 0.23727623727623728, "grad_norm": 0.3818150460720062, "learning_rate": 4.214861167129425e-05, "logits/chosen": -8.534960746765137, "logits/rejected": -8.530998229980469, "logps/chosen": -13.188438415527344, "logps/rejected": -103.04071807861328, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.9183341264724731, "rewards/margins": 8.975760459899902, "rewards/rejected": -7.057426452636719, "step": 338 }, { "epoch": 0.23797823797823797, "grad_norm": 0.017652850598096848, "learning_rate": 4.2169994970051365e-05, "logits/chosen": -8.941926002502441, "logits/rejected": -8.922578811645508, "logps/chosen": -14.58988094329834, "logps/rejected": -96.76219177246094, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 1.6332581043243408, "rewards/margins": 8.1468505859375, "rewards/rejected": -6.513591766357422, "step": 339 }, { "epoch": 0.2386802386802387, "grad_norm": 0.012156129814684391, "learning_rate": 4.219131528403759e-05, "logits/chosen": -9.384241104125977, "logits/rejected": -9.3800687789917, "logps/chosen": -15.244741439819336, "logps/rejected": -102.15894317626953, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.6059081554412842, "rewards/margins": 8.624770164489746, "rewards/rejected": -7.018861770629883, "step": 340 }, { "epoch": 0.23938223938223938, "grad_norm": 0.004327812232077122, "learning_rate": 4.22125729832083e-05, "logits/chosen": -8.707352638244629, "logits/rejected": -8.725425720214844, "logps/chosen": -16.022586822509766, "logps/rejected": -102.7892837524414, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.4033958911895752, "rewards/margins": 8.562767028808594, "rewards/rejected": -7.159371376037598, "step": 341 }, { "epoch": 0.2400842400842401, "grad_norm": 0.01672125980257988, "learning_rate": 4.2233768434268914e-05, "logits/chosen": -7.923779010772705, "logits/rejected": -7.902153015136719, "logps/chosen": -10.855883598327637, "logps/rejected": -103.28215026855469, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 2.182199001312256, "rewards/margins": 9.300719261169434, "rewards/rejected": -7.1185197830200195, "step": 342 }, { "epoch": 0.24078624078624078, "grad_norm": 0.022948481142520905, "learning_rate": 4.225490200071284e-05, "logits/chosen": -8.76740837097168, "logits/rejected": -8.754072189331055, "logps/chosen": -12.780542373657227, "logps/rejected": -103.76377868652344, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.8090720176696777, "rewards/margins": 9.059675216674805, "rewards/rejected": -7.250603199005127, "step": 343 }, { "epoch": 0.2414882414882415, "grad_norm": 0.007489706389605999, "learning_rate": 4.227597404285883e-05, "logits/chosen": -9.294410705566406, "logits/rejected": -9.288492202758789, "logps/chosen": -14.16504192352295, "logps/rejected": -102.9162826538086, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.68551766872406, "rewards/margins": 8.80907154083252, "rewards/rejected": -7.123553276062012, "step": 344 }, { "epoch": 0.24219024219024218, "grad_norm": 0.017640523612499237, "learning_rate": 4.229698491788791e-05, "logits/chosen": -9.290267944335938, "logits/rejected": -9.30142593383789, "logps/chosen": -12.341556549072266, "logps/rejected": -102.94642639160156, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.881321907043457, "rewards/margins": 8.988447189331055, "rewards/rejected": -7.1071248054504395, "step": 345 }, { "epoch": 0.2428922428922429, "grad_norm": 0.035235948860645294, "learning_rate": 4.231793497987961e-05, "logits/chosen": -9.168697357177734, "logits/rejected": -9.15848159790039, "logps/chosen": -11.67369270324707, "logps/rejected": -102.74595642089844, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 2.0423154830932617, "rewards/margins": 9.1300048828125, "rewards/rejected": -7.087688446044922, "step": 346 }, { "epoch": 0.2435942435942436, "grad_norm": 0.006035325583070517, "learning_rate": 4.2338824579847904e-05, "logits/chosen": -9.144936561584473, "logits/rejected": -9.162393569946289, "logps/chosen": -14.69560432434082, "logps/rejected": -101.5826416015625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.6555249691009521, "rewards/margins": 8.647322654724121, "rewards/rejected": -6.99179744720459, "step": 347 }, { "epoch": 0.2442962442962443, "grad_norm": 0.027797924354672432, "learning_rate": 4.235965406577636e-05, "logits/chosen": -8.518011093139648, "logits/rejected": -8.525100708007812, "logps/chosen": -17.106822967529297, "logps/rejected": -96.9411849975586, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.3137876987457275, "rewards/margins": 7.892638206481934, "rewards/rejected": -6.578850269317627, "step": 348 }, { "epoch": 0.244998244998245, "grad_norm": 0.0036386263091117144, "learning_rate": 4.2380423782653e-05, "logits/chosen": -8.68684196472168, "logits/rejected": -8.711612701416016, "logps/chosen": -11.481578826904297, "logps/rejected": -102.00569152832031, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.187025547027588, "rewards/margins": 9.209434509277344, "rewards/rejected": -7.022408485412598, "step": 349 }, { "epoch": 0.2457002457002457, "grad_norm": 0.03563088923692703, "learning_rate": 4.240113407250459e-05, "logits/chosen": -8.471993446350098, "logits/rejected": -8.477673530578613, "logps/chosen": -19.325672149658203, "logps/rejected": -100.38327026367188, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 1.420881986618042, "rewards/margins": 8.125086784362793, "rewards/rejected": -6.704204559326172, "step": 350 }, { "epoch": 0.2464022464022464, "grad_norm": 0.03128799423575401, "learning_rate": 4.24217852744304e-05, "logits/chosen": -8.998417854309082, "logits/rejected": -8.998491287231445, "logps/chosen": -12.80863094329834, "logps/rejected": -97.64549255371094, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 1.8725405931472778, "rewards/margins": 8.47817611694336, "rewards/rejected": -6.605634689331055, "step": 351 }, { "epoch": 0.2471042471042471, "grad_norm": 0.009455684572458267, "learning_rate": 4.244237772463552e-05, "logits/chosen": -8.192805290222168, "logits/rejected": -8.180776596069336, "logps/chosen": -11.573184967041016, "logps/rejected": -101.72544860839844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.2656702995300293, "rewards/margins": 9.179615020751953, "rewards/rejected": -6.913944721221924, "step": 352 }, { "epoch": 0.2478062478062478, "grad_norm": 0.044948939234018326, "learning_rate": 4.246291175646371e-05, "logits/chosen": -8.387259483337402, "logits/rejected": -8.38077449798584, "logps/chosen": -14.516016006469727, "logps/rejected": -102.48680114746094, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.7453663349151611, "rewards/margins": 8.746723175048828, "rewards/rejected": -7.001357078552246, "step": 353 }, { "epoch": 0.2485082485082485, "grad_norm": 0.013265806250274181, "learning_rate": 4.24833877004298e-05, "logits/chosen": -8.882245063781738, "logits/rejected": -8.84516716003418, "logps/chosen": -12.955568313598633, "logps/rejected": -102.57696533203125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.9387290477752686, "rewards/margins": 8.944178581237793, "rewards/rejected": -7.005449295043945, "step": 354 }, { "epoch": 0.2492102492102492, "grad_norm": 0.007803776767104864, "learning_rate": 4.250380588425157e-05, "logits/chosen": -8.971009254455566, "logits/rejected": -8.972103118896484, "logps/chosen": -14.560026168823242, "logps/rejected": -98.0116958618164, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.7798752784729004, "rewards/margins": 8.303685188293457, "rewards/rejected": -6.523810386657715, "step": 355 }, { "epoch": 0.24991224991224992, "grad_norm": 0.004975307732820511, "learning_rate": 4.2524166632881255e-05, "logits/chosen": -8.661874771118164, "logits/rejected": -8.662666320800781, "logps/chosen": -14.52917766571045, "logps/rejected": -102.98495483398438, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.813896656036377, "rewards/margins": 8.849998474121094, "rewards/rejected": -7.036101341247559, "step": 356 }, { "epoch": 0.25061425061425063, "grad_norm": 0.0030101435258984566, "learning_rate": 4.254447026853656e-05, "logits/chosen": -9.263458251953125, "logits/rejected": -9.266386032104492, "logps/chosen": -14.056510925292969, "logps/rejected": -102.77958679199219, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.7914122343063354, "rewards/margins": 8.816732406616211, "rewards/rejected": -7.025320053100586, "step": 357 }, { "epoch": 0.2513162513162513, "grad_norm": 0.0027537040878087282, "learning_rate": 4.2564717110731244e-05, "logits/chosen": -8.123044967651367, "logits/rejected": -8.109119415283203, "logps/chosen": -12.87391471862793, "logps/rejected": -103.5916748046875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.8575119972229004, "rewards/margins": 9.093719482421875, "rewards/rejected": -7.236207008361816, "step": 358 }, { "epoch": 0.252018252018252, "grad_norm": 0.004410578869283199, "learning_rate": 4.258490747630532e-05, "logits/chosen": -8.831903457641602, "logits/rejected": -8.830035209655762, "logps/chosen": -14.991458892822266, "logps/rejected": -103.3505859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.714520812034607, "rewards/margins": 8.763434410095215, "rewards/rejected": -7.048913955688477, "step": 359 }, { "epoch": 0.2527202527202527, "grad_norm": 0.004903916269540787, "learning_rate": 4.260504167945479e-05, "logits/chosen": -8.11577320098877, "logits/rejected": -8.118885040283203, "logps/chosen": -9.92663860321045, "logps/rejected": -105.10163116455078, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.1726365089416504, "rewards/margins": 9.54694938659668, "rewards/rejected": -7.3743133544921875, "step": 360 }, { "epoch": 0.25342225342225344, "grad_norm": 0.010976762510836124, "learning_rate": 4.2625120031760965e-05, "logits/chosen": -8.810895919799805, "logits/rejected": -8.8107328414917, "logps/chosen": -12.250490188598633, "logps/rejected": -102.68292999267578, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.06425142288208, "rewards/margins": 9.173232078552246, "rewards/rejected": -7.108981132507324, "step": 361 }, { "epoch": 0.2541242541242541, "grad_norm": 0.016478439792990685, "learning_rate": 4.264514284221944e-05, "logits/chosen": -8.243749618530273, "logits/rejected": -8.249998092651367, "logps/chosen": -9.666500091552734, "logps/rejected": -104.3617935180664, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.242840051651001, "rewards/margins": 9.459299087524414, "rewards/rejected": -7.216459274291992, "step": 362 }, { "epoch": 0.2548262548262548, "grad_norm": 0.011237259954214096, "learning_rate": 4.266511041726854e-05, "logits/chosen": -9.280684471130371, "logits/rejected": -9.277170181274414, "logps/chosen": -18.476957321166992, "logps/rejected": -98.43708801269531, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 1.4101130962371826, "rewards/margins": 7.942758560180664, "rewards/rejected": -6.532644748687744, "step": 363 }, { "epoch": 0.25552825552825553, "grad_norm": 0.005140771623700857, "learning_rate": 4.26850230608176e-05, "logits/chosen": -8.80666732788086, "logits/rejected": -8.801708221435547, "logps/chosen": -12.547006607055664, "logps/rejected": -104.91258239746094, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.8407496213912964, "rewards/margins": 9.176885604858398, "rewards/rejected": -7.336136341094971, "step": 364 }, { "epoch": 0.25623025623025625, "grad_norm": 0.00869286060333252, "learning_rate": 4.2704881074274584e-05, "logits/chosen": -8.431807518005371, "logits/rejected": -8.470769882202148, "logps/chosen": -10.422317504882812, "logps/rejected": -104.28846740722656, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.26432204246521, "rewards/margins": 9.391495704650879, "rewards/rejected": -7.127172946929932, "step": 365 }, { "epoch": 0.2569322569322569, "grad_norm": 0.00960502028465271, "learning_rate": 4.272468475657351e-05, "logits/chosen": -8.184975624084473, "logits/rejected": -8.19159984588623, "logps/chosen": -12.58192253112793, "logps/rejected": -102.42291259765625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.0002036094665527, "rewards/margins": 9.017631530761719, "rewards/rejected": -7.017427444458008, "step": 366 }, { "epoch": 0.2576342576342576, "grad_norm": 1.4775495529174805, "learning_rate": 4.2744434404201497e-05, "logits/chosen": -8.878284454345703, "logits/rejected": -8.87063980102539, "logps/chosen": -10.219234466552734, "logps/rejected": -100.70722198486328, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 2.1422476768493652, "rewards/margins": 9.074070930480957, "rewards/rejected": -6.931823253631592, "step": 367 }, { "epoch": 0.25833625833625834, "grad_norm": 0.12474162876605988, "learning_rate": 4.27641303112253e-05, "logits/chosen": -8.607194900512695, "logits/rejected": -8.604900360107422, "logps/chosen": -10.80789566040039, "logps/rejected": -105.18028259277344, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.1023011207580566, "rewards/margins": 9.362726211547852, "rewards/rejected": -7.260425567626953, "step": 368 }, { "epoch": 0.25903825903825906, "grad_norm": 0.009206214919686317, "learning_rate": 4.278377276931767e-05, "logits/chosen": -7.207387924194336, "logits/rejected": -7.239879131317139, "logps/chosen": -12.383581161499023, "logps/rejected": -97.90454864501953, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.8561244010925293, "rewards/margins": 8.422111511230469, "rewards/rejected": -6.5659871101379395, "step": 369 }, { "epoch": 0.2597402597402597, "grad_norm": 0.004749534651637077, "learning_rate": 4.2803362067783256e-05, "logits/chosen": -8.984800338745117, "logits/rejected": -9.001049041748047, "logps/chosen": -9.83215618133545, "logps/rejected": -104.1904067993164, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.1918952465057373, "rewards/margins": 9.456433296203613, "rewards/rejected": -7.264538288116455, "step": 370 }, { "epoch": 0.26044226044226043, "grad_norm": 0.004696532618254423, "learning_rate": 4.2822898493584104e-05, "logits/chosen": -8.42540454864502, "logits/rejected": -8.425400733947754, "logps/chosen": -10.0515718460083, "logps/rejected": -104.91618347167969, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.139143943786621, "rewards/margins": 9.50865364074707, "rewards/rejected": -7.369510650634766, "step": 371 }, { "epoch": 0.26114426114426115, "grad_norm": 0.003659463720396161, "learning_rate": 4.284238233136496e-05, "logits/chosen": -8.594417572021484, "logits/rejected": -8.643302917480469, "logps/chosen": -10.014842987060547, "logps/rejected": -105.38887023925781, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.037043571472168, "rewards/margins": 9.50550651550293, "rewards/rejected": -7.46846342086792, "step": 372 }, { "epoch": 0.26184626184626186, "grad_norm": 0.008511990308761597, "learning_rate": 4.286181386347813e-05, "logits/chosen": -8.753019332885742, "logits/rejected": -8.773117065429688, "logps/chosen": -11.276773452758789, "logps/rejected": -102.56612396240234, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.1634230613708496, "rewards/margins": 9.157890319824219, "rewards/rejected": -6.9944682121276855, "step": 373 }, { "epoch": 0.2625482625482625, "grad_norm": 0.010744635947048664, "learning_rate": 4.288119337000801e-05, "logits/chosen": -9.193941116333008, "logits/rejected": -9.18051815032959, "logps/chosen": -8.32534408569336, "logps/rejected": -104.86729431152344, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.215668201446533, "rewards/margins": 9.68586540222168, "rewards/rejected": -7.470197677612305, "step": 374 }, { "epoch": 0.26325026325026324, "grad_norm": 0.01672777719795704, "learning_rate": 4.2900521128795315e-05, "logits/chosen": -9.274229049682617, "logits/rejected": -9.273017883300781, "logps/chosen": -9.211349487304688, "logps/rejected": -104.58003234863281, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.065396308898926, "rewards/margins": 9.546239852905273, "rewards/rejected": -7.480844020843506, "step": 375 }, { "epoch": 0.26395226395226395, "grad_norm": 0.006168079562485218, "learning_rate": 4.291979741546102e-05, "logits/chosen": -8.78345012664795, "logits/rejected": -8.805113792419434, "logps/chosen": -14.676271438598633, "logps/rejected": -99.82470703125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.8499813079833984, "rewards/margins": 8.538654327392578, "rewards/rejected": -6.6886725425720215, "step": 376 }, { "epoch": 0.26465426465426467, "grad_norm": 1.443017840385437, "learning_rate": 4.293902250342989e-05, "logits/chosen": -8.476943969726562, "logits/rejected": -8.48042106628418, "logps/chosen": -7.274722099304199, "logps/rejected": -103.81175994873047, "loss": 0.0666, "rewards/accuracies": 1.0, "rewards/chosen": 2.606414794921875, "rewards/margins": 9.74709701538086, "rewards/rejected": -7.140681266784668, "step": 377 }, { "epoch": 0.26535626535626533, "grad_norm": 0.0688285082578659, "learning_rate": 4.295819666395376e-05, "logits/chosen": -8.756513595581055, "logits/rejected": -8.758661270141602, "logps/chosen": -21.26804542541504, "logps/rejected": -92.90553283691406, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.3068147897720337, "rewards/margins": 7.010549545288086, "rewards/rejected": -5.703734874725342, "step": 378 }, { "epoch": 0.26605826605826605, "grad_norm": 0.08722803741693497, "learning_rate": 4.297732016613454e-05, "logits/chosen": -8.6237211227417, "logits/rejected": -8.611590385437012, "logps/chosen": -13.311307907104492, "logps/rejected": -103.3050537109375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.9669137001037598, "rewards/margins": 9.068014144897461, "rewards/rejected": -7.101100921630859, "step": 379 }, { "epoch": 0.26676026676026676, "grad_norm": 0.014267485588788986, "learning_rate": 4.299639327694684e-05, "logits/chosen": -9.031636238098145, "logits/rejected": -9.038860321044922, "logps/chosen": -9.397754669189453, "logps/rejected": -95.6617431640625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.1842167377471924, "rewards/margins": 8.695402145385742, "rewards/rejected": -6.511185646057129, "step": 380 }, { "epoch": 0.2674622674622675, "grad_norm": 0.037356700748205185, "learning_rate": 4.3015416261260325e-05, "logits/chosen": -8.727376937866211, "logits/rejected": -8.744274139404297, "logps/chosen": -9.581253051757812, "logps/rejected": -93.10350799560547, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 2.363264560699463, "rewards/margins": 8.479890823364258, "rewards/rejected": -6.116626262664795, "step": 381 }, { "epoch": 0.26816426816426814, "grad_norm": 0.1336798071861267, "learning_rate": 4.303438938186182e-05, "logits/chosen": -9.572385787963867, "logits/rejected": -9.572587966918945, "logps/chosen": -14.64376449584961, "logps/rejected": -98.30955505371094, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.763279676437378, "rewards/margins": 8.306005477905273, "rewards/rejected": -6.542725086212158, "step": 382 }, { "epoch": 0.26886626886626885, "grad_norm": 0.02889818698167801, "learning_rate": 4.305331289947705e-05, "logits/chosen": -9.047636032104492, "logits/rejected": -9.036657333374023, "logps/chosen": -11.011117935180664, "logps/rejected": -94.56719970703125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.9437636137008667, "rewards/margins": 8.31773567199707, "rewards/rejected": -6.373971939086914, "step": 383 }, { "epoch": 0.26956826956826957, "grad_norm": 0.028732268139719963, "learning_rate": 4.3072187072792184e-05, "logits/chosen": -7.823786735534668, "logits/rejected": -7.83189582824707, "logps/chosen": -15.373350143432617, "logps/rejected": -86.19059753417969, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.8924541473388672, "rewards/margins": 7.280056953430176, "rewards/rejected": -5.387602806091309, "step": 384 }, { "epoch": 0.2702702702702703, "grad_norm": 0.037155721336603165, "learning_rate": 4.309101215847502e-05, "logits/chosen": -8.298635482788086, "logits/rejected": -8.291547775268555, "logps/chosen": -14.992486953735352, "logps/rejected": -95.11180877685547, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 1.6696529388427734, "rewards/margins": 7.955430030822754, "rewards/rejected": -6.2857770919799805, "step": 385 }, { "epoch": 0.27097227097227095, "grad_norm": 0.13418908417224884, "learning_rate": 4.3109788411195924e-05, "logits/chosen": -9.323923110961914, "logits/rejected": -9.322628021240234, "logps/chosen": -10.279054641723633, "logps/rejected": -85.91163635253906, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 2.129293203353882, "rewards/margins": 7.608471870422363, "rewards/rejected": -5.479178428649902, "step": 386 }, { "epoch": 0.27167427167427166, "grad_norm": 0.02990160509943962, "learning_rate": 4.312851608364853e-05, "logits/chosen": -8.03729248046875, "logits/rejected": -8.051146507263184, "logps/chosen": -11.805295944213867, "logps/rejected": -87.75018310546875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 2.0746512413024902, "rewards/margins": 7.596149921417236, "rewards/rejected": -5.521498203277588, "step": 387 }, { "epoch": 0.2723762723762724, "grad_norm": 0.02539190649986267, "learning_rate": 4.314719542657013e-05, "logits/chosen": -8.956216812133789, "logits/rejected": -8.989641189575195, "logps/chosen": -11.902203559875488, "logps/rejected": -88.51610565185547, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 2.039719343185425, "rewards/margins": 7.710927486419678, "rewards/rejected": -5.671207904815674, "step": 388 }, { "epoch": 0.2730782730782731, "grad_norm": 0.0259188674390316, "learning_rate": 4.3165826688761796e-05, "logits/chosen": -8.546934127807617, "logits/rejected": -8.559269905090332, "logps/chosen": -15.813150405883789, "logps/rejected": -88.00244140625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.759721040725708, "rewards/margins": 7.216746807098389, "rewards/rejected": -5.457025527954102, "step": 389 }, { "epoch": 0.27378027378027375, "grad_norm": 0.026351574808359146, "learning_rate": 4.318441011710833e-05, "logits/chosen": -8.687335968017578, "logits/rejected": -8.708877563476562, "logps/chosen": -15.116119384765625, "logps/rejected": -85.29852294921875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.7615525722503662, "rewards/margins": 7.0110883712768555, "rewards/rejected": -5.24953556060791, "step": 390 }, { "epoch": 0.27448227448227447, "grad_norm": 0.04496662691235542, "learning_rate": 4.3202945956597786e-05, "logits/chosen": -8.619085311889648, "logits/rejected": -8.611741065979004, "logps/chosen": -11.355707168579102, "logps/rejected": -95.63081359863281, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 2.0512874126434326, "rewards/margins": 8.373516082763672, "rewards/rejected": -6.322227478027344, "step": 391 }, { "epoch": 0.2751842751842752, "grad_norm": 0.018022766336798668, "learning_rate": 4.3221434450340956e-05, "logits/chosen": -8.061605453491211, "logits/rejected": -8.069265365600586, "logps/chosen": -15.19871711730957, "logps/rejected": -94.92198181152344, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.869349718093872, "rewards/margins": 7.886741638183594, "rewards/rejected": -6.017392158508301, "step": 392 }, { "epoch": 0.2758862758862759, "grad_norm": 0.03525719419121742, "learning_rate": 4.323987583959045e-05, "logits/chosen": -8.478246688842773, "logits/rejected": -8.49482536315918, "logps/chosen": -14.865188598632812, "logps/rejected": -92.71623229980469, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 1.8267018795013428, "rewards/margins": 7.766195297241211, "rewards/rejected": -5.939493179321289, "step": 393 }, { "epoch": 0.2765882765882766, "grad_norm": 0.1945619136095047, "learning_rate": 4.325827036375957e-05, "logits/chosen": -8.4051513671875, "logits/rejected": -8.385531425476074, "logps/chosen": -14.938945770263672, "logps/rejected": -95.06045532226562, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.786920189857483, "rewards/margins": 7.972382545471191, "rewards/rejected": -6.18546199798584, "step": 394 }, { "epoch": 0.2772902772902773, "grad_norm": 0.019016245380043983, "learning_rate": 4.327661826044101e-05, "logits/chosen": -8.681941986083984, "logits/rejected": -8.686805725097656, "logps/chosen": -12.344802856445312, "logps/rejected": -100.46110534667969, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.923886775970459, "rewards/margins": 8.71706771850586, "rewards/rejected": -6.793179988861084, "step": 395 }, { "epoch": 0.277992277992278, "grad_norm": 0.011870593763887882, "learning_rate": 4.329491976542521e-05, "logits/chosen": -8.187358856201172, "logits/rejected": -8.18885326385498, "logps/chosen": -11.71027946472168, "logps/rejected": -93.1496353149414, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 2.037748336791992, "rewards/margins": 8.109048843383789, "rewards/rejected": -6.071300506591797, "step": 396 }, { "epoch": 0.2786942786942787, "grad_norm": 0.13726358115673065, "learning_rate": 4.331317511271859e-05, "logits/chosen": -8.77176284790039, "logits/rejected": -8.768787384033203, "logps/chosen": -11.026529312133789, "logps/rejected": -99.64085388183594, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 2.023556709289551, "rewards/margins": 8.8003568649292, "rewards/rejected": -6.776800155639648, "step": 397 }, { "epoch": 0.2793962793962794, "grad_norm": 0.013859091326594353, "learning_rate": 4.333138453456147e-05, "logits/chosen": -9.696513175964355, "logits/rejected": -9.695594787597656, "logps/chosen": -14.014627456665039, "logps/rejected": -97.02165222167969, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.8089866638183594, "rewards/margins": 8.256803512573242, "rewards/rejected": -6.447817325592041, "step": 398 }, { "epoch": 0.2800982800982801, "grad_norm": 0.00891526136547327, "learning_rate": 4.334954826144581e-05, "logits/chosen": -9.0324125289917, "logits/rejected": -9.041574478149414, "logps/chosen": -12.119179725646973, "logps/rejected": -102.98275756835938, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.8601319789886475, "rewards/margins": 9.018235206604004, "rewards/rejected": -7.158102989196777, "step": 399 }, { "epoch": 0.2808002808002808, "grad_norm": 0.007587415631860495, "learning_rate": 4.336766652213271e-05, "logits/chosen": -8.766959190368652, "logits/rejected": -8.763082504272461, "logps/chosen": -12.718477249145508, "logps/rejected": -99.76692199707031, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.8700942993164062, "rewards/margins": 8.724455833435059, "rewards/rejected": -6.854361534118652, "step": 400 }, { "epoch": 0.2815022815022815, "grad_norm": 0.006447991356253624, "learning_rate": 4.338573954366971e-05, "logits/chosen": -8.809225082397461, "logits/rejected": -8.794013977050781, "logps/chosen": -10.346450805664062, "logps/rejected": -99.96395874023438, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.1782329082489014, "rewards/margins": 8.908368110656738, "rewards/rejected": -6.730134963989258, "step": 401 }, { "epoch": 0.28220428220428223, "grad_norm": 0.008146814070641994, "learning_rate": 4.340376755140784e-05, "logits/chosen": -8.924603462219238, "logits/rejected": -8.916657447814941, "logps/chosen": -15.317176818847656, "logps/rejected": -95.65894317626953, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.7663565874099731, "rewards/margins": 8.087133407592773, "rewards/rejected": -6.320777416229248, "step": 402 }, { "epoch": 0.2829062829062829, "grad_norm": 0.005253266077488661, "learning_rate": 4.342175076901849e-05, "logits/chosen": -7.666966438293457, "logits/rejected": -7.659298419952393, "logps/chosen": -9.829215049743652, "logps/rejected": -103.0132827758789, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.0599894523620605, "rewards/margins": 9.222431182861328, "rewards/rejected": -7.162441253662109, "step": 403 }, { "epoch": 0.2836082836082836, "grad_norm": 3.931095600128174, "learning_rate": 4.343968941851009e-05, "logits/chosen": -9.486889839172363, "logits/rejected": -9.475420951843262, "logps/chosen": -8.732436180114746, "logps/rejected": -100.19575500488281, "loss": 0.0566, "rewards/accuracies": 1.0, "rewards/chosen": 2.262428045272827, "rewards/margins": 9.184772491455078, "rewards/rejected": -6.922344207763672, "step": 404 }, { "epoch": 0.2843102843102843, "grad_norm": 0.0064927986823022366, "learning_rate": 4.345758372024448e-05, "logits/chosen": -9.402381896972656, "logits/rejected": -9.406651496887207, "logps/chosen": -8.652904510498047, "logps/rejected": -101.74069213867188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.2845301628112793, "rewards/margins": 9.288580894470215, "rewards/rejected": -7.004050254821777, "step": 405 }, { "epoch": 0.28501228501228504, "grad_norm": 0.008318254724144936, "learning_rate": 4.347543389295324e-05, "logits/chosen": -8.34644889831543, "logits/rejected": -8.351264953613281, "logps/chosen": -8.227598190307617, "logps/rejected": -101.67599487304688, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.505115032196045, "rewards/margins": 9.393898010253906, "rewards/rejected": -6.8887834548950195, "step": 406 }, { "epoch": 0.2857142857142857, "grad_norm": 0.010637782514095306, "learning_rate": 4.3493240153753666e-05, "logits/chosen": -8.568105697631836, "logits/rejected": -8.580371856689453, "logps/chosen": -11.929319381713867, "logps/rejected": -101.39810180664062, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.9480371475219727, "rewards/margins": 8.948522567749023, "rewards/rejected": -7.000485897064209, "step": 407 }, { "epoch": 0.2864162864162864, "grad_norm": 0.004698682576417923, "learning_rate": 4.3511002718164666e-05, "logits/chosen": -8.969755172729492, "logits/rejected": -8.959392547607422, "logps/chosen": -12.071956634521484, "logps/rejected": -102.36813354492188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.9420652389526367, "rewards/margins": 9.014129638671875, "rewards/rejected": -7.07206392288208, "step": 408 }, { "epoch": 0.28711828711828713, "grad_norm": 0.008622453548014164, "learning_rate": 4.352872180012237e-05, "logits/chosen": -8.470559120178223, "logits/rejected": -8.468387603759766, "logps/chosen": -13.670578956604004, "logps/rejected": -103.3233642578125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.7490339279174805, "rewards/margins": 8.834943771362305, "rewards/rejected": -7.085909843444824, "step": 409 }, { "epoch": 0.28782028782028785, "grad_norm": 1.327553153038025, "learning_rate": 4.35463976119956e-05, "logits/chosen": -8.668405532836914, "logits/rejected": -8.69038200378418, "logps/chosen": -9.212808609008789, "logps/rejected": -101.26510620117188, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.2963738441467285, "rewards/margins": 9.237621307373047, "rewards/rejected": -6.941247940063477, "step": 410 }, { "epoch": 0.2885222885222885, "grad_norm": 1.3664177656173706, "learning_rate": 4.356403036460115e-05, "logits/chosen": -8.126418113708496, "logits/rejected": -8.142708778381348, "logps/chosen": -11.97889518737793, "logps/rejected": -102.17686462402344, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": 2.0456643104553223, "rewards/margins": 9.076631546020508, "rewards/rejected": -7.030967712402344, "step": 411 }, { "epoch": 0.2892242892242892, "grad_norm": 0.016403287649154663, "learning_rate": 4.3581620267218916e-05, "logits/chosen": -9.416509628295898, "logits/rejected": -9.393876075744629, "logps/chosen": -10.487689971923828, "logps/rejected": -104.86599731445312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.1203482151031494, "rewards/margins": 9.40884017944336, "rewards/rejected": -7.288492202758789, "step": 412 }, { "epoch": 0.28992628992628994, "grad_norm": 0.11092265695333481, "learning_rate": 4.359916752760669e-05, "logits/chosen": -8.928506851196289, "logits/rejected": -8.911355972290039, "logps/chosen": -13.291553497314453, "logps/rejected": -103.23478698730469, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.7683855295181274, "rewards/margins": 8.943063735961914, "rewards/rejected": -7.174678802490234, "step": 413 }, { "epoch": 0.29062829062829065, "grad_norm": 0.015974948182702065, "learning_rate": 4.361667235201499e-05, "logits/chosen": -9.164084434509277, "logits/rejected": -9.163789749145508, "logps/chosen": -10.10418701171875, "logps/rejected": -102.38272094726562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.164968252182007, "rewards/margins": 9.231450080871582, "rewards/rejected": -7.066481590270996, "step": 414 }, { "epoch": 0.2913302913302913, "grad_norm": 0.06209737807512283, "learning_rate": 4.363413494520154e-05, "logits/chosen": -9.618644714355469, "logits/rejected": -9.61819839477539, "logps/chosen": -10.189172744750977, "logps/rejected": -104.1976318359375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 2.2415919303894043, "rewards/margins": 9.44552993774414, "rewards/rejected": -7.203937530517578, "step": 415 }, { "epoch": 0.29203229203229203, "grad_norm": 0.040601614862680435, "learning_rate": 4.365155551044572e-05, "logits/chosen": -8.990056991577148, "logits/rejected": -9.003531455993652, "logps/chosen": -11.86796760559082, "logps/rejected": -102.19241333007812, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 2.0106143951416016, "rewards/margins": 9.078329086303711, "rewards/rejected": -7.067715644836426, "step": 416 }, { "epoch": 0.29273429273429274, "grad_norm": 0.0502256415784359, "learning_rate": 4.366893424956263e-05, "logits/chosen": -9.009932518005371, "logits/rejected": -9.019306182861328, "logps/chosen": -19.063846588134766, "logps/rejected": -102.1300048828125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.3048362731933594, "rewards/margins": 8.157347679138184, "rewards/rejected": -6.852510929107666, "step": 417 }, { "epoch": 0.29343629343629346, "grad_norm": 2.4403977394104004, "learning_rate": 4.368627136291726e-05, "logits/chosen": -9.174825668334961, "logits/rejected": -9.16642951965332, "logps/chosen": -10.410409927368164, "logps/rejected": -104.28317260742188, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": 2.1227455139160156, "rewards/margins": 9.386423110961914, "rewards/rejected": -7.263676643371582, "step": 418 }, { "epoch": 0.2941382941382941, "grad_norm": 0.11538813263177872, "learning_rate": 4.370356704943825e-05, "logits/chosen": -9.17447280883789, "logits/rejected": -9.19190502166748, "logps/chosen": -9.317389488220215, "logps/rejected": -105.18358612060547, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 2.197427272796631, "rewards/margins": 9.555366516113281, "rewards/rejected": -7.357938766479492, "step": 419 }, { "epoch": 0.29484029484029484, "grad_norm": 0.3338773548603058, "learning_rate": 4.372082150663168e-05, "logits/chosen": -8.922666549682617, "logits/rejected": -8.921577453613281, "logps/chosen": -19.398447036743164, "logps/rejected": -100.1949462890625, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 1.4288138151168823, "rewards/margins": 8.112114906311035, "rewards/rejected": -6.683300495147705, "step": 420 }, { "epoch": 0.29554229554229555, "grad_norm": 0.18033447861671448, "learning_rate": 4.3738034930594475e-05, "logits/chosen": -8.827239990234375, "logits/rejected": -8.840787887573242, "logps/chosen": -13.585800170898438, "logps/rejected": -103.01160430908203, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.7627317905426025, "rewards/margins": 8.927143096923828, "rewards/rejected": -7.1644110679626465, "step": 421 }, { "epoch": 0.29624429624429627, "grad_norm": 0.014393203891813755, "learning_rate": 4.3755207516027904e-05, "logits/chosen": -9.697216033935547, "logits/rejected": -9.712255477905273, "logps/chosen": -8.92843246459961, "logps/rejected": -105.03852844238281, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.177306890487671, "rewards/margins": 9.614389419555664, "rewards/rejected": -7.437082290649414, "step": 422 }, { "epoch": 0.29694629694629693, "grad_norm": 0.010388338938355446, "learning_rate": 4.377233945625071e-05, "logits/chosen": -8.608022689819336, "logits/rejected": -8.611534118652344, "logps/chosen": -11.491147994995117, "logps/rejected": -102.221435546875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.1229233741760254, "rewards/margins": 9.088215827941895, "rewards/rejected": -6.965292453765869, "step": 423 }, { "epoch": 0.29764829764829764, "grad_norm": 0.022490600124001503, "learning_rate": 4.378943094321221e-05, "logits/chosen": -9.519155502319336, "logits/rejected": -9.526168823242188, "logps/chosen": -8.568216323852539, "logps/rejected": -106.01402282714844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.301335334777832, "rewards/margins": 9.712969779968262, "rewards/rejected": -7.41163444519043, "step": 424 }, { "epoch": 0.29835029835029836, "grad_norm": 0.009200277738273144, "learning_rate": 4.3806482167505196e-05, "logits/chosen": -8.830179214477539, "logits/rejected": -8.824623107910156, "logps/chosen": -14.393718719482422, "logps/rejected": -100.36518096923828, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.6130638122558594, "rewards/margins": 8.491326332092285, "rewards/rejected": -6.878262519836426, "step": 425 }, { "epoch": 0.2990522990522991, "grad_norm": 0.0034821454901248217, "learning_rate": 4.382349331837866e-05, "logits/chosen": -8.859567642211914, "logits/rejected": -8.84177303314209, "logps/chosen": -10.761014938354492, "logps/rejected": -104.66159057617188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.9885411262512207, "rewards/margins": 9.334516525268555, "rewards/rejected": -7.345975399017334, "step": 426 }, { "epoch": 0.29975429975429974, "grad_norm": 0.002967687090858817, "learning_rate": 4.3840464583750404e-05, "logits/chosen": -9.188234329223633, "logits/rejected": -9.193929672241211, "logps/chosen": -11.578794479370117, "logps/rejected": -105.22503662109375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.967897891998291, "rewards/margins": 9.30006217956543, "rewards/rejected": -7.3321638107299805, "step": 427 }, { "epoch": 0.30045630045630045, "grad_norm": 0.00431620329618454, "learning_rate": 4.385739615021954e-05, "logits/chosen": -9.239370346069336, "logits/rejected": -9.220002174377441, "logps/chosen": -11.579229354858398, "logps/rejected": -102.18316650390625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.1167993545532227, "rewards/margins": 9.108657836914062, "rewards/rejected": -6.99185848236084, "step": 428 }, { "epoch": 0.30115830115830117, "grad_norm": 0.0031616801861673594, "learning_rate": 4.387428820307874e-05, "logits/chosen": -9.102396011352539, "logits/rejected": -9.095512390136719, "logps/chosen": -8.726110458374023, "logps/rejected": -105.0847396850586, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.2662534713745117, "rewards/margins": 9.586126327514648, "rewards/rejected": -7.319872856140137, "step": 429 }, { "epoch": 0.3018603018603019, "grad_norm": 0.05630970001220703, "learning_rate": 4.3891140926326446e-05, "logits/chosen": -9.456500053405762, "logits/rejected": -9.471039772033691, "logps/chosen": -9.200798034667969, "logps/rejected": -104.86781311035156, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 2.1904423236846924, "rewards/margins": 9.495420455932617, "rewards/rejected": -7.304978847503662, "step": 430 }, { "epoch": 0.30256230256230254, "grad_norm": 0.004142187535762787, "learning_rate": 4.390795450267886e-05, "logits/chosen": -9.434795379638672, "logits/rejected": -9.425863265991211, "logps/chosen": -10.058637619018555, "logps/rejected": -105.07878112792969, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.1835176944732666, "rewards/margins": 9.401215553283691, "rewards/rejected": -7.217698097229004, "step": 431 }, { "epoch": 0.30326430326430326, "grad_norm": 0.004226724151521921, "learning_rate": 4.3924729113581876e-05, "logits/chosen": -8.051304817199707, "logits/rejected": -8.061164855957031, "logps/chosen": -9.431507110595703, "logps/rejected": -103.56217956542969, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.3643836975097656, "rewards/margins": 9.390783309936523, "rewards/rejected": -7.0263991355896, "step": 432 }, { "epoch": 0.303966303966304, "grad_norm": 0.004388120956718922, "learning_rate": 4.394146493922276e-05, "logits/chosen": -9.46902847290039, "logits/rejected": -9.466257095336914, "logps/chosen": -8.290660858154297, "logps/rejected": -104.35013580322266, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.1665520668029785, "rewards/margins": 9.503792762756348, "rewards/rejected": -7.337240695953369, "step": 433 }, { "epoch": 0.3046683046683047, "grad_norm": 0.0026562754064798355, "learning_rate": 4.395816215854185e-05, "logits/chosen": -8.486305236816406, "logits/rejected": -8.475886344909668, "logps/chosen": -8.809309959411621, "logps/rejected": -104.24385070800781, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4580318927764893, "rewards/margins": 9.571582794189453, "rewards/rejected": -7.113550186157227, "step": 434 }, { "epoch": 0.30537030537030535, "grad_norm": 0.0026626598555594683, "learning_rate": 4.397482094924396e-05, "logits/chosen": -8.328810691833496, "logits/rejected": -8.292753219604492, "logps/chosen": -9.169690132141113, "logps/rejected": -104.01224517822266, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.255077362060547, "rewards/margins": 9.400896072387695, "rewards/rejected": -7.14581823348999, "step": 435 }, { "epoch": 0.30607230607230607, "grad_norm": 0.0902949720621109, "learning_rate": 4.399144148780977e-05, "logits/chosen": -8.883628845214844, "logits/rejected": -8.8856201171875, "logps/chosen": -8.04377555847168, "logps/rejected": -99.00294494628906, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.56852388381958, "rewards/margins": 9.166719436645508, "rewards/rejected": -6.5981950759887695, "step": 436 }, { "epoch": 0.3067743067743068, "grad_norm": 0.010128587484359741, "learning_rate": 4.400802394950703e-05, "logits/chosen": -9.97817611694336, "logits/rejected": -9.989072799682617, "logps/chosen": -5.887510299682617, "logps/rejected": -105.55548095703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.5895721912384033, "rewards/margins": 9.943349838256836, "rewards/rejected": -7.353776931762695, "step": 437 }, { "epoch": 0.3074763074763075, "grad_norm": 0.00606644619256258, "learning_rate": 4.402456850840166e-05, "logits/chosen": -9.069456100463867, "logits/rejected": -9.05079460144043, "logps/chosen": -8.203450202941895, "logps/rejected": -105.76414489746094, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.391185760498047, "rewards/margins": 9.655704498291016, "rewards/rejected": -7.264518737792969, "step": 438 }, { "epoch": 0.30817830817830816, "grad_norm": 0.004454936366528273, "learning_rate": 4.4041075337368695e-05, "logits/chosen": -9.858201026916504, "logits/rejected": -9.861167907714844, "logps/chosen": -8.042516708374023, "logps/rejected": -103.64226531982422, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.288447380065918, "rewards/margins": 9.630125999450684, "rewards/rejected": -7.341678619384766, "step": 439 }, { "epoch": 0.3088803088803089, "grad_norm": 0.25734585523605347, "learning_rate": 4.405754460810312e-05, "logits/chosen": -9.239215850830078, "logits/rejected": -9.230655670166016, "logps/chosen": -11.270944595336914, "logps/rejected": -105.16983032226562, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 2.021956443786621, "rewards/margins": 9.23079776763916, "rewards/rejected": -7.208841323852539, "step": 440 }, { "epoch": 0.3095823095823096, "grad_norm": 0.04832395538687706, "learning_rate": 4.407397649113065e-05, "logits/chosen": -9.15053939819336, "logits/rejected": -9.158004760742188, "logps/chosen": -12.804474830627441, "logps/rejected": -102.78707122802734, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.923788070678711, "rewards/margins": 8.92500114440918, "rewards/rejected": -7.001214027404785, "step": 441 }, { "epoch": 0.3102843102843103, "grad_norm": 0.03402753174304962, "learning_rate": 4.40903711558182e-05, "logits/chosen": -8.399024963378906, "logits/rejected": -8.4088773727417, "logps/chosen": -6.605869293212891, "logps/rejected": -104.54576110839844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.5370607376098633, "rewards/margins": 9.693159103393555, "rewards/rejected": -7.156098365783691, "step": 442 }, { "epoch": 0.31098631098631097, "grad_norm": 0.013694651424884796, "learning_rate": 4.41067287703845e-05, "logits/chosen": -9.4154634475708, "logits/rejected": -9.418703079223633, "logps/chosen": -7.822390079498291, "logps/rejected": -102.34520721435547, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.3522377014160156, "rewards/margins": 9.425531387329102, "rewards/rejected": -7.073293685913086, "step": 443 }, { "epoch": 0.3116883116883117, "grad_norm": 0.00770655507221818, "learning_rate": 4.412304950191033e-05, "logits/chosen": -8.778573989868164, "logits/rejected": -8.775468826293945, "logps/chosen": -5.020951747894287, "logps/rejected": -106.28600311279297, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7465662956237793, "rewards/margins": 10.133211135864258, "rewards/rejected": -7.38664436340332, "step": 444 }, { "epoch": 0.3123903123903124, "grad_norm": 0.00721836369484663, "learning_rate": 4.413933351634886e-05, "logits/chosen": -9.908824920654297, "logits/rejected": -9.90350341796875, "logps/chosen": -9.05491828918457, "logps/rejected": -101.66683959960938, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.1880507469177246, "rewards/margins": 9.152610778808594, "rewards/rejected": -6.964559555053711, "step": 445 }, { "epoch": 0.3130923130923131, "grad_norm": 0.0023418946657329798, "learning_rate": 4.4155580978535707e-05, "logits/chosen": -9.563041687011719, "logits/rejected": -9.576682090759277, "logps/chosen": -8.998811721801758, "logps/rejected": -103.87435150146484, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.2065951824188232, "rewards/margins": 9.405595779418945, "rewards/rejected": -7.198999404907227, "step": 446 }, { "epoch": 0.3137943137943138, "grad_norm": 0.0020189068745821714, "learning_rate": 4.417179205219895e-05, "logits/chosen": -8.64323616027832, "logits/rejected": -8.64747428894043, "logps/chosen": -6.978670120239258, "logps/rejected": -103.53274536132812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5364890098571777, "rewards/margins": 9.650196075439453, "rewards/rejected": -7.113706111907959, "step": 447 }, { "epoch": 0.3144963144963145, "grad_norm": 0.003900434821844101, "learning_rate": 4.418796689996907e-05, "logits/chosen": -9.048709869384766, "logits/rejected": -9.058738708496094, "logps/chosen": -4.959263801574707, "logps/rejected": -107.12579345703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7681491374969482, "rewards/margins": 10.22479248046875, "rewards/rejected": -7.4566426277160645, "step": 448 }, { "epoch": 0.3151983151983152, "grad_norm": 0.0019036935409530997, "learning_rate": 4.420410568338872e-05, "logits/chosen": -10.264848709106445, "logits/rejected": -10.25308895111084, "logps/chosen": -7.671433925628662, "logps/rejected": -104.87330627441406, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.438692569732666, "rewards/margins": 9.714122772216797, "rewards/rejected": -7.275431156158447, "step": 449 }, { "epoch": 0.3159003159003159, "grad_norm": 0.0019373418763279915, "learning_rate": 4.42202085629224e-05, "logits/chosen": -8.810646057128906, "logits/rejected": -8.787069320678711, "logps/chosen": -10.840066909790039, "logps/rejected": -101.85871887207031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.0289902687072754, "rewards/margins": 9.063559532165527, "rewards/rejected": -7.034568786621094, "step": 450 }, { "epoch": 0.3166023166023166, "grad_norm": 0.0019215471111238003, "learning_rate": 4.423627569796601e-05, "logits/chosen": -8.735268592834473, "logits/rejected": -8.739690780639648, "logps/chosen": -6.226633071899414, "logps/rejected": -103.24983215332031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5647521018981934, "rewards/margins": 9.729413986206055, "rewards/rejected": -7.164661407470703, "step": 451 }, { "epoch": 0.3173043173043173, "grad_norm": 0.0018975220154970884, "learning_rate": 4.425230724685638e-05, "logits/chosen": -8.817325592041016, "logits/rejected": -8.818482398986816, "logps/chosen": -11.340200424194336, "logps/rejected": -104.24020385742188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.0943443775177, "rewards/margins": 9.243807792663574, "rewards/rejected": -7.149463653564453, "step": 452 }, { "epoch": 0.318006318006318, "grad_norm": 0.002718034666031599, "learning_rate": 4.4268303366880536e-05, "logits/chosen": -8.400851249694824, "logits/rejected": -8.413286209106445, "logps/chosen": -7.42848014831543, "logps/rejected": -103.06898498535156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4556145668029785, "rewards/margins": 9.583261489868164, "rewards/rejected": -7.127647399902344, "step": 453 }, { "epoch": 0.3187083187083187, "grad_norm": 0.0045762574300169945, "learning_rate": 4.428426421428507e-05, "logits/chosen": -9.140806198120117, "logits/rejected": -9.148019790649414, "logps/chosen": -5.052892684936523, "logps/rejected": -106.08197021484375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7060441970825195, "rewards/margins": 10.166549682617188, "rewards/rejected": -7.460505485534668, "step": 454 }, { "epoch": 0.3194103194103194, "grad_norm": 0.003260900266468525, "learning_rate": 4.430018994428521e-05, "logits/chosen": -8.482109069824219, "logits/rejected": -8.482575416564941, "logps/chosen": -6.0478715896606445, "logps/rejected": -106.67313385009766, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6101272106170654, "rewards/margins": 9.991515159606934, "rewards/rejected": -7.3813886642456055, "step": 455 }, { "epoch": 0.3201123201123201, "grad_norm": 0.001727921306155622, "learning_rate": 4.431608071107392e-05, "logits/chosen": -9.263931274414062, "logits/rejected": -9.262899398803711, "logps/chosen": -3.5214076042175293, "logps/rejected": -106.9378433227539, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7752017974853516, "rewards/margins": 10.284162521362305, "rewards/rejected": -7.508959770202637, "step": 456 }, { "epoch": 0.3208143208143208, "grad_norm": 0.003588385647162795, "learning_rate": 4.433193666783084e-05, "logits/chosen": -9.895387649536133, "logits/rejected": -9.902056694030762, "logps/chosen": -14.187888145446777, "logps/rejected": -100.52668762207031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.9103288650512695, "rewards/margins": 8.619132995605469, "rewards/rejected": -6.708803176879883, "step": 457 }, { "epoch": 0.32151632151632153, "grad_norm": 0.012762986123561859, "learning_rate": 4.4347757966731156e-05, "logits/chosen": -9.720733642578125, "logits/rejected": -9.709232330322266, "logps/chosen": -7.377583980560303, "logps/rejected": -104.4515151977539, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.48225474357605, "rewards/margins": 9.752004623413086, "rewards/rejected": -7.269749641418457, "step": 458 }, { "epoch": 0.3222183222183222, "grad_norm": 0.0020145857706665993, "learning_rate": 4.436354475895436e-05, "logits/chosen": -8.403146743774414, "logits/rejected": -8.406874656677246, "logps/chosen": -7.887382507324219, "logps/rejected": -103.8184814453125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5583572387695312, "rewards/margins": 9.604859352111816, "rewards/rejected": -7.046502590179443, "step": 459 }, { "epoch": 0.3229203229203229, "grad_norm": 0.0023633865639567375, "learning_rate": 4.437929719469291e-05, "logits/chosen": -9.129474639892578, "logits/rejected": -9.120658874511719, "logps/chosen": -4.3806047439575195, "logps/rejected": -107.43498229980469, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.647362470626831, "rewards/margins": 10.208961486816406, "rewards/rejected": -7.561598777770996, "step": 460 }, { "epoch": 0.3236223236223236, "grad_norm": 0.0014451612951233983, "learning_rate": 4.4395015423160807e-05, "logits/chosen": -9.073966026306152, "logits/rejected": -9.05967903137207, "logps/chosen": -3.52982234954834, "logps/rejected": -106.19291687011719, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8692755699157715, "rewards/margins": 10.369997024536133, "rewards/rejected": -7.5007219314575195, "step": 461 }, { "epoch": 0.32432432432432434, "grad_norm": 0.015185794793069363, "learning_rate": 4.4410699592602094e-05, "logits/chosen": -9.114996910095215, "logits/rejected": -9.099605560302734, "logps/chosen": -2.8004941940307617, "logps/rejected": -105.48733520507812, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.993258237838745, "rewards/margins": 10.35361385345459, "rewards/rejected": -7.360355377197266, "step": 462 }, { "epoch": 0.325026325026325, "grad_norm": 0.023182472214102745, "learning_rate": 4.442634985029922e-05, "logits/chosen": -8.20903491973877, "logits/rejected": -8.176194190979004, "logps/chosen": -3.40376877784729, "logps/rejected": -106.50471496582031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8458778858184814, "rewards/margins": 10.222740173339844, "rewards/rejected": -7.376862525939941, "step": 463 }, { "epoch": 0.3257283257283257, "grad_norm": 0.025111017748713493, "learning_rate": 4.444196634258136e-05, "logits/chosen": -8.709789276123047, "logits/rejected": -8.707902908325195, "logps/chosen": -9.199222564697266, "logps/rejected": -102.39120483398438, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 2.3759806156158447, "rewards/margins": 9.3355712890625, "rewards/rejected": -6.959590911865234, "step": 464 }, { "epoch": 0.32643032643032643, "grad_norm": 0.010237106122076511, "learning_rate": 4.4457549214832566e-05, "logits/chosen": -8.945719718933105, "logits/rejected": -8.944843292236328, "logps/chosen": -7.566823482513428, "logps/rejected": -105.53155517578125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.5021004676818848, "rewards/margins": 9.766061782836914, "rewards/rejected": -7.2639617919921875, "step": 465 }, { "epoch": 0.32713232713232715, "grad_norm": 0.017543470486998558, "learning_rate": 4.44730986115e-05, "logits/chosen": -9.439754486083984, "logits/rejected": -9.440465927124023, "logps/chosen": -10.231224060058594, "logps/rejected": -102.79817199707031, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.2234091758728027, "rewards/margins": 9.203022956848145, "rewards/rejected": -6.979613304138184, "step": 466 }, { "epoch": 0.3278343278343278, "grad_norm": 0.0033181332983076572, "learning_rate": 4.448861467610187e-05, "logits/chosen": -9.571135520935059, "logits/rejected": -9.569860458374023, "logps/chosen": -12.04247760772705, "logps/rejected": -102.4771728515625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.1045403480529785, "rewards/margins": 9.036831855773926, "rewards/rejected": -6.932291507720947, "step": 467 }, { "epoch": 0.3285363285363285, "grad_norm": 0.002444152720272541, "learning_rate": 4.4504097551235406e-05, "logits/chosen": -9.074725151062012, "logits/rejected": -9.095664978027344, "logps/chosen": -13.436149597167969, "logps/rejected": -102.49825286865234, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.038726329803467, "rewards/margins": 8.922666549682617, "rewards/rejected": -6.88394021987915, "step": 468 }, { "epoch": 0.32923832923832924, "grad_norm": 0.0032048372086137533, "learning_rate": 4.4519547378584725e-05, "logits/chosen": -9.665696144104004, "logits/rejected": -9.673742294311523, "logps/chosen": -2.7922072410583496, "logps/rejected": -107.45103454589844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9187097549438477, "rewards/margins": 10.429853439331055, "rewards/rejected": -7.511143684387207, "step": 469 }, { "epoch": 0.32994032994032996, "grad_norm": 0.003030061721801758, "learning_rate": 4.453496429892863e-05, "logits/chosen": -8.536962509155273, "logits/rejected": -8.559761047363281, "logps/chosen": -6.8339409828186035, "logps/rejected": -104.04656982421875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.48703670501709, "rewards/margins": 9.679725646972656, "rewards/rejected": -7.192688465118408, "step": 470 }, { "epoch": 0.3306423306423306, "grad_norm": 0.00397350313141942, "learning_rate": 4.455034845214827e-05, "logits/chosen": -8.294533729553223, "logits/rejected": -8.286943435668945, "logps/chosen": -8.566863059997559, "logps/rejected": -104.54025268554688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4382166862487793, "rewards/margins": 9.531927108764648, "rewards/rejected": -7.093709945678711, "step": 471 }, { "epoch": 0.33134433134433133, "grad_norm": 0.0021346004214137793, "learning_rate": 4.4565699977234796e-05, "logits/chosen": -9.18923568725586, "logits/rejected": -9.18464469909668, "logps/chosen": -5.716814041137695, "logps/rejected": -103.20806884765625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6772613525390625, "rewards/margins": 9.738245964050293, "rewards/rejected": -7.0609846115112305, "step": 472 }, { "epoch": 0.33204633204633205, "grad_norm": 0.0030234691221266985, "learning_rate": 4.458101901229686e-05, "logits/chosen": -9.633935928344727, "logits/rejected": -9.622036933898926, "logps/chosen": -6.502987861633301, "logps/rejected": -106.01834106445312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6242403984069824, "rewards/margins": 9.951801300048828, "rewards/rejected": -7.3275604248046875, "step": 473 }, { "epoch": 0.33274833274833276, "grad_norm": 5.093133449554443, "learning_rate": 4.459630569456809e-05, "logits/chosen": -9.610869407653809, "logits/rejected": -9.602914810180664, "logps/chosen": -4.886628150939941, "logps/rejected": -103.71243286132812, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": 2.5659494400024414, "rewards/margins": 9.836199760437012, "rewards/rejected": -7.270249843597412, "step": 474 }, { "epoch": 0.3334503334503334, "grad_norm": 0.0016451341798529029, "learning_rate": 4.461156016041444e-05, "logits/chosen": -9.60161018371582, "logits/rejected": -9.60666275024414, "logps/chosen": -2.5472044944763184, "logps/rejected": -107.05183410644531, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0930466651916504, "rewards/margins": 10.540107727050781, "rewards/rejected": -7.447060585021973, "step": 475 }, { "epoch": 0.33415233415233414, "grad_norm": 0.005306497681885958, "learning_rate": 4.462678254534156e-05, "logits/chosen": -8.649223327636719, "logits/rejected": -8.630914688110352, "logps/chosen": -4.522772789001465, "logps/rejected": -106.27050018310547, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.6271255016326904, "rewards/margins": 10.12048625946045, "rewards/rejected": -7.493360996246338, "step": 476 }, { "epoch": 0.33485433485433486, "grad_norm": 0.007573090028017759, "learning_rate": 4.464197298400191e-05, "logits/chosen": -8.967031478881836, "logits/rejected": -8.95956802368164, "logps/chosen": -6.309453964233398, "logps/rejected": -104.97523498535156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.5078001022338867, "rewards/margins": 9.748696327209473, "rewards/rejected": -7.240896701812744, "step": 477 }, { "epoch": 0.33555633555633557, "grad_norm": 0.0030352724716067314, "learning_rate": 4.4657131610201994e-05, "logits/chosen": -9.069021224975586, "logits/rejected": -9.059247970581055, "logps/chosen": -1.9757153987884521, "logps/rejected": -107.63644409179688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9228477478027344, "rewards/margins": 10.566882133483887, "rewards/rejected": -7.644034385681152, "step": 478 }, { "epoch": 0.33625833625833623, "grad_norm": 0.004812523256987333, "learning_rate": 4.467225855690939e-05, "logits/chosen": -9.615201950073242, "logits/rejected": -9.601312637329102, "logps/chosen": -4.91041374206543, "logps/rejected": -104.0596694946289, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.58369779586792, "rewards/margins": 9.988218307495117, "rewards/rejected": -7.4045209884643555, "step": 479 }, { "epoch": 0.33696033696033695, "grad_norm": 0.0022141442168504, "learning_rate": 4.468735395625979e-05, "logits/chosen": -9.063701629638672, "logits/rejected": -9.064391136169434, "logps/chosen": -9.61534309387207, "logps/rejected": -103.38774871826172, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.1978671550750732, "rewards/margins": 9.27752685546875, "rewards/rejected": -7.079659461975098, "step": 480 }, { "epoch": 0.33766233766233766, "grad_norm": 0.01424348633736372, "learning_rate": 4.470241793956387e-05, "logits/chosen": -9.462915420532227, "logits/rejected": -9.456448554992676, "logps/chosen": -2.246256113052368, "logps/rejected": -107.00674438476562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.9612207412719727, "rewards/margins": 10.510830879211426, "rewards/rejected": -7.549610137939453, "step": 481 }, { "epoch": 0.3383643383643384, "grad_norm": 0.004952895920723677, "learning_rate": 4.471745063731416e-05, "logits/chosen": -8.547994613647461, "logits/rejected": -8.536848068237305, "logps/chosen": -9.130019187927246, "logps/rejected": -101.98003387451172, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.303354263305664, "rewards/margins": 9.338963508605957, "rewards/rejected": -7.035609245300293, "step": 482 }, { "epoch": 0.33906633906633904, "grad_norm": 0.0024836589582264423, "learning_rate": 4.473245217919187e-05, "logits/chosen": -8.957283973693848, "logits/rejected": -8.966776847839355, "logps/chosen": -3.8111422061920166, "logps/rejected": -108.22483825683594, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7145328521728516, "rewards/margins": 10.274385452270508, "rewards/rejected": -7.559852600097656, "step": 483 }, { "epoch": 0.33976833976833976, "grad_norm": 0.23320446908473969, "learning_rate": 4.474742269407355e-05, "logits/chosen": -8.791566848754883, "logits/rejected": -8.79463005065918, "logps/chosen": -14.752519607543945, "logps/rejected": -104.78819274902344, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 1.8238199949264526, "rewards/margins": 9.009364128112793, "rewards/rejected": -7.185544013977051, "step": 484 }, { "epoch": 0.34047034047034047, "grad_norm": 0.03655384108424187, "learning_rate": 4.476236231003773e-05, "logits/chosen": -8.70830249786377, "logits/rejected": -8.70879077911377, "logps/chosen": -5.013664245605469, "logps/rejected": -103.77374267578125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.663484573364258, "rewards/margins": 9.902956008911133, "rewards/rejected": -7.239471435546875, "step": 485 }, { "epoch": 0.3411723411723412, "grad_norm": 0.07385007292032242, "learning_rate": 4.477727115437156e-05, "logits/chosen": -8.577463150024414, "logits/rejected": -8.582297325134277, "logps/chosen": -6.1719536781311035, "logps/rejected": -104.81988525390625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.4877424240112305, "rewards/margins": 9.897686958312988, "rewards/rejected": -7.409945011138916, "step": 486 }, { "epoch": 0.34187434187434185, "grad_norm": 0.5019731521606445, "learning_rate": 4.479214935357724e-05, "logits/chosen": -9.334588050842285, "logits/rejected": -9.324485778808594, "logps/chosen": -7.117732048034668, "logps/rejected": -106.22957611083984, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.4050133228302, "rewards/margins": 9.733072280883789, "rewards/rejected": -7.328059196472168, "step": 487 }, { "epoch": 0.34257634257634256, "grad_norm": 0.01665443927049637, "learning_rate": 4.480699703337852e-05, "logits/chosen": -10.069182395935059, "logits/rejected": -10.054916381835938, "logps/chosen": -3.34228253364563, "logps/rejected": -107.24215698242188, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.89780330657959, "rewards/margins": 10.349628448486328, "rewards/rejected": -7.451825141906738, "step": 488 }, { "epoch": 0.3432783432783433, "grad_norm": 0.007334902882575989, "learning_rate": 4.4821814318727016e-05, "logits/chosen": -9.848747253417969, "logits/rejected": -9.849482536315918, "logps/chosen": -1.8042782545089722, "logps/rejected": -107.47917175292969, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.8951096534729004, "rewards/margins": 10.562529563903809, "rewards/rejected": -7.66741943359375, "step": 489 }, { "epoch": 0.343980343980344, "grad_norm": 0.0040780166164040565, "learning_rate": 4.483660133380856e-05, "logits/chosen": -9.23158073425293, "logits/rejected": -9.219949722290039, "logps/chosen": -13.88882064819336, "logps/rejected": -100.83013916015625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.8723931312561035, "rewards/margins": 8.63833236694336, "rewards/rejected": -6.765938758850098, "step": 490 }, { "epoch": 0.34468234468234465, "grad_norm": 0.02832343615591526, "learning_rate": 4.485135820204948e-05, "logits/chosen": -8.450630187988281, "logits/rejected": -8.44382095336914, "logps/chosen": -6.975223064422607, "logps/rejected": -105.8624496459961, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.57905912399292, "rewards/margins": 9.848142623901367, "rewards/rejected": -7.269083499908447, "step": 491 }, { "epoch": 0.34538434538434537, "grad_norm": 0.005573755595833063, "learning_rate": 4.486608504612267e-05, "logits/chosen": -8.38563346862793, "logits/rejected": -8.368837356567383, "logps/chosen": -6.303404808044434, "logps/rejected": -104.76734924316406, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.624685049057007, "rewards/margins": 9.90676498413086, "rewards/rejected": -7.282079696655273, "step": 492 }, { "epoch": 0.3460863460863461, "grad_norm": 0.0207048486918211, "learning_rate": 4.488078198795383e-05, "logits/chosen": -9.027321815490723, "logits/rejected": -9.051240921020508, "logps/chosen": -1.9885761737823486, "logps/rejected": -106.60569763183594, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.9844040870666504, "rewards/margins": 10.46219253540039, "rewards/rejected": -7.47778844833374, "step": 493 }, { "epoch": 0.3467883467883468, "grad_norm": 0.13930493593215942, "learning_rate": 4.489544914872745e-05, "logits/chosen": -9.469765663146973, "logits/rejected": -9.479732513427734, "logps/chosen": -5.804605960845947, "logps/rejected": -106.32203674316406, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 2.755160331726074, "rewards/margins": 10.160484313964844, "rewards/rejected": -7.4053239822387695, "step": 494 }, { "epoch": 0.3474903474903475, "grad_norm": 0.0020910336170345545, "learning_rate": 4.4910086648892815e-05, "logits/chosen": -8.945512771606445, "logits/rejected": -8.925504684448242, "logps/chosen": -2.593893527984619, "logps/rejected": -107.7900390625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8668737411499023, "rewards/margins": 10.418071746826172, "rewards/rejected": -7.551197528839111, "step": 495 }, { "epoch": 0.3481923481923482, "grad_norm": 0.007017074152827263, "learning_rate": 4.4924694608169965e-05, "logits/chosen": -8.510411262512207, "logits/rejected": -8.516077995300293, "logps/chosen": -8.197874069213867, "logps/rejected": -103.03970336914062, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.5550620555877686, "rewards/margins": 9.503901481628418, "rewards/rejected": -6.94883918762207, "step": 496 }, { "epoch": 0.3488943488943489, "grad_norm": 0.0018913110252469778, "learning_rate": 4.4939273145555536e-05, "logits/chosen": -8.964625358581543, "logits/rejected": -8.992349624633789, "logps/chosen": -1.2271552085876465, "logps/rejected": -107.77609252929688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.976773738861084, "rewards/margins": 10.561779022216797, "rewards/rejected": -7.585004806518555, "step": 497 }, { "epoch": 0.3495963495963496, "grad_norm": 0.0041789342649281025, "learning_rate": 4.495382237932863e-05, "logits/chosen": -9.398414611816406, "logits/rejected": -9.388962745666504, "logps/chosen": -5.616103649139404, "logps/rejected": -106.009521484375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.696207046508789, "rewards/margins": 10.150822639465332, "rewards/rejected": -7.454615592956543, "step": 498 }, { "epoch": 0.3502983502983503, "grad_norm": 0.0017071334877982736, "learning_rate": 4.4968342427056505e-05, "logits/chosen": -9.377155303955078, "logits/rejected": -9.375895500183105, "logps/chosen": -1.171778678894043, "logps/rejected": -107.05342102050781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0208888053894043, "rewards/margins": 10.579320907592773, "rewards/rejected": -7.558431625366211, "step": 499 }, { "epoch": 0.351000351000351, "grad_norm": 0.0021445192396640778, "learning_rate": 4.498283340560031e-05, "logits/chosen": -9.537704467773438, "logits/rejected": -9.545515060424805, "logps/chosen": -1.8739668130874634, "logps/rejected": -106.87510681152344, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.894268035888672, "rewards/margins": 10.536991119384766, "rewards/rejected": -7.642723083496094, "step": 500 }, { "epoch": 0.3517023517023517, "grad_norm": 0.005182094406336546, "learning_rate": 4.499729543112076e-05, "logits/chosen": -9.945930480957031, "logits/rejected": -9.949920654296875, "logps/chosen": -1.367124319076538, "logps/rejected": -107.73649597167969, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9766147136688232, "rewards/margins": 10.663524627685547, "rewards/rejected": -7.6869096755981445, "step": 501 }, { "epoch": 0.3524043524043524, "grad_norm": 0.0031835834961384535, "learning_rate": 4.501172861908366e-05, "logits/chosen": -9.988163948059082, "logits/rejected": -9.990220069885254, "logps/chosen": -12.136592864990234, "logps/rejected": -100.35289764404297, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.1707372665405273, "rewards/margins": 8.892416954040527, "rewards/rejected": -6.7216796875, "step": 502 }, { "epoch": 0.35310635310635313, "grad_norm": 0.0026465882547199726, "learning_rate": 4.502613308426546e-05, "logits/chosen": -9.719749450683594, "logits/rejected": -9.724935531616211, "logps/chosen": -4.140077590942383, "logps/rejected": -104.721923828125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.6810312271118164, "rewards/margins": 10.03913688659668, "rewards/rejected": -7.35810661315918, "step": 503 }, { "epoch": 0.3538083538083538, "grad_norm": 0.002345727989450097, "learning_rate": 4.504050894075876e-05, "logits/chosen": -9.746755599975586, "logits/rejected": -9.734722137451172, "logps/chosen": -4.06167459487915, "logps/rejected": -104.94972229003906, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8381357192993164, "rewards/margins": 10.192806243896484, "rewards/rejected": -7.354670524597168, "step": 504 }, { "epoch": 0.3545103545103545, "grad_norm": 0.0013900698395445943, "learning_rate": 4.5054856301977696e-05, "logits/chosen": -9.237285614013672, "logits/rejected": -9.219219207763672, "logps/chosen": -1.0296497344970703, "logps/rejected": -108.21893310546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0266120433807373, "rewards/margins": 10.637334823608398, "rewards/rejected": -7.610722541809082, "step": 505 }, { "epoch": 0.3552123552123552, "grad_norm": 0.001955584390088916, "learning_rate": 4.506917528066332e-05, "logits/chosen": -8.605432510375977, "logits/rejected": -8.634790420532227, "logps/chosen": -4.572240829467773, "logps/rejected": -104.6720199584961, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7826390266418457, "rewards/margins": 10.021307945251465, "rewards/rejected": -7.238668918609619, "step": 506 }, { "epoch": 0.35591435591435594, "grad_norm": 0.0016919844783842564, "learning_rate": 4.508346598888894e-05, "logits/chosen": -9.233402252197266, "logits/rejected": -9.239765167236328, "logps/chosen": -9.189287185668945, "logps/rejected": -103.86378479003906, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.262518882751465, "rewards/margins": 9.451221466064453, "rewards/rejected": -7.188702583312988, "step": 507 }, { "epoch": 0.3566163566163566, "grad_norm": 0.00206765322946012, "learning_rate": 4.509772853806532e-05, "logits/chosen": -9.826201438903809, "logits/rejected": -9.83835506439209, "logps/chosen": -6.583255767822266, "logps/rejected": -106.59456634521484, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4850800037384033, "rewards/margins": 9.859356880187988, "rewards/rejected": -7.374277114868164, "step": 508 }, { "epoch": 0.3573183573183573, "grad_norm": 0.0022384761832654476, "learning_rate": 4.511196303894598e-05, "logits/chosen": -8.724101066589355, "logits/rejected": -8.713859558105469, "logps/chosen": -0.6458878517150879, "logps/rejected": -107.32177734375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.110473155975342, "rewards/margins": 10.71206283569336, "rewards/rejected": -7.601590156555176, "step": 509 }, { "epoch": 0.35802035802035803, "grad_norm": 0.0030829748138785362, "learning_rate": 4.512616960163227e-05, "logits/chosen": -9.140340805053711, "logits/rejected": -9.135198593139648, "logps/chosen": -1.0555680990219116, "logps/rejected": -106.8279037475586, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 3.058457136154175, "rewards/margins": 10.603517532348633, "rewards/rejected": -7.545060157775879, "step": 510 }, { "epoch": 0.35872235872235875, "grad_norm": 0.00702682975679636, "learning_rate": 4.5140348335578547e-05, "logits/chosen": -9.717981338500977, "logits/rejected": -9.712404251098633, "logps/chosen": -7.165702819824219, "logps/rejected": -103.78446197509766, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.581285238265991, "rewards/margins": 9.70358657836914, "rewards/rejected": -7.1223015785217285, "step": 511 }, { "epoch": 0.3594243594243594, "grad_norm": 0.003912303131073713, "learning_rate": 4.515449934959718e-05, "logits/chosen": -9.03824234008789, "logits/rejected": -9.043444633483887, "logps/chosen": -1.2725601196289062, "logps/rejected": -107.76139831542969, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 3.090233325958252, "rewards/margins": 10.618816375732422, "rewards/rejected": -7.528583526611328, "step": 512 }, { "epoch": 0.3601263601263601, "grad_norm": 0.015898888930678368, "learning_rate": 4.516862275186361e-05, "logits/chosen": -9.466897964477539, "logits/rejected": -9.472142219543457, "logps/chosen": -7.347879409790039, "logps/rejected": -100.39363098144531, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.332132339477539, "rewards/margins": 9.255291938781738, "rewards/rejected": -6.923159599304199, "step": 513 }, { "epoch": 0.36082836082836084, "grad_norm": 0.09853096306324005, "learning_rate": 4.518271864992127e-05, "logits/chosen": -9.483631134033203, "logits/rejected": -9.489583969116211, "logps/chosen": -9.495861053466797, "logps/rejected": -104.68412780761719, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 2.173600673675537, "rewards/margins": 9.432219505310059, "rewards/rejected": -7.258618354797363, "step": 514 }, { "epoch": 0.36153036153036155, "grad_norm": 0.0020290501415729523, "learning_rate": 4.519678715068652e-05, "logits/chosen": -10.16245174407959, "logits/rejected": -10.166131973266602, "logps/chosen": -7.159505844116211, "logps/rejected": -103.68550109863281, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.432786226272583, "rewards/margins": 9.57815933227539, "rewards/rejected": -7.1453728675842285, "step": 515 }, { "epoch": 0.3622323622323622, "grad_norm": 0.0025947345420718193, "learning_rate": 4.521082836045353e-05, "logits/chosen": -8.619491577148438, "logits/rejected": -8.60577392578125, "logps/chosen": -5.085075378417969, "logps/rejected": -105.12588500976562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6705374717712402, "rewards/margins": 9.957820892333984, "rewards/rejected": -7.287282943725586, "step": 516 }, { "epoch": 0.36293436293436293, "grad_norm": 0.001725982641801238, "learning_rate": 4.5224842384899045e-05, "logits/chosen": -9.484079360961914, "logits/rejected": -9.500480651855469, "logps/chosen": -0.6814526319503784, "logps/rejected": -107.1943359375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.181067705154419, "rewards/margins": 10.73162841796875, "rewards/rejected": -7.55056095123291, "step": 517 }, { "epoch": 0.36363636363636365, "grad_norm": 0.005188254173845053, "learning_rate": 4.523882932908722e-05, "logits/chosen": -8.912826538085938, "logits/rejected": -8.90532112121582, "logps/chosen": -5.387393951416016, "logps/rejected": -107.47100067138672, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.666182518005371, "rewards/margins": 10.110494613647461, "rewards/rejected": -7.444311618804932, "step": 518 }, { "epoch": 0.36433836433836436, "grad_norm": 0.004144442733377218, "learning_rate": 4.52527892974743e-05, "logits/chosen": -9.123167037963867, "logits/rejected": -9.120609283447266, "logps/chosen": -10.568292617797852, "logps/rejected": -102.86573791503906, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.2353157997131348, "rewards/margins": 9.258129119873047, "rewards/rejected": -7.022813320159912, "step": 519 }, { "epoch": 0.365040365040365, "grad_norm": 0.002470881910994649, "learning_rate": 4.526672239391333e-05, "logits/chosen": -9.41067123413086, "logits/rejected": -9.41063117980957, "logps/chosen": -0.517834484577179, "logps/rejected": -108.12416076660156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.199197769165039, "rewards/margins": 10.727334976196289, "rewards/rejected": -7.52813720703125, "step": 520 }, { "epoch": 0.36574236574236574, "grad_norm": 0.002389585366472602, "learning_rate": 4.528062872165875e-05, "logits/chosen": -9.423465728759766, "logits/rejected": -9.408394813537598, "logps/chosen": -4.9828057289123535, "logps/rejected": -105.16667175292969, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7786197662353516, "rewards/margins": 10.145219802856445, "rewards/rejected": -7.366600036621094, "step": 521 }, { "epoch": 0.36644436644436645, "grad_norm": 0.005451335571706295, "learning_rate": 4.529450838337104e-05, "logits/chosen": -9.350272178649902, "logits/rejected": -9.350996017456055, "logps/chosen": -5.856687545776367, "logps/rejected": -106.34733581542969, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7379465103149414, "rewards/margins": 10.093738555908203, "rewards/rejected": -7.355792045593262, "step": 522 }, { "epoch": 0.36714636714636717, "grad_norm": 0.0020313782151788473, "learning_rate": 4.530836148112124e-05, "logits/chosen": -9.252174377441406, "logits/rejected": -9.240900039672852, "logps/chosen": -3.6234819889068604, "logps/rejected": -105.86833190917969, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.817425012588501, "rewards/margins": 10.185230255126953, "rewards/rejected": -7.367805480957031, "step": 523 }, { "epoch": 0.36784836784836783, "grad_norm": 0.005320240743458271, "learning_rate": 4.532218811639545e-05, "logits/chosen": -9.62649917602539, "logits/rejected": -9.649155616760254, "logps/chosen": -0.8575372695922852, "logps/rejected": -107.99919891357422, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.0724258422851562, "rewards/margins": 10.525837898254395, "rewards/rejected": -7.4534125328063965, "step": 524 }, { "epoch": 0.36855036855036855, "grad_norm": 0.002152619417756796, "learning_rate": 4.5335988390099284e-05, "logits/chosen": -8.97576904296875, "logits/rejected": -8.953739166259766, "logps/chosen": -4.839704513549805, "logps/rejected": -106.65852355957031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8264191150665283, "rewards/margins": 10.200637817382812, "rewards/rejected": -7.374218940734863, "step": 525 }, { "epoch": 0.36925236925236926, "grad_norm": 0.000961911806371063, "learning_rate": 4.534976240256232e-05, "logits/chosen": -8.271437644958496, "logits/rejected": -8.296140670776367, "logps/chosen": -1.208186149597168, "logps/rejected": -106.34878540039062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0774412155151367, "rewards/margins": 10.458161354064941, "rewards/rejected": -7.380720138549805, "step": 526 }, { "epoch": 0.36995436995437, "grad_norm": 0.005005581304430962, "learning_rate": 4.536351025354245e-05, "logits/chosen": -9.591944694519043, "logits/rejected": -9.589746475219727, "logps/chosen": -1.0875871181488037, "logps/rejected": -108.26118469238281, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9751322269439697, "rewards/margins": 10.710546493530273, "rewards/rejected": -7.735414028167725, "step": 527 }, { "epoch": 0.37065637065637064, "grad_norm": 0.0042818887159228325, "learning_rate": 4.537723204223021e-05, "logits/chosen": -9.08741569519043, "logits/rejected": -9.073931694030762, "logps/chosen": -4.66508674621582, "logps/rejected": -105.05795288085938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7004129886627197, "rewards/margins": 9.962942123413086, "rewards/rejected": -7.262529373168945, "step": 528 }, { "epoch": 0.37135837135837135, "grad_norm": 0.03709322214126587, "learning_rate": 4.53909278672531e-05, "logits/chosen": -8.584692001342773, "logits/rejected": -8.604194641113281, "logps/chosen": -0.6376703977584839, "logps/rejected": -108.24179077148438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.105680465698242, "rewards/margins": 10.794879913330078, "rewards/rejected": -7.689199447631836, "step": 529 }, { "epoch": 0.37206037206037207, "grad_norm": 0.0019659758545458317, "learning_rate": 4.5404597826679824e-05, "logits/chosen": -8.609170913696289, "logits/rejected": -8.609807968139648, "logps/chosen": -8.013566970825195, "logps/rejected": -103.7770767211914, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.529387950897217, "rewards/margins": 9.626293182373047, "rewards/rejected": -7.096905708312988, "step": 530 }, { "epoch": 0.3727623727623728, "grad_norm": 0.005019034259021282, "learning_rate": 4.541824201802449e-05, "logits/chosen": -8.992721557617188, "logits/rejected": -8.993072509765625, "logps/chosen": -3.6353964805603027, "logps/rejected": -105.67267608642578, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8654184341430664, "rewards/margins": 10.16858959197998, "rewards/rejected": -7.303170680999756, "step": 531 }, { "epoch": 0.37346437346437344, "grad_norm": 0.007984096184372902, "learning_rate": 4.543186053825081e-05, "logits/chosen": -9.893957138061523, "logits/rejected": -9.896133422851562, "logps/chosen": -3.829714298248291, "logps/rejected": -106.64144134521484, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.6501624584198, "rewards/margins": 10.203376770019531, "rewards/rejected": -7.553214073181152, "step": 532 }, { "epoch": 0.37416637416637416, "grad_norm": 0.9064089059829712, "learning_rate": 4.544545348377621e-05, "logits/chosen": -8.63446044921875, "logits/rejected": -8.624732971191406, "logps/chosen": -3.5007941722869873, "logps/rejected": -104.15332794189453, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 2.879364490509033, "rewards/margins": 10.114240646362305, "rewards/rejected": -7.23487663269043, "step": 533 }, { "epoch": 0.3748683748683749, "grad_norm": 0.0023448967840522528, "learning_rate": 4.5459020950475946e-05, "logits/chosen": -8.820938110351562, "logits/rejected": -8.805131912231445, "logps/chosen": -0.5324887633323669, "logps/rejected": -108.4061279296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.2898690700531006, "rewards/margins": 10.900247573852539, "rewards/rejected": -7.610378265380859, "step": 534 }, { "epoch": 0.3755703755703756, "grad_norm": 0.0011524524306878448, "learning_rate": 4.5472563033687145e-05, "logits/chosen": -9.335243225097656, "logits/rejected": -9.330076217651367, "logps/chosen": -0.4628337025642395, "logps/rejected": -107.06253051757812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.2109973430633545, "rewards/margins": 10.662665367126465, "rewards/rejected": -7.451667785644531, "step": 535 }, { "epoch": 0.37627237627237625, "grad_norm": 0.0075366017408668995, "learning_rate": 4.548607982821284e-05, "logits/chosen": -8.862451553344727, "logits/rejected": -8.856921195983887, "logps/chosen": -3.9702210426330566, "logps/rejected": -105.21414184570312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.789499044418335, "rewards/margins": 10.129661560058594, "rewards/rejected": -7.340163230895996, "step": 536 }, { "epoch": 0.37697437697437697, "grad_norm": 0.001983840251341462, "learning_rate": 4.5499571428325935e-05, "logits/chosen": -8.99791145324707, "logits/rejected": -8.982507705688477, "logps/chosen": -0.35623466968536377, "logps/rejected": -108.90347290039062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.166356086730957, "rewards/margins": 10.85287857055664, "rewards/rejected": -7.686522483825684, "step": 537 }, { "epoch": 0.3776763776763777, "grad_norm": 0.003222499741241336, "learning_rate": 4.5513037927773155e-05, "logits/chosen": -8.711878776550293, "logits/rejected": -8.71352767944336, "logps/chosen": -4.111454010009766, "logps/rejected": -104.40894317626953, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8619136810302734, "rewards/margins": 10.062344551086426, "rewards/rejected": -7.200430870056152, "step": 538 }, { "epoch": 0.3783783783783784, "grad_norm": 0.003504195949062705, "learning_rate": 4.5526479419778986e-05, "logits/chosen": -8.314419746398926, "logits/rejected": -8.310098648071289, "logps/chosen": -7.357254505157471, "logps/rejected": -103.05557250976562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4464006423950195, "rewards/margins": 9.523239135742188, "rewards/rejected": -7.076838493347168, "step": 539 }, { "epoch": 0.37908037908037906, "grad_norm": 0.0010640477994456887, "learning_rate": 4.553989599704948e-05, "logits/chosen": -8.887497901916504, "logits/rejected": -8.876548767089844, "logps/chosen": -0.5595270991325378, "logps/rejected": -108.86427307128906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.958388328552246, "rewards/margins": 10.747434616088867, "rewards/rejected": -7.789046764373779, "step": 540 }, { "epoch": 0.3797823797823798, "grad_norm": 0.0014699158491566777, "learning_rate": 4.555328775177616e-05, "logits/chosen": -9.124560356140137, "logits/rejected": -9.128927230834961, "logps/chosen": -0.42220717668533325, "logps/rejected": -109.025634765625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1247363090515137, "rewards/margins": 10.862432479858398, "rewards/rejected": -7.737696170806885, "step": 541 }, { "epoch": 0.3804843804843805, "grad_norm": 0.001146857626736164, "learning_rate": 4.5566654775639785e-05, "logits/chosen": -8.742437362670898, "logits/rejected": -8.74575424194336, "logps/chosen": -4.742001533508301, "logps/rejected": -107.92111206054688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7226784229278564, "rewards/margins": 10.25217056274414, "rewards/rejected": -7.529491901397705, "step": 542 }, { "epoch": 0.3811863811863812, "grad_norm": 0.004715400747954845, "learning_rate": 4.5579997159814117e-05, "logits/chosen": -8.627483367919922, "logits/rejected": -8.600030899047852, "logps/chosen": -0.39315661787986755, "logps/rejected": -108.62820434570312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 3.089679718017578, "rewards/margins": 10.79023551940918, "rewards/rejected": -7.700556755065918, "step": 543 }, { "epoch": 0.38188838188838187, "grad_norm": 0.001307474565692246, "learning_rate": 4.5593314994969665e-05, "logits/chosen": -8.939509391784668, "logits/rejected": -8.924827575683594, "logps/chosen": -1.1387321949005127, "logps/rejected": -106.1434097290039, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.054776191711426, "rewards/margins": 10.487232208251953, "rewards/rejected": -7.432455062866211, "step": 544 }, { "epoch": 0.3825903825903826, "grad_norm": 0.0010821446776390076, "learning_rate": 4.560660837127738e-05, "logits/chosen": -9.355355262756348, "logits/rejected": -9.35153579711914, "logps/chosen": -7.090184211730957, "logps/rejected": -102.87977600097656, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6059885025024414, "rewards/margins": 9.501943588256836, "rewards/rejected": -6.895954132080078, "step": 545 }, { "epoch": 0.3832923832923833, "grad_norm": 0.004283946473151445, "learning_rate": 4.561987737841229e-05, "logits/chosen": -9.234613418579102, "logits/rejected": -9.230507850646973, "logps/chosen": -1.0271888971328735, "logps/rejected": -107.85984802246094, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.0003018379211426, "rewards/margins": 10.634346008300781, "rewards/rejected": -7.634043216705322, "step": 546 }, { "epoch": 0.383994383994384, "grad_norm": 0.42665478587150574, "learning_rate": 4.563312210555719e-05, "logits/chosen": -9.728407859802246, "logits/rejected": -9.711305618286133, "logps/chosen": -8.165038108825684, "logps/rejected": -105.51144409179688, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": 2.436627149581909, "rewards/margins": 9.77617359161377, "rewards/rejected": -7.339546203613281, "step": 547 }, { "epoch": 0.3846963846963847, "grad_norm": 0.034018345177173615, "learning_rate": 4.564634264140616e-05, "logits/chosen": -8.497684478759766, "logits/rejected": -8.52045726776123, "logps/chosen": -9.793234825134277, "logps/rejected": -102.14429473876953, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.3219635486602783, "rewards/margins": 9.23347282409668, "rewards/rejected": -6.911509037017822, "step": 548 }, { "epoch": 0.3853983853983854, "grad_norm": 0.031340938061475754, "learning_rate": 4.56595390741682e-05, "logits/chosen": -8.336894035339355, "logits/rejected": -8.324618339538574, "logps/chosen": -6.786680698394775, "logps/rejected": -102.94927978515625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.604433536529541, "rewards/margins": 9.622053146362305, "rewards/rejected": -7.0176191329956055, "step": 549 }, { "epoch": 0.3861003861003861, "grad_norm": 0.0013554951874539256, "learning_rate": 4.567271149157073e-05, "logits/chosen": -9.36636734008789, "logits/rejected": -9.357521057128906, "logps/chosen": -8.765366554260254, "logps/rejected": -105.54328918457031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.44626522064209, "rewards/margins": 9.606106758117676, "rewards/rejected": -7.159841537475586, "step": 550 }, { "epoch": 0.3868023868023868, "grad_norm": 0.0018148582894355059, "learning_rate": 4.5685859980863086e-05, "logits/chosen": -9.101829528808594, "logits/rejected": -9.102739334106445, "logps/chosen": -0.36864280700683594, "logps/rejected": -108.76660919189453, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.1475064754486084, "rewards/margins": 10.924973487854004, "rewards/rejected": -7.777466773986816, "step": 551 }, { "epoch": 0.3875043875043875, "grad_norm": 0.0024502770975232124, "learning_rate": 4.569898462881999e-05, "logits/chosen": -9.029497146606445, "logits/rejected": -9.025859832763672, "logps/chosen": -0.5689732432365417, "logps/rejected": -108.08270263671875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.1134445667266846, "rewards/margins": 10.599691390991211, "rewards/rejected": -7.486246585845947, "step": 552 }, { "epoch": 0.3882063882063882, "grad_norm": 0.001308985287323594, "learning_rate": 4.571208552174497e-05, "logits/chosen": -8.861003875732422, "logits/rejected": -8.851272583007812, "logps/chosen": -3.9969117641448975, "logps/rejected": -106.55892181396484, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8872339725494385, "rewards/margins": 10.226905822753906, "rewards/rejected": -7.3396711349487305, "step": 553 }, { "epoch": 0.3889083889083889, "grad_norm": 0.0015201395144686103, "learning_rate": 4.572516274547383e-05, "logits/chosen": -8.488177299499512, "logits/rejected": -8.482549667358398, "logps/chosen": -6.8537278175354, "logps/rejected": -103.65739440917969, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6532883644104004, "rewards/margins": 9.75820541381836, "rewards/rejected": -7.104917526245117, "step": 554 }, { "epoch": 0.38961038961038963, "grad_norm": 0.0022309746127575636, "learning_rate": 4.573821638537794e-05, "logits/chosen": -8.75467586517334, "logits/rejected": -8.7571439743042, "logps/chosen": -0.2747952342033386, "logps/rejected": -109.32337951660156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 3.2098751068115234, "rewards/margins": 10.84086799621582, "rewards/rejected": -7.630992889404297, "step": 555 }, { "epoch": 0.3903123903123903, "grad_norm": 0.5816552639007568, "learning_rate": 4.575124652636763e-05, "logits/chosen": -8.87002182006836, "logits/rejected": -8.877584457397461, "logps/chosen": -6.139520645141602, "logps/rejected": -106.77107238769531, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 2.5746326446533203, "rewards/margins": 9.963581085205078, "rewards/rejected": -7.388948440551758, "step": 556 }, { "epoch": 0.391014391014391, "grad_norm": 0.0012250096770003438, "learning_rate": 4.5764253252895486e-05, "logits/chosen": -8.685739517211914, "logits/rejected": -8.679414749145508, "logps/chosen": -0.44397640228271484, "logps/rejected": -108.72040557861328, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.187035322189331, "rewards/margins": 10.867752075195312, "rewards/rejected": -7.680717468261719, "step": 557 }, { "epoch": 0.3917163917163917, "grad_norm": 0.0022035320289433002, "learning_rate": 4.577723664895965e-05, "logits/chosen": -9.443397521972656, "logits/rejected": -9.435532569885254, "logps/chosen": -6.673610210418701, "logps/rejected": -102.71745300292969, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6148505210876465, "rewards/margins": 9.57620620727539, "rewards/rejected": -6.961355209350586, "step": 558 }, { "epoch": 0.39241839241839244, "grad_norm": 0.006401616148650646, "learning_rate": 4.579019679810706e-05, "logits/chosen": -8.802078247070312, "logits/rejected": -8.782264709472656, "logps/chosen": -0.30669593811035156, "logps/rejected": -108.96479797363281, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.1651294231414795, "rewards/margins": 10.81229305267334, "rewards/rejected": -7.647163391113281, "step": 559 }, { "epoch": 0.3931203931203931, "grad_norm": 0.0024000804405659437, "learning_rate": 4.5803133783436676e-05, "logits/chosen": -9.111019134521484, "logits/rejected": -9.107678413391113, "logps/chosen": -3.8050405979156494, "logps/rejected": -105.31234741210938, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8019027709960938, "rewards/margins": 10.049945831298828, "rewards/rejected": -7.248042106628418, "step": 560 }, { "epoch": 0.3938223938223938, "grad_norm": 0.005485559348016977, "learning_rate": 4.581604768760269e-05, "logits/chosen": -10.141094207763672, "logits/rejected": -10.119389533996582, "logps/chosen": -1.2157633304595947, "logps/rejected": -108.99608612060547, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.9582691192626953, "rewards/margins": 10.718435287475586, "rewards/rejected": -7.760166168212891, "step": 561 }, { "epoch": 0.39452439452439453, "grad_norm": 0.006114716175943613, "learning_rate": 4.582893859281769e-05, "logits/chosen": -8.722509384155273, "logits/rejected": -8.701807022094727, "logps/chosen": -8.81248664855957, "logps/rejected": -105.73091888427734, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.528571605682373, "rewards/margins": 9.678094863891602, "rewards/rejected": -7.14952278137207, "step": 562 }, { "epoch": 0.39522639522639524, "grad_norm": 0.027982445433735847, "learning_rate": 4.584180658085578e-05, "logits/chosen": -8.039392471313477, "logits/rejected": -8.0479736328125, "logps/chosen": -3.683530330657959, "logps/rejected": -103.60577392578125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.901210308074951, "rewards/margins": 9.957549095153809, "rewards/rejected": -7.056339263916016, "step": 563 }, { "epoch": 0.3959283959283959, "grad_norm": 0.0014069437747821212, "learning_rate": 4.585465173305571e-05, "logits/chosen": -9.432156562805176, "logits/rejected": -9.436823844909668, "logps/chosen": -4.132700443267822, "logps/rejected": -105.58613586425781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7425661087036133, "rewards/margins": 10.085938453674316, "rewards/rejected": -7.343372344970703, "step": 564 }, { "epoch": 0.3966303966303966, "grad_norm": 0.0021559607703238726, "learning_rate": 4.5867474130323984e-05, "logits/chosen": -8.797337532043457, "logits/rejected": -8.809667587280273, "logps/chosen": -6.988340854644775, "logps/rejected": -103.20601654052734, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6241750717163086, "rewards/margins": 9.628155708312988, "rewards/rejected": -7.00398063659668, "step": 565 }, { "epoch": 0.39733239733239734, "grad_norm": 0.0017381361685693264, "learning_rate": 4.588027385313786e-05, "logits/chosen": -9.11514663696289, "logits/rejected": -9.097770690917969, "logps/chosen": -3.7055749893188477, "logps/rejected": -106.81090545654297, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8364953994750977, "rewards/margins": 10.32174301147461, "rewards/rejected": -7.4852471351623535, "step": 566 }, { "epoch": 0.39803439803439805, "grad_norm": 0.0036528995260596275, "learning_rate": 4.5893050981548446e-05, "logits/chosen": -9.094096183776855, "logits/rejected": -9.096189498901367, "logps/chosen": -0.5549576282501221, "logps/rejected": -109.52776336669922, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.1304452419281006, "rewards/margins": 10.928892135620117, "rewards/rejected": -7.7984466552734375, "step": 567 }, { "epoch": 0.3987363987363987, "grad_norm": 0.003102668793871999, "learning_rate": 4.5905805595183656e-05, "logits/chosen": -8.70571231842041, "logits/rejected": -8.735719680786133, "logps/chosen": -4.3330864906311035, "logps/rejected": -106.72956848144531, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.708397388458252, "rewards/margins": 10.188492774963379, "rewards/rejected": -7.480095386505127, "step": 568 }, { "epoch": 0.3994383994383994, "grad_norm": 0.004725241102278233, "learning_rate": 4.591853777325119e-05, "logits/chosen": -8.946680068969727, "logits/rejected": -8.963560104370117, "logps/chosen": -4.721794128417969, "logps/rejected": -107.76620483398438, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.8158841133117676, "rewards/margins": 10.309541702270508, "rewards/rejected": -7.493656635284424, "step": 569 }, { "epoch": 0.40014040014040014, "grad_norm": 0.0026139654219150543, "learning_rate": 4.593124759454153e-05, "logits/chosen": -8.991992950439453, "logits/rejected": -8.992889404296875, "logps/chosen": -4.129839897155762, "logps/rejected": -107.43036651611328, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6876378059387207, "rewards/margins": 10.268479347229004, "rewards/rejected": -7.580841064453125, "step": 570 }, { "epoch": 0.40084240084240086, "grad_norm": 0.03787726163864136, "learning_rate": 4.5943935137430806e-05, "logits/chosen": -9.052194595336914, "logits/rejected": -9.04607105255127, "logps/chosen": -0.29293012619018555, "logps/rejected": -109.23277282714844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.1129231452941895, "rewards/margins": 10.830791473388672, "rewards/rejected": -7.717867851257324, "step": 571 }, { "epoch": 0.4015444015444015, "grad_norm": 0.002234378596767783, "learning_rate": 4.595660047988374e-05, "logits/chosen": -8.145594596862793, "logits/rejected": -8.138455390930176, "logps/chosen": -3.547250747680664, "logps/rejected": -106.83649444580078, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.723224401473999, "rewards/margins": 10.267186164855957, "rewards/rejected": -7.543961524963379, "step": 572 }, { "epoch": 0.40224640224640223, "grad_norm": 0.002455296693369746, "learning_rate": 4.59692436994565e-05, "logits/chosen": -9.116910934448242, "logits/rejected": -9.088706016540527, "logps/chosen": -0.4678025245666504, "logps/rejected": -109.64227294921875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 3.1325783729553223, "rewards/margins": 10.893022537231445, "rewards/rejected": -7.760445594787598, "step": 573 }, { "epoch": 0.40294840294840295, "grad_norm": 0.009982574731111526, "learning_rate": 4.5981864873299563e-05, "logits/chosen": -9.222670555114746, "logits/rejected": -9.215982437133789, "logps/chosen": -3.6025032997131348, "logps/rejected": -107.43853759765625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8110244274139404, "rewards/margins": 10.38371467590332, "rewards/rejected": -7.572690486907959, "step": 574 }, { "epoch": 0.40365040365040367, "grad_norm": 0.006703351624310017, "learning_rate": 4.599446407816052e-05, "logits/chosen": -10.22374439239502, "logits/rejected": -10.220067977905273, "logps/chosen": -14.586004257202148, "logps/rejected": -99.20763397216797, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.7880730628967285, "rewards/margins": 8.517242431640625, "rewards/rejected": -6.7291693687438965, "step": 575 }, { "epoch": 0.4043524043524043, "grad_norm": 0.00894266925752163, "learning_rate": 4.6007041390386874e-05, "logits/chosen": -8.288748741149902, "logits/rejected": -8.279921531677246, "logps/chosen": -5.088599681854248, "logps/rejected": -108.6777572631836, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.775386095046997, "rewards/margins": 10.483353614807129, "rewards/rejected": -7.707967281341553, "step": 576 }, { "epoch": 0.40505440505440504, "grad_norm": 0.0035168854519724846, "learning_rate": 4.601959688592886e-05, "logits/chosen": -9.157872200012207, "logits/rejected": -9.153118133544922, "logps/chosen": -8.464950561523438, "logps/rejected": -103.61553955078125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.3578364849090576, "rewards/margins": 9.524032592773438, "rewards/rejected": -7.166195869445801, "step": 577 }, { "epoch": 0.40575640575640576, "grad_norm": 0.6919627785682678, "learning_rate": 4.603213064034216e-05, "logits/chosen": -8.60385799407959, "logits/rejected": -8.597031593322754, "logps/chosen": -0.33327239751815796, "logps/rejected": -108.32235717773438, "loss": 0.0768, "rewards/accuracies": 1.0, "rewards/chosen": 3.081843376159668, "rewards/margins": 10.751598358154297, "rewards/rejected": -7.669755458831787, "step": 578 }, { "epoch": 0.4064584064584065, "grad_norm": 0.2690199017524719, "learning_rate": 4.604464272879061e-05, "logits/chosen": -8.86178207397461, "logits/rejected": -8.841301918029785, "logps/chosen": -7.055635452270508, "logps/rejected": -106.87814331054688, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 2.5782558917999268, "rewards/margins": 9.8028564453125, "rewards/rejected": -7.224600315093994, "step": 579 }, { "epoch": 0.40716040716040713, "grad_norm": 0.0022856525611132383, "learning_rate": 4.605713322604896e-05, "logits/chosen": -9.08082389831543, "logits/rejected": -9.085807800292969, "logps/chosen": -3.9702882766723633, "logps/rejected": -107.83747863769531, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7603161334991455, "rewards/margins": 10.439268112182617, "rewards/rejected": -7.678952217102051, "step": 580 }, { "epoch": 0.40786240786240785, "grad_norm": 0.0037994764279574156, "learning_rate": 4.606960220650551e-05, "logits/chosen": -9.089805603027344, "logits/rejected": -9.09583568572998, "logps/chosen": -8.001964569091797, "logps/rejected": -104.55294799804688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4672515392303467, "rewards/margins": 9.646228790283203, "rewards/rejected": -7.178977966308594, "step": 581 }, { "epoch": 0.40856440856440857, "grad_norm": 0.015519216656684875, "learning_rate": 4.608204974416481e-05, "logits/chosen": -9.115259170532227, "logits/rejected": -9.119709014892578, "logps/chosen": -8.731271743774414, "logps/rejected": -104.49430084228516, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.2683069705963135, "rewards/margins": 9.507906913757324, "rewards/rejected": -7.239599704742432, "step": 582 }, { "epoch": 0.4092664092664093, "grad_norm": 0.18259547650814056, "learning_rate": 4.6094475912650234e-05, "logits/chosen": -8.559019088745117, "logits/rejected": -8.53663444519043, "logps/chosen": -12.067331314086914, "logps/rejected": -103.19454956054688, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.8983261585235596, "rewards/margins": 9.175088882446289, "rewards/rejected": -7.276762008666992, "step": 583 }, { "epoch": 0.40996840996840994, "grad_norm": 0.028627723455429077, "learning_rate": 4.610688078520666e-05, "logits/chosen": -8.90488052368164, "logits/rejected": -8.900003433227539, "logps/chosen": -12.514532089233398, "logps/rejected": -103.10539245605469, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.0406370162963867, "rewards/margins": 9.09585189819336, "rewards/rejected": -7.0552144050598145, "step": 584 }, { "epoch": 0.41067041067041066, "grad_norm": 0.03842032328248024, "learning_rate": 4.611926443470301e-05, "logits/chosen": -8.94662094116211, "logits/rejected": -8.946557998657227, "logps/chosen": -20.7670841217041, "logps/rejected": -94.15306091308594, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.1113418340682983, "rewards/margins": 7.258638381958008, "rewards/rejected": -6.147296905517578, "step": 585 }, { "epoch": 0.4113724113724114, "grad_norm": 0.050390999764204025, "learning_rate": 4.6131626933634844e-05, "logits/chosen": -8.583203315734863, "logits/rejected": -8.568052291870117, "logps/chosen": -10.57664966583252, "logps/rejected": -103.62555694580078, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 2.2398853302001953, "rewards/margins": 9.365667343139648, "rewards/rejected": -7.125782012939453, "step": 586 }, { "epoch": 0.4120744120744121, "grad_norm": 0.04279716685414314, "learning_rate": 4.6143968354126914e-05, "logits/chosen": -9.023279190063477, "logits/rejected": -9.023773193359375, "logps/chosen": -13.661493301391602, "logps/rejected": -102.09019470214844, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 1.954648733139038, "rewards/margins": 8.87915325164795, "rewards/rejected": -6.924503803253174, "step": 587 }, { "epoch": 0.41277641277641275, "grad_norm": 0.1629287451505661, "learning_rate": 4.6156288767935646e-05, "logits/chosen": -9.240253448486328, "logits/rejected": -9.219122886657715, "logps/chosen": -16.455322265625, "logps/rejected": -99.28237915039062, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 1.5113625526428223, "rewards/margins": 8.213860511779785, "rewards/rejected": -6.702497482299805, "step": 588 }, { "epoch": 0.41347841347841346, "grad_norm": 0.2593044340610504, "learning_rate": 4.61685882464517e-05, "logits/chosen": -9.524923324584961, "logits/rejected": -9.526156425476074, "logps/chosen": -18.915300369262695, "logps/rejected": -97.9605712890625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 1.0394213199615479, "rewards/margins": 7.825238227844238, "rewards/rejected": -6.785816669464111, "step": 589 }, { "epoch": 0.4141804141804142, "grad_norm": 0.03193063288927078, "learning_rate": 4.61808668607024e-05, "logits/chosen": -8.880998611450195, "logits/rejected": -8.894620895385742, "logps/chosen": -16.669940948486328, "logps/rejected": -102.13307189941406, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.4606120586395264, "rewards/margins": 8.482465744018555, "rewards/rejected": -7.021853446960449, "step": 590 }, { "epoch": 0.4148824148824149, "grad_norm": 0.015179728157818317, "learning_rate": 4.619312468135426e-05, "logits/chosen": -8.80389404296875, "logits/rejected": -8.80514144897461, "logps/chosen": -10.745655059814453, "logps/rejected": -106.474365234375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.2824320793151855, "rewards/margins": 9.601478576660156, "rewards/rejected": -7.319046974182129, "step": 591 }, { "epoch": 0.4155844155844156, "grad_norm": 0.010180286131799221, "learning_rate": 4.620536177871533e-05, "logits/chosen": -8.767969131469727, "logits/rejected": -8.784196853637695, "logps/chosen": -15.676675796508789, "logps/rejected": -104.72099304199219, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.5487146377563477, "rewards/margins": 8.889720916748047, "rewards/rejected": -7.341005802154541, "step": 592 }, { "epoch": 0.41628641628641627, "grad_norm": 0.008405203931033611, "learning_rate": 4.621757822273772e-05, "logits/chosen": -9.83257007598877, "logits/rejected": -9.82681941986084, "logps/chosen": -18.80691146850586, "logps/rejected": -102.27642822265625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.3332935571670532, "rewards/margins": 8.293333053588867, "rewards/rejected": -6.960039138793945, "step": 593 }, { "epoch": 0.416988416988417, "grad_norm": 3.9150025844573975, "learning_rate": 4.62297740830199e-05, "logits/chosen": -9.882944107055664, "logits/rejected": -9.86735725402832, "logps/chosen": -19.1162166595459, "logps/rejected": -104.84910583496094, "loss": 0.0249, "rewards/accuracies": 1.0, "rewards/chosen": 1.1746554374694824, "rewards/margins": 8.432376861572266, "rewards/rejected": -7.257721424102783, "step": 594 }, { "epoch": 0.4176904176904177, "grad_norm": 0.005052560940384865, "learning_rate": 4.6241949428809165e-05, "logits/chosen": -8.268148422241211, "logits/rejected": -8.270316123962402, "logps/chosen": -11.597227096557617, "logps/rejected": -107.26275634765625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.068894386291504, "rewards/margins": 9.57072639465332, "rewards/rejected": -7.5018310546875, "step": 595 }, { "epoch": 0.4183924183924184, "grad_norm": 0.003814245108515024, "learning_rate": 4.625410432900395e-05, "logits/chosen": -8.674753189086914, "logits/rejected": -8.66825008392334, "logps/chosen": -18.32672119140625, "logps/rejected": -103.25663757324219, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.4258095026016235, "rewards/margins": 8.487272262573242, "rewards/rejected": -7.06146240234375, "step": 596 }, { "epoch": 0.4190944190944191, "grad_norm": 0.004269392695277929, "learning_rate": 4.626623885215616e-05, "logits/chosen": -9.139884948730469, "logits/rejected": -9.126933097839355, "logps/chosen": -19.19308090209961, "logps/rejected": -103.0250244140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.3686612844467163, "rewards/margins": 8.3853178024292, "rewards/rejected": -7.016655921936035, "step": 597 }, { "epoch": 0.4197964197964198, "grad_norm": 0.0035933724138885736, "learning_rate": 4.627835306647352e-05, "logits/chosen": -9.134010314941406, "logits/rejected": -9.129789352416992, "logps/chosen": -13.728506088256836, "logps/rejected": -105.81696319580078, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.8632196187973022, "rewards/margins": 9.23993968963623, "rewards/rejected": -7.376720905303955, "step": 598 }, { "epoch": 0.4204984204984205, "grad_norm": 0.00639535766094923, "learning_rate": 4.629044703982186e-05, "logits/chosen": -9.031082153320312, "logits/rejected": -9.028400421142578, "logps/chosen": -18.24537467956543, "logps/rejected": -105.47474670410156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.3338596820831299, "rewards/margins": 8.73360824584961, "rewards/rejected": -7.399748802185059, "step": 599 }, { "epoch": 0.4212004212004212, "grad_norm": 0.0037284225691109896, "learning_rate": 4.63025208397274e-05, "logits/chosen": -9.362751960754395, "logits/rejected": -9.348814010620117, "logps/chosen": -16.418628692626953, "logps/rejected": -106.25672912597656, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.4942569732666016, "rewards/margins": 8.971222877502441, "rewards/rejected": -7.47696590423584, "step": 600 }, { "epoch": 0.4219024219024219, "grad_norm": 0.005691895727068186, "learning_rate": 4.6314574533379e-05, "logits/chosen": -8.976402282714844, "logits/rejected": -8.958719253540039, "logps/chosen": -16.38129997253418, "logps/rejected": -104.5560302734375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.5865464210510254, "rewards/margins": 8.771221160888672, "rewards/rejected": -7.184675216674805, "step": 601 }, { "epoch": 0.4226044226044226, "grad_norm": 0.0034169554710388184, "learning_rate": 4.632660818763041e-05, "logits/chosen": -8.938007354736328, "logits/rejected": -8.898028373718262, "logps/chosen": -19.235916137695312, "logps/rejected": -103.99669647216797, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.393949031829834, "rewards/margins": 8.501025199890137, "rewards/rejected": -7.107076644897461, "step": 602 }, { "epoch": 0.4233064233064233, "grad_norm": 0.016814323142170906, "learning_rate": 4.633862186900253e-05, "logits/chosen": -9.058176040649414, "logits/rejected": -9.058356285095215, "logps/chosen": -15.374520301818848, "logps/rejected": -107.60017395019531, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.6776628494262695, "rewards/margins": 9.148454666137695, "rewards/rejected": -7.470792770385742, "step": 603 }, { "epoch": 0.42400842400842403, "grad_norm": 0.02215074934065342, "learning_rate": 4.6350615643685535e-05, "logits/chosen": -8.713054656982422, "logits/rejected": -8.70323371887207, "logps/chosen": -19.071533203125, "logps/rejected": -103.71185302734375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.3080081939697266, "rewards/margins": 8.488258361816406, "rewards/rejected": -7.18025016784668, "step": 604 }, { "epoch": 0.4247104247104247, "grad_norm": 0.08530601114034653, "learning_rate": 4.6362589577541154e-05, "logits/chosen": -9.079513549804688, "logits/rejected": -9.10315227508545, "logps/chosen": -15.418963432312012, "logps/rejected": -104.89862823486328, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.6434907913208008, "rewards/margins": 8.96321964263916, "rewards/rejected": -7.319728851318359, "step": 605 }, { "epoch": 0.4254124254124254, "grad_norm": 0.008084154687821865, "learning_rate": 4.637454373610477e-05, "logits/chosen": -8.831938743591309, "logits/rejected": -8.816082954406738, "logps/chosen": -16.708995819091797, "logps/rejected": -103.05284118652344, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.4487273693084717, "rewards/margins": 8.629213333129883, "rewards/rejected": -7.180485725402832, "step": 606 }, { "epoch": 0.4261144261144261, "grad_norm": 0.0034802784211933613, "learning_rate": 4.638647818458763e-05, "logits/chosen": -7.20490026473999, "logits/rejected": -7.180264949798584, "logps/chosen": -17.346925735473633, "logps/rejected": -105.62297058105469, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.6080198287963867, "rewards/margins": 8.819189071655273, "rewards/rejected": -7.211169242858887, "step": 607 }, { "epoch": 0.42681642681642684, "grad_norm": 0.3928808569908142, "learning_rate": 4.639839298787892e-05, "logits/chosen": -8.695526123046875, "logits/rejected": -8.685112953186035, "logps/chosen": -12.799803733825684, "logps/rejected": -107.4385986328125, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 1.813517689704895, "rewards/margins": 9.461895942687988, "rewards/rejected": -7.648378372192383, "step": 608 }, { "epoch": 0.4275184275184275, "grad_norm": 0.021193578839302063, "learning_rate": 4.641028821054793e-05, "logits/chosen": -7.605978012084961, "logits/rejected": -7.599844932556152, "logps/chosen": -18.353675842285156, "logps/rejected": -103.60383605957031, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.6065102815628052, "rewards/margins": 8.5457124710083, "rewards/rejected": -6.939201831817627, "step": 609 }, { "epoch": 0.4282204282204282, "grad_norm": 0.003431899007409811, "learning_rate": 4.6422163916846124e-05, "logits/chosen": -9.13631534576416, "logits/rejected": -9.112180709838867, "logps/chosen": -16.62655258178711, "logps/rejected": -102.8824462890625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.548832893371582, "rewards/margins": 8.651388168334961, "rewards/rejected": -7.102555274963379, "step": 610 }, { "epoch": 0.42892242892242893, "grad_norm": 0.005399459507316351, "learning_rate": 4.643402017070924e-05, "logits/chosen": -9.337211608886719, "logits/rejected": -9.345316886901855, "logps/chosen": -12.771368980407715, "logps/rejected": -105.88616943359375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.7636001110076904, "rewards/margins": 9.212154388427734, "rewards/rejected": -7.448553562164307, "step": 611 }, { "epoch": 0.42962442962442965, "grad_norm": 0.25153377652168274, "learning_rate": 4.644585703575936e-05, "logits/chosen": -9.700007438659668, "logits/rejected": -9.705856323242188, "logps/chosen": -14.86815357208252, "logps/rejected": -105.33036804199219, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 1.5335431098937988, "rewards/margins": 8.874225616455078, "rewards/rejected": -7.340682029724121, "step": 612 }, { "epoch": 0.4303264303264303, "grad_norm": 0.004082137253135443, "learning_rate": 4.645767457530692e-05, "logits/chosen": -9.060602188110352, "logits/rejected": -9.050373077392578, "logps/chosen": -12.644998550415039, "logps/rejected": -107.06452178955078, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.03184175491333, "rewards/margins": 9.48000717163086, "rewards/rejected": -7.448164939880371, "step": 613 }, { "epoch": 0.431028431028431, "grad_norm": 0.004219335969537497, "learning_rate": 4.64694728523528e-05, "logits/chosen": -8.863935470581055, "logits/rejected": -8.857860565185547, "logps/chosen": -15.728355407714844, "logps/rejected": -101.95661926269531, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.5529217720031738, "rewards/margins": 8.562548637390137, "rewards/rejected": -7.009627342224121, "step": 614 }, { "epoch": 0.43173043173043174, "grad_norm": 0.01577799767255783, "learning_rate": 4.648125192959028e-05, "logits/chosen": -8.834552764892578, "logits/rejected": -8.831554412841797, "logps/chosen": -13.767776489257812, "logps/rejected": -103.99757385253906, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.7860212326049805, "rewards/margins": 8.97497844696045, "rewards/rejected": -7.188957691192627, "step": 615 }, { "epoch": 0.43243243243243246, "grad_norm": 0.002301097847521305, "learning_rate": 4.649301186940709e-05, "logits/chosen": -8.575980186462402, "logits/rejected": -8.587843894958496, "logps/chosen": -10.649076461791992, "logps/rejected": -106.72821044921875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.120393753051758, "rewards/margins": 9.720080375671387, "rewards/rejected": -7.599686622619629, "step": 616 }, { "epoch": 0.4331344331344331, "grad_norm": 0.026333332061767578, "learning_rate": 4.650475273388737e-05, "logits/chosen": -9.259392738342285, "logits/rejected": -9.275139808654785, "logps/chosen": -12.842309951782227, "logps/rejected": -107.34990692138672, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.8829271793365479, "rewards/margins": 9.443319320678711, "rewards/rejected": -7.560392379760742, "step": 617 }, { "epoch": 0.43383643383643383, "grad_norm": 0.004920284729450941, "learning_rate": 4.651647458481359e-05, "logits/chosen": -9.449658393859863, "logits/rejected": -9.469889640808105, "logps/chosen": -10.449399948120117, "logps/rejected": -106.87548065185547, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.0638389587402344, "rewards/margins": 9.664840698242188, "rewards/rejected": -7.601001739501953, "step": 618 }, { "epoch": 0.43453843453843455, "grad_norm": 0.006927684880793095, "learning_rate": 4.652817748366864e-05, "logits/chosen": -8.349481582641602, "logits/rejected": -8.366287231445312, "logps/chosen": -12.489608764648438, "logps/rejected": -108.62134552001953, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.8143272399902344, "rewards/margins": 9.51036262512207, "rewards/rejected": -7.6960344314575195, "step": 619 }, { "epoch": 0.43524043524043526, "grad_norm": 0.00365955987945199, "learning_rate": 4.653986149163757e-05, "logits/chosen": -8.40023136138916, "logits/rejected": -8.426277160644531, "logps/chosen": -9.779703140258789, "logps/rejected": -106.60507202148438, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.1926777362823486, "rewards/margins": 9.756361961364746, "rewards/rejected": -7.563684463500977, "step": 620 }, { "epoch": 0.4359424359424359, "grad_norm": 0.002303390298038721, "learning_rate": 4.655152666960967e-05, "logits/chosen": -8.404130935668945, "logits/rejected": -8.41029167175293, "logps/chosen": -13.116798400878906, "logps/rejected": -105.4293212890625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.897993803024292, "rewards/margins": 9.206439018249512, "rewards/rejected": -7.308444976806641, "step": 621 }, { "epoch": 0.43664443664443664, "grad_norm": 0.006935638841241598, "learning_rate": 4.6563173078180315e-05, "logits/chosen": -8.501910209655762, "logits/rejected": -8.504684448242188, "logps/chosen": -14.930350303649902, "logps/rejected": -102.6368408203125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.6698315143585205, "rewards/margins": 8.788480758666992, "rewards/rejected": -7.118649482727051, "step": 622 }, { "epoch": 0.43734643734643736, "grad_norm": 0.020430123433470726, "learning_rate": 4.657480077765283e-05, "logits/chosen": -7.96713399887085, "logits/rejected": -7.976930141448975, "logps/chosen": -13.743274688720703, "logps/rejected": -104.86383056640625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.8937320709228516, "rewards/margins": 9.092527389526367, "rewards/rejected": -7.198795318603516, "step": 623 }, { "epoch": 0.43804843804843807, "grad_norm": 3.0275301933288574, "learning_rate": 4.6586409828040405e-05, "logits/chosen": -9.373302459716797, "logits/rejected": -9.356645584106445, "logps/chosen": -12.916791915893555, "logps/rejected": -104.08287811279297, "loss": 0.0433, "rewards/accuracies": 1.0, "rewards/chosen": 1.8474695682525635, "rewards/margins": 9.07949447631836, "rewards/rejected": -7.232024669647217, "step": 624 }, { "epoch": 0.43875043875043873, "grad_norm": 0.6789658069610596, "learning_rate": 4.659800028906792e-05, "logits/chosen": -8.672926902770996, "logits/rejected": -8.671446800231934, "logps/chosen": -9.439438819885254, "logps/rejected": -107.67446899414062, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 2.3766918182373047, "rewards/margins": 9.95029067993164, "rewards/rejected": -7.573599338531494, "step": 625 }, { "epoch": 0.43945243945243945, "grad_norm": 0.002006754046306014, "learning_rate": 4.660957222017383e-05, "logits/chosen": -8.458700180053711, "logits/rejected": -8.455964088439941, "logps/chosen": -14.065094947814941, "logps/rejected": -103.99471282958984, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.9059545993804932, "rewards/margins": 8.998458862304688, "rewards/rejected": -7.092504024505615, "step": 626 }, { "epoch": 0.44015444015444016, "grad_norm": 0.030590182170271873, "learning_rate": 4.662112568051194e-05, "logits/chosen": -9.184097290039062, "logits/rejected": -9.196257591247559, "logps/chosen": -13.299565315246582, "logps/rejected": -106.76364135742188, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.7610876560211182, "rewards/margins": 9.20255184173584, "rewards/rejected": -7.441463470458984, "step": 627 }, { "epoch": 0.4408564408564409, "grad_norm": 0.0014523010468110442, "learning_rate": 4.663266072895327e-05, "logits/chosen": -8.714282989501953, "logits/rejected": -8.698814392089844, "logps/chosen": -5.358798980712891, "logps/rejected": -107.76799011230469, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.714617967605591, "rewards/margins": 10.329141616821289, "rewards/rejected": -7.614523410797119, "step": 628 }, { "epoch": 0.44155844155844154, "grad_norm": 0.009632195346057415, "learning_rate": 4.664417742408782e-05, "logits/chosen": -7.913036346435547, "logits/rejected": -7.902412414550781, "logps/chosen": -11.140863418579102, "logps/rejected": -107.55719757080078, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.2161409854888916, "rewards/margins": 9.558614730834961, "rewards/rejected": -7.342473983764648, "step": 629 }, { "epoch": 0.44226044226044225, "grad_norm": 0.007829696871340275, "learning_rate": 4.665567582422637e-05, "logits/chosen": -9.350183486938477, "logits/rejected": -9.326925277709961, "logps/chosen": -8.067788124084473, "logps/rejected": -107.29930114746094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.275130271911621, "rewards/margins": 9.841442108154297, "rewards/rejected": -7.566311836242676, "step": 630 }, { "epoch": 0.44296244296244297, "grad_norm": 0.0019082891521975398, "learning_rate": 4.666715598740224e-05, "logits/chosen": -8.986562728881836, "logits/rejected": -8.989130020141602, "logps/chosen": -8.44627571105957, "logps/rejected": -107.4915771484375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.2965569496154785, "rewards/margins": 9.804357528686523, "rewards/rejected": -7.507800102233887, "step": 631 }, { "epoch": 0.4436644436644437, "grad_norm": 0.007839841768145561, "learning_rate": 4.667861797137309e-05, "logits/chosen": -8.784549713134766, "logits/rejected": -8.754714012145996, "logps/chosen": -3.497272253036499, "logps/rejected": -107.33972930908203, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.861927032470703, "rewards/margins": 10.494431495666504, "rewards/rejected": -7.632504463195801, "step": 632 }, { "epoch": 0.44436644436644435, "grad_norm": 0.0028748023323714733, "learning_rate": 4.669006183362258e-05, "logits/chosen": -7.919785499572754, "logits/rejected": -7.935317039489746, "logps/chosen": -9.49337100982666, "logps/rejected": -104.68313598632812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.2896087169647217, "rewards/margins": 9.54275131225586, "rewards/rejected": -7.253143310546875, "step": 633 }, { "epoch": 0.44506844506844506, "grad_norm": 0.003970960155129433, "learning_rate": 4.670148763136221e-05, "logits/chosen": -8.991774559020996, "logits/rejected": -8.976700782775879, "logps/chosen": -5.043935298919678, "logps/rejected": -108.1717529296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.678466558456421, "rewards/margins": 10.393380165100098, "rewards/rejected": -7.714913368225098, "step": 634 }, { "epoch": 0.4457704457704458, "grad_norm": 0.003464267123490572, "learning_rate": 4.671289542153293e-05, "logits/chosen": -8.765070915222168, "logits/rejected": -8.757671356201172, "logps/chosen": -7.544814109802246, "logps/rejected": -107.1904296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.346165418624878, "rewards/margins": 9.899957656860352, "rewards/rejected": -7.5537919998168945, "step": 635 }, { "epoch": 0.4464724464724465, "grad_norm": 0.003959548193961382, "learning_rate": 4.672428526080691e-05, "logits/chosen": -9.031238555908203, "logits/rejected": -9.027105331420898, "logps/chosen": -8.109774589538574, "logps/rejected": -103.65416717529297, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4567737579345703, "rewards/margins": 9.532648086547852, "rewards/rejected": -7.075875282287598, "step": 636 }, { "epoch": 0.44717444717444715, "grad_norm": 0.005230925045907497, "learning_rate": 4.673565720558918e-05, "logits/chosen": -7.818792819976807, "logits/rejected": -7.8076581954956055, "logps/chosen": -10.483681678771973, "logps/rejected": -101.34739685058594, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.194065809249878, "rewards/margins": 9.115744590759277, "rewards/rejected": -6.92167854309082, "step": 637 }, { "epoch": 0.44787644787644787, "grad_norm": 0.003051496110856533, "learning_rate": 4.6747011312019374e-05, "logits/chosen": -8.336688995361328, "logits/rejected": -8.329050064086914, "logps/chosen": -10.631427764892578, "logps/rejected": -104.4743423461914, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.1024763584136963, "rewards/margins": 9.29574203491211, "rewards/rejected": -7.193265438079834, "step": 638 }, { "epoch": 0.4485784485784486, "grad_norm": 0.0026226809713989496, "learning_rate": 4.6758347635973334e-05, "logits/chosen": -8.843090057373047, "logits/rejected": -8.838459014892578, "logps/chosen": -11.028423309326172, "logps/rejected": -104.26094055175781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.1016993522644043, "rewards/margins": 9.243928909301758, "rewards/rejected": -7.142230033874512, "step": 639 }, { "epoch": 0.4492804492804493, "grad_norm": 0.0024404546711593866, "learning_rate": 4.676966623306479e-05, "logits/chosen": -9.058950424194336, "logits/rejected": -9.032112121582031, "logps/chosen": -7.417182922363281, "logps/rejected": -104.7947769165039, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.605015277862549, "rewards/margins": 9.793395042419434, "rewards/rejected": -7.188379764556885, "step": 640 }, { "epoch": 0.44998244998244996, "grad_norm": 0.0025964127853512764, "learning_rate": 4.678096715864696e-05, "logits/chosen": -8.980599403381348, "logits/rejected": -8.985420227050781, "logps/chosen": -11.968538284301758, "logps/rejected": -104.25639343261719, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.9288336038589478, "rewards/margins": 9.096182823181152, "rewards/rejected": -7.167349815368652, "step": 641 }, { "epoch": 0.4506844506844507, "grad_norm": 0.004369934089481831, "learning_rate": 4.679225046781422e-05, "logits/chosen": -8.492570877075195, "logits/rejected": -8.488555908203125, "logps/chosen": -6.13210391998291, "logps/rejected": -104.33349609375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.391761064529419, "rewards/margins": 9.790864944458008, "rewards/rejected": -7.39910364151001, "step": 642 }, { "epoch": 0.4513864513864514, "grad_norm": 0.022510340437293053, "learning_rate": 4.68035162154037e-05, "logits/chosen": -9.10562801361084, "logits/rejected": -9.095724105834961, "logps/chosen": -10.90048599243164, "logps/rejected": -102.14271545410156, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.0992064476013184, "rewards/margins": 9.10708236694336, "rewards/rejected": -7.007875919342041, "step": 643 }, { "epoch": 0.4520884520884521, "grad_norm": 0.002859686501324177, "learning_rate": 4.681476445599687e-05, "logits/chosen": -8.666586875915527, "logits/rejected": -8.64898681640625, "logps/chosen": -7.094790458679199, "logps/rejected": -105.81155395507812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.496978759765625, "rewards/margins": 9.781669616699219, "rewards/rejected": -7.28469181060791, "step": 644 }, { "epoch": 0.45279045279045277, "grad_norm": 0.00814884901046753, "learning_rate": 4.6825995243921137e-05, "logits/chosen": -8.319927215576172, "logits/rejected": -8.322864532470703, "logps/chosen": -5.826495170593262, "logps/rejected": -106.76339721679688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6304616928100586, "rewards/margins": 10.081075668334961, "rewards/rejected": -7.450613975524902, "step": 645 }, { "epoch": 0.4534924534924535, "grad_norm": 0.0026562439743429422, "learning_rate": 4.683720863325141e-05, "logits/chosen": -9.111251831054688, "logits/rejected": -9.108022689819336, "logps/chosen": -7.769123077392578, "logps/rejected": -105.6496810913086, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.255279779434204, "rewards/margins": 9.659412384033203, "rewards/rejected": -7.404131889343262, "step": 646 }, { "epoch": 0.4541944541944542, "grad_norm": 0.0025850299280136824, "learning_rate": 4.684840467781168e-05, "logits/chosen": -9.056230545043945, "logits/rejected": -9.049039840698242, "logps/chosen": -5.239814758300781, "logps/rejected": -104.89900207519531, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.632582187652588, "rewards/margins": 9.968277931213379, "rewards/rejected": -7.335695743560791, "step": 647 }, { "epoch": 0.4548964548964549, "grad_norm": 0.0024756514467298985, "learning_rate": 4.685958343117656e-05, "logits/chosen": -8.590635299682617, "logits/rejected": -8.595808029174805, "logps/chosen": -7.118595123291016, "logps/rejected": -106.11064147949219, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.2775750160217285, "rewards/margins": 9.804985046386719, "rewards/rejected": -7.52741003036499, "step": 648 }, { "epoch": 0.4555984555984556, "grad_norm": 0.002659962046891451, "learning_rate": 4.6870744946672826e-05, "logits/chosen": -8.743327140808105, "logits/rejected": -8.729215621948242, "logps/chosen": -7.258363723754883, "logps/rejected": -104.8450927734375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.480013608932495, "rewards/margins": 9.687477111816406, "rewards/rejected": -7.207462787628174, "step": 649 }, { "epoch": 0.4563004563004563, "grad_norm": 0.010408788919448853, "learning_rate": 4.688188927738093e-05, "logits/chosen": -9.248025894165039, "logits/rejected": -9.249759674072266, "logps/chosen": -6.366868495941162, "logps/rejected": -105.19477844238281, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.4830405712127686, "rewards/margins": 9.864545822143555, "rewards/rejected": -7.381505012512207, "step": 650 }, { "epoch": 0.457002457002457, "grad_norm": 0.0028300555422902107, "learning_rate": 4.689301647613653e-05, "logits/chosen": -9.016534805297852, "logits/rejected": -9.002595901489258, "logps/chosen": -2.977700710296631, "logps/rejected": -106.96772766113281, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.9555771350860596, "rewards/margins": 10.512153625488281, "rewards/rejected": -7.556575775146484, "step": 651 }, { "epoch": 0.4577044577044577, "grad_norm": 0.0027832328341901302, "learning_rate": 4.6904126595532014e-05, "logits/chosen": -9.136636734008789, "logits/rejected": -9.13322925567627, "logps/chosen": -7.753105640411377, "logps/rejected": -105.83629608154297, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.3320372104644775, "rewards/margins": 9.717618942260742, "rewards/rejected": -7.385581016540527, "step": 652 }, { "epoch": 0.4584064584064584, "grad_norm": 0.004360685124993324, "learning_rate": 4.69152196879179e-05, "logits/chosen": -8.234676361083984, "logits/rejected": -8.234771728515625, "logps/chosen": -4.03495454788208, "logps/rejected": -106.48416900634766, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.829136610031128, "rewards/margins": 10.315851211547852, "rewards/rejected": -7.4867143630981445, "step": 653 }, { "epoch": 0.4591084591084591, "grad_norm": 0.003806489985436201, "learning_rate": 4.692629580540446e-05, "logits/chosen": -8.706917762756348, "logits/rejected": -8.676971435546875, "logps/chosen": -7.226866722106934, "logps/rejected": -104.63444519042969, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.4600534439086914, "rewards/margins": 9.766908645629883, "rewards/rejected": -7.306855201721191, "step": 654 }, { "epoch": 0.4598104598104598, "grad_norm": 0.003489489434286952, "learning_rate": 4.693735499986305e-05, "logits/chosen": -7.903153896331787, "logits/rejected": -7.900577545166016, "logps/chosen": -3.5718703269958496, "logps/rejected": -107.60049438476562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9326906204223633, "rewards/margins": 10.439170837402344, "rewards/rejected": -7.506479263305664, "step": 655 }, { "epoch": 0.46051246051246053, "grad_norm": 0.003982571419328451, "learning_rate": 4.694839732292767e-05, "logits/chosen": -8.586386680603027, "logits/rejected": -8.593162536621094, "logps/chosen": -8.293469429016113, "logps/rejected": -103.571533203125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.397512912750244, "rewards/margins": 9.472261428833008, "rewards/rejected": -7.074748516082764, "step": 656 }, { "epoch": 0.4612144612144612, "grad_norm": 0.001603860640898347, "learning_rate": 4.6959422825996345e-05, "logits/chosen": -9.069883346557617, "logits/rejected": -9.038928985595703, "logps/chosen": -6.5117645263671875, "logps/rejected": -106.4656753540039, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4782607555389404, "rewards/margins": 9.876211166381836, "rewards/rejected": -7.397950172424316, "step": 657 }, { "epoch": 0.4619164619164619, "grad_norm": 0.0015721105737611651, "learning_rate": 4.69704315602326e-05, "logits/chosen": -8.348634719848633, "logits/rejected": -8.33160400390625, "logps/chosen": -4.43705415725708, "logps/rejected": -107.46866607666016, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7722291946411133, "rewards/margins": 10.347105979919434, "rewards/rejected": -7.57487678527832, "step": 658 }, { "epoch": 0.4626184626184626, "grad_norm": 0.0021823274437338114, "learning_rate": 4.698142357656684e-05, "logits/chosen": -8.295515060424805, "logits/rejected": -8.297492980957031, "logps/chosen": -5.325466156005859, "logps/rejected": -107.17835998535156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6231794357299805, "rewards/margins": 10.202880859375, "rewards/rejected": -7.5797014236450195, "step": 659 }, { "epoch": 0.46332046332046334, "grad_norm": 0.0025443199556320906, "learning_rate": 4.6992398925697814e-05, "logits/chosen": -8.397335052490234, "logits/rejected": -8.403736114501953, "logps/chosen": -8.290432929992676, "logps/rejected": -106.86045837402344, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.283071279525757, "rewards/margins": 9.75039291381836, "rewards/rejected": -7.467321395874023, "step": 660 }, { "epoch": 0.464022464022464, "grad_norm": 0.0036649834364652634, "learning_rate": 4.7003357658094e-05, "logits/chosen": -8.383892059326172, "logits/rejected": -8.382780075073242, "logps/chosen": -5.119461536407471, "logps/rejected": -107.6535415649414, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.727698802947998, "rewards/margins": 10.20865535736084, "rewards/rejected": -7.480956554412842, "step": 661 }, { "epoch": 0.4647244647244647, "grad_norm": 0.0018270015716552734, "learning_rate": 4.7014299823995005e-05, "logits/chosen": -8.143403053283691, "logits/rejected": -8.131940841674805, "logps/chosen": -8.079638481140137, "logps/rejected": -104.40353393554688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.325813055038452, "rewards/margins": 9.631807327270508, "rewards/rejected": -7.305994510650635, "step": 662 }, { "epoch": 0.46542646542646543, "grad_norm": 0.0017440117662772536, "learning_rate": 4.702522547341289e-05, "logits/chosen": -9.098053932189941, "logits/rejected": -9.09000015258789, "logps/chosen": -5.62169075012207, "logps/rejected": -106.26213836669922, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5647077560424805, "rewards/margins": 9.980596542358398, "rewards/rejected": -7.415888786315918, "step": 663 }, { "epoch": 0.46612846612846615, "grad_norm": 0.02472071908414364, "learning_rate": 4.703613465613363e-05, "logits/chosen": -9.00951099395752, "logits/rejected": -9.015718460083008, "logps/chosen": -3.771599292755127, "logps/rejected": -108.05705261230469, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.719999313354492, "rewards/margins": 10.461920738220215, "rewards/rejected": -7.741921424865723, "step": 664 }, { "epoch": 0.4668304668304668, "grad_norm": 0.001542108366265893, "learning_rate": 4.704702742171841e-05, "logits/chosen": -9.15570068359375, "logits/rejected": -9.126781463623047, "logps/chosen": -4.0943779945373535, "logps/rejected": -107.08792114257812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6786911487579346, "rewards/margins": 10.317855834960938, "rewards/rejected": -7.63916540145874, "step": 665 }, { "epoch": 0.4675324675324675, "grad_norm": 0.0019985935650765896, "learning_rate": 4.7057903819505024e-05, "logits/chosen": -8.684255599975586, "logits/rejected": -8.676822662353516, "logps/chosen": -10.06159496307373, "logps/rejected": -106.22242736816406, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.2226123809814453, "rewards/margins": 9.476058959960938, "rewards/rejected": -7.253446578979492, "step": 666 }, { "epoch": 0.46823446823446824, "grad_norm": 0.0016623771516606212, "learning_rate": 4.7068763898609154e-05, "logits/chosen": -8.154001235961914, "logits/rejected": -8.150726318359375, "logps/chosen": -7.519370079040527, "logps/rejected": -103.38980102539062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5497753620147705, "rewards/margins": 9.606088638305664, "rewards/rejected": -7.056313991546631, "step": 667 }, { "epoch": 0.46893646893646895, "grad_norm": 0.0028777921106666327, "learning_rate": 4.707960770792576e-05, "logits/chosen": -9.354473114013672, "logits/rejected": -9.359146118164062, "logps/chosen": -4.173748970031738, "logps/rejected": -107.84404754638672, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.711977958679199, "rewards/margins": 10.363622665405273, "rewards/rejected": -7.651644706726074, "step": 668 }, { "epoch": 0.4696384696384696, "grad_norm": 0.0016034863656386733, "learning_rate": 4.709043529613039e-05, "logits/chosen": -9.379730224609375, "logits/rejected": -9.360246658325195, "logps/chosen": -9.399415016174316, "logps/rejected": -106.04895782470703, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.2432479858398438, "rewards/margins": 9.584297180175781, "rewards/rejected": -7.3410491943359375, "step": 669 }, { "epoch": 0.47034047034047033, "grad_norm": 0.0016966272378340364, "learning_rate": 4.710124671168044e-05, "logits/chosen": -8.366256713867188, "logits/rejected": -8.346247673034668, "logps/chosen": -2.705819606781006, "logps/rejected": -108.50762176513672, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.036612033843994, "rewards/margins": 10.591279983520508, "rewards/rejected": -7.55466890335083, "step": 670 }, { "epoch": 0.47104247104247104, "grad_norm": 0.0022190711461007595, "learning_rate": 4.711204200281654e-05, "logits/chosen": -9.197164535522461, "logits/rejected": -9.195253372192383, "logps/chosen": -14.913491249084473, "logps/rejected": -102.05030059814453, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.8013590574264526, "rewards/margins": 8.619856834411621, "rewards/rejected": -6.818497657775879, "step": 671 }, { "epoch": 0.47174447174447176, "grad_norm": 0.010406624525785446, "learning_rate": 4.712282121756376e-05, "logits/chosen": -8.79688835144043, "logits/rejected": -8.79867172241211, "logps/chosen": -8.793567657470703, "logps/rejected": -106.72920989990234, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4608073234558105, "rewards/margins": 9.842246055603027, "rewards/rejected": -7.381438732147217, "step": 672 }, { "epoch": 0.4724464724464724, "grad_norm": 0.0029399723280221224, "learning_rate": 4.713358440373295e-05, "logits/chosen": -8.636778831481934, "logits/rejected": -8.59377384185791, "logps/chosen": -5.230677604675293, "logps/rejected": -107.89813232421875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5933821201324463, "rewards/margins": 10.220407485961914, "rewards/rejected": -7.627025604248047, "step": 673 }, { "epoch": 0.47314847314847314, "grad_norm": 0.003287707455456257, "learning_rate": 4.7144331608922e-05, "logits/chosen": -7.974759101867676, "logits/rejected": -7.968532562255859, "logps/chosen": -6.357562065124512, "logps/rejected": -107.87250518798828, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.599616527557373, "rewards/margins": 10.138482093811035, "rewards/rejected": -7.538865089416504, "step": 674 }, { "epoch": 0.47385047385047385, "grad_norm": 0.0013791151577606797, "learning_rate": 4.715506288051709e-05, "logits/chosen": -9.190458297729492, "logits/rejected": -9.186809539794922, "logps/chosen": -4.584577560424805, "logps/rejected": -108.16497802734375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6655216217041016, "rewards/margins": 10.338294982910156, "rewards/rejected": -7.672773838043213, "step": 675 }, { "epoch": 0.47455247455247457, "grad_norm": 0.0026988754980266094, "learning_rate": 4.7165778265693935e-05, "logits/chosen": -9.930255889892578, "logits/rejected": -9.910259246826172, "logps/chosen": -1.5449419021606445, "logps/rejected": -108.73675537109375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9328973293304443, "rewards/margins": 10.812238693237305, "rewards/rejected": -7.879340171813965, "step": 676 }, { "epoch": 0.47525447525447523, "grad_norm": 0.0021016194950789213, "learning_rate": 4.7176477811419076e-05, "logits/chosen": -8.823284149169922, "logits/rejected": -8.822736740112305, "logps/chosen": -6.89982795715332, "logps/rejected": -105.64725494384766, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5191807746887207, "rewards/margins": 9.92713451385498, "rewards/rejected": -7.40795373916626, "step": 677 }, { "epoch": 0.47595647595647594, "grad_norm": 0.001350301317870617, "learning_rate": 4.718716156445106e-05, "logits/chosen": -8.378057479858398, "logits/rejected": -8.356675148010254, "logps/chosen": -5.83522891998291, "logps/rejected": -106.0318603515625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.711214542388916, "rewards/margins": 10.087566375732422, "rewards/rejected": -7.376351356506348, "step": 678 }, { "epoch": 0.47665847665847666, "grad_norm": 0.006158636882901192, "learning_rate": 4.7197829571341704e-05, "logits/chosen": -8.75899887084961, "logits/rejected": -8.76502513885498, "logps/chosen": -5.4244160652160645, "logps/rejected": -105.10411834716797, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.6429905891418457, "rewards/margins": 9.98530101776123, "rewards/rejected": -7.342310428619385, "step": 679 }, { "epoch": 0.4773604773604774, "grad_norm": 0.06997643411159515, "learning_rate": 4.720848187843727e-05, "logits/chosen": -8.159282684326172, "logits/rejected": -8.139219284057617, "logps/chosen": -7.794161319732666, "logps/rejected": -107.350830078125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.435638427734375, "rewards/margins": 9.853105545043945, "rewards/rejected": -7.4174675941467285, "step": 680 }, { "epoch": 0.47806247806247804, "grad_norm": 3.6980478763580322, "learning_rate": 4.721911853187975e-05, "logits/chosen": -8.646184921264648, "logits/rejected": -8.616464614868164, "logps/chosen": -5.990159034729004, "logps/rejected": -107.79096984863281, "loss": 0.0676, "rewards/accuracies": 1.0, "rewards/chosen": 2.439467191696167, "rewards/margins": 10.10079574584961, "rewards/rejected": -7.6613287925720215, "step": 681 }, { "epoch": 0.47876447876447875, "grad_norm": 0.004010606557130814, "learning_rate": 4.722973957760799e-05, "logits/chosen": -8.064681053161621, "logits/rejected": -8.076053619384766, "logps/chosen": -5.569210052490234, "logps/rejected": -105.7097396850586, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.840930461883545, "rewards/margins": 10.119251251220703, "rewards/rejected": -7.278321266174316, "step": 682 }, { "epoch": 0.47946647946647947, "grad_norm": 0.0013547695707529783, "learning_rate": 4.724034506135888e-05, "logits/chosen": -8.672317504882812, "logits/rejected": -8.677755355834961, "logps/chosen": -8.166535377502441, "logps/rejected": -106.35745239257812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.385406970977783, "rewards/margins": 9.810632705688477, "rewards/rejected": -7.425225734710693, "step": 683 }, { "epoch": 0.4801684801684802, "grad_norm": 0.0013147592544555664, "learning_rate": 4.725093502866861e-05, "logits/chosen": -9.42085075378418, "logits/rejected": -9.397031784057617, "logps/chosen": -5.1849775314331055, "logps/rejected": -108.83834838867188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5617408752441406, "rewards/margins": 10.317981719970703, "rewards/rejected": -7.7562408447265625, "step": 684 }, { "epoch": 0.48087048087048084, "grad_norm": 0.0028106821700930595, "learning_rate": 4.7261509524873764e-05, "logits/chosen": -9.449993133544922, "logits/rejected": -9.459720611572266, "logps/chosen": -16.133644104003906, "logps/rejected": -99.16061401367188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.6818492412567139, "rewards/margins": 8.257284164428711, "rewards/rejected": -6.575434684753418, "step": 685 }, { "epoch": 0.48157248157248156, "grad_norm": 0.007590763736516237, "learning_rate": 4.727206859511253e-05, "logits/chosen": -7.9707441329956055, "logits/rejected": -7.95932674407959, "logps/chosen": -8.590436935424805, "logps/rejected": -102.91007995605469, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.3674731254577637, "rewards/margins": 9.431222915649414, "rewards/rejected": -7.063750267028809, "step": 686 }, { "epoch": 0.4822744822744823, "grad_norm": 0.001486493507400155, "learning_rate": 4.7282612284325846e-05, "logits/chosen": -9.216705322265625, "logits/rejected": -9.223567008972168, "logps/chosen": -3.72346830368042, "logps/rejected": -108.58055114746094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8125009536743164, "rewards/margins": 10.557449340820312, "rewards/rejected": -7.744948387145996, "step": 687 }, { "epoch": 0.482976482976483, "grad_norm": 0.00256081810221076, "learning_rate": 4.729314063725853e-05, "logits/chosen": -8.375566482543945, "logits/rejected": -8.380475997924805, "logps/chosen": -3.6361002922058105, "logps/rejected": -108.3837890625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9084296226501465, "rewards/margins": 10.505258560180664, "rewards/rejected": -7.596828460693359, "step": 688 }, { "epoch": 0.4836784836784837, "grad_norm": 0.0015320382080972195, "learning_rate": 4.730365369846044e-05, "logits/chosen": -9.156126022338867, "logits/rejected": -9.140539169311523, "logps/chosen": -9.561094284057617, "logps/rejected": -106.82182312011719, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.1339194774627686, "rewards/margins": 9.633591651916504, "rewards/rejected": -7.499671936035156, "step": 689 }, { "epoch": 0.48438048438048437, "grad_norm": 0.004131741356104612, "learning_rate": 4.7314151512287594e-05, "logits/chosen": -9.029459953308105, "logits/rejected": -9.017335891723633, "logps/chosen": -4.546167373657227, "logps/rejected": -107.89044952392578, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7707204818725586, "rewards/margins": 10.385335922241211, "rewards/rejected": -7.614615440368652, "step": 690 }, { "epoch": 0.4850824850824851, "grad_norm": 0.4843467175960541, "learning_rate": 4.732463412290331e-05, "logits/chosen": -8.736452102661133, "logits/rejected": -8.72667121887207, "logps/chosen": -11.252941131591797, "logps/rejected": -102.49738311767578, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 2.083989143371582, "rewards/margins": 9.082074165344238, "rewards/rejected": -6.998084545135498, "step": 691 }, { "epoch": 0.4857844857844858, "grad_norm": 0.001520531834103167, "learning_rate": 4.73351015742793e-05, "logits/chosen": -8.355815887451172, "logits/rejected": -8.35678482055664, "logps/chosen": -4.688126564025879, "logps/rejected": -108.15470123291016, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7649521827697754, "rewards/margins": 10.37275505065918, "rewards/rejected": -7.607802391052246, "step": 692 }, { "epoch": 0.4864864864864865, "grad_norm": 0.0018118192674592137, "learning_rate": 4.7345553910196785e-05, "logits/chosen": -8.57109260559082, "logits/rejected": -8.575272560119629, "logps/chosen": -4.818983554840088, "logps/rejected": -108.60260009765625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.636788845062256, "rewards/margins": 10.378755569458008, "rewards/rejected": -7.74196720123291, "step": 693 }, { "epoch": 0.4871884871884872, "grad_norm": 0.003150681033730507, "learning_rate": 4.735599117424759e-05, "logits/chosen": -8.120869636535645, "logits/rejected": -8.103998184204102, "logps/chosen": -7.9437174797058105, "logps/rejected": -104.59860229492188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.241875648498535, "rewards/margins": 9.57906723022461, "rewards/rejected": -7.337191581726074, "step": 694 }, { "epoch": 0.4878904878904879, "grad_norm": 0.210810124874115, "learning_rate": 4.7366413409835235e-05, "logits/chosen": -8.40847396850586, "logits/rejected": -8.403294563293457, "logps/chosen": -6.056929111480713, "logps/rejected": -107.07583618164062, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 2.563556671142578, "rewards/margins": 10.083219528198242, "rewards/rejected": -7.519663333892822, "step": 695 }, { "epoch": 0.4885924885924886, "grad_norm": 0.0025089753326028585, "learning_rate": 4.737682066017604e-05, "logits/chosen": -8.698734283447266, "logits/rejected": -8.689369201660156, "logps/chosen": -6.3203277587890625, "logps/rejected": -108.4161605834961, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.494344472885132, "rewards/margins": 10.04498291015625, "rewards/rejected": -7.550638198852539, "step": 696 }, { "epoch": 0.4892944892944893, "grad_norm": 0.0019691064953804016, "learning_rate": 4.7387212968300166e-05, "logits/chosen": -8.580059051513672, "logits/rejected": -8.57678508758545, "logps/chosen": -6.546647071838379, "logps/rejected": -106.08235168457031, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5968501567840576, "rewards/margins": 9.96843147277832, "rewards/rejected": -7.37158203125, "step": 697 }, { "epoch": 0.48999648999649, "grad_norm": 0.0020705547649413347, "learning_rate": 4.7397590377052686e-05, "logits/chosen": -9.246721267700195, "logits/rejected": -9.241491317749023, "logps/chosen": -5.418006896972656, "logps/rejected": -107.9409408569336, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.464059352874756, "rewards/margins": 10.199390411376953, "rewards/rejected": -7.735330581665039, "step": 698 }, { "epoch": 0.4906984906984907, "grad_norm": 0.004461263306438923, "learning_rate": 4.74079529290947e-05, "logits/chosen": -8.579840660095215, "logits/rejected": -8.574850082397461, "logps/chosen": -6.403408050537109, "logps/rejected": -104.06167602539062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6656033992767334, "rewards/margins": 9.840957641601562, "rewards/rejected": -7.175354480743408, "step": 699 }, { "epoch": 0.4914004914004914, "grad_norm": 0.0022544634994119406, "learning_rate": 4.741830066690428e-05, "logits/chosen": -9.293903350830078, "logits/rejected": -9.294572830200195, "logps/chosen": -4.858672618865967, "logps/rejected": -107.70165252685547, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5685601234436035, "rewards/margins": 10.226008415222168, "rewards/rejected": -7.657448768615723, "step": 700 }, { "epoch": 0.49210249210249213, "grad_norm": 0.005233956500887871, "learning_rate": 4.742863363277765e-05, "logits/chosen": -8.713666915893555, "logits/rejected": -8.708990097045898, "logps/chosen": -7.079583168029785, "logps/rejected": -109.03148651123047, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.369781017303467, "rewards/margins": 9.946842193603516, "rewards/rejected": -7.577060699462891, "step": 701 }, { "epoch": 0.4928044928044928, "grad_norm": 0.003894644556567073, "learning_rate": 4.743895186883009e-05, "logits/chosen": -8.92329216003418, "logits/rejected": -8.941970825195312, "logps/chosen": -8.211753845214844, "logps/rejected": -106.92286682128906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.510342597961426, "rewards/margins": 9.877175331115723, "rewards/rejected": -7.366832733154297, "step": 702 }, { "epoch": 0.4935064935064935, "grad_norm": 0.005967606790363789, "learning_rate": 4.7449255416997075e-05, "logits/chosen": -8.673913955688477, "logits/rejected": -8.672346115112305, "logps/chosen": -10.203048706054688, "logps/rejected": -103.07797241210938, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.166426181793213, "rewards/margins": 9.18739128112793, "rewards/rejected": -7.020963668823242, "step": 703 }, { "epoch": 0.4942084942084942, "grad_norm": 0.07417977601289749, "learning_rate": 4.7459544319035206e-05, "logits/chosen": -8.992324829101562, "logits/rejected": -8.983848571777344, "logps/chosen": -8.32938003540039, "logps/rejected": -106.45121765136719, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.4176156520843506, "rewards/margins": 9.752052307128906, "rewards/rejected": -7.334437370300293, "step": 704 }, { "epoch": 0.49491049491049494, "grad_norm": 0.0032106004655361176, "learning_rate": 4.746981861652332e-05, "logits/chosen": -8.923319816589355, "logits/rejected": -8.936894416809082, "logps/chosen": -5.694147109985352, "logps/rejected": -107.95343017578125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5528573989868164, "rewards/margins": 10.229455947875977, "rewards/rejected": -7.67659854888916, "step": 705 }, { "epoch": 0.4956124956124956, "grad_norm": 0.0037430922966450453, "learning_rate": 4.74800783508634e-05, "logits/chosen": -8.130697250366211, "logits/rejected": -8.152563095092773, "logps/chosen": -6.113738536834717, "logps/rejected": -109.18199157714844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.625483512878418, "rewards/margins": 10.162992477416992, "rewards/rejected": -7.537508487701416, "step": 706 }, { "epoch": 0.4963144963144963, "grad_norm": 0.0015666732797399163, "learning_rate": 4.7490323563281665e-05, "logits/chosen": -9.364144325256348, "logits/rejected": -9.368759155273438, "logps/chosen": -10.001514434814453, "logps/rejected": -104.73504638671875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.167564868927002, "rewards/margins": 9.360458374023438, "rewards/rejected": -7.192892551422119, "step": 707 }, { "epoch": 0.497016497016497, "grad_norm": 0.0011695263674482703, "learning_rate": 4.750055429482949e-05, "logits/chosen": -9.4371976852417, "logits/rejected": -9.404834747314453, "logps/chosen": -5.4184441566467285, "logps/rejected": -107.56593322753906, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.537022113800049, "rewards/margins": 10.189204216003418, "rewards/rejected": -7.652182102203369, "step": 708 }, { "epoch": 0.49771849771849774, "grad_norm": 0.00445494893938303, "learning_rate": 4.751077058638445e-05, "logits/chosen": -8.75928020477295, "logits/rejected": -8.774913787841797, "logps/chosen": -5.9621076583862305, "logps/rejected": -107.49061584472656, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6018433570861816, "rewards/margins": 10.159769058227539, "rewards/rejected": -7.557926177978516, "step": 709 }, { "epoch": 0.4984204984204984, "grad_norm": 0.0014164233580231667, "learning_rate": 4.752097247865126e-05, "logits/chosen": -9.667526245117188, "logits/rejected": -9.689804077148438, "logps/chosen": -10.836703300476074, "logps/rejected": -104.4063949584961, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.15368914604187, "rewards/margins": 9.359125137329102, "rewards/rejected": -7.205435752868652, "step": 710 }, { "epoch": 0.4991224991224991, "grad_norm": 0.0012632710859179497, "learning_rate": 4.753116001216277e-05, "logits/chosen": -9.229812622070312, "logits/rejected": -9.239179611206055, "logps/chosen": -8.815649032592773, "logps/rejected": -105.42941284179688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.277641773223877, "rewards/margins": 9.659610748291016, "rewards/rejected": -7.3819684982299805, "step": 711 }, { "epoch": 0.49982449982449983, "grad_norm": 0.0019728918559849262, "learning_rate": 4.7541333227280944e-05, "logits/chosen": -8.971508026123047, "logits/rejected": -8.954763412475586, "logps/chosen": -5.217925548553467, "logps/rejected": -108.4991226196289, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5172533988952637, "rewards/margins": 10.28138542175293, "rewards/rejected": -7.764132976531982, "step": 712 }, { "epoch": 0.5005265005265005, "grad_norm": 0.0014994324883446097, "learning_rate": 4.755149216419776e-05, "logits/chosen": -8.769657135009766, "logits/rejected": -8.778206825256348, "logps/chosen": -5.759805202484131, "logps/rejected": -108.9544906616211, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6230926513671875, "rewards/margins": 10.2225341796875, "rewards/rejected": -7.5994415283203125, "step": 713 }, { "epoch": 0.5012285012285013, "grad_norm": 0.007691496983170509, "learning_rate": 4.756163686293624e-05, "logits/chosen": -8.771562576293945, "logits/rejected": -8.759931564331055, "logps/chosen": -4.82569694519043, "logps/rejected": -108.61520385742188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.621975898742676, "rewards/margins": 10.339275360107422, "rewards/rejected": -7.717299461364746, "step": 714 }, { "epoch": 0.5019305019305019, "grad_norm": 0.0010066860122606158, "learning_rate": 4.7571767363351344e-05, "logits/chosen": -9.042348861694336, "logits/rejected": -9.015921592712402, "logps/chosen": -9.896235466003418, "logps/rejected": -102.51879119873047, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3170714378356934, "rewards/margins": 9.336048126220703, "rewards/rejected": -7.018976211547852, "step": 715 }, { "epoch": 0.5026325026325026, "grad_norm": 0.0019704371225088835, "learning_rate": 4.758188370513093e-05, "logits/chosen": -9.142595291137695, "logits/rejected": -9.150362014770508, "logps/chosen": -3.6404948234558105, "logps/rejected": -107.87734985351562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.717132091522217, "rewards/margins": 10.431014060974121, "rewards/rejected": -7.713881492614746, "step": 716 }, { "epoch": 0.5033345033345034, "grad_norm": 0.003384291660040617, "learning_rate": 4.759198592779667e-05, "logits/chosen": -8.494671821594238, "logits/rejected": -8.483345985412598, "logps/chosen": -4.792608737945557, "logps/rejected": -108.7902603149414, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6909985542297363, "rewards/margins": 10.381782531738281, "rewards/rejected": -7.690783500671387, "step": 717 }, { "epoch": 0.504036504036504, "grad_norm": 0.006241960916668177, "learning_rate": 4.760207407070501e-05, "logits/chosen": -8.340167045593262, "logits/rejected": -8.333430290222168, "logps/chosen": -4.2335591316223145, "logps/rejected": -108.09042358398438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.846318244934082, "rewards/margins": 10.382411003112793, "rewards/rejected": -7.536092758178711, "step": 718 }, { "epoch": 0.5047385047385048, "grad_norm": 0.0071311467327177525, "learning_rate": 4.761214817304805e-05, "logits/chosen": -8.835451126098633, "logits/rejected": -8.831838607788086, "logps/chosen": -8.616400718688965, "logps/rejected": -107.32347106933594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3320999145507812, "rewards/margins": 9.782787322998047, "rewards/rejected": -7.450687885284424, "step": 719 }, { "epoch": 0.5054405054405054, "grad_norm": 0.0016469588736072183, "learning_rate": 4.762220827385448e-05, "logits/chosen": -7.557332992553711, "logits/rejected": -7.566674709320068, "logps/chosen": -10.195344924926758, "logps/rejected": -104.18801879882812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.191349744796753, "rewards/margins": 9.280325889587402, "rewards/rejected": -7.088976860046387, "step": 720 }, { "epoch": 0.5061425061425061, "grad_norm": 0.0011833437019959092, "learning_rate": 4.763225441199049e-05, "logits/chosen": -8.755386352539062, "logits/rejected": -8.740362167358398, "logps/chosen": -4.233767509460449, "logps/rejected": -108.63204193115234, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.887852191925049, "rewards/margins": 10.53865909576416, "rewards/rejected": -7.650806427001953, "step": 721 }, { "epoch": 0.5068445068445069, "grad_norm": 0.0011223329929634929, "learning_rate": 4.7642286626160654e-05, "logits/chosen": -9.033134460449219, "logits/rejected": -9.017525672912598, "logps/chosen": -11.555051803588867, "logps/rejected": -105.0460433959961, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.0578255653381348, "rewards/margins": 9.318023681640625, "rewards/rejected": -7.260197639465332, "step": 722 }, { "epoch": 0.5075465075465075, "grad_norm": 0.0013281663414090872, "learning_rate": 4.765230495490885e-05, "logits/chosen": -9.575471878051758, "logits/rejected": -9.557741165161133, "logps/chosen": -4.248085975646973, "logps/rejected": -109.3125228881836, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5913991928100586, "rewards/margins": 10.386030197143555, "rewards/rejected": -7.794631004333496, "step": 723 }, { "epoch": 0.5082485082485082, "grad_norm": 0.021408911794424057, "learning_rate": 4.7662309436619115e-05, "logits/chosen": -9.288820266723633, "logits/rejected": -9.275434494018555, "logps/chosen": -3.213972806930542, "logps/rejected": -108.38875579833984, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.764920473098755, "rewards/margins": 10.556934356689453, "rewards/rejected": -7.792015075683594, "step": 724 }, { "epoch": 0.508950508950509, "grad_norm": 0.0009773427154868841, "learning_rate": 4.7672300109516563e-05, "logits/chosen": -9.058398246765137, "logits/rejected": -9.093381881713867, "logps/chosen": -3.6362204551696777, "logps/rejected": -108.09066772460938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7986135482788086, "rewards/margins": 10.494203567504883, "rewards/rejected": -7.695590019226074, "step": 725 }, { "epoch": 0.5096525096525096, "grad_norm": 0.0015803189016878605, "learning_rate": 4.768227701166823e-05, "logits/chosen": -9.423173904418945, "logits/rejected": -9.42386245727539, "logps/chosen": -6.1403703689575195, "logps/rejected": -106.22123718261719, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.492971897125244, "rewards/margins": 10.029213905334473, "rewards/rejected": -7.53624153137207, "step": 726 }, { "epoch": 0.5103545103545104, "grad_norm": 0.0012818590039387345, "learning_rate": 4.7692240180983964e-05, "logits/chosen": -8.039255142211914, "logits/rejected": -8.022526741027832, "logps/chosen": -3.691120147705078, "logps/rejected": -108.57221984863281, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.944822311401367, "rewards/margins": 10.576433181762695, "rewards/rejected": -7.631610870361328, "step": 727 }, { "epoch": 0.5110565110565111, "grad_norm": 0.0073013207875192165, "learning_rate": 4.770218965521729e-05, "logits/chosen": -8.592602729797363, "logits/rejected": -8.59332275390625, "logps/chosen": -8.850079536437988, "logps/rejected": -107.45578002929688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3953399658203125, "rewards/margins": 9.756957054138184, "rewards/rejected": -7.361617565155029, "step": 728 }, { "epoch": 0.5117585117585117, "grad_norm": 0.0012156926095485687, "learning_rate": 4.7712125471966245e-05, "logits/chosen": -8.145329475402832, "logits/rejected": -8.127080917358398, "logps/chosen": -3.995387315750122, "logps/rejected": -108.74705505371094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.777749538421631, "rewards/margins": 10.499252319335938, "rewards/rejected": -7.721502304077148, "step": 729 }, { "epoch": 0.5124605124605125, "grad_norm": 0.0012002689763903618, "learning_rate": 4.7722047668674267e-05, "logits/chosen": -8.662242889404297, "logits/rejected": -8.652141571044922, "logps/chosen": -3.260681629180908, "logps/rejected": -108.5836410522461, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.914794445037842, "rewards/margins": 10.594694137573242, "rewards/rejected": -7.679900169372559, "step": 730 }, { "epoch": 0.5131625131625132, "grad_norm": 0.0013688582694157958, "learning_rate": 4.7731956282631004e-05, "logits/chosen": -9.934555053710938, "logits/rejected": -9.937253952026367, "logps/chosen": -18.2783203125, "logps/rejected": -99.41154479980469, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.6272706985473633, "rewards/margins": 8.285408973693848, "rewards/rejected": -6.658138751983643, "step": 731 }, { "epoch": 0.5138645138645138, "grad_norm": 0.0013430098770186305, "learning_rate": 4.77418513509732e-05, "logits/chosen": -9.41046142578125, "logits/rejected": -9.413104057312012, "logps/chosen": -5.8980536460876465, "logps/rejected": -105.94361877441406, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.528184175491333, "rewards/margins": 9.987564086914062, "rewards/rejected": -7.45937967300415, "step": 732 }, { "epoch": 0.5145665145665146, "grad_norm": 0.0011951917549595237, "learning_rate": 4.775173291068547e-05, "logits/chosen": -8.949190139770508, "logits/rejected": -8.958956718444824, "logps/chosen": -11.870645523071289, "logps/rejected": -103.13993835449219, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.067636489868164, "rewards/margins": 8.940896987915039, "rewards/rejected": -6.873259544372559, "step": 733 }, { "epoch": 0.5152685152685152, "grad_norm": 0.0011295708827674389, "learning_rate": 4.776160099860117e-05, "logits/chosen": -8.802639961242676, "logits/rejected": -8.77609920501709, "logps/chosen": -3.8986446857452393, "logps/rejected": -108.42381286621094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9468343257904053, "rewards/margins": 10.52299690246582, "rewards/rejected": -7.576162338256836, "step": 734 }, { "epoch": 0.515970515970516, "grad_norm": 0.0014922542031854391, "learning_rate": 4.777145565140325e-05, "logits/chosen": -8.074104309082031, "logits/rejected": -8.085395812988281, "logps/chosen": -6.7395734786987305, "logps/rejected": -105.70323181152344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5173416137695312, "rewards/margins": 9.832706451416016, "rewards/rejected": -7.315364837646484, "step": 735 }, { "epoch": 0.5166725166725167, "grad_norm": 0.0020384248346090317, "learning_rate": 4.7781296905624986e-05, "logits/chosen": -9.18210220336914, "logits/rejected": -9.190692901611328, "logps/chosen": -6.7743964195251465, "logps/rejected": -106.88031005859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4368975162506104, "rewards/margins": 10.021064758300781, "rewards/rejected": -7.584166526794434, "step": 736 }, { "epoch": 0.5173745173745173, "grad_norm": 0.0017656050622463226, "learning_rate": 4.779112479765086e-05, "logits/chosen": -8.697786331176758, "logits/rejected": -8.677882194519043, "logps/chosen": -5.302433013916016, "logps/rejected": -104.91688537597656, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.67218017578125, "rewards/margins": 10.048554420471191, "rewards/rejected": -7.376374244689941, "step": 737 }, { "epoch": 0.5180765180765181, "grad_norm": 0.036214280873537064, "learning_rate": 4.780093936371736e-05, "logits/chosen": -8.462875366210938, "logits/rejected": -8.45949935913086, "logps/chosen": -4.293979644775391, "logps/rejected": -108.03392028808594, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.7524795532226562, "rewards/margins": 10.415108680725098, "rewards/rejected": -7.662628173828125, "step": 738 }, { "epoch": 0.5187785187785188, "grad_norm": 0.0010223283898085356, "learning_rate": 4.781074063991376e-05, "logits/chosen": -8.70595932006836, "logits/rejected": -8.711629867553711, "logps/chosen": -8.005993843078613, "logps/rejected": -105.65260314941406, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4707398414611816, "rewards/margins": 9.792610168457031, "rewards/rejected": -7.321869850158691, "step": 739 }, { "epoch": 0.5194805194805194, "grad_norm": 0.0036031208001077175, "learning_rate": 4.782052866218294e-05, "logits/chosen": -9.60185432434082, "logits/rejected": -9.596271514892578, "logps/chosen": -3.749642848968506, "logps/rejected": -109.66471099853516, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.633838176727295, "rewards/margins": 10.478559494018555, "rewards/rejected": -7.844720840454102, "step": 740 }, { "epoch": 0.5201825201825202, "grad_norm": 0.0011466331779956818, "learning_rate": 4.783030346632214e-05, "logits/chosen": -8.422893524169922, "logits/rejected": -8.404111862182617, "logps/chosen": -6.970730304718018, "logps/rejected": -105.76197052001953, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.589175224304199, "rewards/margins": 9.91620922088623, "rewards/rejected": -7.3270344734191895, "step": 741 }, { "epoch": 0.5208845208845209, "grad_norm": 0.0011848454596474767, "learning_rate": 4.7840065087983786e-05, "logits/chosen": -9.40322494506836, "logits/rejected": -9.402043342590332, "logps/chosen": -6.748039722442627, "logps/rejected": -107.44580841064453, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5768790245056152, "rewards/margins": 10.05600357055664, "rewards/rejected": -7.479124069213867, "step": 742 }, { "epoch": 0.5215865215865216, "grad_norm": 0.0009989376412704587, "learning_rate": 4.784981356267626e-05, "logits/chosen": -8.624664306640625, "logits/rejected": -8.62614631652832, "logps/chosen": -3.4744038581848145, "logps/rejected": -108.96136474609375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6783690452575684, "rewards/margins": 10.535453796386719, "rewards/rejected": -7.857085227966309, "step": 743 }, { "epoch": 0.5222885222885223, "grad_norm": 0.0016475076554343104, "learning_rate": 4.785954892576465e-05, "logits/chosen": -8.79848861694336, "logits/rejected": -8.769512176513672, "logps/chosen": -7.801493167877197, "logps/rejected": -105.62007141113281, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.482661247253418, "rewards/margins": 9.809654235839844, "rewards/rejected": -7.326993465423584, "step": 744 }, { "epoch": 0.522990522990523, "grad_norm": 0.0014024041593074799, "learning_rate": 4.7869271212471554e-05, "logits/chosen": -9.405363082885742, "logits/rejected": -9.389379501342773, "logps/chosen": -6.571925640106201, "logps/rejected": -107.68011474609375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5198824405670166, "rewards/margins": 10.119159698486328, "rewards/rejected": -7.599277019500732, "step": 745 }, { "epoch": 0.5236925236925237, "grad_norm": 0.001408567768521607, "learning_rate": 4.7878980457877814e-05, "logits/chosen": -9.31932258605957, "logits/rejected": -9.299598693847656, "logps/chosen": -3.2187089920043945, "logps/rejected": -109.23919677734375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.81150484085083, "rewards/margins": 10.578777313232422, "rewards/rejected": -7.767271518707275, "step": 746 }, { "epoch": 0.5243945243945244, "grad_norm": 0.005045863799750805, "learning_rate": 4.7888676696923315e-05, "logits/chosen": -8.694091796875, "logits/rejected": -8.695772171020508, "logps/chosen": -2.8736934661865234, "logps/rejected": -109.06175994873047, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.891765832901001, "rewards/margins": 10.617607116699219, "rewards/rejected": -7.725842475891113, "step": 747 }, { "epoch": 0.525096525096525, "grad_norm": 0.0015434316592290998, "learning_rate": 4.7898359964407695e-05, "logits/chosen": -8.634785652160645, "logits/rejected": -8.628132820129395, "logps/chosen": -3.4848852157592773, "logps/rejected": -108.80697631835938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8793110847473145, "rewards/margins": 10.616485595703125, "rewards/rejected": -7.737174034118652, "step": 748 }, { "epoch": 0.5257985257985258, "grad_norm": 0.0013662249548360705, "learning_rate": 4.790803029499111e-05, "logits/chosen": -9.428342819213867, "logits/rejected": -9.428982734680176, "logps/chosen": -5.207060813903809, "logps/rejected": -106.98579406738281, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.570021390914917, "rewards/margins": 10.192556381225586, "rewards/rejected": -7.62253475189209, "step": 749 }, { "epoch": 0.5265005265005265, "grad_norm": 0.002207741839811206, "learning_rate": 4.7917687723195004e-05, "logits/chosen": -9.099411010742188, "logits/rejected": -9.105018615722656, "logps/chosen": -12.229572296142578, "logps/rejected": -106.89104461669922, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.042616844177246, "rewards/margins": 9.405548095703125, "rewards/rejected": -7.362931728363037, "step": 750 }, { "epoch": 0.5272025272025272, "grad_norm": 0.0012440034188330173, "learning_rate": 4.792733228340281e-05, "logits/chosen": -8.827529907226562, "logits/rejected": -8.83348274230957, "logps/chosen": -2.8711354732513428, "logps/rejected": -108.41040802001953, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8714163303375244, "rewards/margins": 10.681488990783691, "rewards/rejected": -7.810072898864746, "step": 751 }, { "epoch": 0.5279045279045279, "grad_norm": 0.09422644972801208, "learning_rate": 4.793696400986071e-05, "logits/chosen": -8.202922821044922, "logits/rejected": -8.19430160522461, "logps/chosen": -2.4837422370910645, "logps/rejected": -109.69305419921875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.912045478820801, "rewards/margins": 10.74793815612793, "rewards/rejected": -7.835893154144287, "step": 752 }, { "epoch": 0.5286065286065286, "grad_norm": 0.17059707641601562, "learning_rate": 4.7946582936678344e-05, "logits/chosen": -8.602169036865234, "logits/rejected": -8.603347778320312, "logps/chosen": -10.079854011535645, "logps/rejected": -104.20110321044922, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 2.152456045150757, "rewards/margins": 9.361837387084961, "rewards/rejected": -7.209381580352783, "step": 753 }, { "epoch": 0.5293085293085293, "grad_norm": 0.001598062110133469, "learning_rate": 4.795618909782957e-05, "logits/chosen": -8.074943542480469, "logits/rejected": -8.06580924987793, "logps/chosen": -3.04628586769104, "logps/rejected": -109.20523071289062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.897214412689209, "rewards/margins": 10.639505386352539, "rewards/rejected": -7.742291450500488, "step": 754 }, { "epoch": 0.53001053001053, "grad_norm": 0.0009593103313818574, "learning_rate": 4.796578252715314e-05, "logits/chosen": -7.924717903137207, "logits/rejected": -7.924848556518555, "logps/chosen": -7.726203918457031, "logps/rejected": -105.39225769042969, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5202977657318115, "rewards/margins": 9.702953338623047, "rewards/rejected": -7.182656288146973, "step": 755 }, { "epoch": 0.5307125307125307, "grad_norm": 0.0017839007778093219, "learning_rate": 4.797536325835345e-05, "logits/chosen": -9.66338062286377, "logits/rejected": -9.6580810546875, "logps/chosen": -10.695937156677246, "logps/rejected": -102.53106689453125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.0890378952026367, "rewards/margins": 9.174357414245605, "rewards/rejected": -7.085319519042969, "step": 756 }, { "epoch": 0.5314145314145314, "grad_norm": 0.0021303456742316484, "learning_rate": 4.7984931325001216e-05, "logits/chosen": -8.909147262573242, "logits/rejected": -8.900476455688477, "logps/chosen": -5.721314430236816, "logps/rejected": -106.9058837890625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7221035957336426, "rewards/margins": 10.224705696105957, "rewards/rejected": -7.502601623535156, "step": 757 }, { "epoch": 0.5321165321165321, "grad_norm": 0.0012417212128639221, "learning_rate": 4.799448676053423e-05, "logits/chosen": -8.916329383850098, "logits/rejected": -8.901568412780762, "logps/chosen": -10.024208068847656, "logps/rejected": -103.9007568359375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3059425354003906, "rewards/margins": 9.381187438964844, "rewards/rejected": -7.075244903564453, "step": 758 }, { "epoch": 0.5328185328185329, "grad_norm": 0.0013099053176119924, "learning_rate": 4.800402959825802e-05, "logits/chosen": -8.474567413330078, "logits/rejected": -8.448532104492188, "logps/chosen": -3.2898943424224854, "logps/rejected": -109.00992584228516, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.942716598510742, "rewards/margins": 10.573246955871582, "rewards/rejected": -7.630529880523682, "step": 759 }, { "epoch": 0.5335205335205335, "grad_norm": 0.0008023087866604328, "learning_rate": 4.801355987134653e-05, "logits/chosen": -8.010520935058594, "logits/rejected": -7.989349842071533, "logps/chosen": -2.304471254348755, "logps/rejected": -109.8748779296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.036374807357788, "rewards/margins": 10.768665313720703, "rewards/rejected": -7.732290744781494, "step": 760 }, { "epoch": 0.5342225342225342, "grad_norm": 0.0008604503236711025, "learning_rate": 4.802307761284289e-05, "logits/chosen": -9.33313274383545, "logits/rejected": -9.327446937561035, "logps/chosen": -8.209661483764648, "logps/rejected": -104.57563781738281, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3496358394622803, "rewards/margins": 9.62690544128418, "rewards/rejected": -7.27726936340332, "step": 761 }, { "epoch": 0.534924534924535, "grad_norm": 0.0020852817688137293, "learning_rate": 4.8032582855660014e-05, "logits/chosen": -8.902711868286133, "logits/rejected": -8.858587265014648, "logps/chosen": -2.1583480834960938, "logps/rejected": -109.22232055664062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1390132904052734, "rewards/margins": 10.797798156738281, "rewards/rejected": -7.658785343170166, "step": 762 }, { "epoch": 0.5356265356265356, "grad_norm": 0.0010942997178062797, "learning_rate": 4.8042075632581346e-05, "logits/chosen": -8.616594314575195, "logits/rejected": -8.585611343383789, "logps/chosen": -2.6124520301818848, "logps/rejected": -109.6781234741211, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9984755516052246, "rewards/margins": 10.709400177001953, "rewards/rejected": -7.7109246253967285, "step": 763 }, { "epoch": 0.5363285363285363, "grad_norm": 0.0014006331330165267, "learning_rate": 4.80515559762615e-05, "logits/chosen": -8.315877914428711, "logits/rejected": -8.316553115844727, "logps/chosen": -5.46417236328125, "logps/rejected": -106.85687255859375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7054331302642822, "rewards/margins": 10.171050071716309, "rewards/rejected": -7.4656171798706055, "step": 764 }, { "epoch": 0.537030537030537, "grad_norm": 0.11716674268245697, "learning_rate": 4.8061023919226964e-05, "logits/chosen": -8.719602584838867, "logits/rejected": -8.72358512878418, "logps/chosen": -2.805163860321045, "logps/rejected": -109.46644592285156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.831911325454712, "rewards/margins": 10.661290168762207, "rewards/rejected": -7.829379558563232, "step": 765 }, { "epoch": 0.5377325377325377, "grad_norm": 0.001946399686858058, "learning_rate": 4.807047949387674e-05, "logits/chosen": -8.399606704711914, "logits/rejected": -8.38398551940918, "logps/chosen": -7.644145488739014, "logps/rejected": -104.27769470214844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.657339096069336, "rewards/margins": 9.766029357910156, "rewards/rejected": -7.108691215515137, "step": 766 }, { "epoch": 0.5384345384345385, "grad_norm": 0.0024355812929570675, "learning_rate": 4.807992273248302e-05, "logits/chosen": -8.452861785888672, "logits/rejected": -8.433866500854492, "logps/chosen": -1.7705808877944946, "logps/rejected": -109.63716125488281, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.903911590576172, "rewards/margins": 10.77595329284668, "rewards/rejected": -7.872042179107666, "step": 767 }, { "epoch": 0.5391365391365391, "grad_norm": 0.0009131710394285619, "learning_rate": 4.808935366719187e-05, "logits/chosen": -9.246526718139648, "logits/rejected": -9.248634338378906, "logps/chosen": -2.6448376178741455, "logps/rejected": -109.49697875976562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9842700958251953, "rewards/margins": 10.678611755371094, "rewards/rejected": -7.694341659545898, "step": 768 }, { "epoch": 0.5398385398385398, "grad_norm": 0.0013768133940175176, "learning_rate": 4.8098772330023855e-05, "logits/chosen": -8.77442741394043, "logits/rejected": -8.774759292602539, "logps/chosen": -1.832148790359497, "logps/rejected": -109.51081085205078, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.0486719608306885, "rewards/margins": 10.839962005615234, "rewards/rejected": -7.791290283203125, "step": 769 }, { "epoch": 0.5405405405405406, "grad_norm": 0.0009712298633530736, "learning_rate": 4.81081787528747e-05, "logits/chosen": -9.502687454223633, "logits/rejected": -9.501945495605469, "logps/chosen": -12.535123825073242, "logps/rejected": -103.69288635253906, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.0986435413360596, "rewards/margins": 9.062698364257812, "rewards/rejected": -6.964055061340332, "step": 770 }, { "epoch": 0.5412425412425412, "grad_norm": 0.0006363438442349434, "learning_rate": 4.811757296751595e-05, "logits/chosen": -8.704866409301758, "logits/rejected": -8.694694519042969, "logps/chosen": -4.390791893005371, "logps/rejected": -109.98870849609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7375006675720215, "rewards/margins": 10.46002197265625, "rewards/rejected": -7.7225213050842285, "step": 771 }, { "epoch": 0.5419445419445419, "grad_norm": 0.0011236956343054771, "learning_rate": 4.812695500559561e-05, "logits/chosen": -8.305606842041016, "logits/rejected": -8.303318977355957, "logps/chosen": -10.837596893310547, "logps/rejected": -106.54613494873047, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.1715738773345947, "rewards/margins": 9.519303321838379, "rewards/rejected": -7.347729206085205, "step": 772 }, { "epoch": 0.5426465426465427, "grad_norm": 0.004043136723339558, "learning_rate": 4.8136324898638756e-05, "logits/chosen": -9.424785614013672, "logits/rejected": -9.433906555175781, "logps/chosen": -3.188676357269287, "logps/rejected": -110.02290344238281, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.684321641921997, "rewards/margins": 10.594432830810547, "rewards/rejected": -7.910110950469971, "step": 773 }, { "epoch": 0.5433485433485433, "grad_norm": 0.0011948919855058193, "learning_rate": 4.8145682678048214e-05, "logits/chosen": -8.654035568237305, "logits/rejected": -8.660467147827148, "logps/chosen": -4.8112077713012695, "logps/rejected": -107.76048278808594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7596802711486816, "rewards/margins": 10.369970321655273, "rewards/rejected": -7.61029052734375, "step": 774 }, { "epoch": 0.5440505440505441, "grad_norm": 0.008132479153573513, "learning_rate": 4.815502837510518e-05, "logits/chosen": -8.475614547729492, "logits/rejected": -8.478496551513672, "logps/chosen": -1.8742742538452148, "logps/rejected": -109.43970489501953, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9850645065307617, "rewards/margins": 10.814352035522461, "rewards/rejected": -7.829287528991699, "step": 775 }, { "epoch": 0.5447525447525448, "grad_norm": 0.0015435823006555438, "learning_rate": 4.816436202096981e-05, "logits/chosen": -8.583593368530273, "logits/rejected": -8.574150085449219, "logps/chosen": -10.084345817565918, "logps/rejected": -104.48324584960938, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.1863436698913574, "rewards/margins": 9.3430814743042, "rewards/rejected": -7.156737804412842, "step": 776 }, { "epoch": 0.5454545454545454, "grad_norm": 0.0009839730337262154, "learning_rate": 4.81736836466819e-05, "logits/chosen": -8.087800025939941, "logits/rejected": -8.07923698425293, "logps/chosen": -5.3107523918151855, "logps/rejected": -108.32565307617188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.884737730026245, "rewards/margins": 10.427452087402344, "rewards/rejected": -7.5427141189575195, "step": 777 }, { "epoch": 0.5461565461565462, "grad_norm": 0.0009364220895804465, "learning_rate": 4.8182993283161485e-05, "logits/chosen": -8.850781440734863, "logits/rejected": -8.83418083190918, "logps/chosen": -7.358994960784912, "logps/rejected": -108.05783081054688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.474531650543213, "rewards/margins": 10.043612480163574, "rewards/rejected": -7.569080829620361, "step": 778 }, { "epoch": 0.5468585468585468, "grad_norm": 0.001130172167904675, "learning_rate": 4.819229096120941e-05, "logits/chosen": -8.249344825744629, "logits/rejected": -8.251434326171875, "logps/chosen": -6.301486492156982, "logps/rejected": -107.04267120361328, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7063686847686768, "rewards/margins": 10.09840202331543, "rewards/rejected": -7.392033576965332, "step": 779 }, { "epoch": 0.5475605475605475, "grad_norm": 0.4795147180557251, "learning_rate": 4.820157671150801e-05, "logits/chosen": -8.556113243103027, "logits/rejected": -8.541938781738281, "logps/chosen": -18.571247100830078, "logps/rejected": -94.4859848022461, "loss": 0.0791, "rewards/accuracies": 0.875, "rewards/chosen": 1.4208952188491821, "rewards/margins": 7.550260543823242, "rewards/rejected": -6.129364967346191, "step": 780 }, { "epoch": 0.5482625482625483, "grad_norm": 0.001289957552216947, "learning_rate": 4.821085056462168e-05, "logits/chosen": -8.696268081665039, "logits/rejected": -8.686346054077148, "logps/chosen": -3.4769864082336426, "logps/rejected": -109.454833984375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.865220546722412, "rewards/margins": 10.554282188415527, "rewards/rejected": -7.689061164855957, "step": 781 }, { "epoch": 0.5489645489645489, "grad_norm": 0.0013481107307597995, "learning_rate": 4.822011255099747e-05, "logits/chosen": -8.727834701538086, "logits/rejected": -8.73442268371582, "logps/chosen": -6.1369147300720215, "logps/rejected": -104.41954040527344, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7124485969543457, "rewards/margins": 9.902654647827148, "rewards/rejected": -7.190206527709961, "step": 782 }, { "epoch": 0.5496665496665497, "grad_norm": 0.001619346672669053, "learning_rate": 4.8229362700965726e-05, "logits/chosen": -8.694768905639648, "logits/rejected": -8.71143913269043, "logps/chosen": -6.137058258056641, "logps/rejected": -107.63661193847656, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.562241554260254, "rewards/margins": 10.162853240966797, "rewards/rejected": -7.600611686706543, "step": 783 }, { "epoch": 0.5503685503685504, "grad_norm": 0.002299916697666049, "learning_rate": 4.8238601044740645e-05, "logits/chosen": -8.544599533081055, "logits/rejected": -8.550745010375977, "logps/chosen": -14.268799781799316, "logps/rejected": -104.39581298828125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.0039572715759277, "rewards/margins": 9.072098731994629, "rewards/rejected": -7.068141460418701, "step": 784 }, { "epoch": 0.551070551070551, "grad_norm": 0.0031967710237950087, "learning_rate": 4.824782761242088e-05, "logits/chosen": -8.242158889770508, "logits/rejected": -8.231325149536133, "logps/chosen": -10.888910293579102, "logps/rejected": -102.45674896240234, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.1575560569763184, "rewards/margins": 9.17074203491211, "rewards/rejected": -7.013186454772949, "step": 785 }, { "epoch": 0.5517725517725518, "grad_norm": 0.008967027999460697, "learning_rate": 4.8257042433990135e-05, "logits/chosen": -9.379656791687012, "logits/rejected": -9.372620582580566, "logps/chosen": -11.895328521728516, "logps/rejected": -103.20112609863281, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.948524832725525, "rewards/margins": 9.13250732421875, "rewards/rejected": -7.183981895446777, "step": 786 }, { "epoch": 0.5524745524745525, "grad_norm": 0.014441384933888912, "learning_rate": 4.826624553931775e-05, "logits/chosen": -8.893465042114258, "logits/rejected": -8.885377883911133, "logps/chosen": -14.660493850708008, "logps/rejected": -101.0829086303711, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.8159823417663574, "rewards/margins": 8.670825004577637, "rewards/rejected": -6.854842185974121, "step": 787 }, { "epoch": 0.5531765531765532, "grad_norm": 0.02166915312409401, "learning_rate": 4.827543695815926e-05, "logits/chosen": -8.954629898071289, "logits/rejected": -8.949178695678711, "logps/chosen": -15.868846893310547, "logps/rejected": -99.16162109375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.6098979711532593, "rewards/margins": 8.385254859924316, "rewards/rejected": -6.775356292724609, "step": 788 }, { "epoch": 0.5538785538785539, "grad_norm": 0.0657285526394844, "learning_rate": 4.8284616720157006e-05, "logits/chosen": -9.162504196166992, "logits/rejected": -9.153884887695312, "logps/chosen": -15.259696960449219, "logps/rejected": -101.84060668945312, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 1.7392350435256958, "rewards/margins": 8.60622501373291, "rewards/rejected": -6.866990089416504, "step": 789 }, { "epoch": 0.5545805545805546, "grad_norm": 0.04148022457957268, "learning_rate": 4.82937848548407e-05, "logits/chosen": -7.497060775756836, "logits/rejected": -7.502013206481934, "logps/chosen": -21.439762115478516, "logps/rejected": -95.89418029785156, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.1917335987091064, "rewards/margins": 7.550511360168457, "rewards/rejected": -6.35877799987793, "step": 790 }, { "epoch": 0.5552825552825553, "grad_norm": 0.04137840494513512, "learning_rate": 4.8302941391627947e-05, "logits/chosen": -8.437350273132324, "logits/rejected": -8.420761108398438, "logps/chosen": -16.39451789855957, "logps/rejected": -101.54385375976562, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 1.5746902227401733, "rewards/margins": 8.499521255493164, "rewards/rejected": -6.924831867218018, "step": 791 }, { "epoch": 0.555984555984556, "grad_norm": 0.043380651623010635, "learning_rate": 4.83120863598249e-05, "logits/chosen": -8.517017364501953, "logits/rejected": -8.509651184082031, "logps/chosen": -14.515480995178223, "logps/rejected": -101.30467224121094, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 1.7400554418563843, "rewards/margins": 8.695048332214355, "rewards/rejected": -6.95499324798584, "step": 792 }, { "epoch": 0.5566865566865566, "grad_norm": 0.029155688360333443, "learning_rate": 4.832121978862673e-05, "logits/chosen": -8.090412139892578, "logits/rejected": -8.07435417175293, "logps/chosen": -15.815364837646484, "logps/rejected": -102.80746459960938, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.7052831649780273, "rewards/margins": 8.688949584960938, "rewards/rejected": -6.98366641998291, "step": 793 }, { "epoch": 0.5573885573885574, "grad_norm": 0.01690620742738247, "learning_rate": 4.8330341707118276e-05, "logits/chosen": -8.818367004394531, "logits/rejected": -8.82099723815918, "logps/chosen": -15.555193901062012, "logps/rejected": -101.40800476074219, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.5205729007720947, "rewards/margins": 8.621219635009766, "rewards/rejected": -7.100646018981934, "step": 794 }, { "epoch": 0.5580905580905581, "grad_norm": 0.006844535004347563, "learning_rate": 4.833945214427451e-05, "logits/chosen": -9.68975830078125, "logits/rejected": -9.6803617477417, "logps/chosen": -12.884297370910645, "logps/rejected": -105.11066436767578, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.784005880355835, "rewards/margins": 9.09776782989502, "rewards/rejected": -7.313762664794922, "step": 795 }, { "epoch": 0.5587925587925588, "grad_norm": 0.017127495259046555, "learning_rate": 4.834855112896116e-05, "logits/chosen": -8.659440994262695, "logits/rejected": -8.678764343261719, "logps/chosen": -12.952125549316406, "logps/rejected": -105.94232177734375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.877216100692749, "rewards/margins": 9.21990966796875, "rewards/rejected": -7.342693328857422, "step": 796 }, { "epoch": 0.5594945594945595, "grad_norm": 0.004244522657245398, "learning_rate": 4.835763868993521e-05, "logits/chosen": -9.602258682250977, "logits/rejected": -9.624227523803711, "logps/chosen": -8.701371192932129, "logps/rejected": -108.3935546875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.2234573364257812, "rewards/margins": 9.864055633544922, "rewards/rejected": -7.640599250793457, "step": 797 }, { "epoch": 0.5601965601965602, "grad_norm": 0.0030378082301467657, "learning_rate": 4.8366714855845496e-05, "logits/chosen": -8.605302810668945, "logits/rejected": -8.626501083374023, "logps/chosen": -17.24557113647461, "logps/rejected": -102.6240234375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.7057867050170898, "rewards/margins": 8.514486312866211, "rewards/rejected": -6.808699607849121, "step": 798 }, { "epoch": 0.5608985608985609, "grad_norm": 0.003497328143566847, "learning_rate": 4.837577965523319e-05, "logits/chosen": -8.873054504394531, "logits/rejected": -8.885976791381836, "logps/chosen": -8.524447441101074, "logps/rejected": -107.27721405029297, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.358153820037842, "rewards/margins": 9.812738418579102, "rewards/rejected": -7.454584121704102, "step": 799 }, { "epoch": 0.5616005616005616, "grad_norm": 0.00515150697901845, "learning_rate": 4.8384833116532396e-05, "logits/chosen": -8.816047668457031, "logits/rejected": -8.842328071594238, "logps/chosen": -8.639659881591797, "logps/rejected": -107.31562805175781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.287039041519165, "rewards/margins": 9.814929962158203, "rewards/rejected": -7.527891159057617, "step": 800 }, { "epoch": 0.5623025623025623, "grad_norm": 0.013157667592167854, "learning_rate": 4.8393875268070636e-05, "logits/chosen": -8.659713745117188, "logits/rejected": -8.664427757263184, "logps/chosen": -9.766263008117676, "logps/rejected": -106.94911193847656, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.166693687438965, "rewards/margins": 9.61198616027832, "rewards/rejected": -7.445291519165039, "step": 801 }, { "epoch": 0.563004563004563, "grad_norm": 0.002364583546295762, "learning_rate": 4.84029061380694e-05, "logits/chosen": -8.951957702636719, "logits/rejected": -8.940651893615723, "logps/chosen": -8.487149238586426, "logps/rejected": -107.79931640625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.2751171588897705, "rewards/margins": 9.843202590942383, "rewards/rejected": -7.568085670471191, "step": 802 }, { "epoch": 0.5637065637065637, "grad_norm": 0.0025606227573007345, "learning_rate": 4.841192575464469e-05, "logits/chosen": -9.077948570251465, "logits/rejected": -9.076491355895996, "logps/chosen": -10.91256046295166, "logps/rejected": -107.34426879882812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.0975558757781982, "rewards/margins": 9.503459930419922, "rewards/rejected": -7.4059038162231445, "step": 803 }, { "epoch": 0.5644085644085645, "grad_norm": 0.0015107672661542892, "learning_rate": 4.842093414580753e-05, "logits/chosen": -8.418102264404297, "logits/rejected": -8.411396026611328, "logps/chosen": -13.681364059448242, "logps/rejected": -102.3269271850586, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.8907766342163086, "rewards/margins": 8.851215362548828, "rewards/rejected": -6.960438251495361, "step": 804 }, { "epoch": 0.5651105651105651, "grad_norm": 0.0018911404768005013, "learning_rate": 4.842993133946448e-05, "logits/chosen": -8.174800872802734, "logits/rejected": -8.184349060058594, "logps/chosen": -8.523653984069824, "logps/rejected": -108.34701538085938, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.367168426513672, "rewards/margins": 9.886362075805664, "rewards/rejected": -7.51919412612915, "step": 805 }, { "epoch": 0.5658125658125658, "grad_norm": 0.0016133477911353111, "learning_rate": 4.843891736341818e-05, "logits/chosen": -9.263206481933594, "logits/rejected": -9.261146545410156, "logps/chosen": -12.568992614746094, "logps/rejected": -105.9271240234375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.9651182889938354, "rewards/margins": 9.3151216506958, "rewards/rejected": -7.350003242492676, "step": 806 }, { "epoch": 0.5665145665145666, "grad_norm": 0.0024484251625835896, "learning_rate": 4.8447892245367846e-05, "logits/chosen": -7.983319282531738, "logits/rejected": -7.9999189376831055, "logps/chosen": -12.743620872497559, "logps/rejected": -103.30072784423828, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.9895380735397339, "rewards/margins": 9.114009857177734, "rewards/rejected": -7.124472618103027, "step": 807 }, { "epoch": 0.5672165672165672, "grad_norm": 0.003982552792876959, "learning_rate": 4.845685601290977e-05, "logits/chosen": -9.10179328918457, "logits/rejected": -9.102402687072754, "logps/chosen": -8.860532760620117, "logps/rejected": -107.72117614746094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.270397186279297, "rewards/margins": 9.967124938964844, "rewards/rejected": -7.696728706359863, "step": 808 }, { "epoch": 0.5679185679185679, "grad_norm": 0.002218119101598859, "learning_rate": 4.846580869353787e-05, "logits/chosen": -9.179718017578125, "logits/rejected": -9.188321113586426, "logps/chosen": -7.16670036315918, "logps/rejected": -108.48532104492188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.395047426223755, "rewards/margins": 10.037242889404297, "rewards/rejected": -7.642195224761963, "step": 809 }, { "epoch": 0.5686205686205686, "grad_norm": 0.001275999704375863, "learning_rate": 4.847475031464416e-05, "logits/chosen": -8.394241333007812, "logits/rejected": -8.370417594909668, "logps/chosen": -6.4784135818481445, "logps/rejected": -109.36567687988281, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5115766525268555, "rewards/margins": 10.179767608642578, "rewards/rejected": -7.668191909790039, "step": 810 }, { "epoch": 0.5693225693225693, "grad_norm": 0.0011090150801464915, "learning_rate": 4.8483680903519274e-05, "logits/chosen": -9.31763744354248, "logits/rejected": -9.313959121704102, "logps/chosen": -6.243013381958008, "logps/rejected": -109.92302703857422, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.282254219055176, "rewards/margins": 10.177050590515137, "rewards/rejected": -7.894796371459961, "step": 811 }, { "epoch": 0.5700245700245701, "grad_norm": 0.0012019382556900382, "learning_rate": 4.8492600487352926e-05, "logits/chosen": -9.099868774414062, "logits/rejected": -9.106141090393066, "logps/chosen": -9.439476013183594, "logps/rejected": -104.90809631347656, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.0926589965820312, "rewards/margins": 9.591092109680176, "rewards/rejected": -7.4984331130981445, "step": 812 }, { "epoch": 0.5707265707265707, "grad_norm": 0.866608202457428, "learning_rate": 4.850150909323447e-05, "logits/chosen": -8.851543426513672, "logits/rejected": -8.852827072143555, "logps/chosen": -11.540472030639648, "logps/rejected": -104.4913101196289, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 2.08758807182312, "rewards/margins": 9.262866973876953, "rewards/rejected": -7.175278186798096, "step": 813 }, { "epoch": 0.5714285714285714, "grad_norm": 0.0011250453535467386, "learning_rate": 4.8510406748153355e-05, "logits/chosen": -8.737077713012695, "logits/rejected": -8.759603500366211, "logps/chosen": -11.152482986450195, "logps/rejected": -107.40867614746094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.094961166381836, "rewards/margins": 9.600442886352539, "rewards/rejected": -7.505482196807861, "step": 814 }, { "epoch": 0.5721305721305722, "grad_norm": 0.008440857753157616, "learning_rate": 4.8519293478999614e-05, "logits/chosen": -8.451850891113281, "logits/rejected": -8.457151412963867, "logps/chosen": -11.930255889892578, "logps/rejected": -104.21678161621094, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.083807945251465, "rewards/margins": 9.179810523986816, "rewards/rejected": -7.096001625061035, "step": 815 }, { "epoch": 0.5728325728325728, "grad_norm": 0.0013659625547006726, "learning_rate": 4.8528169312564355e-05, "logits/chosen": -8.694467544555664, "logits/rejected": -8.694093704223633, "logps/chosen": -9.245306968688965, "logps/rejected": -106.59961700439453, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.325214385986328, "rewards/margins": 9.759449005126953, "rewards/rejected": -7.434234619140625, "step": 816 }, { "epoch": 0.5735345735345735, "grad_norm": 0.003321623895317316, "learning_rate": 4.8537034275540264e-05, "logits/chosen": -9.030479431152344, "logits/rejected": -9.029563903808594, "logps/chosen": -9.322093963623047, "logps/rejected": -106.44234466552734, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.139808177947998, "rewards/margins": 9.59125804901123, "rewards/rejected": -7.451450347900391, "step": 817 }, { "epoch": 0.5742365742365743, "grad_norm": 0.0010090676369145513, "learning_rate": 4.854588839452205e-05, "logits/chosen": -8.716009140014648, "logits/rejected": -8.694723129272461, "logps/chosen": -6.165672779083252, "logps/rejected": -108.52906799316406, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.517564296722412, "rewards/margins": 10.227574348449707, "rewards/rejected": -7.710009574890137, "step": 818 }, { "epoch": 0.5749385749385749, "grad_norm": 0.0008187644416466355, "learning_rate": 4.855473169600698e-05, "logits/chosen": -8.120474815368652, "logits/rejected": -8.118013381958008, "logps/chosen": -10.040306091308594, "logps/rejected": -105.67489624023438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.447495937347412, "rewards/margins": 9.734652519226074, "rewards/rejected": -7.287156581878662, "step": 819 }, { "epoch": 0.5756405756405757, "grad_norm": 0.0014248997904360294, "learning_rate": 4.856356420639528e-05, "logits/chosen": -9.024009704589844, "logits/rejected": -9.043819427490234, "logps/chosen": -7.817588806152344, "logps/rejected": -105.83973693847656, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4095044136047363, "rewards/margins": 9.773676872253418, "rewards/rejected": -7.364172458648682, "step": 820 }, { "epoch": 0.5763425763425764, "grad_norm": 0.001368628814816475, "learning_rate": 4.857238595199068e-05, "logits/chosen": -9.073637008666992, "logits/rejected": -9.083944320678711, "logps/chosen": -6.532811164855957, "logps/rejected": -108.32661437988281, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.466822862625122, "rewards/margins": 10.090776443481445, "rewards/rejected": -7.623954772949219, "step": 821 }, { "epoch": 0.577044577044577, "grad_norm": 0.0010115507757291198, "learning_rate": 4.858119695900084e-05, "logits/chosen": -8.45067024230957, "logits/rejected": -8.44902515411377, "logps/chosen": -5.962835788726807, "logps/rejected": -108.60452270507812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6773529052734375, "rewards/margins": 10.289905548095703, "rewards/rejected": -7.612553119659424, "step": 822 }, { "epoch": 0.5777465777465778, "grad_norm": 0.0009575331350788474, "learning_rate": 4.858999725353783e-05, "logits/chosen": -8.395423889160156, "logits/rejected": -8.3895845413208, "logps/chosen": -8.361837387084961, "logps/rejected": -106.48816680908203, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.442354440689087, "rewards/margins": 9.821231842041016, "rewards/rejected": -7.37887716293335, "step": 823 }, { "epoch": 0.5784485784485784, "grad_norm": 0.00322050997056067, "learning_rate": 4.8598786861618605e-05, "logits/chosen": -9.196242332458496, "logits/rejected": -9.214263916015625, "logps/chosen": -5.749598979949951, "logps/rejected": -108.7440185546875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5951528549194336, "rewards/margins": 10.276445388793945, "rewards/rejected": -7.681292533874512, "step": 824 }, { "epoch": 0.5791505791505791, "grad_norm": 0.0014292385894805193, "learning_rate": 4.860756580916542e-05, "logits/chosen": -9.107752799987793, "logits/rejected": -9.113485336303711, "logps/chosen": -9.250249862670898, "logps/rejected": -107.55489349365234, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.2092151641845703, "rewards/margins": 9.7693452835083, "rewards/rejected": -7.5601301193237305, "step": 825 }, { "epoch": 0.5798525798525799, "grad_norm": 0.002983823651447892, "learning_rate": 4.861633412200637e-05, "logits/chosen": -8.60444450378418, "logits/rejected": -8.59028434753418, "logps/chosen": -6.036561012268066, "logps/rejected": -108.28424072265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5366435050964355, "rewards/margins": 10.123214721679688, "rewards/rejected": -7.586570739746094, "step": 826 }, { "epoch": 0.5805545805545805, "grad_norm": 0.05771346762776375, "learning_rate": 4.862509182587578e-05, "logits/chosen": -9.174978256225586, "logits/rejected": -9.149166107177734, "logps/chosen": -6.184833526611328, "logps/rejected": -109.01427459716797, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 2.5660459995269775, "rewards/margins": 10.237205505371094, "rewards/rejected": -7.671159267425537, "step": 827 }, { "epoch": 0.5812565812565813, "grad_norm": 0.0009440591675229371, "learning_rate": 4.863383894641467e-05, "logits/chosen": -9.8035249710083, "logits/rejected": -9.78483772277832, "logps/chosen": -8.023303985595703, "logps/rejected": -106.92167663574219, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3573615550994873, "rewards/margins": 9.928810119628906, "rewards/rejected": -7.57144832611084, "step": 828 }, { "epoch": 0.581958581958582, "grad_norm": 0.0011475121136754751, "learning_rate": 4.864257550917123e-05, "logits/chosen": -8.235757827758789, "logits/rejected": -8.231483459472656, "logps/chosen": -5.952023983001709, "logps/rejected": -108.45573425292969, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.665865898132324, "rewards/margins": 10.333908081054688, "rewards/rejected": -7.668041229248047, "step": 829 }, { "epoch": 0.5826605826605826, "grad_norm": 0.00459166057407856, "learning_rate": 4.865130153960124e-05, "logits/chosen": -8.746826171875, "logits/rejected": -8.713333129882812, "logps/chosen": -8.451322555541992, "logps/rejected": -106.21990966796875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.1745595932006836, "rewards/margins": 9.76824951171875, "rewards/rejected": -7.593689918518066, "step": 830 }, { "epoch": 0.5833625833625834, "grad_norm": 0.0011921742698177695, "learning_rate": 4.8660017063068526e-05, "logits/chosen": -9.311868667602539, "logits/rejected": -9.311565399169922, "logps/chosen": -6.0236101150512695, "logps/rejected": -106.999267578125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.595024585723877, "rewards/margins": 10.064658164978027, "rewards/rejected": -7.46963357925415, "step": 831 }, { "epoch": 0.5840645840645841, "grad_norm": 0.007036360912024975, "learning_rate": 4.8668722104845403e-05, "logits/chosen": -9.355838775634766, "logits/rejected": -9.352253913879395, "logps/chosen": -5.464478492736816, "logps/rejected": -109.05416870117188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3818349838256836, "rewards/margins": 10.242984771728516, "rewards/rejected": -7.861149311065674, "step": 832 }, { "epoch": 0.5847665847665847, "grad_norm": 0.0012344353599473834, "learning_rate": 4.8677416690113134e-05, "logits/chosen": -9.185543060302734, "logits/rejected": -9.204849243164062, "logps/chosen": -5.06199836730957, "logps/rejected": -108.58436584472656, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.523122787475586, "rewards/margins": 10.23022747039795, "rewards/rejected": -7.707104682922363, "step": 833 }, { "epoch": 0.5854685854685855, "grad_norm": 0.012298448011279106, "learning_rate": 4.868610084396232e-05, "logits/chosen": -8.316173553466797, "logits/rejected": -8.304098129272461, "logps/chosen": -5.016010761260986, "logps/rejected": -108.5943832397461, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.635772705078125, "rewards/margins": 10.381033897399902, "rewards/rejected": -7.745261192321777, "step": 834 }, { "epoch": 0.5861705861705861, "grad_norm": 0.0014168998459354043, "learning_rate": 4.869477459139337e-05, "logits/chosen": -8.656925201416016, "logits/rejected": -8.656158447265625, "logps/chosen": -5.370882987976074, "logps/rejected": -107.95636749267578, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.709601879119873, "rewards/margins": 10.29806900024414, "rewards/rejected": -7.588466644287109, "step": 835 }, { "epoch": 0.5868725868725869, "grad_norm": 0.0018463272135704756, "learning_rate": 4.870343795731694e-05, "logits/chosen": -9.416364669799805, "logits/rejected": -9.40399169921875, "logps/chosen": -9.665182113647461, "logps/rejected": -106.7479248046875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.2018589973449707, "rewards/margins": 9.621466636657715, "rewards/rejected": -7.419607639312744, "step": 836 }, { "epoch": 0.5875745875745876, "grad_norm": 0.070364810526371, "learning_rate": 4.8712090966554334e-05, "logits/chosen": -8.398119926452637, "logits/rejected": -8.381172180175781, "logps/chosen": -5.71483039855957, "logps/rejected": -108.45726013183594, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.6182541847229004, "rewards/margins": 10.272666931152344, "rewards/rejected": -7.654412269592285, "step": 837 }, { "epoch": 0.5882765882765882, "grad_norm": 0.0013004206120967865, "learning_rate": 4.872073364383795e-05, "logits/chosen": -8.597923278808594, "logits/rejected": -8.60110855102539, "logps/chosen": -6.09662389755249, "logps/rejected": -108.31307983398438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5407814979553223, "rewards/margins": 10.174915313720703, "rewards/rejected": -7.634133815765381, "step": 838 }, { "epoch": 0.588978588978589, "grad_norm": 0.0015698865754529834, "learning_rate": 4.8729366013811674e-05, "logits/chosen": -8.197444915771484, "logits/rejected": -8.175642013549805, "logps/chosen": -5.607226848602295, "logps/rejected": -107.55786895751953, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6873559951782227, "rewards/margins": 10.1956787109375, "rewards/rejected": -7.508321762084961, "step": 839 }, { "epoch": 0.5896805896805897, "grad_norm": 0.0012066636700183153, "learning_rate": 4.8737988101031366e-05, "logits/chosen": -9.03207015991211, "logits/rejected": -9.03626537322998, "logps/chosen": -4.79427433013916, "logps/rejected": -108.51715087890625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6639699935913086, "rewards/margins": 10.417922019958496, "rewards/rejected": -7.7539520263671875, "step": 840 }, { "epoch": 0.5903825903825903, "grad_norm": 0.0015788882737979293, "learning_rate": 4.874659992996521e-05, "logits/chosen": -8.296993255615234, "logits/rejected": -8.276058197021484, "logps/chosen": -10.506001472473145, "logps/rejected": -101.99684143066406, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.1949844360351562, "rewards/margins": 9.200193405151367, "rewards/rejected": -7.005209922790527, "step": 841 }, { "epoch": 0.5910845910845911, "grad_norm": 0.002304231282323599, "learning_rate": 4.875520152499416e-05, "logits/chosen": -8.9407958984375, "logits/rejected": -8.93700885772705, "logps/chosen": -5.175708770751953, "logps/rejected": -105.73090362548828, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5279531478881836, "rewards/margins": 10.116104125976562, "rewards/rejected": -7.588151931762695, "step": 842 }, { "epoch": 0.5917865917865918, "grad_norm": 0.014292419888079166, "learning_rate": 4.876379291041238e-05, "logits/chosen": -8.897431373596191, "logits/rejected": -8.89138412475586, "logps/chosen": -7.791264533996582, "logps/rejected": -106.7822265625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.350132942199707, "rewards/margins": 9.822037696838379, "rewards/rejected": -7.471905708312988, "step": 843 }, { "epoch": 0.5924885924885925, "grad_norm": 0.000914954231120646, "learning_rate": 4.8772374110427594e-05, "logits/chosen": -8.218647003173828, "logits/rejected": -8.21878719329834, "logps/chosen": -5.149745464324951, "logps/rejected": -108.63346099853516, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.755650281906128, "rewards/margins": 10.411834716796875, "rewards/rejected": -7.656184196472168, "step": 844 }, { "epoch": 0.5931905931905932, "grad_norm": 1.7738027572631836, "learning_rate": 4.878094514916154e-05, "logits/chosen": -9.205510139465332, "logits/rejected": -9.184986114501953, "logps/chosen": -11.050666809082031, "logps/rejected": -104.32987976074219, "loss": 0.0345, "rewards/accuracies": 1.0, "rewards/chosen": 2.28961443901062, "rewards/margins": 9.424076080322266, "rewards/rejected": -7.134461402893066, "step": 845 }, { "epoch": 0.5938925938925939, "grad_norm": 0.0010371508542448282, "learning_rate": 4.8789506050650396e-05, "logits/chosen": -8.818378448486328, "logits/rejected": -8.816324234008789, "logps/chosen": -4.3941850662231445, "logps/rejected": -108.92313385009766, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.732261896133423, "rewards/margins": 10.431451797485352, "rewards/rejected": -7.699190616607666, "step": 846 }, { "epoch": 0.5945945945945946, "grad_norm": 0.0012976306024938822, "learning_rate": 4.879805683884512e-05, "logits/chosen": -9.033533096313477, "logits/rejected": -9.013592720031738, "logps/chosen": -4.134301662445068, "logps/rejected": -108.86420440673828, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7525432109832764, "rewards/margins": 10.373069763183594, "rewards/rejected": -7.620527267456055, "step": 847 }, { "epoch": 0.5952965952965953, "grad_norm": 0.001377119217067957, "learning_rate": 4.8806597537611906e-05, "logits/chosen": -8.80250358581543, "logits/rejected": -8.803317070007324, "logps/chosen": -11.022027969360352, "logps/rejected": -104.97239685058594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.1769092082977295, "rewards/margins": 9.232925415039062, "rewards/rejected": -7.056015968322754, "step": 848 }, { "epoch": 0.595998595998596, "grad_norm": 0.0008766084210947156, "learning_rate": 4.881512817073255e-05, "logits/chosen": -8.965917587280273, "logits/rejected": -8.976173400878906, "logps/chosen": -5.041679859161377, "logps/rejected": -108.83822631835938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.595964193344116, "rewards/margins": 10.373044967651367, "rewards/rejected": -7.77708101272583, "step": 849 }, { "epoch": 0.5967005967005967, "grad_norm": 0.001718753483146429, "learning_rate": 4.882364876190489e-05, "logits/chosen": -9.365053176879883, "logits/rejected": -9.371176719665527, "logps/chosen": -10.585657119750977, "logps/rejected": -104.31231689453125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.161494731903076, "rewards/margins": 9.288041114807129, "rewards/rejected": -7.1265459060668945, "step": 850 }, { "epoch": 0.5974025974025974, "grad_norm": 0.001427427283488214, "learning_rate": 4.8832159334743136e-05, "logits/chosen": -8.67275619506836, "logits/rejected": -8.659271240234375, "logps/chosen": -5.6801838874816895, "logps/rejected": -108.24015045166016, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.548590898513794, "rewards/margins": 10.181556701660156, "rewards/rejected": -7.632966041564941, "step": 851 }, { "epoch": 0.5981045981045982, "grad_norm": 0.004394554533064365, "learning_rate": 4.884065991277833e-05, "logits/chosen": -8.043885231018066, "logits/rejected": -8.043161392211914, "logps/chosen": -5.9920735359191895, "logps/rejected": -108.54672241210938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5471620559692383, "rewards/margins": 10.262139320373535, "rewards/rejected": -7.714977264404297, "step": 852 }, { "epoch": 0.5988065988065988, "grad_norm": 0.0010171354515478015, "learning_rate": 4.8849150519458726e-05, "logits/chosen": -8.68580150604248, "logits/rejected": -8.695882797241211, "logps/chosen": -6.941376686096191, "logps/rejected": -106.49728393554688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.511807441711426, "rewards/margins": 9.932601928710938, "rewards/rejected": -7.420795440673828, "step": 853 }, { "epoch": 0.5995085995085995, "grad_norm": 0.001236928510479629, "learning_rate": 4.885763117815009e-05, "logits/chosen": -9.090753555297852, "logits/rejected": -9.100685119628906, "logps/chosen": -8.404790878295898, "logps/rejected": -107.16856384277344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.2679948806762695, "rewards/margins": 9.806989669799805, "rewards/rejected": -7.538994312286377, "step": 854 }, { "epoch": 0.6002106002106002, "grad_norm": 0.0008053958299569786, "learning_rate": 4.886610191213622e-05, "logits/chosen": -8.712761878967285, "logits/rejected": -8.711603164672852, "logps/chosen": -11.655157089233398, "logps/rejected": -104.69689178466797, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.1618213653564453, "rewards/margins": 9.163466453552246, "rewards/rejected": -7.001645088195801, "step": 855 }, { "epoch": 0.6009126009126009, "grad_norm": 0.004652590956538916, "learning_rate": 4.887456274461922e-05, "logits/chosen": -7.824297904968262, "logits/rejected": -7.824055194854736, "logps/chosen": -11.625175476074219, "logps/rejected": -103.02159118652344, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.2317986488342285, "rewards/margins": 9.273417472839355, "rewards/rejected": -7.041619300842285, "step": 856 }, { "epoch": 0.6016146016146016, "grad_norm": 0.0047489735297858715, "learning_rate": 4.8883013698719973e-05, "logits/chosen": -8.419425010681152, "logits/rejected": -8.401058197021484, "logps/chosen": -12.133695602416992, "logps/rejected": -104.43089294433594, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.203155994415283, "rewards/margins": 9.261242866516113, "rewards/rejected": -7.058087348937988, "step": 857 }, { "epoch": 0.6023166023166023, "grad_norm": 0.0014157986734062433, "learning_rate": 4.889145479747843e-05, "logits/chosen": -8.530903816223145, "logits/rejected": -8.512306213378906, "logps/chosen": -9.550545692443848, "logps/rejected": -105.604736328125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.3693394660949707, "rewards/margins": 9.69320297241211, "rewards/rejected": -7.3238630294799805, "step": 858 }, { "epoch": 0.603018603018603, "grad_norm": 0.0011628716019913554, "learning_rate": 4.889988606385404e-05, "logits/chosen": -8.907187461853027, "logits/rejected": -8.913789749145508, "logps/chosen": -10.117774963378906, "logps/rejected": -103.32380676269531, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.2074546813964844, "rewards/margins": 9.238800048828125, "rewards/rejected": -7.031345367431641, "step": 859 }, { "epoch": 0.6037206037206038, "grad_norm": 0.0017478404333814979, "learning_rate": 4.8908307520726135e-05, "logits/chosen": -9.053573608398438, "logits/rejected": -9.051345825195312, "logps/chosen": -12.05803108215332, "logps/rejected": -104.618408203125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.915645956993103, "rewards/margins": 9.297245025634766, "rewards/rejected": -7.381598472595215, "step": 860 }, { "epoch": 0.6044226044226044, "grad_norm": 0.001005186466500163, "learning_rate": 4.891671919089425e-05, "logits/chosen": -9.347505569458008, "logits/rejected": -9.341620445251465, "logps/chosen": -6.6672821044921875, "logps/rejected": -105.92105102539062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5765187740325928, "rewards/margins": 9.994436264038086, "rewards/rejected": -7.4179182052612305, "step": 861 }, { "epoch": 0.6051246051246051, "grad_norm": 0.001887626014649868, "learning_rate": 4.892512109707855e-05, "logits/chosen": -8.738136291503906, "logits/rejected": -8.746734619140625, "logps/chosen": -8.732959747314453, "logps/rejected": -108.16818237304688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.2721409797668457, "rewards/margins": 9.78727912902832, "rewards/rejected": -7.515139102935791, "step": 862 }, { "epoch": 0.6058266058266059, "grad_norm": 0.03766294941306114, "learning_rate": 4.893351326192016e-05, "logits/chosen": -10.245002746582031, "logits/rejected": -10.257491111755371, "logps/chosen": -5.224938869476318, "logps/rejected": -108.75010681152344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5580317974090576, "rewards/margins": 10.320388793945312, "rewards/rejected": -7.762357711791992, "step": 863 }, { "epoch": 0.6065286065286065, "grad_norm": 0.002506781602278352, "learning_rate": 4.894189570798156e-05, "logits/chosen": -9.744770050048828, "logits/rejected": -9.734413146972656, "logps/chosen": -10.150396347045898, "logps/rejected": -103.83333587646484, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.0559816360473633, "rewards/margins": 9.274637222290039, "rewards/rejected": -7.218656063079834, "step": 864 }, { "epoch": 0.6072306072306072, "grad_norm": 0.0015141721814870834, "learning_rate": 4.895026845774691e-05, "logits/chosen": -8.976905822753906, "logits/rejected": -8.987510681152344, "logps/chosen": -10.311260223388672, "logps/rejected": -106.59916687011719, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.2732629776000977, "rewards/margins": 9.660629272460938, "rewards/rejected": -7.38736629486084, "step": 865 }, { "epoch": 0.607932607932608, "grad_norm": 0.0015548091614618897, "learning_rate": 4.895863153362244e-05, "logits/chosen": -8.710716247558594, "logits/rejected": -8.708256721496582, "logps/chosen": -5.920620441436768, "logps/rejected": -107.62777709960938, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.554816246032715, "rewards/margins": 10.176761627197266, "rewards/rejected": -7.621945381164551, "step": 866 }, { "epoch": 0.6086346086346086, "grad_norm": 0.0014348265249282122, "learning_rate": 4.896698495793684e-05, "logits/chosen": -8.586776733398438, "logits/rejected": -8.602252960205078, "logps/chosen": -9.36868667602539, "logps/rejected": -106.53557586669922, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.2533388137817383, "rewards/margins": 9.625864028930664, "rewards/rejected": -7.372525691986084, "step": 867 }, { "epoch": 0.6093366093366094, "grad_norm": 0.0011227860813960433, "learning_rate": 4.897532875294154e-05, "logits/chosen": -8.567679405212402, "logits/rejected": -8.558425903320312, "logps/chosen": -6.284788131713867, "logps/rejected": -108.04927062988281, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.53149151802063, "rewards/margins": 10.173431396484375, "rewards/rejected": -7.641940116882324, "step": 868 }, { "epoch": 0.61003861003861, "grad_norm": 0.002005854854360223, "learning_rate": 4.8983662940811115e-05, "logits/chosen": -9.37265396118164, "logits/rejected": -9.371294021606445, "logps/chosen": -6.516753196716309, "logps/rejected": -106.10113525390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.436152458190918, "rewards/margins": 9.952102661132812, "rewards/rejected": -7.5159502029418945, "step": 869 }, { "epoch": 0.6107406107406107, "grad_norm": 0.008363050408661366, "learning_rate": 4.899198754364365e-05, "logits/chosen": -9.716363906860352, "logits/rejected": -9.714273452758789, "logps/chosen": -7.926741123199463, "logps/rejected": -105.71940612792969, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.31788969039917, "rewards/margins": 9.77056884765625, "rewards/rejected": -7.452678680419922, "step": 870 }, { "epoch": 0.6114426114426115, "grad_norm": 0.001090935547836125, "learning_rate": 4.900030258346106e-05, "logits/chosen": -8.72464370727539, "logits/rejected": -8.740806579589844, "logps/chosen": -6.220740795135498, "logps/rejected": -107.99868774414062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6002321243286133, "rewards/margins": 10.091802597045898, "rewards/rejected": -7.491569519042969, "step": 871 }, { "epoch": 0.6121446121446121, "grad_norm": 0.0021275500766932964, "learning_rate": 4.900860808220946e-05, "logits/chosen": -8.686620712280273, "logits/rejected": -8.675960540771484, "logps/chosen": -5.659488677978516, "logps/rejected": -108.27674865722656, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.620176315307617, "rewards/margins": 10.21834659576416, "rewards/rejected": -7.598170280456543, "step": 872 }, { "epoch": 0.6128466128466128, "grad_norm": 0.001168177928775549, "learning_rate": 4.90169040617595e-05, "logits/chosen": -9.224075317382812, "logits/rejected": -9.193672180175781, "logps/chosen": -9.85411262512207, "logps/rejected": -104.23222351074219, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.196812629699707, "rewards/margins": 9.50344467163086, "rewards/rejected": -7.306632041931152, "step": 873 }, { "epoch": 0.6135486135486136, "grad_norm": 0.16783538460731506, "learning_rate": 4.9025190543906715e-05, "logits/chosen": -8.62374496459961, "logits/rejected": -8.620660781860352, "logps/chosen": -9.08531379699707, "logps/rejected": -106.03945922851562, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 2.5173819065093994, "rewards/margins": 9.760746002197266, "rewards/rejected": -7.243363380432129, "step": 874 }, { "epoch": 0.6142506142506142, "grad_norm": 0.002496796427294612, "learning_rate": 4.903346755037189e-05, "logits/chosen": -9.610979080200195, "logits/rejected": -9.630014419555664, "logps/chosen": -5.799032211303711, "logps/rejected": -106.86172485351562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.716242790222168, "rewards/margins": 10.223159790039062, "rewards/rejected": -7.506916046142578, "step": 875 }, { "epoch": 0.614952614952615, "grad_norm": 0.042944785207509995, "learning_rate": 4.904173510280135e-05, "logits/chosen": -9.159337997436523, "logits/rejected": -9.14841079711914, "logps/chosen": -13.098037719726562, "logps/rejected": -101.28591918945312, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.824453592300415, "rewards/margins": 8.770405769348145, "rewards/rejected": -6.945952415466309, "step": 876 }, { "epoch": 0.6156546156546157, "grad_norm": 0.0009552069241181016, "learning_rate": 4.904999322276735e-05, "logits/chosen": -9.228607177734375, "logits/rejected": -9.23497200012207, "logps/chosen": -4.846240997314453, "logps/rejected": -109.19275665283203, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.608826160430908, "rewards/margins": 10.327986717224121, "rewards/rejected": -7.719161033630371, "step": 877 }, { "epoch": 0.6163566163566163, "grad_norm": 0.0025520427152514458, "learning_rate": 4.9058241931768385e-05, "logits/chosen": -9.579360961914062, "logits/rejected": -9.599390029907227, "logps/chosen": -6.001381874084473, "logps/rejected": -108.4110336303711, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6416709423065186, "rewards/margins": 10.201996803283691, "rewards/rejected": -7.560326099395752, "step": 878 }, { "epoch": 0.6170586170586171, "grad_norm": 0.0012313163606449962, "learning_rate": 4.9066481251229535e-05, "logits/chosen": -9.199045181274414, "logits/rejected": -9.199995040893555, "logps/chosen": -4.774941444396973, "logps/rejected": -109.0562744140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6888251304626465, "rewards/margins": 10.410531997680664, "rewards/rejected": -7.721707344055176, "step": 879 }, { "epoch": 0.6177606177606177, "grad_norm": 0.001164157991297543, "learning_rate": 4.907471120250281e-05, "logits/chosen": -9.44837760925293, "logits/rejected": -9.43372917175293, "logps/chosen": -8.406961441040039, "logps/rejected": -107.1942367553711, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.280444860458374, "rewards/margins": 9.77673053741455, "rewards/rejected": -7.496285438537598, "step": 880 }, { "epoch": 0.6184626184626184, "grad_norm": 0.0012425517197698355, "learning_rate": 4.9082931806867474e-05, "logits/chosen": -8.544235229492188, "logits/rejected": -8.556316375732422, "logps/chosen": -6.1341552734375, "logps/rejected": -108.94070434570312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.60884428024292, "rewards/margins": 10.272724151611328, "rewards/rejected": -7.66387939453125, "step": 881 }, { "epoch": 0.6191646191646192, "grad_norm": 0.002231762045994401, "learning_rate": 4.909114308553033e-05, "logits/chosen": -8.846393585205078, "logits/rejected": -8.82966423034668, "logps/chosen": -10.27626895904541, "logps/rejected": -107.10533142089844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.238783597946167, "rewards/margins": 9.686811447143555, "rewards/rejected": -7.448027610778809, "step": 882 }, { "epoch": 0.6198666198666198, "grad_norm": 0.0009200823842547834, "learning_rate": 4.909934505962615e-05, "logits/chosen": -9.749054908752441, "logits/rejected": -9.753387451171875, "logps/chosen": -5.175971031188965, "logps/rejected": -108.87267303466797, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.537977695465088, "rewards/margins": 10.361797332763672, "rewards/rejected": -7.823819160461426, "step": 883 }, { "epoch": 0.6205686205686206, "grad_norm": 0.022009365260601044, "learning_rate": 4.9107537750217886e-05, "logits/chosen": -8.932882308959961, "logits/rejected": -8.913253784179688, "logps/chosen": -16.200611114501953, "logps/rejected": -101.21717834472656, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.640045166015625, "rewards/margins": 8.472705841064453, "rewards/rejected": -6.8326616287231445, "step": 884 }, { "epoch": 0.6212706212706213, "grad_norm": 0.0015230284770950675, "learning_rate": 4.9115721178297093e-05, "logits/chosen": -8.949197769165039, "logits/rejected": -8.955541610717773, "logps/chosen": -5.233624458312988, "logps/rejected": -108.92601013183594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.558483123779297, "rewards/margins": 10.341344833374023, "rewards/rejected": -7.78286075592041, "step": 885 }, { "epoch": 0.6219726219726219, "grad_norm": 0.002078535035252571, "learning_rate": 4.9123895364784184e-05, "logits/chosen": -9.595235824584961, "logits/rejected": -9.570574760437012, "logps/chosen": -8.641376495361328, "logps/rejected": -106.53976440429688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3492918014526367, "rewards/margins": 9.829713821411133, "rewards/rejected": -7.480422019958496, "step": 886 }, { "epoch": 0.6226746226746227, "grad_norm": 0.0018171969568356872, "learning_rate": 4.913206033052877e-05, "logits/chosen": -8.573932647705078, "logits/rejected": -8.565458297729492, "logps/chosen": -9.04563045501709, "logps/rejected": -106.61473846435547, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.319110631942749, "rewards/margins": 9.729511260986328, "rewards/rejected": -7.410401344299316, "step": 887 }, { "epoch": 0.6233766233766234, "grad_norm": 0.0010463614016771317, "learning_rate": 4.914021609631002e-05, "logits/chosen": -9.05457878112793, "logits/rejected": -9.060861587524414, "logps/chosen": -5.155213832855225, "logps/rejected": -107.77243041992188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5755817890167236, "rewards/margins": 10.2259521484375, "rewards/rejected": -7.6503705978393555, "step": 888 }, { "epoch": 0.6240786240786241, "grad_norm": 0.015606410801410675, "learning_rate": 4.91483626828369e-05, "logits/chosen": -8.927923202514648, "logits/rejected": -8.932291984558105, "logps/chosen": -5.955909729003906, "logps/rejected": -108.13748931884766, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6650350093841553, "rewards/margins": 10.286545753479004, "rewards/rejected": -7.6215105056762695, "step": 889 }, { "epoch": 0.6247806247806248, "grad_norm": 0.0012701937230303884, "learning_rate": 4.915650011074855e-05, "logits/chosen": -8.950790405273438, "logits/rejected": -8.955760955810547, "logps/chosen": -8.881040573120117, "logps/rejected": -107.43340301513672, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.411454200744629, "rewards/margins": 9.819565773010254, "rewards/rejected": -7.408111572265625, "step": 890 }, { "epoch": 0.6254826254826255, "grad_norm": 0.0013404261553660035, "learning_rate": 4.916462840061458e-05, "logits/chosen": -9.142322540283203, "logits/rejected": -9.139548301696777, "logps/chosen": -7.925081729888916, "logps/rejected": -107.19796752929688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.371152877807617, "rewards/margins": 9.892337799072266, "rewards/rejected": -7.521184921264648, "step": 891 }, { "epoch": 0.6261846261846262, "grad_norm": 0.0014703880297020078, "learning_rate": 4.917274757293539e-05, "logits/chosen": -8.081055641174316, "logits/rejected": -8.077601432800293, "logps/chosen": -11.504232406616211, "logps/rejected": -106.24162292480469, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.0981950759887695, "rewards/margins": 9.356291770935059, "rewards/rejected": -7.258096694946289, "step": 892 }, { "epoch": 0.6268866268866269, "grad_norm": 0.002024304587393999, "learning_rate": 4.918085764814244e-05, "logits/chosen": -8.819598197937012, "logits/rejected": -8.835470199584961, "logps/chosen": -4.746373176574707, "logps/rejected": -107.00665283203125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.656069755554199, "rewards/margins": 10.221323013305664, "rewards/rejected": -7.565252780914307, "step": 893 }, { "epoch": 0.6275886275886275, "grad_norm": 0.0007227042224258184, "learning_rate": 4.9188958646598624e-05, "logits/chosen": -8.889853477478027, "logits/rejected": -8.87545394897461, "logps/chosen": -10.864418029785156, "logps/rejected": -104.06709289550781, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.174468994140625, "rewards/margins": 9.302505493164062, "rewards/rejected": -7.1280364990234375, "step": 894 }, { "epoch": 0.6282906282906283, "grad_norm": 0.0011626139748841524, "learning_rate": 4.919705058859854e-05, "logits/chosen": -8.899118423461914, "logits/rejected": -8.890379905700684, "logps/chosen": -9.96264934539795, "logps/rejected": -105.02481079101562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.132132053375244, "rewards/margins": 9.3655366897583, "rewards/rejected": -7.233404636383057, "step": 895 }, { "epoch": 0.628992628992629, "grad_norm": 2.7660388946533203, "learning_rate": 4.920513349436875e-05, "logits/chosen": -8.845502853393555, "logits/rejected": -8.830401420593262, "logps/chosen": -7.408384799957275, "logps/rejected": -103.64834594726562, "loss": 0.048, "rewards/accuracies": 1.0, "rewards/chosen": 2.4953041076660156, "rewards/margins": 9.675033569335938, "rewards/rejected": -7.179729461669922, "step": 896 }, { "epoch": 0.6296946296946297, "grad_norm": 0.0008619902655482292, "learning_rate": 4.92132073840682e-05, "logits/chosen": -8.046656608581543, "logits/rejected": -8.045738220214844, "logps/chosen": -8.81241226196289, "logps/rejected": -106.01707458496094, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.392993450164795, "rewards/margins": 9.75584602355957, "rewards/rejected": -7.362852573394775, "step": 897 }, { "epoch": 0.6303966303966304, "grad_norm": 0.0019692389760166407, "learning_rate": 4.922127227778841e-05, "logits/chosen": -8.278484344482422, "logits/rejected": -8.27791976928711, "logps/chosen": -6.318984031677246, "logps/rejected": -107.57890319824219, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5559630393981934, "rewards/margins": 9.949230194091797, "rewards/rejected": -7.393267631530762, "step": 898 }, { "epoch": 0.6310986310986311, "grad_norm": 0.006537633016705513, "learning_rate": 4.9229328195553815e-05, "logits/chosen": -8.72286605834961, "logits/rejected": -8.708873748779297, "logps/chosen": -8.91301441192627, "logps/rejected": -104.16442108154297, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.2834689617156982, "rewards/margins": 9.409013748168945, "rewards/rejected": -7.125544548034668, "step": 899 }, { "epoch": 0.6318006318006318, "grad_norm": 0.00869343988597393, "learning_rate": 4.923737515732209e-05, "logits/chosen": -8.82007884979248, "logits/rejected": -8.806669235229492, "logps/chosen": -6.30649471282959, "logps/rejected": -105.570556640625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.562723159790039, "rewards/margins": 9.889886856079102, "rewards/rejected": -7.3271636962890625, "step": 900 }, { "epoch": 0.6325026325026325, "grad_norm": 0.004643296357244253, "learning_rate": 4.924541318298438e-05, "logits/chosen": -8.469995498657227, "logits/rejected": -8.500274658203125, "logps/chosen": -7.062673568725586, "logps/rejected": -104.77264404296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4913148880004883, "rewards/margins": 9.714265823364258, "rewards/rejected": -7.222950458526611, "step": 901 }, { "epoch": 0.6332046332046332, "grad_norm": 0.007271741516888142, "learning_rate": 4.92534422923657e-05, "logits/chosen": -9.113142013549805, "logits/rejected": -9.10998821258545, "logps/chosen": -10.317974090576172, "logps/rejected": -98.40482330322266, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.2538938522338867, "rewards/margins": 8.787298202514648, "rewards/rejected": -6.5334038734436035, "step": 902 }, { "epoch": 0.6339066339066339, "grad_norm": 0.015458953566849232, "learning_rate": 4.9261462505225106e-05, "logits/chosen": -8.995553016662598, "logits/rejected": -8.9736328125, "logps/chosen": -5.744424819946289, "logps/rejected": -103.46612548828125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.568699359893799, "rewards/margins": 9.799274444580078, "rewards/rejected": -7.2305755615234375, "step": 903 }, { "epoch": 0.6346086346086346, "grad_norm": 0.020070478320121765, "learning_rate": 4.926947384125606e-05, "logits/chosen": -8.823341369628906, "logits/rejected": -8.821897506713867, "logps/chosen": -9.024658203125, "logps/rejected": -97.43479919433594, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.203962802886963, "rewards/margins": 8.709453582763672, "rewards/rejected": -6.505489349365234, "step": 904 }, { "epoch": 0.6353106353106354, "grad_norm": 0.018627164885401726, "learning_rate": 4.927747632008672e-05, "logits/chosen": -9.322990417480469, "logits/rejected": -9.305274963378906, "logps/chosen": -7.030367851257324, "logps/rejected": -96.30842590332031, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.336911201477051, "rewards/margins": 8.930957794189453, "rewards/rejected": -6.5940470695495605, "step": 905 }, { "epoch": 0.636012636012636, "grad_norm": 0.012317284941673279, "learning_rate": 4.9285469961280226e-05, "logits/chosen": -8.865346908569336, "logits/rejected": -8.86063003540039, "logps/chosen": -6.259883403778076, "logps/rejected": -95.40560913085938, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.4937281608581543, "rewards/margins": 8.872359275817871, "rewards/rejected": -6.378631591796875, "step": 906 }, { "epoch": 0.6367146367146367, "grad_norm": 0.01575290970504284, "learning_rate": 4.9293454784334924e-05, "logits/chosen": -8.736815452575684, "logits/rejected": -8.717534065246582, "logps/chosen": -6.52513313293457, "logps/rejected": -103.5310287475586, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4795994758605957, "rewards/margins": 9.736471176147461, "rewards/rejected": -7.256872177124023, "step": 907 }, { "epoch": 0.6374166374166375, "grad_norm": 0.02369009703397751, "learning_rate": 4.9301430808684754e-05, "logits/chosen": -8.73065185546875, "logits/rejected": -8.724020004272461, "logps/chosen": -10.125040054321289, "logps/rejected": -92.63265991210938, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.2219858169555664, "rewards/margins": 8.3214111328125, "rewards/rejected": -6.099425315856934, "step": 908 }, { "epoch": 0.6381186381186381, "grad_norm": 0.01887308619916439, "learning_rate": 4.930939805369946e-05, "logits/chosen": -8.5166015625, "logits/rejected": -8.505952835083008, "logps/chosen": -9.899580955505371, "logps/rejected": -99.47238159179688, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.2845101356506348, "rewards/margins": 8.946724891662598, "rewards/rejected": -6.662215232849121, "step": 909 }, { "epoch": 0.6388206388206388, "grad_norm": 0.02289261855185032, "learning_rate": 4.93173565386849e-05, "logits/chosen": -9.165473937988281, "logits/rejected": -9.154375076293945, "logps/chosen": -12.057870864868164, "logps/rejected": -98.59538269042969, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.0715136528015137, "rewards/margins": 8.589723587036133, "rewards/rejected": -6.518210411071777, "step": 910 }, { "epoch": 0.6395226395226395, "grad_norm": 0.03610272333025932, "learning_rate": 4.932530628288331e-05, "logits/chosen": -8.771965026855469, "logits/rejected": -8.760311126708984, "logps/chosen": -6.1324052810668945, "logps/rejected": -96.33507537841797, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.588499069213867, "rewards/margins": 8.975955963134766, "rewards/rejected": -6.387456893920898, "step": 911 }, { "epoch": 0.6402246402246402, "grad_norm": 0.014464144594967365, "learning_rate": 4.933324730547361e-05, "logits/chosen": -9.027294158935547, "logits/rejected": -9.02662467956543, "logps/chosen": -7.429254531860352, "logps/rejected": -98.4267807006836, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4118728637695312, "rewards/margins": 9.009817123413086, "rewards/rejected": -6.597943305969238, "step": 912 }, { "epoch": 0.640926640926641, "grad_norm": 0.013210304081439972, "learning_rate": 4.934117962557165e-05, "logits/chosen": -8.428668022155762, "logits/rejected": -8.441402435302734, "logps/chosen": -6.776463985443115, "logps/rejected": -99.95592498779297, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.606537103652954, "rewards/margins": 9.371805191040039, "rewards/rejected": -6.765267848968506, "step": 913 }, { "epoch": 0.6416286416286416, "grad_norm": 0.0047167628072202206, "learning_rate": 4.9349103262230524e-05, "logits/chosen": -9.3408203125, "logits/rejected": -9.323325157165527, "logps/chosen": -5.562750816345215, "logps/rejected": -106.76718139648438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.731215715408325, "rewards/margins": 10.182788848876953, "rewards/rejected": -7.451573371887207, "step": 914 }, { "epoch": 0.6423306423306423, "grad_norm": 0.005400040652602911, "learning_rate": 4.935701823444081e-05, "logits/chosen": -8.979085922241211, "logits/rejected": -8.976125717163086, "logps/chosen": -5.217706680297852, "logps/rejected": -106.4228744506836, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.6741280555725098, "rewards/margins": 10.114095687866211, "rewards/rejected": -7.439968109130859, "step": 915 }, { "epoch": 0.6430326430326431, "grad_norm": 0.0021081448066979647, "learning_rate": 4.9364924561130845e-05, "logits/chosen": -8.68722915649414, "logits/rejected": -8.673404693603516, "logps/chosen": -7.755281448364258, "logps/rejected": -102.59539794921875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4095802307128906, "rewards/margins": 9.37879467010498, "rewards/rejected": -6.96921443939209, "step": 916 }, { "epoch": 0.6437346437346437, "grad_norm": 0.006361627951264381, "learning_rate": 4.937282226116702e-05, "logits/chosen": -9.315523147583008, "logits/rejected": -9.310081481933594, "logps/chosen": -5.240076065063477, "logps/rejected": -107.66484069824219, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.636838674545288, "rewards/margins": 10.278047561645508, "rewards/rejected": -7.641208648681641, "step": 917 }, { "epoch": 0.6444366444366444, "grad_norm": 0.0020310634281486273, "learning_rate": 4.938071135335405e-05, "logits/chosen": -8.890424728393555, "logits/rejected": -8.911195755004883, "logps/chosen": -4.749798774719238, "logps/rejected": -107.45814514160156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5088486671447754, "rewards/margins": 10.100418090820312, "rewards/rejected": -7.591569423675537, "step": 918 }, { "epoch": 0.6451386451386452, "grad_norm": 0.0030574908014386892, "learning_rate": 4.938859185643519e-05, "logits/chosen": -9.250082015991211, "logits/rejected": -9.229222297668457, "logps/chosen": -4.5389909744262695, "logps/rejected": -108.58207702636719, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.670727252960205, "rewards/margins": 10.437097549438477, "rewards/rejected": -7.7663702964782715, "step": 919 }, { "epoch": 0.6458406458406458, "grad_norm": 0.005115822423249483, "learning_rate": 4.939646378909259e-05, "logits/chosen": -8.928770065307617, "logits/rejected": -8.920198440551758, "logps/chosen": -4.879809379577637, "logps/rejected": -108.52532196044922, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7524876594543457, "rewards/margins": 10.372201919555664, "rewards/rejected": -7.61971378326416, "step": 920 }, { "epoch": 0.6465426465426466, "grad_norm": 0.002205691998824477, "learning_rate": 4.940432716994748e-05, "logits/chosen": -8.994126319885254, "logits/rejected": -9.011215209960938, "logps/chosen": -4.454686641693115, "logps/rejected": -109.03201293945312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.72847318649292, "rewards/margins": 10.400543212890625, "rewards/rejected": -7.672069549560547, "step": 921 }, { "epoch": 0.6472446472446473, "grad_norm": 0.09392383694648743, "learning_rate": 4.9412182017560496e-05, "logits/chosen": -9.18979263305664, "logits/rejected": -9.189201354980469, "logps/chosen": -4.117437839508057, "logps/rejected": -109.39729309082031, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 2.6492037773132324, "rewards/margins": 10.414934158325195, "rewards/rejected": -7.765730381011963, "step": 922 }, { "epoch": 0.6479466479466479, "grad_norm": 0.00128161848988384, "learning_rate": 4.942002835043187e-05, "logits/chosen": -9.527193069458008, "logits/rejected": -9.498604774475098, "logps/chosen": -4.04109001159668, "logps/rejected": -109.5810317993164, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.786182165145874, "rewards/margins": 10.473591804504395, "rewards/rejected": -7.687409400939941, "step": 923 }, { "epoch": 0.6486486486486487, "grad_norm": 0.0013625527499243617, "learning_rate": 4.942786618700178e-05, "logits/chosen": -8.855087280273438, "logits/rejected": -8.860750198364258, "logps/chosen": -10.001920700073242, "logps/rejected": -101.72731018066406, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.0905160903930664, "rewards/margins": 9.08090591430664, "rewards/rejected": -6.990390777587891, "step": 924 }, { "epoch": 0.6493506493506493, "grad_norm": 0.0018368285382166505, "learning_rate": 4.9435695545650545e-05, "logits/chosen": -8.80607795715332, "logits/rejected": -8.794294357299805, "logps/chosen": -4.471872806549072, "logps/rejected": -108.40843963623047, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7049591541290283, "rewards/margins": 10.422611236572266, "rewards/rejected": -7.717652320861816, "step": 925 }, { "epoch": 0.65005265005265, "grad_norm": 0.004686048254370689, "learning_rate": 4.944351644469891e-05, "logits/chosen": -8.56399154663086, "logits/rejected": -8.55919361114502, "logps/chosen": -6.8763628005981445, "logps/rejected": -105.8408432006836, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5105926990509033, "rewards/margins": 9.928789138793945, "rewards/rejected": -7.418196678161621, "step": 926 }, { "epoch": 0.6507546507546508, "grad_norm": 0.0009687680285423994, "learning_rate": 4.945132890240829e-05, "logits/chosen": -9.07375431060791, "logits/rejected": -9.080269813537598, "logps/chosen": -3.2901594638824463, "logps/rejected": -109.0177993774414, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.785722255706787, "rewards/margins": 10.519857406616211, "rewards/rejected": -7.734134674072266, "step": 927 }, { "epoch": 0.6514566514566514, "grad_norm": 0.0010447966633364558, "learning_rate": 4.945913293698104e-05, "logits/chosen": -9.413228988647461, "logits/rejected": -9.411554336547852, "logps/chosen": -7.17263650894165, "logps/rejected": -106.83622741699219, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.446523904800415, "rewards/margins": 9.933679580688477, "rewards/rejected": -7.487155914306641, "step": 928 }, { "epoch": 0.6521586521586522, "grad_norm": 0.16634352505207062, "learning_rate": 4.9466928566560696e-05, "logits/chosen": -9.148160934448242, "logits/rejected": -9.153639793395996, "logps/chosen": -3.8832035064697266, "logps/rejected": -108.82771301269531, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 2.873206615447998, "rewards/margins": 10.53136920928955, "rewards/rejected": -7.658162593841553, "step": 929 }, { "epoch": 0.6528606528606529, "grad_norm": 0.0013413522392511368, "learning_rate": 4.9474715809232256e-05, "logits/chosen": -8.298539161682129, "logits/rejected": -8.283679962158203, "logps/chosen": -10.610395431518555, "logps/rejected": -104.74999237060547, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.2450318336486816, "rewards/margins": 9.372228622436523, "rewards/rejected": -7.127197265625, "step": 930 }, { "epoch": 0.6535626535626535, "grad_norm": 0.0012878018897026777, "learning_rate": 4.948249468302239e-05, "logits/chosen": -9.598811149597168, "logits/rejected": -9.599848747253418, "logps/chosen": -4.966952323913574, "logps/rejected": -107.90792083740234, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.609907627105713, "rewards/margins": 10.317779541015625, "rewards/rejected": -7.707871437072754, "step": 931 }, { "epoch": 0.6542646542646543, "grad_norm": 0.002019251696765423, "learning_rate": 4.9490265205899697e-05, "logits/chosen": -8.986863136291504, "logits/rejected": -8.987728118896484, "logps/chosen": -7.182128429412842, "logps/rejected": -106.67804718017578, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.589468002319336, "rewards/margins": 9.974092483520508, "rewards/rejected": -7.384624481201172, "step": 932 }, { "epoch": 0.654966654966655, "grad_norm": 0.000866169051732868, "learning_rate": 4.9498027395775006e-05, "logits/chosen": -9.051610946655273, "logits/rejected": -9.053556442260742, "logps/chosen": -3.179856777191162, "logps/rejected": -109.34086608886719, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.813748836517334, "rewards/margins": 10.54212474822998, "rewards/rejected": -7.728376388549805, "step": 933 }, { "epoch": 0.6556686556686556, "grad_norm": 0.002028288086876273, "learning_rate": 4.950578127050156e-05, "logits/chosen": -8.600628852844238, "logits/rejected": -8.603188514709473, "logps/chosen": -4.272668838500977, "logps/rejected": -107.27603149414062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7913272380828857, "rewards/margins": 10.299834251403809, "rewards/rejected": -7.508506774902344, "step": 934 }, { "epoch": 0.6563706563706564, "grad_norm": 0.0010598251828923821, "learning_rate": 4.95135268478753e-05, "logits/chosen": -8.761273384094238, "logits/rejected": -8.760156631469727, "logps/chosen": -3.708561420440674, "logps/rejected": -109.43055725097656, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8289546966552734, "rewards/margins": 10.5779390335083, "rewards/rejected": -7.748983383178711, "step": 935 }, { "epoch": 0.657072657072657, "grad_norm": 0.0010907758260145783, "learning_rate": 4.952126414563509e-05, "logits/chosen": -9.028377532958984, "logits/rejected": -9.037530899047852, "logps/chosen": -7.591670036315918, "logps/rejected": -106.894287109375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3947911262512207, "rewards/margins": 9.95589828491211, "rewards/rejected": -7.561107635498047, "step": 936 }, { "epoch": 0.6577746577746578, "grad_norm": 0.000992443528957665, "learning_rate": 4.952899318146297e-05, "logits/chosen": -8.597383499145508, "logits/rejected": -8.592742919921875, "logps/chosen": -4.3426432609558105, "logps/rejected": -108.65370178222656, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.691950559616089, "rewards/margins": 10.472017288208008, "rewards/rejected": -7.780066013336182, "step": 937 }, { "epoch": 0.6584766584766585, "grad_norm": 0.0008623283356428146, "learning_rate": 4.9536713972984414e-05, "logits/chosen": -8.65474796295166, "logits/rejected": -8.657163619995117, "logps/chosen": -4.323099136352539, "logps/rejected": -108.62677764892578, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.851806640625, "rewards/margins": 10.52605152130127, "rewards/rejected": -7.6742448806762695, "step": 938 }, { "epoch": 0.6591786591786591, "grad_norm": 0.0008433107286691666, "learning_rate": 4.954442653776852e-05, "logits/chosen": -8.597095489501953, "logits/rejected": -8.598745346069336, "logps/chosen": -6.650193214416504, "logps/rejected": -105.72523498535156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6008076667785645, "rewards/margins": 9.959844589233398, "rewards/rejected": -7.359036445617676, "step": 939 }, { "epoch": 0.6598806598806599, "grad_norm": 0.0010480578057467937, "learning_rate": 4.955213089332832e-05, "logits/chosen": -8.938982009887695, "logits/rejected": -8.918594360351562, "logps/chosen": -3.787079334259033, "logps/rejected": -108.4339599609375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7061429023742676, "rewards/margins": 10.466767311096191, "rewards/rejected": -7.760624885559082, "step": 940 }, { "epoch": 0.6605826605826606, "grad_norm": 0.0012945744674652815, "learning_rate": 4.955982705712095e-05, "logits/chosen": -8.222956657409668, "logits/rejected": -8.218900680541992, "logps/chosen": -6.547910690307617, "logps/rejected": -106.80987548828125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6588077545166016, "rewards/margins": 10.041354179382324, "rewards/rejected": -7.382546901702881, "step": 941 }, { "epoch": 0.6612846612846612, "grad_norm": 0.0018045780016109347, "learning_rate": 4.956751504654796e-05, "logits/chosen": -8.61664867401123, "logits/rejected": -8.627726554870605, "logps/chosen": -7.76535701751709, "logps/rejected": -107.37715911865234, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.556581497192383, "rewards/margins": 9.95026683807373, "rewards/rejected": -7.393686294555664, "step": 942 }, { "epoch": 0.661986661986662, "grad_norm": 0.0025173614267259836, "learning_rate": 4.957519487895548e-05, "logits/chosen": -9.024872779846191, "logits/rejected": -9.031637191772461, "logps/chosen": -3.6538777351379395, "logps/rejected": -109.12171936035156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7441115379333496, "rewards/margins": 10.549139022827148, "rewards/rejected": -7.805027484893799, "step": 943 }, { "epoch": 0.6626886626886627, "grad_norm": 0.0009523750632070005, "learning_rate": 4.9582866571634485e-05, "logits/chosen": -9.060213088989258, "logits/rejected": -9.051355361938477, "logps/chosen": -9.518270492553711, "logps/rejected": -104.66175079345703, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.267993450164795, "rewards/margins": 9.611603736877441, "rewards/rejected": -7.343609809875488, "step": 944 }, { "epoch": 0.6633906633906634, "grad_norm": 0.0012294636107981205, "learning_rate": 4.959053014182106e-05, "logits/chosen": -8.738917350769043, "logits/rejected": -8.741446495056152, "logps/chosen": -8.138568878173828, "logps/rejected": -107.86700439453125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4705605506896973, "rewards/margins": 9.994149208068848, "rewards/rejected": -7.523589134216309, "step": 945 }, { "epoch": 0.6640926640926641, "grad_norm": 0.003668507793918252, "learning_rate": 4.959818560669655e-05, "logits/chosen": -8.717252731323242, "logits/rejected": -8.738383293151855, "logps/chosen": -8.707441329956055, "logps/rejected": -105.97514343261719, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.3463706970214844, "rewards/margins": 9.651391983032227, "rewards/rejected": -7.305021286010742, "step": 946 }, { "epoch": 0.6647946647946648, "grad_norm": 0.0007248771726153791, "learning_rate": 4.96058329833879e-05, "logits/chosen": -8.936223983764648, "logits/rejected": -8.936548233032227, "logps/chosen": -8.0654296875, "logps/rejected": -105.47958374023438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.493450164794922, "rewards/margins": 9.889420509338379, "rewards/rejected": -7.395971298217773, "step": 947 }, { "epoch": 0.6654966654966655, "grad_norm": 0.0010928985429927707, "learning_rate": 4.961347228896777e-05, "logits/chosen": -8.668157577514648, "logits/rejected": -8.654058456420898, "logps/chosen": -6.58783483505249, "logps/rejected": -106.9991226196289, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.559743881225586, "rewards/margins": 10.088510513305664, "rewards/rejected": -7.528765678405762, "step": 948 }, { "epoch": 0.6661986661986662, "grad_norm": 0.0010134306503459811, "learning_rate": 4.962110354045488e-05, "logits/chosen": -9.038410186767578, "logits/rejected": -9.018400192260742, "logps/chosen": -10.798612594604492, "logps/rejected": -105.94322204589844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4214749336242676, "rewards/margins": 9.574506759643555, "rewards/rejected": -7.153031349182129, "step": 949 }, { "epoch": 0.6669006669006668, "grad_norm": 0.0009867604821920395, "learning_rate": 4.962872675481414e-05, "logits/chosen": -9.710186004638672, "logits/rejected": -9.685428619384766, "logps/chosen": -3.3629636764526367, "logps/rejected": -108.43968963623047, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.786191463470459, "rewards/margins": 10.507118225097656, "rewards/rejected": -7.720925807952881, "step": 950 }, { "epoch": 0.6676026676026676, "grad_norm": 0.0010624363785609603, "learning_rate": 4.9636341948956906e-05, "logits/chosen": -9.149959564208984, "logits/rejected": -9.150047302246094, "logps/chosen": -2.7340359687805176, "logps/rejected": -109.77670288085938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7878079414367676, "rewards/margins": 10.594679832458496, "rewards/rejected": -7.80687141418457, "step": 951 }, { "epoch": 0.6683046683046683, "grad_norm": 0.0007567398133687675, "learning_rate": 4.964394913974124e-05, "logits/chosen": -8.929086685180664, "logits/rejected": -8.913289070129395, "logps/chosen": -8.165750503540039, "logps/rejected": -106.89803314208984, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.4471325874328613, "rewards/margins": 9.872335433959961, "rewards/rejected": -7.4252028465271, "step": 952 }, { "epoch": 0.669006669006669, "grad_norm": 0.0016459105536341667, "learning_rate": 4.965154834397211e-05, "logits/chosen": -9.589859008789062, "logits/rejected": -9.556949615478516, "logps/chosen": -3.2809853553771973, "logps/rejected": -108.74665832519531, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.792609691619873, "rewards/margins": 10.583460807800293, "rewards/rejected": -7.79085111618042, "step": 953 }, { "epoch": 0.6697086697086697, "grad_norm": 0.002020621206611395, "learning_rate": 4.965913957840159e-05, "logits/chosen": -9.144918441772461, "logits/rejected": -9.13918399810791, "logps/chosen": -5.687076091766357, "logps/rejected": -106.68489837646484, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.53719162940979, "rewards/margins": 10.005773544311523, "rewards/rejected": -7.4685821533203125, "step": 954 }, { "epoch": 0.6704106704106704, "grad_norm": 0.001585791353136301, "learning_rate": 4.966672285972911e-05, "logits/chosen": -9.089003562927246, "logits/rejected": -9.10055160522461, "logps/chosen": -2.4905643463134766, "logps/rejected": -109.86001586914062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8487367630004883, "rewards/margins": 10.590906143188477, "rewards/rejected": -7.742170333862305, "step": 955 }, { "epoch": 0.6711126711126711, "grad_norm": 0.0013410740066319704, "learning_rate": 4.967429820460167e-05, "logits/chosen": -8.686140060424805, "logits/rejected": -8.688528060913086, "logps/chosen": -7.465663433074951, "logps/rejected": -106.50377655029297, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4334607124328613, "rewards/margins": 9.938196182250977, "rewards/rejected": -7.504734992980957, "step": 956 }, { "epoch": 0.6718146718146718, "grad_norm": 0.0009089465020224452, "learning_rate": 4.9681865629614064e-05, "logits/chosen": -8.92219352722168, "logits/rejected": -8.934272766113281, "logps/chosen": -3.3486971855163574, "logps/rejected": -109.21687316894531, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.834575891494751, "rewards/margins": 10.570670127868652, "rewards/rejected": -7.7360944747924805, "step": 957 }, { "epoch": 0.6725166725166725, "grad_norm": 0.0005745120579376817, "learning_rate": 4.9689425151309074e-05, "logits/chosen": -8.233536720275879, "logits/rejected": -8.213333129882812, "logps/chosen": -3.499022960662842, "logps/rejected": -108.62591552734375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9962806701660156, "rewards/margins": 10.532682418823242, "rewards/rejected": -7.536402225494385, "step": 958 }, { "epoch": 0.6732186732186732, "grad_norm": 0.0009178604232147336, "learning_rate": 4.969697678617773e-05, "logits/chosen": -9.144672393798828, "logits/rejected": -9.123957633972168, "logps/chosen": -11.197942733764648, "logps/rejected": -102.35413360595703, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.1591315269470215, "rewards/margins": 9.054394721984863, "rewards/rejected": -6.895263671875, "step": 959 }, { "epoch": 0.6739206739206739, "grad_norm": 0.0008297275053337216, "learning_rate": 4.970452055065948e-05, "logits/chosen": -8.561744689941406, "logits/rejected": -8.54381275177002, "logps/chosen": -10.606348991394043, "logps/rejected": -105.15558624267578, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.322047710418701, "rewards/margins": 9.46983528137207, "rewards/rejected": -7.147788047790527, "step": 960 }, { "epoch": 0.6746226746226747, "grad_norm": 2.820699453353882, "learning_rate": 4.9712056461142423e-05, "logits/chosen": -8.645087242126465, "logits/rejected": -8.665874481201172, "logps/chosen": -5.102206707000732, "logps/rejected": -106.52542877197266, "loss": 0.044, "rewards/accuracies": 1.0, "rewards/chosen": 2.640368700027466, "rewards/margins": 10.10455322265625, "rewards/rejected": -7.464184761047363, "step": 961 }, { "epoch": 0.6753246753246753, "grad_norm": 0.0006472196546383202, "learning_rate": 4.971958453396355e-05, "logits/chosen": -8.896978378295898, "logits/rejected": -8.857407569885254, "logps/chosen": -3.766944169998169, "logps/rejected": -107.80648040771484, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.6066484451293945, "rewards/margins": 10.204809188842773, "rewards/rejected": -7.598161220550537, "step": 962 }, { "epoch": 0.676026676026676, "grad_norm": 0.0012331383768469095, "learning_rate": 4.972710478540891e-05, "logits/chosen": -8.177507400512695, "logits/rejected": -8.185270309448242, "logps/chosen": -6.339359283447266, "logps/rejected": -107.08909606933594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6126654148101807, "rewards/margins": 9.962713241577148, "rewards/rejected": -7.3500471115112305, "step": 963 }, { "epoch": 0.6767286767286768, "grad_norm": 0.018712276592850685, "learning_rate": 4.973461723171385e-05, "logits/chosen": -7.999045372009277, "logits/rejected": -7.979865550994873, "logps/chosen": -3.5173532962799072, "logps/rejected": -109.882080078125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8887593746185303, "rewards/margins": 10.581183433532715, "rewards/rejected": -7.6924238204956055, "step": 964 }, { "epoch": 0.6774306774306774, "grad_norm": 0.015470731072127819, "learning_rate": 4.9742121889063213e-05, "logits/chosen": -9.160021781921387, "logits/rejected": -9.159405708312988, "logps/chosen": -2.866997003555298, "logps/rejected": -109.71839141845703, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.820526123046875, "rewards/margins": 10.684016227722168, "rewards/rejected": -7.863490104675293, "step": 965 }, { "epoch": 0.6781326781326781, "grad_norm": 1.0054374933242798, "learning_rate": 4.974961877359156e-05, "logits/chosen": -8.336505889892578, "logits/rejected": -8.35267448425293, "logps/chosen": -3.4311795234680176, "logps/rejected": -108.53981018066406, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 2.863910675048828, "rewards/margins": 10.611902236938477, "rewards/rejected": -7.747991561889648, "step": 966 }, { "epoch": 0.6788346788346789, "grad_norm": 0.002341507701203227, "learning_rate": 4.975710790138336e-05, "logits/chosen": -8.822267532348633, "logits/rejected": -8.829996109008789, "logps/chosen": -5.132882595062256, "logps/rejected": -106.73304748535156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5454859733581543, "rewards/margins": 10.05339241027832, "rewards/rejected": -7.507905960083008, "step": 967 }, { "epoch": 0.6795366795366795, "grad_norm": 0.002837076550349593, "learning_rate": 4.976458928847323e-05, "logits/chosen": -8.351652145385742, "logits/rejected": -8.35923957824707, "logps/chosen": -8.904438018798828, "logps/rejected": -103.7623519897461, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.399000644683838, "rewards/margins": 9.494670867919922, "rewards/rejected": -7.095670700073242, "step": 968 }, { "epoch": 0.6802386802386803, "grad_norm": 0.02260526455938816, "learning_rate": 4.977206295084609e-05, "logits/chosen": -9.185693740844727, "logits/rejected": -9.18507194519043, "logps/chosen": -7.647906303405762, "logps/rejected": -107.06575012207031, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.405646800994873, "rewards/margins": 10.012604713439941, "rewards/rejected": -7.606956958770752, "step": 969 }, { "epoch": 0.6809406809406809, "grad_norm": 0.006483196280896664, "learning_rate": 4.9779528904437424e-05, "logits/chosen": -8.228086471557617, "logits/rejected": -8.24319076538086, "logps/chosen": -9.973493576049805, "logps/rejected": -99.7860107421875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.2686376571655273, "rewards/margins": 8.953023910522461, "rewards/rejected": -6.684386253356934, "step": 970 }, { "epoch": 0.6816426816426816, "grad_norm": 0.05075029283761978, "learning_rate": 4.978698716513342e-05, "logits/chosen": -7.967087745666504, "logits/rejected": -7.933586597442627, "logps/chosen": -4.115879535675049, "logps/rejected": -108.98789978027344, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.884720802307129, "rewards/margins": 10.484914779663086, "rewards/rejected": -7.600193500518799, "step": 971 }, { "epoch": 0.6823446823446824, "grad_norm": 0.0018477918347343802, "learning_rate": 4.9794437748771244e-05, "logits/chosen": -9.146283149719238, "logits/rejected": -9.159252166748047, "logps/chosen": -3.3701252937316895, "logps/rejected": -107.81623840332031, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.86566424369812, "rewards/margins": 10.451406478881836, "rewards/rejected": -7.585742950439453, "step": 972 }, { "epoch": 0.683046683046683, "grad_norm": 0.046633280813694, "learning_rate": 4.9801880671139204e-05, "logits/chosen": -9.27958869934082, "logits/rejected": -9.285544395446777, "logps/chosen": -4.780791759490967, "logps/rejected": -106.15154266357422, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.707021713256836, "rewards/margins": 10.062519073486328, "rewards/rejected": -7.355497360229492, "step": 973 }, { "epoch": 0.6837486837486837, "grad_norm": 0.014296461828052998, "learning_rate": 4.980931594797693e-05, "logits/chosen": -8.341432571411133, "logits/rejected": -8.338397026062012, "logps/chosen": -11.08648681640625, "logps/rejected": -108.90377807617188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.3075881004333496, "rewards/margins": 9.555686950683594, "rewards/rejected": -7.248099327087402, "step": 974 }, { "epoch": 0.6844506844506845, "grad_norm": 0.003373765153810382, "learning_rate": 4.981674359497562e-05, "logits/chosen": -8.894247055053711, "logits/rejected": -8.872978210449219, "logps/chosen": -3.0739245414733887, "logps/rejected": -108.59784698486328, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8793797492980957, "rewards/margins": 10.50045394897461, "rewards/rejected": -7.6210737228393555, "step": 975 }, { "epoch": 0.6851526851526851, "grad_norm": 0.0015874539967626333, "learning_rate": 4.98241636277782e-05, "logits/chosen": -9.372074127197266, "logits/rejected": -9.364776611328125, "logps/chosen": -10.202269554138184, "logps/rejected": -105.82728576660156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.1956019401550293, "rewards/margins": 9.620183944702148, "rewards/rejected": -7.424582481384277, "step": 976 }, { "epoch": 0.6858546858546859, "grad_norm": 0.07395212352275848, "learning_rate": 4.983157606197955e-05, "logits/chosen": -8.35920524597168, "logits/rejected": -8.361339569091797, "logps/chosen": -6.420226097106934, "logps/rejected": -107.1788330078125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.6387715339660645, "rewards/margins": 10.083033561706543, "rewards/rejected": -7.444262504577637, "step": 977 }, { "epoch": 0.6865566865566866, "grad_norm": 0.0009759090025909245, "learning_rate": 4.98389809131267e-05, "logits/chosen": -8.15243911743164, "logits/rejected": -8.138494491577148, "logps/chosen": -6.781317234039307, "logps/rejected": -106.14510345458984, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.637796401977539, "rewards/margins": 9.975442886352539, "rewards/rejected": -7.337646484375, "step": 978 }, { "epoch": 0.6872586872586872, "grad_norm": 0.0009965329663828015, "learning_rate": 4.984637819671897e-05, "logits/chosen": -8.883325576782227, "logits/rejected": -8.90013313293457, "logps/chosen": -2.302198886871338, "logps/rejected": -108.75856018066406, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9090230464935303, "rewards/margins": 10.55366325378418, "rewards/rejected": -7.644640922546387, "step": 979 }, { "epoch": 0.687960687960688, "grad_norm": 0.0013770494842901826, "learning_rate": 4.985376792820825e-05, "logits/chosen": -8.692614555358887, "logits/rejected": -8.725296020507812, "logps/chosen": -6.434783935546875, "logps/rejected": -107.13558959960938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4738869667053223, "rewards/margins": 9.96664810180664, "rewards/rejected": -7.49276065826416, "step": 980 }, { "epoch": 0.6886626886626886, "grad_norm": 1.5428539514541626, "learning_rate": 4.986115012299915e-05, "logits/chosen": -9.003129959106445, "logits/rejected": -9.005659103393555, "logps/chosen": -10.713037490844727, "logps/rejected": -99.67405700683594, "loss": 0.0311, "rewards/accuracies": 0.875, "rewards/chosen": 1.9912598133087158, "rewards/margins": 8.848079681396484, "rewards/rejected": -6.8568196296691895, "step": 981 }, { "epoch": 0.6893646893646893, "grad_norm": 0.34597906470298767, "learning_rate": 4.986852479644916e-05, "logits/chosen": -8.954846382141113, "logits/rejected": -8.952079772949219, "logps/chosen": -5.141438961029053, "logps/rejected": -107.4439468383789, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.743126153945923, "rewards/margins": 10.188899040222168, "rewards/rejected": -7.445772647857666, "step": 982 }, { "epoch": 0.6900666900666901, "grad_norm": 0.0026875054463744164, "learning_rate": 4.987589196386893e-05, "logits/chosen": -8.403742790222168, "logits/rejected": -8.386112213134766, "logps/chosen": -6.210978031158447, "logps/rejected": -104.95140075683594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.634922742843628, "rewards/margins": 9.891609191894531, "rewards/rejected": -7.256687164306641, "step": 983 }, { "epoch": 0.6907686907686907, "grad_norm": 0.009720595553517342, "learning_rate": 4.988325164052236e-05, "logits/chosen": -9.206750869750977, "logits/rejected": -9.211536407470703, "logps/chosen": -3.9758706092834473, "logps/rejected": -104.89710235595703, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.741844654083252, "rewards/margins": 10.111444473266602, "rewards/rejected": -7.36959981918335, "step": 984 }, { "epoch": 0.6914706914706915, "grad_norm": 2.100283145904541, "learning_rate": 4.9890603841626866e-05, "logits/chosen": -8.542610168457031, "logits/rejected": -8.54669189453125, "logps/chosen": -1.84628427028656, "logps/rejected": -108.61184692382812, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": 2.9751055240631104, "rewards/margins": 10.616186141967773, "rewards/rejected": -7.641080379486084, "step": 985 }, { "epoch": 0.6921726921726922, "grad_norm": 0.05424442142248154, "learning_rate": 4.989794858235352e-05, "logits/chosen": -9.548425674438477, "logits/rejected": -9.54455852508545, "logps/chosen": -5.2943243980407715, "logps/rejected": -106.31561279296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.651407241821289, "rewards/margins": 10.098573684692383, "rewards/rejected": -7.447166442871094, "step": 986 }, { "epoch": 0.6928746928746928, "grad_norm": 0.35307934880256653, "learning_rate": 4.990528587782729e-05, "logits/chosen": -8.558634757995605, "logits/rejected": -8.565092086791992, "logps/chosen": -2.6710872650146484, "logps/rejected": -108.24162292480469, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 2.93449330329895, "rewards/margins": 10.615394592285156, "rewards/rejected": -7.680901527404785, "step": 987 }, { "epoch": 0.6935766935766936, "grad_norm": 0.22379176318645477, "learning_rate": 4.9912615743127146e-05, "logits/chosen": -8.89384937286377, "logits/rejected": -8.887595176696777, "logps/chosen": -6.6560564041137695, "logps/rejected": -106.0225830078125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.596670150756836, "rewards/margins": 9.99736499786377, "rewards/rejected": -7.400694370269775, "step": 988 }, { "epoch": 0.6942786942786943, "grad_norm": 0.012225447222590446, "learning_rate": 4.991993819328633e-05, "logits/chosen": -9.525252342224121, "logits/rejected": -9.518180847167969, "logps/chosen": -2.7024435997009277, "logps/rejected": -106.21002197265625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8332107067108154, "rewards/margins": 10.245793342590332, "rewards/rejected": -7.412582874298096, "step": 989 }, { "epoch": 0.694980694980695, "grad_norm": 0.009187855757772923, "learning_rate": 4.9927253243292505e-05, "logits/chosen": -9.376376152038574, "logits/rejected": -9.362960815429688, "logps/chosen": -5.217571258544922, "logps/rejected": -105.05477142333984, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5357913970947266, "rewards/margins": 9.960129737854004, "rewards/rejected": -7.4243388175964355, "step": 990 }, { "epoch": 0.6956826956826957, "grad_norm": 0.0012468240456655622, "learning_rate": 4.993456090808793e-05, "logits/chosen": -8.407273292541504, "logits/rejected": -8.39706039428711, "logps/chosen": -6.167862892150879, "logps/rejected": -106.23961639404297, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.597719192504883, "rewards/margins": 10.016653060913086, "rewards/rejected": -7.418933868408203, "step": 991 }, { "epoch": 0.6963846963846964, "grad_norm": 0.00174127914942801, "learning_rate": 4.994186120256965e-05, "logits/chosen": -9.788656234741211, "logits/rejected": -9.785331726074219, "logps/chosen": -1.7114769220352173, "logps/rejected": -108.39680480957031, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.839611291885376, "rewards/margins": 10.54701042175293, "rewards/rejected": -7.707398891448975, "step": 992 }, { "epoch": 0.6970866970866971, "grad_norm": 0.0038266980554908514, "learning_rate": 4.9949154141589696e-05, "logits/chosen": -8.331832885742188, "logits/rejected": -8.330713272094727, "logps/chosen": -5.593840599060059, "logps/rejected": -104.78450775146484, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.616225242614746, "rewards/margins": 9.786722183227539, "rewards/rejected": -7.170495986938477, "step": 993 }, { "epoch": 0.6977886977886978, "grad_norm": 0.0746409222483635, "learning_rate": 4.995643973995523e-05, "logits/chosen": -8.346939086914062, "logits/rejected": -8.34863567352295, "logps/chosen": -6.066847324371338, "logps/rejected": -107.4844741821289, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.6427505016326904, "rewards/margins": 10.033770561218262, "rewards/rejected": -7.391019821166992, "step": 994 }, { "epoch": 0.6984906984906984, "grad_norm": 0.002209673635661602, "learning_rate": 4.9963718012428765e-05, "logits/chosen": -9.216997146606445, "logits/rejected": -9.208126068115234, "logps/chosen": -2.871528387069702, "logps/rejected": -108.61660766601562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8565163612365723, "rewards/margins": 10.570398330688477, "rewards/rejected": -7.7138824462890625, "step": 995 }, { "epoch": 0.6991926991926992, "grad_norm": 0.005070014391094446, "learning_rate": 4.9970988973728314e-05, "logits/chosen": -9.894582748413086, "logits/rejected": -9.889062881469727, "logps/chosen": -5.483277320861816, "logps/rejected": -107.2376708984375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.57481050491333, "rewards/margins": 10.137191772460938, "rewards/rejected": -7.562381744384766, "step": 996 }, { "epoch": 0.6998946998946999, "grad_norm": 0.001706116134300828, "learning_rate": 4.99782526385276e-05, "logits/chosen": -9.4254150390625, "logits/rejected": -9.436277389526367, "logps/chosen": -7.709285259246826, "logps/rejected": -103.17377471923828, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.219356060028076, "rewards/margins": 9.56740951538086, "rewards/rejected": -7.348053932189941, "step": 997 }, { "epoch": 0.7005967005967007, "grad_norm": 0.7436209917068481, "learning_rate": 4.998550902145619e-05, "logits/chosen": -8.167625427246094, "logits/rejected": -8.14587688446045, "logps/chosen": -4.52590799331665, "logps/rejected": -104.21467590332031, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 2.7888221740722656, "rewards/margins": 9.954761505126953, "rewards/rejected": -7.165938854217529, "step": 998 }, { "epoch": 0.7012987012987013, "grad_norm": 0.0038634801749140024, "learning_rate": 4.999275813709971e-05, "logits/chosen": -9.065649032592773, "logits/rejected": -9.063300132751465, "logps/chosen": -2.118589401245117, "logps/rejected": -109.28277587890625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9323267936706543, "rewards/margins": 10.603015899658203, "rewards/rejected": -7.670688152313232, "step": 999 }, { "epoch": 0.702000702000702, "grad_norm": 0.27056068181991577, "learning_rate": 5e-05, "logits/chosen": -8.361536979675293, "logits/rejected": -8.346529960632324, "logps/chosen": -7.092010021209717, "logps/rejected": -106.09686279296875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 2.54880690574646, "rewards/margins": 9.896037101745605, "rewards/rejected": -7.347229957580566, "step": 1000 }, { "epoch": 0.7027027027027027, "grad_norm": 0.0012268010759726167, "learning_rate": 4.999931382857749e-05, "logits/chosen": -9.223247528076172, "logits/rejected": -9.227653503417969, "logps/chosen": -8.967948913574219, "logps/rejected": -104.31336975097656, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.302248001098633, "rewards/margins": 9.539461135864258, "rewards/rejected": -7.237214088439941, "step": 1001 }, { "epoch": 0.7034047034047034, "grad_norm": 0.00593750923871994, "learning_rate": 4.9997255351980225e-05, "logits/chosen": -9.17724895477295, "logits/rejected": -9.178215026855469, "logps/chosen": -6.2656941413879395, "logps/rejected": -107.07585144042969, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.602689266204834, "rewards/margins": 10.144128799438477, "rewards/rejected": -7.541438579559326, "step": 1002 }, { "epoch": 0.7041067041067041, "grad_norm": 0.005843866150826216, "learning_rate": 4.999382468321693e-05, "logits/chosen": -8.96492862701416, "logits/rejected": -8.96467399597168, "logps/chosen": -4.340137958526611, "logps/rejected": -108.50267791748047, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.727724552154541, "rewards/margins": 10.39712142944336, "rewards/rejected": -7.66939640045166, "step": 1003 }, { "epoch": 0.7048087048087048, "grad_norm": 0.011775449849665165, "learning_rate": 4.9989022010628595e-05, "logits/chosen": -8.721294403076172, "logits/rejected": -8.686796188354492, "logps/chosen": -3.765620231628418, "logps/rejected": -108.78058624267578, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.865610122680664, "rewards/margins": 10.542180061340332, "rewards/rejected": -7.676569938659668, "step": 1004 }, { "epoch": 0.7055107055107055, "grad_norm": 0.025173859670758247, "learning_rate": 4.99828475978781e-05, "logits/chosen": -7.764649391174316, "logits/rejected": -7.757041931152344, "logps/chosen": -7.582302093505859, "logps/rejected": -104.91786193847656, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4729700088500977, "rewards/margins": 9.74973201751709, "rewards/rejected": -7.27676248550415, "step": 1005 }, { "epoch": 0.7062127062127063, "grad_norm": 0.002708728890866041, "learning_rate": 4.997530178393581e-05, "logits/chosen": -9.038423538208008, "logits/rejected": -9.0181884765625, "logps/chosen": -7.981420040130615, "logps/rejected": -107.80186462402344, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.493407964706421, "rewards/margins": 9.993814468383789, "rewards/rejected": -7.5004072189331055, "step": 1006 }, { "epoch": 0.7069147069147069, "grad_norm": 0.002602911554276943, "learning_rate": 4.99663849830609e-05, "logits/chosen": -9.28097915649414, "logits/rejected": -9.262073516845703, "logps/chosen": -9.319086074829102, "logps/rejected": -105.1153564453125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.1870052814483643, "rewards/margins": 9.58493423461914, "rewards/rejected": -7.3979291915893555, "step": 1007 }, { "epoch": 0.7076167076167076, "grad_norm": 0.0042364574037492275, "learning_rate": 4.995609768477863e-05, "logits/chosen": -8.521207809448242, "logits/rejected": -8.517193794250488, "logps/chosen": -3.7284626960754395, "logps/rejected": -109.26519012451172, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7046122550964355, "rewards/margins": 10.565170288085938, "rewards/rejected": -7.860558032989502, "step": 1008 }, { "epoch": 0.7083187083187084, "grad_norm": 0.003927813842892647, "learning_rate": 4.994444045385348e-05, "logits/chosen": -8.172853469848633, "logits/rejected": -8.16762638092041, "logps/chosen": -11.122672080993652, "logps/rejected": -100.71736907958984, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.1412413120269775, "rewards/margins": 8.915751457214355, "rewards/rejected": -6.774509906768799, "step": 1009 }, { "epoch": 0.709020709020709, "grad_norm": 0.03765709325671196, "learning_rate": 4.993141393025816e-05, "logits/chosen": -9.024906158447266, "logits/rejected": -9.027116775512695, "logps/chosen": -7.907480716705322, "logps/rejected": -107.94783020019531, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.552839756011963, "rewards/margins": 10.102409362792969, "rewards/rejected": -7.549570083618164, "step": 1010 }, { "epoch": 0.7097227097227097, "grad_norm": 0.002265612594783306, "learning_rate": 4.9917018829138434e-05, "logits/chosen": -8.5015869140625, "logits/rejected": -8.49515151977539, "logps/chosen": -6.566116809844971, "logps/rejected": -107.88589477539062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6039161682128906, "rewards/margins": 10.213939666748047, "rewards/rejected": -7.610023498535156, "step": 1011 }, { "epoch": 0.7104247104247104, "grad_norm": 0.07267352193593979, "learning_rate": 4.990125594077389e-05, "logits/chosen": -8.686239242553711, "logits/rejected": -8.675270080566406, "logps/chosen": -7.099653244018555, "logps/rejected": -107.49678802490234, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.4597134590148926, "rewards/margins": 10.012331008911133, "rewards/rejected": -7.552617073059082, "step": 1012 }, { "epoch": 0.7111267111267111, "grad_norm": 0.00396944722160697, "learning_rate": 4.988412613053455e-05, "logits/chosen": -8.009740829467773, "logits/rejected": -8.011014938354492, "logps/chosen": -7.958301067352295, "logps/rejected": -107.99824523925781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4680733680725098, "rewards/margins": 9.996828079223633, "rewards/rejected": -7.528755187988281, "step": 1013 }, { "epoch": 0.7118287118287119, "grad_norm": 0.0009391521452926099, "learning_rate": 4.986563033883335e-05, "logits/chosen": -8.573081970214844, "logits/rejected": -8.545981407165527, "logps/chosen": -9.768707275390625, "logps/rejected": -103.47964477539062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.282301187515259, "rewards/margins": 9.392932891845703, "rewards/rejected": -7.110630989074707, "step": 1014 }, { "epoch": 0.7125307125307125, "grad_norm": 0.025842875242233276, "learning_rate": 4.9845769581074534e-05, "logits/chosen": -9.014991760253906, "logits/rejected": -9.007856369018555, "logps/chosen": -4.236358642578125, "logps/rejected": -109.59085845947266, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.7435030937194824, "rewards/margins": 10.509519577026367, "rewards/rejected": -7.766016960144043, "step": 1015 }, { "epoch": 0.7132327132327132, "grad_norm": 0.0029032889287918806, "learning_rate": 4.9824544947597894e-05, "logits/chosen": -8.48843002319336, "logits/rejected": -8.474185943603516, "logps/chosen": -6.870312690734863, "logps/rejected": -107.86450958251953, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4446897506713867, "rewards/margins": 10.054931640625, "rewards/rejected": -7.61024284362793, "step": 1016 }, { "epoch": 0.713934713934714, "grad_norm": 0.0020692371763288975, "learning_rate": 4.980195760361889e-05, "logits/chosen": -8.795137405395508, "logits/rejected": -8.788344383239746, "logps/chosen": -4.277478218078613, "logps/rejected": -109.63301086425781, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7205023765563965, "rewards/margins": 10.541292190551758, "rewards/rejected": -7.8207902908325195, "step": 1017 }, { "epoch": 0.7146367146367146, "grad_norm": 0.0017475102795287967, "learning_rate": 4.977800878916474e-05, "logits/chosen": -8.36575984954834, "logits/rejected": -8.35770034790039, "logps/chosen": -4.336680889129639, "logps/rejected": -109.30559539794922, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7229394912719727, "rewards/margins": 10.47648811340332, "rewards/rejected": -7.753548622131348, "step": 1018 }, { "epoch": 0.7153387153387153, "grad_norm": 0.3341398239135742, "learning_rate": 4.975269981900627e-05, "logits/chosen": -9.372381210327148, "logits/rejected": -9.369328498840332, "logps/chosen": -3.933842897415161, "logps/rejected": -109.55409240722656, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 2.765256881713867, "rewards/margins": 10.547815322875977, "rewards/rejected": -7.782557487487793, "step": 1019 }, { "epoch": 0.7160407160407161, "grad_norm": 0.0015504619805142283, "learning_rate": 4.97260320825858e-05, "logits/chosen": -8.445056915283203, "logits/rejected": -8.438835144042969, "logps/chosen": -3.8094184398651123, "logps/rejected": -109.8012466430664, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.809751033782959, "rewards/margins": 10.597919464111328, "rewards/rejected": -7.788167953491211, "step": 1020 }, { "epoch": 0.7167427167427167, "grad_norm": 1.7815498113632202, "learning_rate": 4.969800704394081e-05, "logits/chosen": -8.3463134765625, "logits/rejected": -8.322458267211914, "logps/chosen": -7.61067533493042, "logps/rejected": -108.02999877929688, "loss": 0.0702, "rewards/accuracies": 1.0, "rewards/chosen": 2.697385311126709, "rewards/margins": 10.161039352416992, "rewards/rejected": -7.463654518127441, "step": 1021 }, { "epoch": 0.7174447174447175, "grad_norm": 0.0011026364518329501, "learning_rate": 4.9668626241623635e-05, "logits/chosen": -9.328977584838867, "logits/rejected": -9.316388130187988, "logps/chosen": -7.490750312805176, "logps/rejected": -107.34168243408203, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.500336170196533, "rewards/margins": 10.057511329650879, "rewards/rejected": -7.557175159454346, "step": 1022 }, { "epoch": 0.7181467181467182, "grad_norm": 0.010195756331086159, "learning_rate": 4.963789128861689e-05, "logits/chosen": -8.208311080932617, "logits/rejected": -8.2136869430542, "logps/chosen": -8.097630500793457, "logps/rejected": -107.39784240722656, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.4351398944854736, "rewards/margins": 9.847148895263672, "rewards/rejected": -7.412009239196777, "step": 1023 }, { "epoch": 0.7188487188487188, "grad_norm": 0.0011243007611483335, "learning_rate": 4.960580387224505e-05, "logits/chosen": -8.452814102172852, "logits/rejected": -8.468320846557617, "logps/chosen": -5.3363823890686035, "logps/rejected": -106.5301513671875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6269853115081787, "rewards/margins": 10.086844444274902, "rewards/rejected": -7.4598588943481445, "step": 1024 }, { "epoch": 0.7195507195507196, "grad_norm": 0.004286118317395449, "learning_rate": 4.957236575408172e-05, "logits/chosen": -8.588497161865234, "logits/rejected": -8.579384803771973, "logps/chosen": -6.625768184661865, "logps/rejected": -106.61639404296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.610419750213623, "rewards/margins": 9.97111701965332, "rewards/rejected": -7.3606977462768555, "step": 1025 }, { "epoch": 0.7202527202527202, "grad_norm": 0.0023268877994269133, "learning_rate": 4.953757876985294e-05, "logits/chosen": -8.605910301208496, "logits/rejected": -8.58769416809082, "logps/chosen": -2.7427549362182617, "logps/rejected": -108.26266479492188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.86726450920105, "rewards/margins": 10.611282348632812, "rewards/rejected": -7.744017601013184, "step": 1026 }, { "epoch": 0.7209547209547209, "grad_norm": 0.0008682281477376819, "learning_rate": 4.9501444829336464e-05, "logits/chosen": -8.594427108764648, "logits/rejected": -8.609682083129883, "logps/chosen": -5.486394882202148, "logps/rejected": -106.61356353759766, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.674640655517578, "rewards/margins": 10.069279670715332, "rewards/rejected": -7.394639015197754, "step": 1027 }, { "epoch": 0.7216567216567217, "grad_norm": 0.010534292086958885, "learning_rate": 4.9463965916256854e-05, "logits/chosen": -9.663558959960938, "logits/rejected": -9.67332649230957, "logps/chosen": -2.8699042797088623, "logps/rejected": -108.43853759765625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.825015068054199, "rewards/margins": 10.561454772949219, "rewards/rejected": -7.736439228057861, "step": 1028 }, { "epoch": 0.7223587223587223, "grad_norm": 0.01085755042731762, "learning_rate": 4.9425144088176586e-05, "logits/chosen": -9.252580642700195, "logits/rejected": -9.239480972290039, "logps/chosen": -8.67245101928711, "logps/rejected": -103.90802001953125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.3882901668548584, "rewards/margins": 9.465763092041016, "rewards/rejected": -7.077474117279053, "step": 1029 }, { "epoch": 0.7230607230607231, "grad_norm": 0.0015683851670473814, "learning_rate": 4.938498147638311e-05, "logits/chosen": -9.545220375061035, "logits/rejected": -9.529458999633789, "logps/chosen": -2.795137405395508, "logps/rejected": -108.48755645751953, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.80749773979187, "rewards/margins": 10.525470733642578, "rewards/rejected": -7.7179718017578125, "step": 1030 }, { "epoch": 0.7237627237627238, "grad_norm": 0.0011460700770840049, "learning_rate": 4.9343480285771825e-05, "logits/chosen": -8.353163719177246, "logits/rejected": -8.349479675292969, "logps/chosen": -4.553289413452148, "logps/rejected": -107.96846771240234, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6983144283294678, "rewards/margins": 10.377872467041016, "rewards/rejected": -7.679558753967285, "step": 1031 }, { "epoch": 0.7244647244647244, "grad_norm": 1.1489983797073364, "learning_rate": 4.930064279472507e-05, "logits/chosen": -8.514686584472656, "logits/rejected": -8.513692855834961, "logps/chosen": -4.4177751541137695, "logps/rejected": -107.62681579589844, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": 2.7176718711853027, "rewards/margins": 10.375585556030273, "rewards/rejected": -7.657915115356445, "step": 1032 }, { "epoch": 0.7251667251667252, "grad_norm": 0.013727670535445213, "learning_rate": 4.925647135498698e-05, "logits/chosen": -8.608744621276855, "logits/rejected": -8.61436939239502, "logps/chosen": -13.796196937561035, "logps/rejected": -104.46963500976562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.239619255065918, "rewards/margins": 9.196405410766602, "rewards/rejected": -6.956786155700684, "step": 1033 }, { "epoch": 0.7258687258687259, "grad_norm": 0.0015913991956040263, "learning_rate": 4.921096839153443e-05, "logits/chosen": -8.88969612121582, "logits/rejected": -8.893884658813477, "logps/chosen": -3.7775139808654785, "logps/rejected": -106.711669921875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8870162963867188, "rewards/margins": 10.329615592956543, "rewards/rejected": -7.442598819732666, "step": 1034 }, { "epoch": 0.7265707265707265, "grad_norm": 0.00206986372359097, "learning_rate": 4.916413640244386e-05, "logits/chosen": -9.125913619995117, "logits/rejected": -9.114376068115234, "logps/chosen": -7.9152936935424805, "logps/rejected": -102.4566650390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.305094003677368, "rewards/margins": 9.380559921264648, "rewards/rejected": -7.075466632843018, "step": 1035 }, { "epoch": 0.7272727272727273, "grad_norm": 0.001317084883339703, "learning_rate": 4.91159779587542e-05, "logits/chosen": -9.182975769042969, "logits/rejected": -9.16839599609375, "logps/chosen": -3.521531105041504, "logps/rejected": -106.81242370605469, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.763153076171875, "rewards/margins": 10.346090316772461, "rewards/rejected": -7.582937240600586, "step": 1036 }, { "epoch": 0.727974727974728, "grad_norm": 0.0017087877495214343, "learning_rate": 4.906649570432566e-05, "logits/chosen": -9.631949424743652, "logits/rejected": -9.641960144042969, "logps/chosen": -6.389821529388428, "logps/rejected": -104.8048324584961, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5531005859375, "rewards/margins": 9.812897682189941, "rewards/rejected": -7.259797096252441, "step": 1037 }, { "epoch": 0.7286767286767287, "grad_norm": 0.005706769414246082, "learning_rate": 4.90156923556946e-05, "logits/chosen": -8.910030364990234, "logits/rejected": -8.896661758422852, "logps/chosen": -6.610097408294678, "logps/rejected": -107.12095642089844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4741225242614746, "rewards/margins": 9.962482452392578, "rewards/rejected": -7.488358974456787, "step": 1038 }, { "epoch": 0.7293787293787294, "grad_norm": 0.002156425267457962, "learning_rate": 4.8963570701924404e-05, "logits/chosen": -8.526999473571777, "logits/rejected": -8.52464485168457, "logps/chosen": -8.853584289550781, "logps/rejected": -103.80242156982422, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4336633682250977, "rewards/margins": 9.546956062316895, "rewards/rejected": -7.113293647766113, "step": 1039 }, { "epoch": 0.73008073008073, "grad_norm": 0.0031477909069508314, "learning_rate": 4.8910133604452376e-05, "logits/chosen": -9.54861831665039, "logits/rejected": -9.554950714111328, "logps/chosen": -5.9484381675720215, "logps/rejected": -103.09799194335938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3713996410369873, "rewards/margins": 9.701369285583496, "rewards/rejected": -7.32996940612793, "step": 1040 }, { "epoch": 0.7307827307827308, "grad_norm": 0.0037793624214828014, "learning_rate": 4.88553839969326e-05, "logits/chosen": -8.758234024047852, "logits/rejected": -8.74787712097168, "logps/chosen": -8.866317749023438, "logps/rejected": -103.668701171875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4439382553100586, "rewards/margins": 9.524602890014648, "rewards/rejected": -7.08066463470459, "step": 1041 }, { "epoch": 0.7314847314847315, "grad_norm": 0.002150552347302437, "learning_rate": 4.879932488507493e-05, "logits/chosen": -8.907407760620117, "logits/rejected": -8.87508487701416, "logps/chosen": -6.08596658706665, "logps/rejected": -103.81814575195312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6634178161621094, "rewards/margins": 9.846409797668457, "rewards/rejected": -7.182991981506348, "step": 1042 }, { "epoch": 0.7321867321867321, "grad_norm": 0.005313067231327295, "learning_rate": 4.874195934647999e-05, "logits/chosen": -8.410106658935547, "logits/rejected": -8.415563583374023, "logps/chosen": -6.5918474197387695, "logps/rejected": -102.16026306152344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.650787830352783, "rewards/margins": 9.670526504516602, "rewards/rejected": -7.019739151000977, "step": 1043 }, { "epoch": 0.7328887328887329, "grad_norm": 0.026137221604585648, "learning_rate": 4.8683290530470146e-05, "logits/chosen": -8.669836044311523, "logits/rejected": -8.663743019104004, "logps/chosen": -4.831940650939941, "logps/rejected": -104.81051635742188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.736043930053711, "rewards/margins": 9.986885070800781, "rewards/rejected": -7.250842094421387, "step": 1044 }, { "epoch": 0.7335907335907336, "grad_norm": 0.002068733097985387, "learning_rate": 4.8623321657916704e-05, "logits/chosen": -8.727191925048828, "logits/rejected": -8.73214054107666, "logps/chosen": -2.4697306156158447, "logps/rejected": -108.91468811035156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.958616256713867, "rewards/margins": 10.498278617858887, "rewards/rejected": -7.5396623611450195, "step": 1045 }, { "epoch": 0.7342927342927343, "grad_norm": 0.003749322844669223, "learning_rate": 4.8562056021063e-05, "logits/chosen": -9.3921537399292, "logits/rejected": -9.372808456420898, "logps/chosen": -9.756439208984375, "logps/rejected": -103.43930053710938, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4071009159088135, "rewards/margins": 9.419949531555176, "rewards/rejected": -7.012848854064941, "step": 1046 }, { "epoch": 0.734994734994735, "grad_norm": 0.0031361014116555452, "learning_rate": 4.8499496983343704e-05, "logits/chosen": -9.43799114227295, "logits/rejected": -9.430107116699219, "logps/chosen": -13.800378799438477, "logps/rejected": -94.59033203125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.9966052770614624, "rewards/margins": 8.181751251220703, "rewards/rejected": -6.185145854949951, "step": 1047 }, { "epoch": 0.7356967356967357, "grad_norm": 0.0030404741410166025, "learning_rate": 4.84356479792002e-05, "logits/chosen": -8.51938533782959, "logits/rejected": -8.519571304321289, "logps/chosen": -7.925787925720215, "logps/rejected": -104.59512329101562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4156956672668457, "rewards/margins": 9.547065734863281, "rewards/rejected": -7.131371021270752, "step": 1048 }, { "epoch": 0.7363987363987364, "grad_norm": 0.10398770868778229, "learning_rate": 4.8370512513891935e-05, "logits/chosen": -9.26296615600586, "logits/rejected": -9.277349472045898, "logps/chosen": -5.346919059753418, "logps/rejected": -104.41441345214844, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 2.6947059631347656, "rewards/margins": 9.991933822631836, "rewards/rejected": -7.297226905822754, "step": 1049 }, { "epoch": 0.7371007371007371, "grad_norm": 0.17798513174057007, "learning_rate": 4.8304094163304124e-05, "logits/chosen": -9.861695289611816, "logits/rejected": -9.859405517578125, "logps/chosen": -7.803020000457764, "logps/rejected": -100.31060791015625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.270366668701172, "rewards/margins": 9.269729614257812, "rewards/rejected": -6.999362945556641, "step": 1050 }, { "epoch": 0.7378027378027378, "grad_norm": 0.003256521187722683, "learning_rate": 4.8236396573751306e-05, "logits/chosen": -9.142609596252441, "logits/rejected": -9.131745338439941, "logps/chosen": -6.455196380615234, "logps/rejected": -106.50381469726562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7073941230773926, "rewards/margins": 9.916910171508789, "rewards/rejected": -7.209516525268555, "step": 1051 }, { "epoch": 0.7385047385047385, "grad_norm": 0.0021719445940107107, "learning_rate": 4.8167423461777255e-05, "logits/chosen": -8.82609748840332, "logits/rejected": -8.808719635009766, "logps/chosen": -5.327970027923584, "logps/rejected": -103.45545959472656, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7465147972106934, "rewards/margins": 9.84272289276123, "rewards/rejected": -7.096207618713379, "step": 1052 }, { "epoch": 0.7392067392067392, "grad_norm": 0.002316446742042899, "learning_rate": 4.80971786139509e-05, "logits/chosen": -8.344741821289062, "logits/rejected": -8.35004711151123, "logps/chosen": -8.319478988647461, "logps/rejected": -101.16810607910156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.306238889694214, "rewards/margins": 9.264419555664062, "rewards/rejected": -6.958180904388428, "step": 1053 }, { "epoch": 0.73990873990874, "grad_norm": 0.0048733921721577644, "learning_rate": 4.802566588665842e-05, "logits/chosen": -8.417311668395996, "logits/rejected": -8.410717010498047, "logps/chosen": -9.727775573730469, "logps/rejected": -100.42353057861328, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.325913906097412, "rewards/margins": 9.101404190063477, "rewards/rejected": -6.775489330291748, "step": 1054 }, { "epoch": 0.7406107406107406, "grad_norm": 0.0030687940306961536, "learning_rate": 4.7952889205891635e-05, "logits/chosen": -9.30984878540039, "logits/rejected": -9.307855606079102, "logps/chosen": -5.340876579284668, "logps/rejected": -106.7603988647461, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.532466173171997, "rewards/margins": 10.03196907043457, "rewards/rejected": -7.499502182006836, "step": 1055 }, { "epoch": 0.7413127413127413, "grad_norm": 0.0012669505085796118, "learning_rate": 4.787885256703236e-05, "logits/chosen": -8.586193084716797, "logits/rejected": -8.57686710357666, "logps/chosen": -5.694167613983154, "logps/rejected": -104.98165893554688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.782954692840576, "rewards/margins": 10.04830551147461, "rewards/rejected": -7.265350818634033, "step": 1056 }, { "epoch": 0.742014742014742, "grad_norm": 0.002595028607174754, "learning_rate": 4.78035600346331e-05, "logits/chosen": -8.436113357543945, "logits/rejected": -8.442420959472656, "logps/chosen": -7.399031162261963, "logps/rejected": -104.32888793945312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.422450304031372, "rewards/margins": 9.658333778381348, "rewards/rejected": -7.235882759094238, "step": 1057 }, { "epoch": 0.7427167427167427, "grad_norm": 0.002428911393508315, "learning_rate": 4.7727015742193945e-05, "logits/chosen": -8.368290901184082, "logits/rejected": -8.365116119384766, "logps/chosen": -10.122030258178711, "logps/rejected": -103.07577514648438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.305039405822754, "rewards/margins": 9.262821197509766, "rewards/rejected": -6.957781791687012, "step": 1058 }, { "epoch": 0.7434187434187434, "grad_norm": 4.05342435836792, "learning_rate": 4.7649223891935594e-05, "logits/chosen": -9.710041999816895, "logits/rejected": -9.711601257324219, "logps/chosen": -4.732644081115723, "logps/rejected": -105.44696044921875, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": 2.7537596225738525, "rewards/margins": 10.068292617797852, "rewards/rejected": -7.314533233642578, "step": 1059 }, { "epoch": 0.7441207441207441, "grad_norm": 0.008156750351190567, "learning_rate": 4.757018875456868e-05, "logits/chosen": -9.696601867675781, "logits/rejected": -9.685569763183594, "logps/chosen": -7.916329383850098, "logps/rejected": -106.32865905761719, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.323664426803589, "rewards/margins": 9.772764205932617, "rewards/rejected": -7.449100017547607, "step": 1060 }, { "epoch": 0.7448227448227448, "grad_norm": 0.006657208316028118, "learning_rate": 4.748991466905931e-05, "logits/chosen": -8.499704360961914, "logits/rejected": -8.504343032836914, "logps/chosen": -9.818764686584473, "logps/rejected": -103.2306137084961, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3195624351501465, "rewards/margins": 9.268195152282715, "rewards/rejected": -6.94863224029541, "step": 1061 }, { "epoch": 0.7455247455247456, "grad_norm": 0.001928415964357555, "learning_rate": 4.740840604239086e-05, "logits/chosen": -8.894515991210938, "logits/rejected": -8.902408599853516, "logps/chosen": -10.903565406799316, "logps/rejected": -103.04353332519531, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.146817207336426, "rewards/margins": 9.212789535522461, "rewards/rejected": -7.065973281860352, "step": 1062 }, { "epoch": 0.7462267462267462, "grad_norm": 0.004459368530660868, "learning_rate": 4.7325667349322035e-05, "logits/chosen": -8.34304428100586, "logits/rejected": -8.332541465759277, "logps/chosen": -11.356977462768555, "logps/rejected": -98.17176818847656, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.14943790435791, "rewards/margins": 8.764177322387695, "rewards/rejected": -6.614739418029785, "step": 1063 }, { "epoch": 0.7469287469287469, "grad_norm": 1.7379379272460938, "learning_rate": 4.72417031321412e-05, "logits/chosen": -8.80396556854248, "logits/rejected": -8.803340911865234, "logps/chosen": -7.893672943115234, "logps/rejected": -106.68942260742188, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": 2.4457690715789795, "rewards/margins": 9.863764762878418, "rewards/rejected": -7.417996406555176, "step": 1064 }, { "epoch": 0.7476307476307477, "grad_norm": 0.9017641544342041, "learning_rate": 4.715651800041701e-05, "logits/chosen": -9.904816627502441, "logits/rejected": -9.907365798950195, "logps/chosen": -8.868206024169922, "logps/rejected": -104.56394958496094, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 2.2927236557006836, "rewards/margins": 9.54307746887207, "rewards/rejected": -7.250354290008545, "step": 1065 }, { "epoch": 0.7483327483327483, "grad_norm": 0.005697725806385279, "learning_rate": 4.707011663074538e-05, "logits/chosen": -8.813304901123047, "logits/rejected": -8.794107437133789, "logps/chosen": -9.518684387207031, "logps/rejected": -100.41761779785156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3205666542053223, "rewards/margins": 9.136664390563965, "rewards/rejected": -6.816098213195801, "step": 1066 }, { "epoch": 0.749034749034749, "grad_norm": 0.0015864805318415165, "learning_rate": 4.698250376649269e-05, "logits/chosen": -9.046417236328125, "logits/rejected": -9.049603462219238, "logps/chosen": -4.463600158691406, "logps/rejected": -108.20648193359375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.785405158996582, "rewards/margins": 10.32480239868164, "rewards/rejected": -7.539396286010742, "step": 1067 }, { "epoch": 0.7497367497367498, "grad_norm": 0.8191275596618652, "learning_rate": 4.6893684217535426e-05, "logits/chosen": -9.10501766204834, "logits/rejected": -9.099365234375, "logps/chosen": -7.078818321228027, "logps/rejected": -106.36658477783203, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 2.407724142074585, "rewards/margins": 9.869100570678711, "rewards/rejected": -7.461377143859863, "step": 1068 }, { "epoch": 0.7504387504387504, "grad_norm": 0.03466533496975899, "learning_rate": 4.6803662859996126e-05, "logits/chosen": -8.855403900146484, "logits/rejected": -8.851434707641602, "logps/chosen": -7.510464191436768, "logps/rejected": -104.57545471191406, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.3694424629211426, "rewards/margins": 9.605908393859863, "rewards/rejected": -7.2364654541015625, "step": 1069 }, { "epoch": 0.7511407511407512, "grad_norm": 1.5301357507705688, "learning_rate": 4.67124446359756e-05, "logits/chosen": -8.375207901000977, "logits/rejected": -8.386210441589355, "logps/chosen": -8.798465728759766, "logps/rejected": -105.52764892578125, "loss": 0.0272, "rewards/accuracies": 1.0, "rewards/chosen": 2.3255393505096436, "rewards/margins": 9.596593856811523, "rewards/rejected": -7.271055221557617, "step": 1070 }, { "epoch": 0.7518427518427518, "grad_norm": 0.027162037789821625, "learning_rate": 4.662003455328173e-05, "logits/chosen": -8.636449813842773, "logits/rejected": -8.631000518798828, "logps/chosen": -9.441484451293945, "logps/rejected": -101.97847747802734, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.2805447578430176, "rewards/margins": 9.302162170410156, "rewards/rejected": -7.021617412567139, "step": 1071 }, { "epoch": 0.7525447525447525, "grad_norm": 1.7150911092758179, "learning_rate": 4.652643768515447e-05, "logits/chosen": -8.376364707946777, "logits/rejected": -8.38484001159668, "logps/chosen": -10.884279251098633, "logps/rejected": -101.71783447265625, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": 2.1580660343170166, "rewards/margins": 9.060480117797852, "rewards/rejected": -6.902414798736572, "step": 1072 }, { "epoch": 0.7532467532467533, "grad_norm": 0.054095763713121414, "learning_rate": 4.643165916998735e-05, "logits/chosen": -8.50536060333252, "logits/rejected": -8.486649513244629, "logps/chosen": -10.77769947052002, "logps/rejected": -102.42198181152344, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 2.2420735359191895, "rewards/margins": 9.177336692810059, "rewards/rejected": -6.935263633728027, "step": 1073 }, { "epoch": 0.7539487539487539, "grad_norm": 0.004826526623219252, "learning_rate": 4.633570421104539e-05, "logits/chosen": -9.030445098876953, "logits/rejected": -9.02934455871582, "logps/chosen": -7.398488521575928, "logps/rejected": -105.12763977050781, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4345078468322754, "rewards/margins": 9.73592758178711, "rewards/rejected": -7.301421165466309, "step": 1074 }, { "epoch": 0.7546507546507546, "grad_norm": 0.0035372376441955566, "learning_rate": 4.6238578076179414e-05, "logits/chosen": -8.5231351852417, "logits/rejected": -8.540294647216797, "logps/chosen": -4.683618545532227, "logps/rejected": -105.4693603515625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.691410541534424, "rewards/margins": 10.101755142211914, "rewards/rejected": -7.410345077514648, "step": 1075 }, { "epoch": 0.7553527553527554, "grad_norm": 0.06480871140956879, "learning_rate": 4.614028609753689e-05, "logits/chosen": -8.888848304748535, "logits/rejected": -8.905357360839844, "logps/chosen": -8.237481117248535, "logps/rejected": -102.07820892333984, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.514596462249756, "rewards/margins": 9.397239685058594, "rewards/rejected": -6.882643699645996, "step": 1076 }, { "epoch": 0.756054756054756, "grad_norm": 0.0033754718024283648, "learning_rate": 4.6040833671269204e-05, "logits/chosen": -8.090929985046387, "logits/rejected": -8.058685302734375, "logps/chosen": -8.020112991333008, "logps/rejected": -98.70536041259766, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.300363779067993, "rewards/margins": 8.931620597839355, "rewards/rejected": -6.631257057189941, "step": 1077 }, { "epoch": 0.7567567567567568, "grad_norm": 0.01535646989941597, "learning_rate": 4.594022625723533e-05, "logits/chosen": -7.796875953674316, "logits/rejected": -7.780384063720703, "logps/chosen": -8.515716552734375, "logps/rejected": -101.65373229980469, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.481574773788452, "rewards/margins": 9.379274368286133, "rewards/rejected": -6.897700309753418, "step": 1078 }, { "epoch": 0.7574587574587575, "grad_norm": 0.006260486785322428, "learning_rate": 4.58384693787022e-05, "logits/chosen": -8.765915870666504, "logits/rejected": -8.763837814331055, "logps/chosen": -5.619418144226074, "logps/rejected": -102.56797790527344, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.550363063812256, "rewards/margins": 9.652854919433594, "rewards/rejected": -7.102492332458496, "step": 1079 }, { "epoch": 0.7581607581607581, "grad_norm": 0.024930575862526894, "learning_rate": 4.573556862204142e-05, "logits/chosen": -8.225576400756836, "logits/rejected": -8.21065902709961, "logps/chosen": -6.05092191696167, "logps/rejected": -98.10794067382812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.717177391052246, "rewards/margins": 9.257137298583984, "rewards/rejected": -6.539959907531738, "step": 1080 }, { "epoch": 0.7588627588627589, "grad_norm": 0.020253285765647888, "learning_rate": 4.56315296364226e-05, "logits/chosen": -8.617375373840332, "logits/rejected": -8.625936508178711, "logps/chosen": -6.3098859786987305, "logps/rejected": -103.28431701660156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.543961524963379, "rewards/margins": 9.557648658752441, "rewards/rejected": -7.0136871337890625, "step": 1081 }, { "epoch": 0.7595647595647596, "grad_norm": 0.03242947533726692, "learning_rate": 4.552635813350319e-05, "logits/chosen": -8.627272605895996, "logits/rejected": -8.593059539794922, "logps/chosen": -9.220138549804688, "logps/rejected": -101.54685974121094, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.2647666931152344, "rewards/margins": 9.286711692810059, "rewards/rejected": -7.021944999694824, "step": 1082 }, { "epoch": 0.7602667602667603, "grad_norm": 0.013268508948385715, "learning_rate": 4.542005988711497e-05, "logits/chosen": -9.2227783203125, "logits/rejected": -9.206493377685547, "logps/chosen": -5.890615940093994, "logps/rejected": -105.872314453125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.579252004623413, "rewards/margins": 9.97365951538086, "rewards/rejected": -7.394408226013184, "step": 1083 }, { "epoch": 0.760968760968761, "grad_norm": 0.00866982713341713, "learning_rate": 4.531264073294702e-05, "logits/chosen": -8.477767944335938, "logits/rejected": -8.454360961914062, "logps/chosen": -5.948091506958008, "logps/rejected": -100.80966186523438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5381689071655273, "rewards/margins": 9.479878425598145, "rewards/rejected": -6.941709518432617, "step": 1084 }, { "epoch": 0.7616707616707616, "grad_norm": 0.02187407575547695, "learning_rate": 4.5204106568225364e-05, "logits/chosen": -8.107698440551758, "logits/rejected": -8.104738235473633, "logps/chosen": -9.495615005493164, "logps/rejected": -99.89112091064453, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.345778465270996, "rewards/margins": 9.06700325012207, "rewards/rejected": -6.721225261688232, "step": 1085 }, { "epoch": 0.7623727623727624, "grad_norm": 0.004980184603482485, "learning_rate": 4.5094463351389225e-05, "logits/chosen": -8.784870147705078, "logits/rejected": -8.765630722045898, "logps/chosen": -8.236637115478516, "logps/rejected": -101.31944274902344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.407823324203491, "rewards/margins": 9.386673927307129, "rewards/rejected": -6.978850364685059, "step": 1086 }, { "epoch": 0.7630747630747631, "grad_norm": 0.003115938976407051, "learning_rate": 4.4983717101763894e-05, "logits/chosen": -8.548591613769531, "logits/rejected": -8.550249099731445, "logps/chosen": -8.599196434020996, "logps/rejected": -101.64927673339844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.335099220275879, "rewards/margins": 9.282031059265137, "rewards/rejected": -6.946931838989258, "step": 1087 }, { "epoch": 0.7637767637767637, "grad_norm": 0.10018693655729294, "learning_rate": 4.48718738992303e-05, "logits/chosen": -9.175332069396973, "logits/rejected": -9.14633560180664, "logps/chosen": -5.146000862121582, "logps/rejected": -104.1231689453125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 2.612870216369629, "rewards/margins": 9.917726516723633, "rewards/rejected": -7.30485725402832, "step": 1088 }, { "epoch": 0.7644787644787645, "grad_norm": 0.007615898735821247, "learning_rate": 4.4758939883891196e-05, "logits/chosen": -8.87993335723877, "logits/rejected": -8.874822616577148, "logps/chosen": -8.552892684936523, "logps/rejected": -98.56887817382812, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.2446253299713135, "rewards/margins": 8.975013732910156, "rewards/rejected": -6.730388641357422, "step": 1089 }, { "epoch": 0.7651807651807652, "grad_norm": 0.16663262248039246, "learning_rate": 4.464492125573411e-05, "logits/chosen": -9.419349670410156, "logits/rejected": -9.398153305053711, "logps/chosen": -5.313534259796143, "logps/rejected": -104.12272644042969, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 2.526463508605957, "rewards/margins": 9.828997611999512, "rewards/rejected": -7.302533149719238, "step": 1090 }, { "epoch": 0.7658827658827659, "grad_norm": 0.00535274064168334, "learning_rate": 4.452982427429092e-05, "logits/chosen": -8.612276077270508, "logits/rejected": -8.58729076385498, "logps/chosen": -8.633564949035645, "logps/rejected": -103.705322265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3601510524749756, "rewards/margins": 9.54452133178711, "rewards/rejected": -7.184370994567871, "step": 1091 }, { "epoch": 0.7665847665847666, "grad_norm": 0.004508001264184713, "learning_rate": 4.441365525829427e-05, "logits/chosen": -9.421880722045898, "logits/rejected": -9.410762786865234, "logps/chosen": -6.782583236694336, "logps/rejected": -106.55768585205078, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4810287952423096, "rewards/margins": 9.922093391418457, "rewards/rejected": -7.441065311431885, "step": 1092 }, { "epoch": 0.7672867672867673, "grad_norm": 0.003539337543770671, "learning_rate": 4.4296420585330625e-05, "logits/chosen": -8.950288772583008, "logits/rejected": -8.91672134399414, "logps/chosen": -7.624026298522949, "logps/rejected": -105.58937072753906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3439624309539795, "rewards/margins": 9.819503784179688, "rewards/rejected": -7.475541114807129, "step": 1093 }, { "epoch": 0.767988767988768, "grad_norm": 0.02028316631913185, "learning_rate": 4.417812669149019e-05, "logits/chosen": -9.71080207824707, "logits/rejected": -9.708568572998047, "logps/chosen": -5.159681797027588, "logps/rejected": -106.82047271728516, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.501831293106079, "rewards/margins": 10.127755165100098, "rewards/rejected": -7.625924110412598, "step": 1094 }, { "epoch": 0.7686907686907687, "grad_norm": 0.0015135441208258271, "learning_rate": 4.405878007101354e-05, "logits/chosen": -9.309835433959961, "logits/rejected": -9.304685592651367, "logps/chosen": -4.929189682006836, "logps/rejected": -106.77926635742188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.498772382736206, "rewards/margins": 10.17509937286377, "rewards/rejected": -7.676326751708984, "step": 1095 }, { "epoch": 0.7693927693927693, "grad_norm": 0.004359577316790819, "learning_rate": 4.39383872759351e-05, "logits/chosen": -9.056137084960938, "logits/rejected": -9.033926963806152, "logps/chosen": -14.291450500488281, "logps/rejected": -101.88372039794922, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.8955268859863281, "rewards/margins": 8.710466384887695, "rewards/rejected": -6.814939498901367, "step": 1096 }, { "epoch": 0.7700947700947701, "grad_norm": 0.0019314191304147243, "learning_rate": 4.381695491572346e-05, "logits/chosen": -9.041913986206055, "logits/rejected": -9.036823272705078, "logps/chosen": -5.476228713989258, "logps/rejected": -104.29603576660156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.67014217376709, "rewards/margins": 9.818453788757324, "rewards/rejected": -7.148311614990234, "step": 1097 }, { "epoch": 0.7707967707967708, "grad_norm": 0.36782047152519226, "learning_rate": 4.36944896569185e-05, "logits/chosen": -8.180076599121094, "logits/rejected": -8.20188045501709, "logps/chosen": -10.470603942871094, "logps/rejected": -97.248291015625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 2.255629539489746, "rewards/margins": 8.669414520263672, "rewards/rejected": -6.413784027099609, "step": 1098 }, { "epoch": 0.7714987714987716, "grad_norm": 0.007272388786077499, "learning_rate": 4.3570998222765404e-05, "logits/chosen": -9.024726867675781, "logits/rejected": -9.013312339782715, "logps/chosen": -7.431600093841553, "logps/rejected": -105.4210433959961, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.461352825164795, "rewards/margins": 9.771629333496094, "rewards/rejected": -7.310276508331299, "step": 1099 }, { "epoch": 0.7722007722007722, "grad_norm": 0.0013618816155940294, "learning_rate": 4.344648739284558e-05, "logits/chosen": -8.517049789428711, "logits/rejected": -8.50128173828125, "logps/chosen": -4.811728477478027, "logps/rejected": -107.59443664550781, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.629908561706543, "rewards/margins": 10.21812915802002, "rewards/rejected": -7.588220596313477, "step": 1100 }, { "epoch": 0.7729027729027729, "grad_norm": 0.0016405978240072727, "learning_rate": 4.332096400270444e-05, "logits/chosen": -8.722904205322266, "logits/rejected": -8.702173233032227, "logps/chosen": -6.183206558227539, "logps/rejected": -105.92369079589844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5706686973571777, "rewards/margins": 9.994735717773438, "rewards/rejected": -7.424066543579102, "step": 1101 }, { "epoch": 0.7736047736047736, "grad_norm": 0.0019273809157311916, "learning_rate": 4.3194434943476174e-05, "logits/chosen": -8.480945587158203, "logits/rejected": -8.486138343811035, "logps/chosen": -4.530284881591797, "logps/rejected": -107.36279296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7169411182403564, "rewards/margins": 10.256858825683594, "rewards/rejected": -7.539917945861816, "step": 1102 }, { "epoch": 0.7743067743067743, "grad_norm": 0.0020035940688103437, "learning_rate": 4.3066907161505356e-05, "logits/chosen": -8.179206848144531, "logits/rejected": -8.160257339477539, "logps/chosen": -4.66074275970459, "logps/rejected": -107.05854797363281, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8007659912109375, "rewards/margins": 10.144891738891602, "rewards/rejected": -7.344125747680664, "step": 1103 }, { "epoch": 0.775008775008775, "grad_norm": 0.0016186016146093607, "learning_rate": 4.2938387657965666e-05, "logits/chosen": -9.1971435546875, "logits/rejected": -9.186012268066406, "logps/chosen": -13.65294075012207, "logps/rejected": -101.40450286865234, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.9659464359283447, "rewards/margins": 8.738699913024902, "rewards/rejected": -6.772754192352295, "step": 1104 }, { "epoch": 0.7757107757107757, "grad_norm": 0.007541649509221315, "learning_rate": 4.2808883488475535e-05, "logits/chosen": -8.337118148803711, "logits/rejected": -8.331045150756836, "logps/chosen": -8.96242904663086, "logps/rejected": -106.40534973144531, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.196218729019165, "rewards/margins": 9.633790016174316, "rewards/rejected": -7.4375715255737305, "step": 1105 }, { "epoch": 0.7764127764127764, "grad_norm": 0.019926130771636963, "learning_rate": 4.2678401762710726e-05, "logits/chosen": -8.61520767211914, "logits/rejected": -8.60596752166748, "logps/chosen": -3.331937789916992, "logps/rejected": -107.53520202636719, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.779810667037964, "rewards/margins": 10.34055233001709, "rewards/rejected": -7.560741424560547, "step": 1106 }, { "epoch": 0.7771147771147772, "grad_norm": 0.001917229383252561, "learning_rate": 4.2546949644014096e-05, "logits/chosen": -8.419210433959961, "logits/rejected": -8.406822204589844, "logps/chosen": -7.009217262268066, "logps/rejected": -104.18763732910156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.463688373565674, "rewards/margins": 9.718406677246094, "rewards/rejected": -7.254718780517578, "step": 1107 }, { "epoch": 0.7778167778167778, "grad_norm": 0.001413197023794055, "learning_rate": 4.241453434900228e-05, "logits/chosen": -8.363043785095215, "logits/rejected": -8.364679336547852, "logps/chosen": -8.124727249145508, "logps/rejected": -105.55946350097656, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4620447158813477, "rewards/margins": 9.617176055908203, "rewards/rejected": -7.1551313400268555, "step": 1108 }, { "epoch": 0.7785187785187785, "grad_norm": 0.002091263188049197, "learning_rate": 4.2281163147169525e-05, "logits/chosen": -8.18874454498291, "logits/rejected": -8.179981231689453, "logps/chosen": -3.6471571922302246, "logps/rejected": -107.26643371582031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.891394853591919, "rewards/margins": 10.413564682006836, "rewards/rejected": -7.52216911315918, "step": 1109 }, { "epoch": 0.7792207792207793, "grad_norm": 0.002260390669107437, "learning_rate": 4.2146843360488635e-05, "logits/chosen": -8.60820484161377, "logits/rejected": -8.607751846313477, "logps/chosen": -7.863031387329102, "logps/rejected": -105.81903076171875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4421863555908203, "rewards/margins": 9.766399383544922, "rewards/rejected": -7.324213027954102, "step": 1110 }, { "epoch": 0.7799227799227799, "grad_norm": 0.952869176864624, "learning_rate": 4.2011582363008926e-05, "logits/chosen": -8.798023223876953, "logits/rejected": -8.812860488891602, "logps/chosen": -7.480142116546631, "logps/rejected": -105.45799255371094, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": 2.5950124263763428, "rewards/margins": 9.79582405090332, "rewards/rejected": -7.200810432434082, "step": 1111 }, { "epoch": 0.7806247806247806, "grad_norm": 0.001331625273451209, "learning_rate": 4.187538758045147e-05, "logits/chosen": -8.213577270507812, "logits/rejected": -8.2052640914917, "logps/chosen": -6.518065929412842, "logps/rejected": -104.23114013671875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.536404609680176, "rewards/margins": 9.820295333862305, "rewards/rejected": -7.283891201019287, "step": 1112 }, { "epoch": 0.7813267813267813, "grad_norm": 0.0014123670989647508, "learning_rate": 4.173826648980136e-05, "logits/chosen": -8.579612731933594, "logits/rejected": -8.559008598327637, "logps/chosen": -11.629363059997559, "logps/rejected": -104.44172668457031, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.0220372676849365, "rewards/margins": 9.26633358001709, "rewards/rejected": -7.244297027587891, "step": 1113 }, { "epoch": 0.782028782028782, "grad_norm": 0.00860548671334982, "learning_rate": 4.160022661889731e-05, "logits/chosen": -8.843610763549805, "logits/rejected": -8.862859725952148, "logps/chosen": -2.716545820236206, "logps/rejected": -106.3544692993164, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.784879207611084, "rewards/margins": 10.251505851745605, "rewards/rejected": -7.466626167297363, "step": 1114 }, { "epoch": 0.7827307827307828, "grad_norm": 0.020707212388515472, "learning_rate": 4.146127554601829e-05, "logits/chosen": -8.763882637023926, "logits/rejected": -8.751635551452637, "logps/chosen": -6.850902557373047, "logps/rejected": -105.01146697998047, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.530205726623535, "rewards/margins": 9.81549072265625, "rewards/rejected": -7.285284996032715, "step": 1115 }, { "epoch": 0.7834327834327834, "grad_norm": 0.001994227059185505, "learning_rate": 4.132142089946757e-05, "logits/chosen": -8.331887245178223, "logits/rejected": -8.337822914123535, "logps/chosen": -8.927190780639648, "logps/rejected": -101.16856384277344, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4182963371276855, "rewards/margins": 9.158565521240234, "rewards/rejected": -6.740268707275391, "step": 1116 }, { "epoch": 0.7841347841347841, "grad_norm": 0.0014339275658130646, "learning_rate": 4.1180670357153876e-05, "logits/chosen": -8.801259994506836, "logits/rejected": -8.81518840789795, "logps/chosen": -8.884659767150879, "logps/rejected": -103.56937408447266, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4129092693328857, "rewards/margins": 9.495148658752441, "rewards/rejected": -7.082239151000977, "step": 1117 }, { "epoch": 0.7848367848367849, "grad_norm": 0.0013041570782661438, "learning_rate": 4.1039031646169915e-05, "logits/chosen": -8.36539363861084, "logits/rejected": -8.353204727172852, "logps/chosen": -2.4094417095184326, "logps/rejected": -106.21018981933594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8583788871765137, "rewards/margins": 10.232132911682129, "rewards/rejected": -7.373753547668457, "step": 1118 }, { "epoch": 0.7855387855387855, "grad_norm": 0.0012682665837928653, "learning_rate": 4.089651254236812e-05, "logits/chosen": -8.862547874450684, "logits/rejected": -8.839067459106445, "logps/chosen": -5.586452484130859, "logps/rejected": -104.18351745605469, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5505788326263428, "rewards/margins": 9.78059196472168, "rewards/rejected": -7.230012893676758, "step": 1119 }, { "epoch": 0.7862407862407862, "grad_norm": 0.0018768125446513295, "learning_rate": 4.0753120869933834e-05, "logits/chosen": -8.51213264465332, "logits/rejected": -8.502565383911133, "logps/chosen": -3.1832945346832275, "logps/rejected": -106.52851104736328, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.837869167327881, "rewards/margins": 10.300721168518066, "rewards/rejected": -7.4628520011901855, "step": 1120 }, { "epoch": 0.786942786942787, "grad_norm": 1.8788843154907227, "learning_rate": 4.060886450095565e-05, "logits/chosen": -8.579328536987305, "logits/rejected": -8.575950622558594, "logps/chosen": -4.076665878295898, "logps/rejected": -104.3017349243164, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 2.708813428878784, "rewards/margins": 10.035955429077148, "rewards/rejected": -7.327141761779785, "step": 1121 }, { "epoch": 0.7876447876447876, "grad_norm": 0.004338810220360756, "learning_rate": 4.0463751354993395e-05, "logits/chosen": -10.278366088867188, "logits/rejected": -10.274736404418945, "logps/chosen": -17.78689193725586, "logps/rejected": -102.05235290527344, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.4251458644866943, "rewards/margins": 8.260004043579102, "rewards/rejected": -6.8348588943481445, "step": 1122 }, { "epoch": 0.7883467883467884, "grad_norm": 0.001031085615977645, "learning_rate": 4.0317789398643214e-05, "logits/chosen": -8.980735778808594, "logits/rejected": -8.986675262451172, "logps/chosen": -7.093111038208008, "logps/rejected": -104.10784912109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.4827699661254883, "rewards/margins": 9.717230796813965, "rewards/rejected": -7.234460830688477, "step": 1123 }, { "epoch": 0.7890487890487891, "grad_norm": 0.0028119899798184633, "learning_rate": 4.0170986645100286e-05, "logits/chosen": -8.47291374206543, "logits/rejected": -8.476081848144531, "logps/chosen": -3.455239772796631, "logps/rejected": -106.44837951660156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.99403715133667, "rewards/margins": 10.393253326416016, "rewards/rejected": -7.399216651916504, "step": 1124 }, { "epoch": 0.7897507897507897, "grad_norm": 0.0016209216555580497, "learning_rate": 4.002335115371889e-05, "logits/chosen": -7.611870765686035, "logits/rejected": -7.624817848205566, "logps/chosen": -7.08134651184082, "logps/rejected": -103.53781127929688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5461480617523193, "rewards/margins": 9.645915031433105, "rewards/rejected": -7.099767684936523, "step": 1125 }, { "epoch": 0.7904527904527905, "grad_norm": 0.005041207652539015, "learning_rate": 3.987489102956994e-05, "logits/chosen": -8.197057723999023, "logits/rejected": -8.188756942749023, "logps/chosen": -3.2224559783935547, "logps/rejected": -107.2030029296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9752867221832275, "rewards/margins": 10.449918746948242, "rewards/rejected": -7.474632263183594, "step": 1126 }, { "epoch": 0.7911547911547911, "grad_norm": 0.07781370729207993, "learning_rate": 3.972561442299604e-05, "logits/chosen": -9.231878280639648, "logits/rejected": -9.23183822631836, "logps/chosen": -13.778694152832031, "logps/rejected": -100.18368530273438, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 2.0248870849609375, "rewards/margins": 8.731613159179688, "rewards/rejected": -6.706727027893066, "step": 1127 }, { "epoch": 0.7918567918567918, "grad_norm": 0.0011096613015979528, "learning_rate": 3.957552952916402e-05, "logits/chosen": -8.731405258178711, "logits/rejected": -8.731413841247559, "logps/chosen": -10.341630935668945, "logps/rejected": -102.88822937011719, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3042666912078857, "rewards/margins": 9.33969497680664, "rewards/rejected": -7.035428047180176, "step": 1128 }, { "epoch": 0.7925587925587926, "grad_norm": 0.0014381028013303876, "learning_rate": 3.942464458761504e-05, "logits/chosen": -8.452211380004883, "logits/rejected": -8.463947296142578, "logps/chosen": -7.355834007263184, "logps/rejected": -105.64353942871094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6098384857177734, "rewards/margins": 9.893962860107422, "rewards/rejected": -7.284124374389648, "step": 1129 }, { "epoch": 0.7932607932607932, "grad_norm": 0.0014048466691747308, "learning_rate": 3.9272967881812254e-05, "logits/chosen": -8.82635498046875, "logits/rejected": -8.829750061035156, "logps/chosen": -10.248631477355957, "logps/rejected": -105.95938110351562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.016709327697754, "rewards/margins": 9.523845672607422, "rewards/rejected": -7.507136821746826, "step": 1130 }, { "epoch": 0.793962793962794, "grad_norm": 0.0023909315932542086, "learning_rate": 3.912050773868602e-05, "logits/chosen": -8.515875816345215, "logits/rejected": -8.525644302368164, "logps/chosen": -5.048182010650635, "logps/rejected": -104.31661987304688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5693774223327637, "rewards/margins": 9.93763256072998, "rewards/rejected": -7.368255138397217, "step": 1131 }, { "epoch": 0.7946647946647947, "grad_norm": 0.0013618732336908579, "learning_rate": 3.8967272528176805e-05, "logits/chosen": -8.629212379455566, "logits/rejected": -8.634988784790039, "logps/chosen": -1.7559112310409546, "logps/rejected": -107.78245544433594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9476613998413086, "rewards/margins": 10.547952651977539, "rewards/rejected": -7.600290298461914, "step": 1132 }, { "epoch": 0.7953667953667953, "grad_norm": 0.0014289607061073184, "learning_rate": 3.881327066277565e-05, "logits/chosen": -7.844362258911133, "logits/rejected": -7.853912353515625, "logps/chosen": -6.23482084274292, "logps/rejected": -105.03468322753906, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6009485721588135, "rewards/margins": 9.817785263061523, "rewards/rejected": -7.216836929321289, "step": 1133 }, { "epoch": 0.7960687960687961, "grad_norm": 0.19767896831035614, "learning_rate": 3.865851059706233e-05, "logits/chosen": -7.890077590942383, "logits/rejected": -7.886649131774902, "logps/chosen": -3.3669965267181396, "logps/rejected": -107.73713684082031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.722421169281006, "rewards/margins": 10.290234565734863, "rewards/rejected": -7.567813873291016, "step": 1134 }, { "epoch": 0.7967707967707968, "grad_norm": 0.006072353571653366, "learning_rate": 3.850300082724122e-05, "logits/chosen": -7.79393196105957, "logits/rejected": -7.777135848999023, "logps/chosen": -5.147307872772217, "logps/rejected": -105.89179992675781, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6857807636260986, "rewards/margins": 10.005683898925781, "rewards/rejected": -7.31990385055542, "step": 1135 }, { "epoch": 0.7974727974727974, "grad_norm": 0.003586187958717346, "learning_rate": 3.8346749890674853e-05, "logits/chosen": -8.333419799804688, "logits/rejected": -8.323662757873535, "logps/chosen": -5.873533725738525, "logps/rejected": -104.44596099853516, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5070176124572754, "rewards/margins": 9.804500579833984, "rewards/rejected": -7.297482967376709, "step": 1136 }, { "epoch": 0.7981747981747982, "grad_norm": 0.0011658202856779099, "learning_rate": 3.818976636541524e-05, "logits/chosen": -8.54665470123291, "logits/rejected": -8.558752059936523, "logps/chosen": -8.013855934143066, "logps/rejected": -102.29216003417969, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.3061273097991943, "rewards/margins": 9.399225234985352, "rewards/rejected": -7.093097686767578, "step": 1137 }, { "epoch": 0.7988767988767989, "grad_norm": 0.0032000932842493057, "learning_rate": 3.8032058869732905e-05, "logits/chosen": -8.824020385742188, "logits/rejected": -8.811841011047363, "logps/chosen": -3.4102869033813477, "logps/rejected": -106.62423706054688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8539509773254395, "rewards/margins": 10.38072681427002, "rewards/rejected": -7.526775360107422, "step": 1138 }, { "epoch": 0.7995787995787996, "grad_norm": 0.0015822292771190405, "learning_rate": 3.787363606164378e-05, "logits/chosen": -9.055427551269531, "logits/rejected": -9.050338745117188, "logps/chosen": -7.215968608856201, "logps/rejected": -107.43922424316406, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.590596914291382, "rewards/margins": 10.004430770874023, "rewards/rejected": -7.413834095001221, "step": 1139 }, { "epoch": 0.8002808002808003, "grad_norm": 0.0017485395073890686, "learning_rate": 3.7714506638433895e-05, "logits/chosen": -8.403158187866211, "logits/rejected": -8.393829345703125, "logps/chosen": -4.050954818725586, "logps/rejected": -106.56256103515625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.824866771697998, "rewards/margins": 10.268898010253906, "rewards/rejected": -7.444031238555908, "step": 1140 }, { "epoch": 0.800982800982801, "grad_norm": 0.0019889965187758207, "learning_rate": 3.7554679336181845e-05, "logits/chosen": -8.321976661682129, "logits/rejected": -8.302933692932129, "logps/chosen": -2.3793487548828125, "logps/rejected": -108.64395904541016, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8394055366516113, "rewards/margins": 10.578068733215332, "rewards/rejected": -7.7386627197265625, "step": 1141 }, { "epoch": 0.8016848016848017, "grad_norm": 0.0020817373879253864, "learning_rate": 3.739416292927929e-05, "logits/chosen": -9.718402862548828, "logits/rejected": -9.716854095458984, "logps/chosen": -3.105069637298584, "logps/rejected": -106.4480209350586, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.771394729614258, "rewards/margins": 10.378425598144531, "rewards/rejected": -7.607030391693115, "step": 1142 }, { "epoch": 0.8023868023868024, "grad_norm": 0.0010717479744926095, "learning_rate": 3.7232966229949124e-05, "logits/chosen": -8.889596939086914, "logits/rejected": -8.893943786621094, "logps/chosen": -5.589236259460449, "logps/rejected": -106.94070434570312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.691187858581543, "rewards/margins": 10.11473274230957, "rewards/rejected": -7.423544406890869, "step": 1143 }, { "epoch": 0.803088803088803, "grad_norm": 0.005052848719060421, "learning_rate": 3.7071098087761795e-05, "logits/chosen": -8.780584335327148, "logits/rejected": -8.810478210449219, "logps/chosen": -5.609302520751953, "logps/rejected": -106.36163330078125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6393818855285645, "rewards/margins": 9.992496490478516, "rewards/rejected": -7.353114604949951, "step": 1144 }, { "epoch": 0.8037908037908038, "grad_norm": 0.0010487453546375036, "learning_rate": 3.6908567389149424e-05, "logits/chosen": -8.889941215515137, "logits/rejected": -8.860092163085938, "logps/chosen": -8.888648986816406, "logps/rejected": -103.43858337402344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3654327392578125, "rewards/margins": 9.416129112243652, "rewards/rejected": -7.05069637298584, "step": 1145 }, { "epoch": 0.8044928044928045, "grad_norm": 0.04563824087381363, "learning_rate": 3.674538305691793e-05, "logits/chosen": -8.45068359375, "logits/rejected": -8.419279098510742, "logps/chosen": -9.264415740966797, "logps/rejected": -104.90736389160156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.2938013076782227, "rewards/margins": 9.558034896850586, "rewards/rejected": -7.2642340660095215, "step": 1146 }, { "epoch": 0.8051948051948052, "grad_norm": 0.0014958787942305207, "learning_rate": 3.6581554049757226e-05, "logits/chosen": -7.643654823303223, "logits/rejected": -7.638471603393555, "logps/chosen": -9.359468460083008, "logps/rejected": -104.42039489746094, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.363110065460205, "rewards/margins": 9.387741088867188, "rewards/rejected": -7.024631500244141, "step": 1147 }, { "epoch": 0.8058968058968059, "grad_norm": 0.002102995989844203, "learning_rate": 3.6417089361749344e-05, "logits/chosen": -8.128787994384766, "logits/rejected": -8.133545875549316, "logps/chosen": -2.649073600769043, "logps/rejected": -107.42196655273438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9375100135803223, "rewards/margins": 10.519548416137695, "rewards/rejected": -7.582038402557373, "step": 1148 }, { "epoch": 0.8065988065988066, "grad_norm": 0.007021204102784395, "learning_rate": 3.62519980218747e-05, "logits/chosen": -8.385887145996094, "logits/rejected": -8.388465881347656, "logps/chosen": -2.90756893157959, "logps/rejected": -105.95188903808594, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8325798511505127, "rewards/margins": 10.315285682678223, "rewards/rejected": -7.482705593109131, "step": 1149 }, { "epoch": 0.8073008073008073, "grad_norm": 0.003864078316837549, "learning_rate": 3.6086289093516385e-05, "logits/chosen": -8.764470100402832, "logits/rejected": -8.761754035949707, "logps/chosen": -2.914264440536499, "logps/rejected": -107.95099639892578, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.7723441123962402, "rewards/margins": 10.520489692687988, "rewards/rejected": -7.748145580291748, "step": 1150 }, { "epoch": 0.808002808002808, "grad_norm": 0.0011691893450915813, "learning_rate": 3.591997167396263e-05, "logits/chosen": -8.785320281982422, "logits/rejected": -8.768362045288086, "logps/chosen": -3.7067172527313232, "logps/rejected": -108.0243911743164, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8744401931762695, "rewards/margins": 10.462003707885742, "rewards/rejected": -7.587563991546631, "step": 1151 }, { "epoch": 0.8087048087048087, "grad_norm": 0.00111247762106359, "learning_rate": 3.5753054893907326e-05, "logits/chosen": -8.376245498657227, "logits/rejected": -8.3797607421875, "logps/chosen": -2.024533271789551, "logps/rejected": -107.98150634765625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0589089393615723, "rewards/margins": 10.616559982299805, "rewards/rejected": -7.557651519775391, "step": 1152 }, { "epoch": 0.8094068094068094, "grad_norm": 0.0014319041511043906, "learning_rate": 3.558554791694878e-05, "logits/chosen": -8.71976089477539, "logits/rejected": -8.72659683227539, "logps/chosen": -2.7800374031066895, "logps/rejected": -107.90174865722656, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.869541883468628, "rewards/margins": 10.476975440979004, "rewards/rejected": -7.607433795928955, "step": 1153 }, { "epoch": 0.8101088101088101, "grad_norm": 0.0013763734605163336, "learning_rate": 3.541745993908666e-05, "logits/chosen": -8.837188720703125, "logits/rejected": -8.830129623413086, "logps/chosen": -2.7763404846191406, "logps/rejected": -107.84539794921875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8621225357055664, "rewards/margins": 10.475502014160156, "rewards/rejected": -7.613378524780273, "step": 1154 }, { "epoch": 0.8108108108108109, "grad_norm": 0.0020694201812148094, "learning_rate": 3.524880018821708e-05, "logits/chosen": -9.023881912231445, "logits/rejected": -9.00479507446289, "logps/chosen": -8.630013465881348, "logps/rejected": -104.45146179199219, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.345278263092041, "rewards/margins": 9.519613265991211, "rewards/rejected": -7.174334526062012, "step": 1155 }, { "epoch": 0.8115128115128115, "grad_norm": 0.005653561092913151, "learning_rate": 3.507957792362609e-05, "logits/chosen": -8.241077423095703, "logits/rejected": -8.234683990478516, "logps/chosen": -1.8720288276672363, "logps/rejected": -108.7031478881836, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0083439350128174, "rewards/margins": 10.576376914978027, "rewards/rejected": -7.568033218383789, "step": 1156 }, { "epoch": 0.8122148122148122, "grad_norm": 0.0017216111300513148, "learning_rate": 3.4909802435481256e-05, "logits/chosen": -8.003997802734375, "logits/rejected": -7.983097553253174, "logps/chosen": -6.09763240814209, "logps/rejected": -106.24835205078125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5932812690734863, "rewards/margins": 10.04770278930664, "rewards/rejected": -7.4544219970703125, "step": 1157 }, { "epoch": 0.812916812916813, "grad_norm": 0.0011556849349290133, "learning_rate": 3.473948304432168e-05, "logits/chosen": -8.882314682006836, "logits/rejected": -8.88238525390625, "logps/chosen": -4.915391445159912, "logps/rejected": -105.76376342773438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.6921133995056152, "rewards/margins": 10.074142456054688, "rewards/rejected": -7.382030010223389, "step": 1158 }, { "epoch": 0.8136188136188136, "grad_norm": 0.0011058829259127378, "learning_rate": 3.4568629100546333e-05, "logits/chosen": -9.418510437011719, "logits/rejected": -9.41895580291748, "logps/chosen": -2.604191780090332, "logps/rejected": -107.0248794555664, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.762326240539551, "rewards/margins": 10.406907081604004, "rewards/rejected": -7.644581317901611, "step": 1159 }, { "epoch": 0.8143208143208143, "grad_norm": 0.001825247542001307, "learning_rate": 3.439724998390067e-05, "logits/chosen": -8.209193229675293, "logits/rejected": -8.19375228881836, "logps/chosen": -4.068811416625977, "logps/rejected": -105.791259765625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7697536945343018, "rewards/margins": 10.107060432434082, "rewards/rejected": -7.337306976318359, "step": 1160 }, { "epoch": 0.815022815022815, "grad_norm": 0.0014160505961626768, "learning_rate": 3.4225355102961737e-05, "logits/chosen": -8.581966400146484, "logits/rejected": -8.587563514709473, "logps/chosen": -6.170955657958984, "logps/rejected": -105.69026947021484, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.755800247192383, "rewards/margins": 10.041313171386719, "rewards/rejected": -7.2855119705200195, "step": 1161 }, { "epoch": 0.8157248157248157, "grad_norm": 0.33366015553474426, "learning_rate": 3.405295389462161e-05, "logits/chosen": -9.337043762207031, "logits/rejected": -9.334235191345215, "logps/chosen": -1.9719918966293335, "logps/rejected": -108.50094604492188, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 2.9290518760681152, "rewards/margins": 10.671268463134766, "rewards/rejected": -7.742216110229492, "step": 1162 }, { "epoch": 0.8164268164268165, "grad_norm": 0.0039055333472788334, "learning_rate": 3.388005582356935e-05, "logits/chosen": -9.659735679626465, "logits/rejected": -9.660457611083984, "logps/chosen": -10.724674224853516, "logps/rejected": -102.98651123046875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.196143627166748, "rewards/margins": 9.162003517150879, "rewards/rejected": -6.965859889984131, "step": 1163 }, { "epoch": 0.8171288171288171, "grad_norm": 0.012713572010397911, "learning_rate": 3.370667038177139e-05, "logits/chosen": -8.435569763183594, "logits/rejected": -8.430601119995117, "logps/chosen": -4.049836158752441, "logps/rejected": -105.76458740234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8958756923675537, "rewards/margins": 10.25837230682373, "rewards/rejected": -7.3624958992004395, "step": 1164 }, { "epoch": 0.8178308178308178, "grad_norm": 0.001044990960508585, "learning_rate": 3.353280708795041e-05, "logits/chosen": -8.995260238647461, "logits/rejected": -8.991540908813477, "logps/chosen": -5.19993782043457, "logps/rejected": -103.98625946044922, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7400262355804443, "rewards/margins": 9.900556564331055, "rewards/rejected": -7.160530090332031, "step": 1165 }, { "epoch": 0.8185328185328186, "grad_norm": 0.0018285384867340326, "learning_rate": 3.3358475487062804e-05, "logits/chosen": -8.736654281616211, "logits/rejected": -8.719415664672852, "logps/chosen": -11.822049140930176, "logps/rejected": -103.92364501953125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.111135959625244, "rewards/margins": 9.04571533203125, "rewards/rejected": -6.934578895568848, "step": 1166 }, { "epoch": 0.8192348192348192, "grad_norm": 0.0013552444288507104, "learning_rate": 3.3183685149774654e-05, "logits/chosen": -8.368753433227539, "logits/rejected": -8.355241775512695, "logps/chosen": -2.3574438095092773, "logps/rejected": -108.7406997680664, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.952833890914917, "rewards/margins": 10.562349319458008, "rewards/rejected": -7.609515190124512, "step": 1167 }, { "epoch": 0.8199368199368199, "grad_norm": 0.0006530763930641115, "learning_rate": 3.3008445671936286e-05, "logits/chosen": -8.586087226867676, "logits/rejected": -8.601491928100586, "logps/chosen": -5.250150203704834, "logps/rejected": -107.41549682617188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.746385097503662, "rewards/margins": 10.198663711547852, "rewards/rejected": -7.452278137207031, "step": 1168 }, { "epoch": 0.8206388206388207, "grad_norm": 0.0011477094376459718, "learning_rate": 3.2832766674055486e-05, "logits/chosen": -8.415979385375977, "logits/rejected": -8.415353775024414, "logps/chosen": -6.353991508483887, "logps/rejected": -106.29944610595703, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7244534492492676, "rewards/margins": 10.114562034606934, "rewards/rejected": -7.390108585357666, "step": 1169 }, { "epoch": 0.8213408213408213, "grad_norm": 0.0011169170029461384, "learning_rate": 3.265665780076936e-05, "logits/chosen": -8.667694091796875, "logits/rejected": -8.661803245544434, "logps/chosen": -8.414141654968262, "logps/rejected": -103.88633728027344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4066755771636963, "rewards/margins": 9.52923583984375, "rewards/rejected": -7.122560501098633, "step": 1170 }, { "epoch": 0.8220428220428221, "grad_norm": 0.001176942139863968, "learning_rate": 3.248012872031482e-05, "logits/chosen": -8.447854995727539, "logits/rejected": -8.439651489257812, "logps/chosen": -1.598428726196289, "logps/rejected": -108.16616821289062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.025397777557373, "rewards/margins": 10.595235824584961, "rewards/rejected": -7.569838523864746, "step": 1171 }, { "epoch": 0.8227448227448227, "grad_norm": 0.002398432232439518, "learning_rate": 3.2303189123997806e-05, "logits/chosen": -7.935904502868652, "logits/rejected": -7.974882125854492, "logps/chosen": -2.9474880695343018, "logps/rejected": -108.25299835205078, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.946068286895752, "rewards/margins": 10.542990684509277, "rewards/rejected": -7.596921920776367, "step": 1172 }, { "epoch": 0.8234468234468234, "grad_norm": 0.0006526427459903061, "learning_rate": 3.2125848725661265e-05, "logits/chosen": -8.57013988494873, "logits/rejected": -8.566339492797852, "logps/chosen": -9.410482406616211, "logps/rejected": -103.40252685546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.4802513122558594, "rewards/margins": 9.48982048034668, "rewards/rejected": -7.00956916809082, "step": 1173 }, { "epoch": 0.8241488241488242, "grad_norm": 0.002876420971006155, "learning_rate": 3.1948117261151865e-05, "logits/chosen": -7.708915710449219, "logits/rejected": -7.697588920593262, "logps/chosen": -5.635673522949219, "logps/rejected": -102.68199157714844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5587143898010254, "rewards/margins": 9.651151657104492, "rewards/rejected": -7.092437267303467, "step": 1174 }, { "epoch": 0.8248508248508248, "grad_norm": 0.003940102178603411, "learning_rate": 3.177000448778548e-05, "logits/chosen": -7.993518829345703, "logits/rejected": -8.000510215759277, "logps/chosen": -1.593322515487671, "logps/rejected": -109.12364196777344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9841408729553223, "rewards/margins": 10.63460922241211, "rewards/rejected": -7.650467395782471, "step": 1175 }, { "epoch": 0.8255528255528255, "grad_norm": 0.001072462066076696, "learning_rate": 3.159152018381152e-05, "logits/chosen": -8.238995552062988, "logits/rejected": -8.248946189880371, "logps/chosen": -6.309897422790527, "logps/rejected": -106.89346313476562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.559079170227051, "rewards/margins": 10.062664985656738, "rewards/rejected": -7.503585338592529, "step": 1176 }, { "epoch": 0.8262548262548263, "grad_norm": 0.014521673321723938, "learning_rate": 3.141267414787618e-05, "logits/chosen": -8.736196517944336, "logits/rejected": -8.738880157470703, "logps/chosen": -6.158729553222656, "logps/rejected": -104.0630874633789, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.489530563354492, "rewards/margins": 9.700488090515137, "rewards/rejected": -7.2109575271606445, "step": 1177 }, { "epoch": 0.8269568269568269, "grad_norm": 0.001009514438919723, "learning_rate": 3.1233476198484396e-05, "logits/chosen": -8.753530502319336, "logits/rejected": -8.762626647949219, "logps/chosen": -6.648387908935547, "logps/rejected": -104.84100341796875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.393824577331543, "rewards/margins": 9.708300590515137, "rewards/rejected": -7.314476013183594, "step": 1178 }, { "epoch": 0.8276588276588277, "grad_norm": 0.0036514638923108578, "learning_rate": 3.105393617346091e-05, "logits/chosen": -7.997472763061523, "logits/rejected": -7.993504047393799, "logps/chosen": -8.637478828430176, "logps/rejected": -104.22158813476562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.425157070159912, "rewards/margins": 9.647236824035645, "rewards/rejected": -7.222079753875732, "step": 1179 }, { "epoch": 0.8283608283608284, "grad_norm": 0.0073530483059585094, "learning_rate": 3.087406392941015e-05, "logits/chosen": -9.345161437988281, "logits/rejected": -9.33112907409668, "logps/chosen": -5.5905046463012695, "logps/rejected": -107.66500854492188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.52694034576416, "rewards/margins": 10.118446350097656, "rewards/rejected": -7.591505527496338, "step": 1180 }, { "epoch": 0.829062829062829, "grad_norm": 0.001276265480555594, "learning_rate": 3.0693869341175055e-05, "logits/chosen": -8.353940963745117, "logits/rejected": -8.344083786010742, "logps/chosen": -2.0187110900878906, "logps/rejected": -108.38638305664062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.954796314239502, "rewards/margins": 10.65703010559082, "rewards/rejected": -7.70223331451416, "step": 1181 }, { "epoch": 0.8297648297648298, "grad_norm": 0.0008984481100924313, "learning_rate": 3.051336230129504e-05, "logits/chosen": -8.601770401000977, "logits/rejected": -8.559646606445312, "logps/chosen": -5.873355865478516, "logps/rejected": -108.12852478027344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7316970825195312, "rewards/margins": 10.185609817504883, "rewards/rejected": -7.45391321182251, "step": 1182 }, { "epoch": 0.8304668304668305, "grad_norm": 0.0012532897526398301, "learning_rate": 3.033255271946287e-05, "logits/chosen": -8.640771865844727, "logits/rejected": -8.626787185668945, "logps/chosen": -9.439936637878418, "logps/rejected": -105.48048400878906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.257746696472168, "rewards/margins": 9.467132568359375, "rewards/rejected": -7.209386825561523, "step": 1183 }, { "epoch": 0.8311688311688312, "grad_norm": 0.005843191407620907, "learning_rate": 3.0151450521980612e-05, "logits/chosen": -8.540750503540039, "logits/rejected": -8.512933731079102, "logps/chosen": -2.2164793014526367, "logps/rejected": -108.32998657226562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.029231548309326, "rewards/margins": 10.6826753616333, "rewards/rejected": -7.653443813323975, "step": 1184 }, { "epoch": 0.8318708318708319, "grad_norm": 1.0639640092849731, "learning_rate": 2.9970065651214692e-05, "logits/chosen": -8.628721237182617, "logits/rejected": -8.60246467590332, "logps/chosen": -2.034855365753174, "logps/rejected": -107.1603012084961, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 3.039252281188965, "rewards/margins": 10.5960111618042, "rewards/rejected": -7.556758880615234, "step": 1185 }, { "epoch": 0.8325728325728325, "grad_norm": 0.0009352019405923784, "learning_rate": 2.97884080650501e-05, "logits/chosen": -9.479155540466309, "logits/rejected": -9.46733570098877, "logps/chosen": -5.195695877075195, "logps/rejected": -106.89241027832031, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5833187103271484, "rewards/margins": 10.156149864196777, "rewards/rejected": -7.572830677032471, "step": 1186 }, { "epoch": 0.8332748332748333, "grad_norm": 0.002927920315414667, "learning_rate": 2.9606487736343637e-05, "logits/chosen": -7.830811023712158, "logits/rejected": -7.842894077301025, "logps/chosen": -6.999289035797119, "logps/rejected": -105.73290252685547, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.6649091243743896, "rewards/margins": 9.944011688232422, "rewards/rejected": -7.279102802276611, "step": 1187 }, { "epoch": 0.833976833976834, "grad_norm": 0.002657047240063548, "learning_rate": 2.9424314652376516e-05, "logits/chosen": -8.353063583374023, "logits/rejected": -8.361416816711426, "logps/chosen": -12.659375190734863, "logps/rejected": -101.72699737548828, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.111927032470703, "rewards/margins": 9.016449928283691, "rewards/rejected": -6.904522895812988, "step": 1188 }, { "epoch": 0.8346788346788346, "grad_norm": 0.000622598803602159, "learning_rate": 2.924189881430598e-05, "logits/chosen": -8.062978744506836, "logits/rejected": -8.078102111816406, "logps/chosen": -5.090824127197266, "logps/rejected": -106.67057037353516, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.688504695892334, "rewards/margins": 10.085550308227539, "rewards/rejected": -7.397045612335205, "step": 1189 }, { "epoch": 0.8353808353808354, "grad_norm": 0.0007593166083097458, "learning_rate": 2.905925023661628e-05, "logits/chosen": -8.718997955322266, "logits/rejected": -8.709877014160156, "logps/chosen": -1.0430711507797241, "logps/rejected": -109.8599853515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.06384539604187, "rewards/margins": 10.883148193359375, "rewards/rejected": -7.819303035736084, "step": 1190 }, { "epoch": 0.8360828360828361, "grad_norm": 0.001035619992762804, "learning_rate": 2.8876378946568893e-05, "logits/chosen": -9.682802200317383, "logits/rejected": -9.680885314941406, "logps/chosen": -1.0955421924591064, "logps/rejected": -109.20149230957031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.961482048034668, "rewards/margins": 10.828336715698242, "rewards/rejected": -7.866854667663574, "step": 1191 }, { "epoch": 0.8367848367848368, "grad_norm": 0.000985639519058168, "learning_rate": 2.8693294983652032e-05, "logits/chosen": -8.860179901123047, "logits/rejected": -8.850549697875977, "logps/chosen": -10.990979194641113, "logps/rejected": -101.57192993164062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.2544984817504883, "rewards/margins": 9.032644271850586, "rewards/rejected": -6.778146266937256, "step": 1192 }, { "epoch": 0.8374868374868375, "grad_norm": 0.0015101439785212278, "learning_rate": 2.8510008399029458e-05, "logits/chosen": -8.430940628051758, "logits/rejected": -8.441725730895996, "logps/chosen": -1.7527976036071777, "logps/rejected": -109.73804473876953, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9474289417266846, "rewards/margins": 10.772005081176758, "rewards/rejected": -7.824576377868652, "step": 1193 }, { "epoch": 0.8381888381888382, "grad_norm": 0.0022755491081625223, "learning_rate": 2.8326529254988742e-05, "logits/chosen": -8.94415283203125, "logits/rejected": -8.933286666870117, "logps/chosen": -2.2464821338653564, "logps/rejected": -108.50090026855469, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.048964023590088, "rewards/margins": 10.67065715789795, "rewards/rejected": -7.621693134307861, "step": 1194 }, { "epoch": 0.8388908388908389, "grad_norm": 0.0006584928487427533, "learning_rate": 2.814286762438878e-05, "logits/chosen": -7.972149848937988, "logits/rejected": -7.951933860778809, "logps/chosen": -2.3604869842529297, "logps/rejected": -107.88679504394531, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9277234077453613, "rewards/margins": 10.639301300048828, "rewards/rejected": -7.711578369140625, "step": 1195 }, { "epoch": 0.8395928395928396, "grad_norm": 0.02191859856247902, "learning_rate": 2.795903359010685e-05, "logits/chosen": -8.769462585449219, "logits/rejected": -8.747955322265625, "logps/chosen": -1.1608439683914185, "logps/rejected": -109.65982055664062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.072549343109131, "rewards/margins": 10.878808975219727, "rewards/rejected": -7.8062591552734375, "step": 1196 }, { "epoch": 0.8402948402948403, "grad_norm": 0.0008409542497247458, "learning_rate": 2.777503724448504e-05, "logits/chosen": -8.842636108398438, "logits/rejected": -8.837570190429688, "logps/chosen": -1.2868478298187256, "logps/rejected": -109.87708282470703, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.988581418991089, "rewards/margins": 10.920427322387695, "rewards/rejected": -7.931845188140869, "step": 1197 }, { "epoch": 0.840996840996841, "grad_norm": 0.0011043788399547338, "learning_rate": 2.7590888688776223e-05, "logits/chosen": -8.730733871459961, "logits/rejected": -8.722343444824219, "logps/chosen": -1.20115327835083, "logps/rejected": -109.0754623413086, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.045437812805176, "rewards/margins": 10.752058982849121, "rewards/rejected": -7.706621170043945, "step": 1198 }, { "epoch": 0.8416988416988417, "grad_norm": 0.0563134104013443, "learning_rate": 2.7406598032589475e-05, "logits/chosen": -8.601165771484375, "logits/rejected": -8.574976921081543, "logps/chosen": -1.9979604482650757, "logps/rejected": -109.19078063964844, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.955220937728882, "rewards/margins": 10.706363677978516, "rewards/rejected": -7.751143455505371, "step": 1199 }, { "epoch": 0.8424008424008425, "grad_norm": 0.0019811622332781553, "learning_rate": 2.7222175393335063e-05, "logits/chosen": -9.532644271850586, "logits/rejected": -9.540487289428711, "logps/chosen": -1.6505721807479858, "logps/rejected": -109.13703918457031, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.890127658843994, "rewards/margins": 10.770923614501953, "rewards/rejected": -7.880795001983643, "step": 1200 }, { "epoch": 0.8431028431028431, "grad_norm": 0.007779865060001612, "learning_rate": 2.7037630895669043e-05, "logits/chosen": -8.680746078491211, "logits/rejected": -8.65829086303711, "logps/chosen": -1.722928762435913, "logps/rejected": -109.15435791015625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.962484836578369, "rewards/margins": 10.758201599121094, "rewards/rejected": -7.795716762542725, "step": 1201 }, { "epoch": 0.8438048438048438, "grad_norm": 0.004124638624489307, "learning_rate": 2.685297467093738e-05, "logits/chosen": -8.644124984741211, "logits/rejected": -8.641702651977539, "logps/chosen": -4.183496475219727, "logps/rejected": -107.79667663574219, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.797823667526245, "rewards/margins": 10.45920181274414, "rewards/rejected": -7.661377906799316, "step": 1202 }, { "epoch": 0.8445068445068445, "grad_norm": 0.0009964058408513665, "learning_rate": 2.6668216856619775e-05, "logits/chosen": -8.382566452026367, "logits/rejected": -8.376777648925781, "logps/chosen": -6.476559162139893, "logps/rejected": -102.23320007324219, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5657458305358887, "rewards/margins": 9.522676467895508, "rewards/rejected": -6.956930160522461, "step": 1203 }, { "epoch": 0.8452088452088452, "grad_norm": 0.001112490426748991, "learning_rate": 2.6483367595773123e-05, "logits/chosen": -8.675224304199219, "logits/rejected": -8.643209457397461, "logps/chosen": -1.5027074813842773, "logps/rejected": -109.55130767822266, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0378715991973877, "rewards/margins": 10.760534286499023, "rewards/rejected": -7.722663402557373, "step": 1204 }, { "epoch": 0.8459108459108459, "grad_norm": 0.008820192888379097, "learning_rate": 2.6298437036474648e-05, "logits/chosen": -7.896261692047119, "logits/rejected": -7.910872459411621, "logps/chosen": -3.787903070449829, "logps/rejected": -107.49882507324219, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.875505208969116, "rewards/margins": 10.3445405960083, "rewards/rejected": -7.4690351486206055, "step": 1205 }, { "epoch": 0.8466128466128466, "grad_norm": 0.000986919621936977, "learning_rate": 2.611343533126479e-05, "logits/chosen": -8.821889877319336, "logits/rejected": -8.807708740234375, "logps/chosen": -6.939428806304932, "logps/rejected": -105.08375549316406, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.533771514892578, "rewards/margins": 9.7374267578125, "rewards/rejected": -7.2036542892456055, "step": 1206 }, { "epoch": 0.8473148473148473, "grad_norm": 0.0010904970113188028, "learning_rate": 2.5928372636589865e-05, "logits/chosen": -9.239355087280273, "logits/rejected": -9.24566650390625, "logps/chosen": -10.182151794433594, "logps/rejected": -102.89804077148438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.290341377258301, "rewards/margins": 9.213329315185547, "rewards/rejected": -6.922988414764404, "step": 1207 }, { "epoch": 0.8480168480168481, "grad_norm": 0.0010792450048029423, "learning_rate": 2.5743259112244435e-05, "logits/chosen": -9.233744621276855, "logits/rejected": -9.253595352172852, "logps/chosen": -5.118257999420166, "logps/rejected": -107.17361450195312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.698098659515381, "rewards/margins": 10.292855262756348, "rewards/rejected": -7.594756126403809, "step": 1208 }, { "epoch": 0.8487188487188487, "grad_norm": 0.014753241091966629, "learning_rate": 2.5558104920813602e-05, "logits/chosen": -9.426855087280273, "logits/rejected": -9.41734504699707, "logps/chosen": -0.7882328033447266, "logps/rejected": -109.46620178222656, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.96976375579834, "rewards/margins": 10.788919448852539, "rewards/rejected": -7.819156169891357, "step": 1209 }, { "epoch": 0.8494208494208494, "grad_norm": 0.0016842473996803164, "learning_rate": 2.5372920227115033e-05, "logits/chosen": -8.626487731933594, "logits/rejected": -8.607500076293945, "logps/chosen": -4.179539680480957, "logps/rejected": -107.27989196777344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7197585105895996, "rewards/margins": 10.117040634155273, "rewards/rejected": -7.397282600402832, "step": 1210 }, { "epoch": 0.8501228501228502, "grad_norm": 0.0007665818557143211, "learning_rate": 2.518771519764095e-05, "logits/chosen": -7.934453010559082, "logits/rejected": -7.9262237548828125, "logps/chosen": -5.798423767089844, "logps/rejected": -107.44478607177734, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7980902194976807, "rewards/margins": 10.123339653015137, "rewards/rejected": -7.325248718261719, "step": 1211 }, { "epoch": 0.8508248508248508, "grad_norm": 2.437621593475342, "learning_rate": 2.50025e-05, "logits/chosen": -8.087020874023438, "logits/rejected": -8.093412399291992, "logps/chosen": -8.519051551818848, "logps/rejected": -104.49250793457031, "loss": 0.0315, "rewards/accuracies": 1.0, "rewards/chosen": 2.4363980293273926, "rewards/margins": 9.529647827148438, "rewards/rejected": -7.093250274658203, "step": 1212 }, { "epoch": 0.8515268515268515, "grad_norm": 0.0008369561983272433, "learning_rate": 2.4817284802359054e-05, "logits/chosen": -8.734540939331055, "logits/rejected": -8.716699600219727, "logps/chosen": -3.704385757446289, "logps/rejected": -106.46615600585938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.776251792907715, "rewards/margins": 10.254964828491211, "rewards/rejected": -7.478713035583496, "step": 1213 }, { "epoch": 0.8522288522288523, "grad_norm": 0.002350985538214445, "learning_rate": 2.463207977288497e-05, "logits/chosen": -8.44132137298584, "logits/rejected": -8.440208435058594, "logps/chosen": -4.662827491760254, "logps/rejected": -107.64930725097656, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.779980182647705, "rewards/margins": 10.24943733215332, "rewards/rejected": -7.469457149505615, "step": 1214 }, { "epoch": 0.8529308529308529, "grad_norm": 0.0006885849288664758, "learning_rate": 2.4446895079186404e-05, "logits/chosen": -8.453218460083008, "logits/rejected": -8.442018508911133, "logps/chosen": -1.9623427391052246, "logps/rejected": -108.87480163574219, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.137820243835449, "rewards/margins": 10.703155517578125, "rewards/rejected": -7.565335273742676, "step": 1215 }, { "epoch": 0.8536328536328537, "grad_norm": 0.0009959401795640588, "learning_rate": 2.426174088775557e-05, "logits/chosen": -8.123708724975586, "logits/rejected": -8.141703605651855, "logps/chosen": -4.358355522155762, "logps/rejected": -107.23694610595703, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.837948799133301, "rewards/margins": 10.227558135986328, "rewards/rejected": -7.389609336853027, "step": 1216 }, { "epoch": 0.8543348543348543, "grad_norm": 0.0005009585875086486, "learning_rate": 2.4076627363410145e-05, "logits/chosen": -9.496185302734375, "logits/rejected": -9.493828773498535, "logps/chosen": -4.464618682861328, "logps/rejected": -107.30406188964844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.559971809387207, "rewards/margins": 10.23196029663086, "rewards/rejected": -7.671988487243652, "step": 1217 }, { "epoch": 0.855036855036855, "grad_norm": 0.0008091468480415642, "learning_rate": 2.389156466873522e-05, "logits/chosen": -8.898111343383789, "logits/rejected": -8.866989135742188, "logps/chosen": -1.9139058589935303, "logps/rejected": -108.78119659423828, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.012216567993164, "rewards/margins": 10.745220184326172, "rewards/rejected": -7.733003616333008, "step": 1218 }, { "epoch": 0.8557388557388558, "grad_norm": 0.001265364815481007, "learning_rate": 2.3706562963525355e-05, "logits/chosen": -8.794822692871094, "logits/rejected": -8.7637357711792, "logps/chosen": -1.4974956512451172, "logps/rejected": -109.46664428710938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0315752029418945, "rewards/margins": 10.778640747070312, "rewards/rejected": -7.747065544128418, "step": 1219 }, { "epoch": 0.8564408564408564, "grad_norm": 2.742652177810669, "learning_rate": 2.3521632404226873e-05, "logits/chosen": -9.620079040527344, "logits/rejected": -9.627813339233398, "logps/chosen": -3.7621750831604004, "logps/rejected": -106.98023986816406, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": 2.763563871383667, "rewards/margins": 10.167561531066895, "rewards/rejected": -7.403997421264648, "step": 1220 }, { "epoch": 0.8571428571428571, "grad_norm": 0.0010476458119228482, "learning_rate": 2.3336783143380218e-05, "logits/chosen": -8.01786994934082, "logits/rejected": -8.008479118347168, "logps/chosen": -4.900718688964844, "logps/rejected": -106.96231079101562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.708427906036377, "rewards/margins": 10.110029220581055, "rewards/rejected": -7.401600360870361, "step": 1221 }, { "epoch": 0.8578448578448579, "grad_norm": 0.001027215039357543, "learning_rate": 2.315202532906262e-05, "logits/chosen": -9.183183670043945, "logits/rejected": -9.171947479248047, "logps/chosen": -1.576202392578125, "logps/rejected": -109.0619888305664, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.876185417175293, "rewards/margins": 10.713031768798828, "rewards/rejected": -7.836847305297852, "step": 1222 }, { "epoch": 0.8585468585468585, "grad_norm": 0.0009035562397912145, "learning_rate": 2.2967369104330957e-05, "logits/chosen": -8.494355201721191, "logits/rejected": -8.508207321166992, "logps/chosen": -1.4257099628448486, "logps/rejected": -109.49837493896484, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.029081344604492, "rewards/margins": 10.804412841796875, "rewards/rejected": -7.775330543518066, "step": 1223 }, { "epoch": 0.8592488592488593, "grad_norm": 0.0013778702123090625, "learning_rate": 2.278282460666494e-05, "logits/chosen": -8.415583610534668, "logits/rejected": -8.43008804321289, "logps/chosen": -8.312522888183594, "logps/rejected": -102.55836486816406, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.346808433532715, "rewards/margins": 9.417179107666016, "rewards/rejected": -7.070370674133301, "step": 1224 }, { "epoch": 0.85995085995086, "grad_norm": 0.001276767929084599, "learning_rate": 2.2598401967410532e-05, "logits/chosen": -8.976722717285156, "logits/rejected": -8.981964111328125, "logps/chosen": -1.8036725521087646, "logps/rejected": -107.8381576538086, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.854938507080078, "rewards/margins": 10.561038970947266, "rewards/rejected": -7.706101417541504, "step": 1225 }, { "epoch": 0.8606528606528606, "grad_norm": 0.0009006437612697482, "learning_rate": 2.2414111311223784e-05, "logits/chosen": -8.784846305847168, "logits/rejected": -8.768135070800781, "logps/chosen": -13.251181602478027, "logps/rejected": -101.37394714355469, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.1885125637054443, "rewards/margins": 8.90777587890625, "rewards/rejected": -6.719264030456543, "step": 1226 }, { "epoch": 0.8613548613548614, "grad_norm": 0.0013800504384562373, "learning_rate": 2.222996275551497e-05, "logits/chosen": -8.513870239257812, "logits/rejected": -8.509892463684082, "logps/chosen": -5.779995918273926, "logps/rejected": -105.5468521118164, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6434974670410156, "rewards/margins": 10.019076347351074, "rewards/rejected": -7.375579357147217, "step": 1227 }, { "epoch": 0.862056862056862, "grad_norm": 0.0015023270389065146, "learning_rate": 2.2045966409893164e-05, "logits/chosen": -8.712145805358887, "logits/rejected": -8.722943305969238, "logps/chosen": -6.661454677581787, "logps/rejected": -106.80265045166016, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5842251777648926, "rewards/margins": 10.003642082214355, "rewards/rejected": -7.419416427612305, "step": 1228 }, { "epoch": 0.8627588627588627, "grad_norm": 0.002079087309539318, "learning_rate": 2.1862132375611227e-05, "logits/chosen": -9.089815139770508, "logits/rejected": -9.089385986328125, "logps/chosen": -4.601965427398682, "logps/rejected": -101.88638305664062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.628274917602539, "rewards/margins": 9.782937049865723, "rewards/rejected": -7.154661655426025, "step": 1229 }, { "epoch": 0.8634608634608635, "grad_norm": 0.035026613622903824, "learning_rate": 2.1678470745011258e-05, "logits/chosen": -9.154406547546387, "logits/rejected": -9.151511192321777, "logps/chosen": -2.5061140060424805, "logps/rejected": -106.4264907836914, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 3.0007870197296143, "rewards/margins": 10.422513008117676, "rewards/rejected": -7.421726226806641, "step": 1230 }, { "epoch": 0.8641628641628641, "grad_norm": 0.0019342175219208002, "learning_rate": 2.1494991600970542e-05, "logits/chosen": -8.387529373168945, "logits/rejected": -8.369026184082031, "logps/chosen": -3.0424537658691406, "logps/rejected": -101.73632049560547, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9244706630706787, "rewards/margins": 9.963985443115234, "rewards/rejected": -7.039514064788818, "step": 1231 }, { "epoch": 0.8648648648648649, "grad_norm": 0.0023036175407469273, "learning_rate": 2.131170501634798e-05, "logits/chosen": -8.58215618133545, "logits/rejected": -8.560561180114746, "logps/chosen": -2.7159852981567383, "logps/rejected": -108.063720703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9445552825927734, "rewards/margins": 10.515043258666992, "rewards/rejected": -7.570488929748535, "step": 1232 }, { "epoch": 0.8655668655668656, "grad_norm": 0.0017017022473737597, "learning_rate": 2.1128621053431113e-05, "logits/chosen": -8.052677154541016, "logits/rejected": -8.02469539642334, "logps/chosen": -9.785301208496094, "logps/rejected": -102.79230499267578, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.28617525100708, "rewards/margins": 9.323041915893555, "rewards/rejected": -7.036867141723633, "step": 1233 }, { "epoch": 0.8662688662688662, "grad_norm": 0.0027640461921691895, "learning_rate": 2.0945749763383727e-05, "logits/chosen": -9.239410400390625, "logits/rejected": -9.254966735839844, "logps/chosen": -11.735651969909668, "logps/rejected": -98.05657196044922, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.227485179901123, "rewards/margins": 8.753405570983887, "rewards/rejected": -6.5259199142456055, "step": 1234 }, { "epoch": 0.866970866970867, "grad_norm": 0.003991521429270506, "learning_rate": 2.0763101185694027e-05, "logits/chosen": -8.346491813659668, "logits/rejected": -8.34819507598877, "logps/chosen": -5.634230613708496, "logps/rejected": -103.85299682617188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6752800941467285, "rewards/margins": 9.85637378692627, "rewards/rejected": -7.181093692779541, "step": 1235 }, { "epoch": 0.8676728676728677, "grad_norm": 0.0022289028856903315, "learning_rate": 2.0580685347623487e-05, "logits/chosen": -8.408845901489258, "logits/rejected": -8.392350196838379, "logps/chosen": -2.758223295211792, "logps/rejected": -106.64476776123047, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8206405639648438, "rewards/margins": 10.327140808105469, "rewards/rejected": -7.506500244140625, "step": 1236 }, { "epoch": 0.8683748683748683, "grad_norm": 0.004807933699339628, "learning_rate": 2.039851226365636e-05, "logits/chosen": -8.948944091796875, "logits/rejected": -8.957950592041016, "logps/chosen": -4.623434066772461, "logps/rejected": -105.6165542602539, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.627633571624756, "rewards/margins": 10.067699432373047, "rewards/rejected": -7.440066337585449, "step": 1237 }, { "epoch": 0.8690768690768691, "grad_norm": 0.0029296192806214094, "learning_rate": 2.021659193494991e-05, "logits/chosen": -8.761343002319336, "logits/rejected": -8.762407302856445, "logps/chosen": -1.660165786743164, "logps/rejected": -107.70576477050781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9285285472869873, "rewards/margins": 10.537637710571289, "rewards/rejected": -7.609107971191406, "step": 1238 }, { "epoch": 0.8697788697788698, "grad_norm": 0.008608028292655945, "learning_rate": 2.0034934348785308e-05, "logits/chosen": -8.253036499023438, "logits/rejected": -8.227376937866211, "logps/chosen": -1.9594814777374268, "logps/rejected": -108.36688995361328, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8300280570983887, "rewards/margins": 10.52147102355957, "rewards/rejected": -7.691442966461182, "step": 1239 }, { "epoch": 0.8704808704808705, "grad_norm": 0.021717412397265434, "learning_rate": 1.9853549478019398e-05, "logits/chosen": -8.193425178527832, "logits/rejected": -8.210287094116211, "logps/chosen": -8.455682754516602, "logps/rejected": -103.98543548583984, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.3634822368621826, "rewards/margins": 9.435497283935547, "rewards/rejected": -7.072015285491943, "step": 1240 }, { "epoch": 0.8711828711828712, "grad_norm": 0.004926466848701239, "learning_rate": 1.9672447280537136e-05, "logits/chosen": -8.825228691101074, "logits/rejected": -8.80565071105957, "logps/chosen": -2.272050619125366, "logps/rejected": -106.35115051269531, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8812246322631836, "rewards/margins": 10.35318374633789, "rewards/rejected": -7.471958637237549, "step": 1241 }, { "epoch": 0.8718848718848718, "grad_norm": 0.14472560584545135, "learning_rate": 1.9491637698704965e-05, "logits/chosen": -8.721050262451172, "logits/rejected": -8.725676536560059, "logps/chosen": -2.6590967178344727, "logps/rejected": -106.06716918945312, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0153751373291016, "rewards/margins": 10.313032150268555, "rewards/rejected": -7.297656059265137, "step": 1242 }, { "epoch": 0.8725868725868726, "grad_norm": 0.10505974292755127, "learning_rate": 1.9311130658824958e-05, "logits/chosen": -8.576640129089355, "logits/rejected": -8.572357177734375, "logps/chosen": -2.14046049118042, "logps/rejected": -105.63801574707031, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8565587997436523, "rewards/margins": 10.311447143554688, "rewards/rejected": -7.454888343811035, "step": 1243 }, { "epoch": 0.8732888732888733, "grad_norm": 0.13413165509700775, "learning_rate": 1.9130936070589864e-05, "logits/chosen": -8.23952865600586, "logits/rejected": -8.251585006713867, "logps/chosen": -5.608049392700195, "logps/rejected": -104.43209838867188, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 2.667518138885498, "rewards/margins": 9.881176948547363, "rewards/rejected": -7.213658809661865, "step": 1244 }, { "epoch": 0.8739908739908739, "grad_norm": 0.43438655138015747, "learning_rate": 1.8951063826539088e-05, "logits/chosen": -7.443093299865723, "logits/rejected": -7.41811466217041, "logps/chosen": -6.952947616577148, "logps/rejected": -105.4383316040039, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": 2.6072566509246826, "rewards/margins": 9.70396614074707, "rewards/rejected": -7.096710205078125, "step": 1245 }, { "epoch": 0.8746928746928747, "grad_norm": 0.03528987988829613, "learning_rate": 1.877152380151561e-05, "logits/chosen": -7.843338489532471, "logits/rejected": -7.842550754547119, "logps/chosen": -9.30016040802002, "logps/rejected": -102.76448059082031, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 2.48775053024292, "rewards/margins": 9.480889320373535, "rewards/rejected": -6.993138790130615, "step": 1246 }, { "epoch": 0.8753948753948754, "grad_norm": 0.0010906473034992814, "learning_rate": 1.8592325852123832e-05, "logits/chosen": -8.125028610229492, "logits/rejected": -8.136687278747559, "logps/chosen": -5.776184558868408, "logps/rejected": -105.82830810546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.679532766342163, "rewards/margins": 10.005071640014648, "rewards/rejected": -7.3255391120910645, "step": 1247 }, { "epoch": 0.8760968760968761, "grad_norm": 0.0012086580973118544, "learning_rate": 1.8413479816188488e-05, "logits/chosen": -8.368091583251953, "logits/rejected": -8.361098289489746, "logps/chosen": -9.399272918701172, "logps/rejected": -99.25886535644531, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.354912281036377, "rewards/margins": 9.158188819885254, "rewards/rejected": -6.803276538848877, "step": 1248 }, { "epoch": 0.8767988767988768, "grad_norm": 0.002273781690746546, "learning_rate": 1.8234995512214535e-05, "logits/chosen": -8.616498947143555, "logits/rejected": -8.607499122619629, "logps/chosen": -13.114978790283203, "logps/rejected": -101.92565155029297, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.052143096923828, "rewards/margins": 8.856334686279297, "rewards/rejected": -6.8041911125183105, "step": 1249 }, { "epoch": 0.8775008775008775, "grad_norm": 0.0030579131562262774, "learning_rate": 1.8056882738848145e-05, "logits/chosen": -8.571322441101074, "logits/rejected": -8.56582260131836, "logps/chosen": -12.61426067352295, "logps/rejected": -104.17913818359375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.151594638824463, "rewards/margins": 9.280364036560059, "rewards/rejected": -7.1287689208984375, "step": 1250 }, { "epoch": 0.8782028782028782, "grad_norm": 0.0011680658208206296, "learning_rate": 1.7879151274338734e-05, "logits/chosen": -7.85494327545166, "logits/rejected": -7.821000099182129, "logps/chosen": -4.470645427703857, "logps/rejected": -105.67247009277344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7496213912963867, "rewards/margins": 10.09865665435791, "rewards/rejected": -7.349035739898682, "step": 1251 }, { "epoch": 0.8789048789048789, "grad_norm": 0.0010666021844372153, "learning_rate": 1.77018108760022e-05, "logits/chosen": -7.957059860229492, "logits/rejected": -7.98109245300293, "logps/chosen": -4.815074920654297, "logps/rejected": -106.40544128417969, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7865381240844727, "rewards/margins": 10.225422859191895, "rewards/rejected": -7.43888521194458, "step": 1252 }, { "epoch": 0.8796068796068796, "grad_norm": 0.0010056665632873774, "learning_rate": 1.752487127968518e-05, "logits/chosen": -7.7455902099609375, "logits/rejected": -7.725481986999512, "logps/chosen": -7.293032169342041, "logps/rejected": -105.2080078125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.549931049346924, "rewards/margins": 9.667806625366211, "rewards/rejected": -7.117875099182129, "step": 1253 }, { "epoch": 0.8803088803088803, "grad_norm": 0.001244557905010879, "learning_rate": 1.734834219923064e-05, "logits/chosen": -8.735845565795898, "logits/rejected": -8.718122482299805, "logps/chosen": -1.8690556287765503, "logps/rejected": -109.13664245605469, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9506289958953857, "rewards/margins": 10.741467475891113, "rewards/rejected": -7.790838241577148, "step": 1254 }, { "epoch": 0.881010881010881, "grad_norm": 0.0006919351289980114, "learning_rate": 1.717223332594451e-05, "logits/chosen": -8.745235443115234, "logits/rejected": -8.732337951660156, "logps/chosen": -2.129647970199585, "logps/rejected": -109.99577331542969, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.961146593093872, "rewards/margins": 10.786500930786133, "rewards/rejected": -7.82535457611084, "step": 1255 }, { "epoch": 0.8817128817128818, "grad_norm": 0.0009045847691595554, "learning_rate": 1.6996554328063714e-05, "logits/chosen": -8.620550155639648, "logits/rejected": -8.626530647277832, "logps/chosen": -1.043769359588623, "logps/rejected": -109.48824310302734, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.990415573120117, "rewards/margins": 10.753686904907227, "rewards/rejected": -7.763270378112793, "step": 1256 }, { "epoch": 0.8824148824148824, "grad_norm": 0.0008528335019946098, "learning_rate": 1.6821314850225346e-05, "logits/chosen": -8.58163833618164, "logits/rejected": -8.559918403625488, "logps/chosen": -4.635007381439209, "logps/rejected": -105.99449157714844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7106218338012695, "rewards/margins": 10.104970932006836, "rewards/rejected": -7.394349098205566, "step": 1257 }, { "epoch": 0.8831168831168831, "grad_norm": 0.012201395817101002, "learning_rate": 1.6646524512937193e-05, "logits/chosen": -7.89655065536499, "logits/rejected": -7.881433486938477, "logps/chosen": -7.798669338226318, "logps/rejected": -103.92996215820312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.591871738433838, "rewards/margins": 9.577096939086914, "rewards/rejected": -6.985224723815918, "step": 1258 }, { "epoch": 0.8838188838188838, "grad_norm": 0.000597925391048193, "learning_rate": 1.6472192912049593e-05, "logits/chosen": -8.639892578125, "logits/rejected": -8.631200790405273, "logps/chosen": -4.54150390625, "logps/rejected": -106.201171875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.700418472290039, "rewards/margins": 10.235286712646484, "rewards/rejected": -7.534868240356445, "step": 1259 }, { "epoch": 0.8845208845208845, "grad_norm": 0.00081235496327281, "learning_rate": 1.629832961822862e-05, "logits/chosen": -8.782458305358887, "logits/rejected": -8.797161102294922, "logps/chosen": -6.0087103843688965, "logps/rejected": -103.87130737304688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.52376127243042, "rewards/margins": 9.805895805358887, "rewards/rejected": -7.282134532928467, "step": 1260 }, { "epoch": 0.8852228852228852, "grad_norm": 0.009807555936276913, "learning_rate": 1.6124944176430657e-05, "logits/chosen": -8.585845947265625, "logits/rejected": -8.568099975585938, "logps/chosen": -2.0837302207946777, "logps/rejected": -109.32158660888672, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9672694206237793, "rewards/margins": 10.742944717407227, "rewards/rejected": -7.7756757736206055, "step": 1261 }, { "epoch": 0.8859248859248859, "grad_norm": 0.001575667061842978, "learning_rate": 1.5952046105378398e-05, "logits/chosen": -7.446864128112793, "logits/rejected": -7.4275641441345215, "logps/chosen": -5.255207061767578, "logps/rejected": -104.7430419921875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.601886749267578, "rewards/margins": 9.934188842773438, "rewards/rejected": -7.332302093505859, "step": 1262 }, { "epoch": 0.8866268866268866, "grad_norm": 0.0008826297707855701, "learning_rate": 1.5779644897038273e-05, "logits/chosen": -7.873759746551514, "logits/rejected": -7.8559064865112305, "logps/chosen": -5.757717132568359, "logps/rejected": -107.48277282714844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.762403964996338, "rewards/margins": 10.258515357971191, "rewards/rejected": -7.496110916137695, "step": 1263 }, { "epoch": 0.8873288873288874, "grad_norm": 0.00172676972579211, "learning_rate": 1.5607750016099335e-05, "logits/chosen": -8.335768699645996, "logits/rejected": -8.36262035369873, "logps/chosen": -8.958843231201172, "logps/rejected": -107.1718521118164, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3773865699768066, "rewards/margins": 9.692756652832031, "rewards/rejected": -7.315369606018066, "step": 1264 }, { "epoch": 0.888030888030888, "grad_norm": 0.0010760447476059198, "learning_rate": 1.543637089945367e-05, "logits/chosen": -8.345953941345215, "logits/rejected": -8.341130256652832, "logps/chosen": -5.551570415496826, "logps/rejected": -108.0345458984375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6761012077331543, "rewards/margins": 10.220257759094238, "rewards/rejected": -7.544157028198242, "step": 1265 }, { "epoch": 0.8887328887328887, "grad_norm": 0.0006859139539301395, "learning_rate": 1.526551695567832e-05, "logits/chosen": -8.722663879394531, "logits/rejected": -8.725658416748047, "logps/chosen": -5.52739953994751, "logps/rejected": -107.7158203125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6798157691955566, "rewards/margins": 10.191633224487305, "rewards/rejected": -7.51181697845459, "step": 1266 }, { "epoch": 0.8894348894348895, "grad_norm": 0.0009959982708096504, "learning_rate": 1.5095197564518754e-05, "logits/chosen": -8.561532974243164, "logits/rejected": -8.549699783325195, "logps/chosen": -1.3824695348739624, "logps/rejected": -109.4637222290039, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0329253673553467, "rewards/margins": 10.780387878417969, "rewards/rejected": -7.747462272644043, "step": 1267 }, { "epoch": 0.8901368901368901, "grad_norm": 0.0012174872681498528, "learning_rate": 1.4925422076373918e-05, "logits/chosen": -8.055453300476074, "logits/rejected": -8.056498527526855, "logps/chosen": -10.891861915588379, "logps/rejected": -101.60443878173828, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.1739280223846436, "rewards/margins": 9.111115455627441, "rewards/rejected": -6.937188148498535, "step": 1268 }, { "epoch": 0.8908388908388908, "grad_norm": 0.000648445391561836, "learning_rate": 1.4756199811782927e-05, "logits/chosen": -8.811588287353516, "logits/rejected": -8.813054084777832, "logps/chosen": -10.596795082092285, "logps/rejected": -102.70051574707031, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.2252893447875977, "rewards/margins": 9.167279243469238, "rewards/rejected": -6.941989898681641, "step": 1269 }, { "epoch": 0.8915408915408916, "grad_norm": 0.0013158777728676796, "learning_rate": 1.4587540060913358e-05, "logits/chosen": -8.942485809326172, "logits/rejected": -8.925799369812012, "logps/chosen": -4.7110090255737305, "logps/rejected": -107.00120544433594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6784467697143555, "rewards/margins": 10.262088775634766, "rewards/rejected": -7.58364200592041, "step": 1270 }, { "epoch": 0.8922428922428922, "grad_norm": 0.001071266713552177, "learning_rate": 1.4419452083051233e-05, "logits/chosen": -7.842498779296875, "logits/rejected": -7.835883140563965, "logps/chosen": -4.534109115600586, "logps/rejected": -107.9761962890625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.760892629623413, "rewards/margins": 10.221918106079102, "rewards/rejected": -7.461025238037109, "step": 1271 }, { "epoch": 0.892944892944893, "grad_norm": 0.003774672979488969, "learning_rate": 1.4251945106092675e-05, "logits/chosen": -8.974028587341309, "logits/rejected": -8.988608360290527, "logps/chosen": -11.781658172607422, "logps/rejected": -102.70806884765625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.0835347175598145, "rewards/margins": 9.132652282714844, "rewards/rejected": -7.049117565155029, "step": 1272 }, { "epoch": 0.8936468936468936, "grad_norm": 0.0008531412458978593, "learning_rate": 1.4085028326037369e-05, "logits/chosen": -7.969056606292725, "logits/rejected": -7.95396614074707, "logps/chosen": -2.328019618988037, "logps/rejected": -109.76594543457031, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9377880096435547, "rewards/margins": 10.683707237243652, "rewards/rejected": -7.745919227600098, "step": 1273 }, { "epoch": 0.8943488943488943, "grad_norm": 0.003144212067127228, "learning_rate": 1.3918710906483615e-05, "logits/chosen": -8.449762344360352, "logits/rejected": -8.449277877807617, "logps/chosen": -4.92262077331543, "logps/rejected": -107.41757202148438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.630138874053955, "rewards/margins": 10.2784423828125, "rewards/rejected": -7.648303031921387, "step": 1274 }, { "epoch": 0.8950508950508951, "grad_norm": 0.010613837279379368, "learning_rate": 1.3753001978125304e-05, "logits/chosen": -9.004507064819336, "logits/rejected": -8.998716354370117, "logps/chosen": -1.2716526985168457, "logps/rejected": -110.19546508789062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.91123366355896, "rewards/margins": 10.849135398864746, "rewards/rejected": -7.937901973724365, "step": 1275 }, { "epoch": 0.8957528957528957, "grad_norm": 0.0006684979307465255, "learning_rate": 1.3587910638250659e-05, "logits/chosen": -8.736082077026367, "logits/rejected": -8.735871315002441, "logps/chosen": -1.2867088317871094, "logps/rejected": -110.04776000976562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.8306097984313965, "rewards/margins": 10.790863990783691, "rewards/rejected": -7.960254669189453, "step": 1276 }, { "epoch": 0.8964548964548964, "grad_norm": 0.0014232783578336239, "learning_rate": 1.3423445950242778e-05, "logits/chosen": -8.469112396240234, "logits/rejected": -8.460882186889648, "logps/chosen": -5.587289810180664, "logps/rejected": -106.82342529296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.736537218093872, "rewards/margins": 10.152556419372559, "rewards/rejected": -7.416019439697266, "step": 1277 }, { "epoch": 0.8971568971568972, "grad_norm": 0.001094828243367374, "learning_rate": 1.325961694308207e-05, "logits/chosen": -8.320627212524414, "logits/rejected": -8.306355476379395, "logps/chosen": -1.310408592224121, "logps/rejected": -109.3930892944336, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.977360963821411, "rewards/margins": 10.78483772277832, "rewards/rejected": -7.807476043701172, "step": 1278 }, { "epoch": 0.8978588978588978, "grad_norm": 0.0016485550440847874, "learning_rate": 1.3096432610850581e-05, "logits/chosen": -8.552375793457031, "logits/rejected": -8.53586196899414, "logps/chosen": -7.942533016204834, "logps/rejected": -104.3192138671875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.559443712234497, "rewards/margins": 9.6982421875, "rewards/rejected": -7.138798236846924, "step": 1279 }, { "epoch": 0.8985608985608986, "grad_norm": 0.0010979290818795562, "learning_rate": 1.2933901912238209e-05, "logits/chosen": -9.357513427734375, "logits/rejected": -9.359016418457031, "logps/chosen": -4.336092472076416, "logps/rejected": -108.64254760742188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.679408311843872, "rewards/margins": 10.280670166015625, "rewards/rejected": -7.601262092590332, "step": 1280 }, { "epoch": 0.8992628992628993, "grad_norm": 0.0034718511160463095, "learning_rate": 1.2772033770050884e-05, "logits/chosen": -8.537410736083984, "logits/rejected": -8.518280982971191, "logps/chosen": -6.755331993103027, "logps/rejected": -104.46603393554688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.497157096862793, "rewards/margins": 9.786581039428711, "rewards/rejected": -7.289424896240234, "step": 1281 }, { "epoch": 0.8999648999648999, "grad_norm": 0.0010733898961916566, "learning_rate": 1.2610837070720721e-05, "logits/chosen": -8.471172332763672, "logits/rejected": -8.475676536560059, "logps/chosen": -6.0736403465271, "logps/rejected": -107.83891296386719, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7425589561462402, "rewards/margins": 10.216113090515137, "rewards/rejected": -7.473554611206055, "step": 1282 }, { "epoch": 0.9006669006669007, "grad_norm": 0.004145217593759298, "learning_rate": 1.2450320663818148e-05, "logits/chosen": -8.700780868530273, "logits/rejected": -8.693852424621582, "logps/chosen": -1.3173558712005615, "logps/rejected": -109.97650146484375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.959014415740967, "rewards/margins": 10.7584228515625, "rewards/rejected": -7.79940938949585, "step": 1283 }, { "epoch": 0.9013689013689014, "grad_norm": 0.0010791915701702237, "learning_rate": 1.2290493361566108e-05, "logits/chosen": -7.969590187072754, "logits/rejected": -7.957367420196533, "logps/chosen": -1.5828628540039062, "logps/rejected": -109.61286926269531, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1331050395965576, "rewards/margins": 10.843240737915039, "rewards/rejected": -7.710135459899902, "step": 1284 }, { "epoch": 0.9020709020709021, "grad_norm": 0.0009980095783248544, "learning_rate": 1.2131363938356214e-05, "logits/chosen": -7.880501747131348, "logits/rejected": -7.863944053649902, "logps/chosen": -4.159817695617676, "logps/rejected": -108.07408142089844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6891255378723145, "rewards/margins": 10.265695571899414, "rewards/rejected": -7.576569080352783, "step": 1285 }, { "epoch": 0.9027729027729028, "grad_norm": 0.006651091389358044, "learning_rate": 1.1972941130267093e-05, "logits/chosen": -8.600348472595215, "logits/rejected": -8.593501091003418, "logps/chosen": -9.117119789123535, "logps/rejected": -105.2166748046875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.310835361480713, "rewards/margins": 9.616218566894531, "rewards/rejected": -7.30538272857666, "step": 1286 }, { "epoch": 0.9034749034749034, "grad_norm": 0.0009589525288902223, "learning_rate": 1.1815233634584758e-05, "logits/chosen": -8.463611602783203, "logits/rejected": -8.431506156921387, "logps/chosen": -5.118930816650391, "logps/rejected": -105.67381286621094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8237645626068115, "rewards/margins": 10.10389518737793, "rewards/rejected": -7.280130386352539, "step": 1287 }, { "epoch": 0.9041769041769042, "grad_norm": 0.0008964380831457675, "learning_rate": 1.1658250109325143e-05, "logits/chosen": -7.851476669311523, "logits/rejected": -7.850248336791992, "logps/chosen": -6.924668312072754, "logps/rejected": -108.08796691894531, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.655489444732666, "rewards/margins": 10.196502685546875, "rewards/rejected": -7.541012763977051, "step": 1288 }, { "epoch": 0.9048789048789049, "grad_norm": 0.0007521786610595882, "learning_rate": 1.1501999172758785e-05, "logits/chosen": -8.72783088684082, "logits/rejected": -8.736533164978027, "logps/chosen": -6.735645294189453, "logps/rejected": -106.57862091064453, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.494208335876465, "rewards/margins": 9.82510757446289, "rewards/rejected": -7.330899238586426, "step": 1289 }, { "epoch": 0.9055809055809055, "grad_norm": 0.000932000286411494, "learning_rate": 1.1346489402937678e-05, "logits/chosen": -8.45029354095459, "logits/rejected": -8.467422485351562, "logps/chosen": -1.8219645023345947, "logps/rejected": -110.25953674316406, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.085789203643799, "rewards/margins": 10.82404899597168, "rewards/rejected": -7.738259315490723, "step": 1290 }, { "epoch": 0.9062829062829063, "grad_norm": 0.0014587647747248411, "learning_rate": 1.1191729337224358e-05, "logits/chosen": -8.98193359375, "logits/rejected": -8.946524620056152, "logps/chosen": -1.4226102828979492, "logps/rejected": -110.07638549804688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9332690238952637, "rewards/margins": 10.82443904876709, "rewards/rejected": -7.891170501708984, "step": 1291 }, { "epoch": 0.906984906984907, "grad_norm": 0.0012786689912900329, "learning_rate": 1.1037727471823201e-05, "logits/chosen": -8.21888542175293, "logits/rejected": -8.210409164428711, "logps/chosen": -2.0635437965393066, "logps/rejected": -110.2564697265625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.935737133026123, "rewards/margins": 10.674229621887207, "rewards/rejected": -7.738492965698242, "step": 1292 }, { "epoch": 0.9076869076869077, "grad_norm": 0.0008428652654401958, "learning_rate": 1.0884492261313986e-05, "logits/chosen": -8.418020248413086, "logits/rejected": -8.386247634887695, "logps/chosen": -4.997727394104004, "logps/rejected": -107.39418029785156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6712121963500977, "rewards/margins": 10.179450988769531, "rewards/rejected": -7.508238792419434, "step": 1293 }, { "epoch": 0.9083889083889084, "grad_norm": 0.0007107398705556989, "learning_rate": 1.0732032118187756e-05, "logits/chosen": -7.988824844360352, "logits/rejected": -7.98453426361084, "logps/chosen": -1.4122154712677002, "logps/rejected": -109.81723022460938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.087057113647461, "rewards/margins": 10.806413650512695, "rewards/rejected": -7.719356536865234, "step": 1294 }, { "epoch": 0.9090909090909091, "grad_norm": 0.0007117378409020603, "learning_rate": 1.0580355412384968e-05, "logits/chosen": -8.16519546508789, "logits/rejected": -8.194843292236328, "logps/chosen": -1.933718204498291, "logps/rejected": -109.95780181884766, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.894320249557495, "rewards/margins": 10.79971981048584, "rewards/rejected": -7.905399322509766, "step": 1295 }, { "epoch": 0.9097929097929098, "grad_norm": 0.0011267218505963683, "learning_rate": 1.042947047083599e-05, "logits/chosen": -8.586791038513184, "logits/rejected": -8.57319450378418, "logps/chosen": -1.3725144863128662, "logps/rejected": -109.46417236328125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.989774703979492, "rewards/margins": 10.720812797546387, "rewards/rejected": -7.7310380935668945, "step": 1296 }, { "epoch": 0.9104949104949105, "grad_norm": 0.0009183151996694505, "learning_rate": 1.0279385577003959e-05, "logits/chosen": -8.424694061279297, "logits/rejected": -8.419496536254883, "logps/chosen": -6.120013236999512, "logps/rejected": -108.2374038696289, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.765320062637329, "rewards/margins": 10.275735855102539, "rewards/rejected": -7.510415554046631, "step": 1297 }, { "epoch": 0.9111969111969112, "grad_norm": 0.0036607736255973577, "learning_rate": 1.0130108970430057e-05, "logits/chosen": -8.681093215942383, "logits/rejected": -8.665681838989258, "logps/chosen": -4.731987953186035, "logps/rejected": -108.75108337402344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.730592727661133, "rewards/margins": 10.478432655334473, "rewards/rejected": -7.74783992767334, "step": 1298 }, { "epoch": 0.9118989118989119, "grad_norm": 0.0008486664155498147, "learning_rate": 9.981648846281106e-06, "logits/chosen": -8.203756332397461, "logits/rejected": -8.184040069580078, "logps/chosen": -2.5286097526550293, "logps/rejected": -108.5692367553711, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0248239040374756, "rewards/margins": 10.618219375610352, "rewards/rejected": -7.593395709991455, "step": 1299 }, { "epoch": 0.9126009126009126, "grad_norm": 0.011749427765607834, "learning_rate": 9.834013354899712e-06, "logits/chosen": -8.476877212524414, "logits/rejected": -8.475946426391602, "logps/chosen": -4.946405410766602, "logps/rejected": -106.57717895507812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.8267483711242676, "rewards/margins": 10.152323722839355, "rewards/rejected": -7.325575828552246, "step": 1300 }, { "epoch": 0.9133029133029134, "grad_norm": 0.0010331996018067002, "learning_rate": 9.687210601356786e-06, "logits/chosen": -9.038656234741211, "logits/rejected": -9.025406837463379, "logps/chosen": -4.814790725708008, "logps/rejected": -106.91956329345703, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.722184419631958, "rewards/margins": 10.221416473388672, "rewards/rejected": -7.499232292175293, "step": 1301 }, { "epoch": 0.914004914004914, "grad_norm": 0.0008215362322516739, "learning_rate": 9.541248645006606e-06, "logits/chosen": -8.022854804992676, "logits/rejected": -8.016434669494629, "logps/chosen": -2.1604647636413574, "logps/rejected": -110.02462768554688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8725504875183105, "rewards/margins": 10.698936462402344, "rewards/rejected": -7.826385974884033, "step": 1302 }, { "epoch": 0.9147069147069147, "grad_norm": 0.0007161798421293497, "learning_rate": 9.39613549904435e-06, "logits/chosen": -8.551250457763672, "logits/rejected": -8.531779289245605, "logps/chosen": -1.3208112716674805, "logps/rejected": -109.6938247680664, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.085082530975342, "rewards/margins": 10.83504867553711, "rewards/rejected": -7.749965667724609, "step": 1303 }, { "epoch": 0.9154089154089154, "grad_norm": 0.0015111392131075263, "learning_rate": 9.25187913006618e-06, "logits/chosen": -8.998818397521973, "logits/rejected": -8.998607635498047, "logps/chosen": -1.3904938697814941, "logps/rejected": -109.38788604736328, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9676496982574463, "rewards/margins": 10.902105331420898, "rewards/rejected": -7.934454917907715, "step": 1304 }, { "epoch": 0.9161109161109161, "grad_norm": 0.02809201553463936, "learning_rate": 9.108487457631881e-06, "logits/chosen": -8.498453140258789, "logits/rejected": -8.46857738494873, "logps/chosen": -7.574702262878418, "logps/rejected": -103.73487854003906, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.326211452484131, "rewards/margins": 9.513104438781738, "rewards/rejected": -7.186892986297607, "step": 1305 }, { "epoch": 0.9168129168129168, "grad_norm": 0.03311585634946823, "learning_rate": 8.965968353830092e-06, "logits/chosen": -8.99714183807373, "logits/rejected": -9.007110595703125, "logps/chosen": -1.716933012008667, "logps/rejected": -110.4891128540039, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9369118213653564, "rewards/margins": 10.795838356018066, "rewards/rejected": -7.858926773071289, "step": 1306 }, { "epoch": 0.9175149175149175, "grad_norm": 0.0010764080798253417, "learning_rate": 8.82432964284613e-06, "logits/chosen": -8.676963806152344, "logits/rejected": -8.660799026489258, "logps/chosen": -1.471115231513977, "logps/rejected": -109.61740112304688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0087504386901855, "rewards/margins": 10.851719856262207, "rewards/rejected": -7.8429694175720215, "step": 1307 }, { "epoch": 0.9182169182169182, "grad_norm": 0.0008541340939700603, "learning_rate": 8.68357910053244e-06, "logits/chosen": -8.018624305725098, "logits/rejected": -8.0130615234375, "logps/chosen": -8.08074951171875, "logps/rejected": -104.69102478027344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5834245681762695, "rewards/margins": 9.65283203125, "rewards/rejected": -7.069407939910889, "step": 1308 }, { "epoch": 0.918918918918919, "grad_norm": 0.0007044263184070587, "learning_rate": 8.54372445398172e-06, "logits/chosen": -8.118324279785156, "logits/rejected": -8.103347778320312, "logps/chosen": -5.9390645027160645, "logps/rejected": -107.20732116699219, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.60665225982666, "rewards/margins": 10.10989761352539, "rewards/rejected": -7.503245830535889, "step": 1309 }, { "epoch": 0.9196209196209196, "grad_norm": 0.0008724423823878169, "learning_rate": 8.404773381102702e-06, "logits/chosen": -8.194133758544922, "logits/rejected": -8.180327415466309, "logps/chosen": -1.7314724922180176, "logps/rejected": -109.22586822509766, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1108741760253906, "rewards/margins": 10.7802152633667, "rewards/rejected": -7.669341087341309, "step": 1310 }, { "epoch": 0.9203229203229203, "grad_norm": 0.0014653060352429748, "learning_rate": 8.266733510198643e-06, "logits/chosen": -8.253199577331543, "logits/rejected": -8.241212844848633, "logps/chosen": -4.008463382720947, "logps/rejected": -108.58189392089844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.775169849395752, "rewards/margins": 10.429126739501953, "rewards/rejected": -7.653956890106201, "step": 1311 }, { "epoch": 0.9210249210249211, "grad_norm": 0.0010684800799936056, "learning_rate": 8.129612419548536e-06, "logits/chosen": -8.214290618896484, "logits/rejected": -8.20888614654541, "logps/chosen": -3.463700294494629, "logps/rejected": -106.72309875488281, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.780419111251831, "rewards/margins": 10.38554573059082, "rewards/rejected": -7.605126857757568, "step": 1312 }, { "epoch": 0.9217269217269217, "grad_norm": 0.0016200421378016472, "learning_rate": 7.993417636991077e-06, "logits/chosen": -9.617927551269531, "logits/rejected": -9.607985496520996, "logps/chosen": -5.586090564727783, "logps/rejected": -108.49901580810547, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.665985584259033, "rewards/margins": 10.318229675292969, "rewards/rejected": -7.652243614196777, "step": 1313 }, { "epoch": 0.9224289224289224, "grad_norm": 0.007538053207099438, "learning_rate": 7.858156639511373e-06, "logits/chosen": -8.967788696289062, "logits/rejected": -8.979982376098633, "logps/chosen": -6.648526668548584, "logps/rejected": -107.92803955078125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5699081420898438, "rewards/margins": 10.11436653137207, "rewards/rejected": -7.54445743560791, "step": 1314 }, { "epoch": 0.9231309231309232, "grad_norm": 0.0008645218331366777, "learning_rate": 7.723836852830477e-06, "logits/chosen": -8.422895431518555, "logits/rejected": -8.41856861114502, "logps/chosen": -1.6232863664627075, "logps/rejected": -109.48660278320312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0261757373809814, "rewards/margins": 10.782747268676758, "rewards/rejected": -7.7565717697143555, "step": 1315 }, { "epoch": 0.9238329238329238, "grad_norm": 0.0013010624097660184, "learning_rate": 7.590465650997731e-06, "logits/chosen": -8.16504955291748, "logits/rejected": -8.183126449584961, "logps/chosen": -5.398205757141113, "logps/rejected": -107.2125244140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7458696365356445, "rewards/margins": 10.136404037475586, "rewards/rejected": -7.390534400939941, "step": 1316 }, { "epoch": 0.9245349245349246, "grad_norm": 0.0005087369936518371, "learning_rate": 7.458050355985914e-06, "logits/chosen": -8.729835510253906, "logits/rejected": -8.732930183410645, "logps/chosen": -1.5664277076721191, "logps/rejected": -110.72408294677734, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0440521240234375, "rewards/margins": 10.836048126220703, "rewards/rejected": -7.791996955871582, "step": 1317 }, { "epoch": 0.9252369252369252, "grad_norm": 0.0012944673653692007, "learning_rate": 7.32659823728928e-06, "logits/chosen": -8.311986923217773, "logits/rejected": -8.30197525024414, "logps/chosen": -10.9697904586792, "logps/rejected": -107.19219970703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3154845237731934, "rewards/margins": 9.479541778564453, "rewards/rejected": -7.164056777954102, "step": 1318 }, { "epoch": 0.9259389259389259, "grad_norm": 0.0013446551747620106, "learning_rate": 7.196116511524472e-06, "logits/chosen": -8.139021873474121, "logits/rejected": -8.126923561096191, "logps/chosen": -8.5431489944458, "logps/rejected": -105.7414779663086, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.3773975372314453, "rewards/margins": 9.621115684509277, "rewards/rejected": -7.243718147277832, "step": 1319 }, { "epoch": 0.9266409266409267, "grad_norm": 0.0011719007743522525, "learning_rate": 7.066612342034335e-06, "logits/chosen": -8.631038665771484, "logits/rejected": -8.641002655029297, "logps/chosen": -1.6421761512756348, "logps/rejected": -109.73609924316406, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.0489723682403564, "rewards/margins": 10.792463302612305, "rewards/rejected": -7.743490219116211, "step": 1320 }, { "epoch": 0.9273429273429273, "grad_norm": 0.0009983550990000367, "learning_rate": 6.938092838494651e-06, "logits/chosen": -8.377056121826172, "logits/rejected": -8.373361587524414, "logps/chosen": -3.050259590148926, "logps/rejected": -106.39653778076172, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.831808567047119, "rewards/margins": 10.386198997497559, "rewards/rejected": -7.554389953613281, "step": 1321 }, { "epoch": 0.928044928044928, "grad_norm": 0.0008201377931982279, "learning_rate": 6.810565056523835e-06, "logits/chosen": -8.709927558898926, "logits/rejected": -8.699626922607422, "logps/chosen": -1.7415708303451538, "logps/rejected": -109.38159942626953, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9646406173706055, "rewards/margins": 10.725493431091309, "rewards/rejected": -7.760853290557861, "step": 1322 }, { "epoch": 0.9287469287469288, "grad_norm": 0.0008687954978086054, "learning_rate": 6.6840359972955585e-06, "logits/chosen": -8.043399810791016, "logits/rejected": -8.03448486328125, "logps/chosen": -1.492735743522644, "logps/rejected": -109.45333862304688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1434080600738525, "rewards/margins": 10.868386268615723, "rewards/rejected": -7.724978446960449, "step": 1323 }, { "epoch": 0.9294489294489294, "grad_norm": 0.000819918408524245, "learning_rate": 6.558512607154426e-06, "logits/chosen": -7.538263320922852, "logits/rejected": -7.539557456970215, "logps/chosen": -5.164451599121094, "logps/rejected": -106.44467163085938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6330406665802, "rewards/margins": 10.075799942016602, "rewards/rejected": -7.4427595138549805, "step": 1324 }, { "epoch": 0.9301509301509302, "grad_norm": 0.2076633870601654, "learning_rate": 6.4340017772346075e-06, "logits/chosen": -7.8983564376831055, "logits/rejected": -7.887846946716309, "logps/chosen": -7.787734031677246, "logps/rejected": -102.94108581542969, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 2.5078608989715576, "rewards/margins": 9.578350067138672, "rewards/rejected": -7.070488929748535, "step": 1325 }, { "epoch": 0.9308529308529309, "grad_norm": 0.006849775556474924, "learning_rate": 6.310510343081513e-06, "logits/chosen": -8.598572731018066, "logits/rejected": -8.58547592163086, "logps/chosen": -8.234673500061035, "logps/rejected": -105.32096099853516, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.4182682037353516, "rewards/margins": 9.660238265991211, "rewards/rejected": -7.241969585418701, "step": 1326 }, { "epoch": 0.9315549315549315, "grad_norm": 0.000943706021644175, "learning_rate": 6.188045084276546e-06, "logits/chosen": -8.575488090515137, "logits/rejected": -8.563575744628906, "logps/chosen": -1.3299906253814697, "logps/rejected": -109.97315979003906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.030790328979492, "rewards/margins": 10.836997032165527, "rewards/rejected": -7.806206703186035, "step": 1327 }, { "epoch": 0.9322569322569323, "grad_norm": 0.0006311734323389828, "learning_rate": 6.0666127240649095e-06, "logits/chosen": -8.106651306152344, "logits/rejected": -8.121797561645508, "logps/chosen": -1.1781306266784668, "logps/rejected": -109.67211151123047, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0779051780700684, "rewards/margins": 10.846563339233398, "rewards/rejected": -7.768658638000488, "step": 1328 }, { "epoch": 0.932958932958933, "grad_norm": 0.007514989003539085, "learning_rate": 5.946219928986463e-06, "logits/chosen": -8.595329284667969, "logits/rejected": -8.612260818481445, "logps/chosen": -3.865478038787842, "logps/rejected": -107.59101867675781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.868715286254883, "rewards/margins": 10.452127456665039, "rewards/rejected": -7.583411693572998, "step": 1329 }, { "epoch": 0.9336609336609336, "grad_norm": 0.0010299033019691706, "learning_rate": 5.82687330850982e-06, "logits/chosen": -8.234659194946289, "logits/rejected": -8.228699684143066, "logps/chosen": -1.635383129119873, "logps/rejected": -109.42439270019531, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.109025239944458, "rewards/margins": 10.837347984313965, "rewards/rejected": -7.728322982788086, "step": 1330 }, { "epoch": 0.9343629343629344, "grad_norm": 0.0014099172549322248, "learning_rate": 5.708579414669377e-06, "logits/chosen": -9.284217834472656, "logits/rejected": -9.270183563232422, "logps/chosen": -7.558510780334473, "logps/rejected": -106.37902069091797, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.3155572414398193, "rewards/margins": 9.721195220947266, "rewards/rejected": -7.405637741088867, "step": 1331 }, { "epoch": 0.935064935064935, "grad_norm": 0.001319861738011241, "learning_rate": 5.59134474170573e-06, "logits/chosen": -7.681850433349609, "logits/rejected": -7.682551383972168, "logps/chosen": -1.3039374351501465, "logps/rejected": -109.23356628417969, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.0909175872802734, "rewards/margins": 10.797017097473145, "rewards/rejected": -7.706099510192871, "step": 1332 }, { "epoch": 0.9357669357669358, "grad_norm": 0.0007420534966513515, "learning_rate": 5.475175725709085e-06, "logits/chosen": -8.437281608581543, "logits/rejected": -8.466190338134766, "logps/chosen": -1.825585126876831, "logps/rejected": -109.51141357421875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.971320390701294, "rewards/margins": 10.792817115783691, "rewards/rejected": -7.821496486663818, "step": 1333 }, { "epoch": 0.9364689364689365, "grad_norm": 0.0007423662464134395, "learning_rate": 5.360078744265891e-06, "logits/chosen": -8.695150375366211, "logits/rejected": -8.678117752075195, "logps/chosen": -1.2081584930419922, "logps/rejected": -110.17698669433594, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.9336204528808594, "rewards/margins": 10.75893783569336, "rewards/rejected": -7.825317859649658, "step": 1334 }, { "epoch": 0.9371709371709371, "grad_norm": 0.009903359226882458, "learning_rate": 5.246060116108807e-06, "logits/chosen": -7.795057773590088, "logits/rejected": -7.791979789733887, "logps/chosen": -5.569721221923828, "logps/rejected": -107.84663391113281, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.731764793395996, "rewards/margins": 10.21690559387207, "rewards/rejected": -7.485141754150391, "step": 1335 }, { "epoch": 0.9378729378729379, "grad_norm": 0.0011701479088515043, "learning_rate": 5.133126100769699e-06, "logits/chosen": -8.256206512451172, "logits/rejected": -8.273344039916992, "logps/chosen": -4.429319858551025, "logps/rejected": -106.42704772949219, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7956624031066895, "rewards/margins": 10.125325202941895, "rewards/rejected": -7.329663276672363, "step": 1336 }, { "epoch": 0.9385749385749386, "grad_norm": 0.0007397782173939049, "learning_rate": 5.021282898236112e-06, "logits/chosen": -8.143410682678223, "logits/rejected": -8.13608169555664, "logps/chosen": -1.3900163173675537, "logps/rejected": -110.37599182128906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0235435962677, "rewards/margins": 10.746421813964844, "rewards/rejected": -7.722878456115723, "step": 1337 }, { "epoch": 0.9392769392769392, "grad_norm": 0.0006779510877095163, "learning_rate": 4.910536648610779e-06, "logits/chosen": -9.032720565795898, "logits/rejected": -9.043061256408691, "logps/chosen": -9.828482627868652, "logps/rejected": -104.15277099609375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.1233577728271484, "rewards/margins": 9.416131019592285, "rewards/rejected": -7.292773246765137, "step": 1338 }, { "epoch": 0.93997893997894, "grad_norm": 0.0008892252226360142, "learning_rate": 4.800893431774644e-06, "logits/chosen": -8.439183235168457, "logits/rejected": -8.407987594604492, "logps/chosen": -3.5118579864501953, "logps/rejected": -107.28913116455078, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.877462148666382, "rewards/margins": 10.365158081054688, "rewards/rejected": -7.487695693969727, "step": 1339 }, { "epoch": 0.9406809406809407, "grad_norm": 0.0007709927740506828, "learning_rate": 4.692359267052982e-06, "logits/chosen": -8.32420539855957, "logits/rejected": -8.324483871459961, "logps/chosen": -1.0457038879394531, "logps/rejected": -109.50503540039062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0315468311309814, "rewards/margins": 10.82457160949707, "rewards/rejected": -7.793024063110352, "step": 1340 }, { "epoch": 0.9413829413829414, "grad_norm": 0.001093661063350737, "learning_rate": 4.584940112885035e-06, "logits/chosen": -8.398261070251465, "logits/rejected": -8.393585205078125, "logps/chosen": -7.140786647796631, "logps/rejected": -105.20240783691406, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4739136695861816, "rewards/margins": 9.72900390625, "rewards/rejected": -7.25508975982666, "step": 1341 }, { "epoch": 0.9420849420849421, "grad_norm": 0.0010362562024965882, "learning_rate": 4.478641866496808e-06, "logits/chosen": -8.964065551757812, "logits/rejected": -8.97567367553711, "logps/chosen": -8.130404472351074, "logps/rejected": -106.4964828491211, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.344956874847412, "rewards/margins": 9.720868110656738, "rewards/rejected": -7.375910758972168, "step": 1342 }, { "epoch": 0.9427869427869427, "grad_norm": 0.0020639258436858654, "learning_rate": 4.373470363577407e-06, "logits/chosen": -8.636775016784668, "logits/rejected": -8.635366439819336, "logps/chosen": -10.194795608520508, "logps/rejected": -102.59330749511719, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3959271907806396, "rewards/margins": 9.26728343963623, "rewards/rejected": -6.8713555335998535, "step": 1343 }, { "epoch": 0.9434889434889435, "grad_norm": 0.001438385690562427, "learning_rate": 4.269431377958582e-06, "logits/chosen": -8.930496215820312, "logits/rejected": -8.951132774353027, "logps/chosen": -5.569061756134033, "logps/rejected": -107.0682601928711, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.687493085861206, "rewards/margins": 10.160457611083984, "rewards/rejected": -7.472964286804199, "step": 1344 }, { "epoch": 0.9441909441909442, "grad_norm": 0.003520161611959338, "learning_rate": 4.166530621297806e-06, "logits/chosen": -8.593323707580566, "logits/rejected": -8.582849502563477, "logps/chosen": -4.199839115142822, "logps/rejected": -105.57334899902344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7910079956054688, "rewards/margins": 10.201761245727539, "rewards/rejected": -7.4107537269592285, "step": 1345 }, { "epoch": 0.9448929448929448, "grad_norm": 0.09762053936719894, "learning_rate": 4.064773742764677e-06, "logits/chosen": -8.48123836517334, "logits/rejected": -8.453150749206543, "logps/chosen": -1.782731294631958, "logps/rejected": -108.56243896484375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 3.0325233936309814, "rewards/margins": 10.751246452331543, "rewards/rejected": -7.718723297119141, "step": 1346 }, { "epoch": 0.9455949455949456, "grad_norm": 4.214315891265869, "learning_rate": 3.964166328730805e-06, "logits/chosen": -8.607083320617676, "logits/rejected": -8.568113327026367, "logps/chosen": -1.8658860921859741, "logps/rejected": -108.6943130493164, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": 3.0450870990753174, "rewards/margins": 10.749463081359863, "rewards/rejected": -7.704376220703125, "step": 1347 }, { "epoch": 0.9462969462969463, "grad_norm": 0.0015768385492265224, "learning_rate": 3.864713902463106e-06, "logits/chosen": -8.359550476074219, "logits/rejected": -8.361137390136719, "logps/chosen": -2.2059555053710938, "logps/rejected": -109.86083984375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9328436851501465, "rewards/margins": 10.682544708251953, "rewards/rejected": -7.749701023101807, "step": 1348 }, { "epoch": 0.946998946998947, "grad_norm": 0.0009586670203134418, "learning_rate": 3.7664219238205853e-06, "logits/chosen": -7.943412780761719, "logits/rejected": -7.929924964904785, "logps/chosen": -8.087947845458984, "logps/rejected": -104.6522216796875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.459927558898926, "rewards/margins": 9.540997505187988, "rewards/rejected": -7.0810699462890625, "step": 1349 }, { "epoch": 0.9477009477009477, "grad_norm": 0.0011404053075239062, "learning_rate": 3.6692957889546164e-06, "logits/chosen": -8.455657958984375, "logits/rejected": -8.445859909057617, "logps/chosen": -8.399739265441895, "logps/rejected": -103.9443130493164, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4723689556121826, "rewards/margins": 9.574026107788086, "rewards/rejected": -7.101656913757324, "step": 1350 }, { "epoch": 0.9484029484029484, "grad_norm": 0.000805564399342984, "learning_rate": 3.573340830012647e-06, "logits/chosen": -8.599723815917969, "logits/rejected": -8.590082168579102, "logps/chosen": -1.239449143409729, "logps/rejected": -109.82038879394531, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.901154041290283, "rewards/margins": 10.813288688659668, "rewards/rejected": -7.912134170532227, "step": 1351 }, { "epoch": 0.9491049491049491, "grad_norm": 0.0010367871727794409, "learning_rate": 3.4785623148455357e-06, "logits/chosen": -8.570448875427246, "logits/rejected": -8.561650276184082, "logps/chosen": -1.1614799499511719, "logps/rejected": -110.04071044921875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0222082138061523, "rewards/margins": 10.84203052520752, "rewards/rejected": -7.819821357727051, "step": 1352 }, { "epoch": 0.9498069498069498, "grad_norm": 0.0009633756708353758, "learning_rate": 3.3849654467182704e-06, "logits/chosen": -7.999362945556641, "logits/rejected": -7.968902587890625, "logps/chosen": -5.2691969871521, "logps/rejected": -107.75131225585938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.594954490661621, "rewards/margins": 10.206592559814453, "rewards/rejected": -7.611637115478516, "step": 1353 }, { "epoch": 0.9505089505089505, "grad_norm": 0.0007576377247460186, "learning_rate": 3.292555364024403e-06, "logits/chosen": -8.89586353302002, "logits/rejected": -8.881933212280273, "logps/chosen": -8.90414047241211, "logps/rejected": -106.306396484375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4149932861328125, "rewards/margins": 9.642777442932129, "rewards/rejected": -7.227784156799316, "step": 1354 }, { "epoch": 0.9512109512109512, "grad_norm": 0.000986816012300551, "learning_rate": 3.201337140003882e-06, "logits/chosen": -9.937848091125488, "logits/rejected": -9.914661407470703, "logps/chosen": -12.848564147949219, "logps/rejected": -104.67698669433594, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.099628448486328, "rewards/margins": 9.127448081970215, "rewards/rejected": -7.027819633483887, "step": 1355 }, { "epoch": 0.9519129519129519, "grad_norm": 0.0010579272639006376, "learning_rate": 3.1113157824645705e-06, "logits/chosen": -8.693473815917969, "logits/rejected": -8.710052490234375, "logps/chosen": -4.456684589385986, "logps/rejected": -108.26629638671875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6762213706970215, "rewards/margins": 10.355146408081055, "rewards/rejected": -7.678925037384033, "step": 1356 }, { "epoch": 0.9526149526149527, "grad_norm": 0.00844397209584713, "learning_rate": 3.0224962335073144e-06, "logits/chosen": -7.802088737487793, "logits/rejected": -7.798039436340332, "logps/chosen": -6.625181198120117, "logps/rejected": -104.78478240966797, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6924381256103516, "rewards/margins": 9.95031452178955, "rewards/rejected": -7.257876396179199, "step": 1357 }, { "epoch": 0.9533169533169533, "grad_norm": 0.0005781865329481661, "learning_rate": 2.9348833692546268e-06, "logits/chosen": -8.710966110229492, "logits/rejected": -8.706815719604492, "logps/chosen": -1.4584572315216064, "logps/rejected": -110.35606384277344, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0379953384399414, "rewards/margins": 10.772367477416992, "rewards/rejected": -7.734371662139893, "step": 1358 }, { "epoch": 0.954018954018954, "grad_norm": 0.0008596231346018612, "learning_rate": 2.848481999582994e-06, "logits/chosen": -8.305456161499023, "logits/rejected": -8.317920684814453, "logps/chosen": -5.206576824188232, "logps/rejected": -106.92561340332031, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.704348087310791, "rewards/margins": 10.165019989013672, "rewards/rejected": -7.460672378540039, "step": 1359 }, { "epoch": 0.9547209547209548, "grad_norm": 0.00111450778786093, "learning_rate": 2.763296867858807e-06, "logits/chosen": -8.324100494384766, "logits/rejected": -8.286888122558594, "logps/chosen": -2.0055551528930664, "logps/rejected": -109.1157455444336, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0436129570007324, "rewards/margins": 10.724103927612305, "rewards/rejected": -7.680490493774414, "step": 1360 }, { "epoch": 0.9554229554229554, "grad_norm": 0.0009778192033991218, "learning_rate": 2.679332650677969e-06, "logits/chosen": -8.412038803100586, "logits/rejected": -8.403745651245117, "logps/chosen": -4.001523017883301, "logps/rejected": -107.41685485839844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7387847900390625, "rewards/margins": 10.29752254486084, "rewards/rejected": -7.558737754821777, "step": 1361 }, { "epoch": 0.9561249561249561, "grad_norm": 0.000809256627690047, "learning_rate": 2.5965939576091444e-06, "logits/chosen": -9.07585620880127, "logits/rejected": -9.065446853637695, "logps/chosen": -4.181310653686523, "logps/rejected": -107.53427124023438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6578142642974854, "rewards/margins": 10.312820434570312, "rewards/rejected": -7.655006408691406, "step": 1362 }, { "epoch": 0.9568269568269568, "grad_norm": 0.0009498323197476566, "learning_rate": 2.5150853309406934e-06, "logits/chosen": -7.994161605834961, "logits/rejected": -7.985674858093262, "logps/chosen": -1.2756619453430176, "logps/rejected": -109.6465072631836, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.106478691101074, "rewards/margins": 10.840412139892578, "rewards/rejected": -7.73393440246582, "step": 1363 }, { "epoch": 0.9575289575289575, "grad_norm": 0.0013848667731508613, "learning_rate": 2.434811245431329e-06, "logits/chosen": -8.862785339355469, "logits/rejected": -8.85634708404541, "logps/chosen": -5.235378265380859, "logps/rejected": -107.37710571289062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.615480899810791, "rewards/margins": 10.214431762695312, "rewards/rejected": -7.598950386047363, "step": 1364 }, { "epoch": 0.9582309582309583, "grad_norm": 0.0008082491112872958, "learning_rate": 2.355776108064412e-06, "logits/chosen": -8.762730598449707, "logits/rejected": -8.747987747192383, "logps/chosen": -1.3891189098358154, "logps/rejected": -110.72883605957031, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.017375946044922, "rewards/margins": 10.818310737609863, "rewards/rejected": -7.800934791564941, "step": 1365 }, { "epoch": 0.9589329589329589, "grad_norm": 0.0009757449734024704, "learning_rate": 2.277984257806064e-06, "logits/chosen": -9.259500503540039, "logits/rejected": -9.255352020263672, "logps/chosen": -8.004719734191895, "logps/rejected": -106.63639831542969, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.3768887519836426, "rewards/margins": 9.753847122192383, "rewards/rejected": -7.376957893371582, "step": 1366 }, { "epoch": 0.9596349596349596, "grad_norm": 0.0011709424434229732, "learning_rate": 2.2014399653669036e-06, "logits/chosen": -9.214205741882324, "logits/rejected": -9.209846496582031, "logps/chosen": -1.7855122089385986, "logps/rejected": -109.57884216308594, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.0440468788146973, "rewards/margins": 10.786215782165527, "rewards/rejected": -7.74216890335083, "step": 1367 }, { "epoch": 0.9603369603369604, "grad_norm": 0.0012812899658456445, "learning_rate": 2.1261474329676517e-06, "logits/chosen": -8.426538467407227, "logits/rejected": -8.440290451049805, "logps/chosen": -4.446875095367432, "logps/rejected": -108.27533721923828, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6825876235961914, "rewards/margins": 10.287564277648926, "rewards/rejected": -7.604976654052734, "step": 1368 }, { "epoch": 0.961038961038961, "grad_norm": 0.0008833335596136749, "learning_rate": 2.0521107941083654e-06, "logits/chosen": -8.216203689575195, "logits/rejected": -8.20785903930664, "logps/chosen": -5.102726936340332, "logps/rejected": -108.32865142822266, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.6631052494049072, "rewards/margins": 10.157459259033203, "rewards/rejected": -7.494354248046875, "step": 1369 }, { "epoch": 0.9617409617409617, "grad_norm": 0.0011185928015038371, "learning_rate": 1.979334113341578e-06, "logits/chosen": -8.368419647216797, "logits/rejected": -8.353595733642578, "logps/chosen": -4.123073577880859, "logps/rejected": -107.33589935302734, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.675419330596924, "rewards/margins": 10.280445098876953, "rewards/rejected": -7.605025768280029, "step": 1370 }, { "epoch": 0.9624429624429625, "grad_norm": 0.0010652545606717467, "learning_rate": 1.9078213860491097e-06, "logits/chosen": -8.897151947021484, "logits/rejected": -8.884145736694336, "logps/chosen": -1.3087265491485596, "logps/rejected": -109.40617370605469, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.0192761421203613, "rewards/margins": 10.797962188720703, "rewards/rejected": -7.7786865234375, "step": 1371 }, { "epoch": 0.9631449631449631, "grad_norm": 0.0013353563845157623, "learning_rate": 1.8375765382227453e-06, "logits/chosen": -8.917214393615723, "logits/rejected": -8.913795471191406, "logps/chosen": -1.297995924949646, "logps/rejected": -110.76187133789062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9349474906921387, "rewards/margins": 10.854560852050781, "rewards/rejected": -7.919612884521484, "step": 1372 }, { "epoch": 0.9638469638469639, "grad_norm": 0.0008918720996007323, "learning_rate": 1.7686034262486925e-06, "logits/chosen": -8.684749603271484, "logits/rejected": -8.65962028503418, "logps/chosen": -1.4723854064941406, "logps/rejected": -108.8482437133789, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1565513610839844, "rewards/margins": 10.82541561126709, "rewards/rejected": -7.6688642501831055, "step": 1373 }, { "epoch": 0.9645489645489645, "grad_norm": 0.0009758829255588353, "learning_rate": 1.7009058366958787e-06, "logits/chosen": -8.104373931884766, "logits/rejected": -8.072288513183594, "logps/chosen": -3.872478485107422, "logps/rejected": -108.51028442382812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7323498725891113, "rewards/margins": 10.363245010375977, "rewards/rejected": -7.630895614624023, "step": 1374 }, { "epoch": 0.9652509652509652, "grad_norm": 0.0010336707346141338, "learning_rate": 1.6344874861080682e-06, "logits/chosen": -8.557031631469727, "logits/rejected": -8.547887802124023, "logps/chosen": -4.298130512237549, "logps/rejected": -108.34857177734375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7350058555603027, "rewards/margins": 10.372306823730469, "rewards/rejected": -7.637301445007324, "step": 1375 }, { "epoch": 0.965952965952966, "grad_norm": 0.0006378429243341088, "learning_rate": 1.5693520207998132e-06, "logits/chosen": -8.775436401367188, "logits/rejected": -8.763589859008789, "logps/chosen": -1.3751060962677002, "logps/rejected": -109.62013244628906, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0758137702941895, "rewards/margins": 10.735538482666016, "rewards/rejected": -7.659724235534668, "step": 1376 }, { "epoch": 0.9666549666549666, "grad_norm": 0.001132626784965396, "learning_rate": 1.5055030166563016e-06, "logits/chosen": -8.34052848815918, "logits/rejected": -8.345357894897461, "logps/chosen": -1.2959495782852173, "logps/rejected": -110.04615783691406, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.059206485748291, "rewards/margins": 10.795294761657715, "rewards/rejected": -7.736088275909424, "step": 1377 }, { "epoch": 0.9673569673569674, "grad_norm": 0.0015539381420239806, "learning_rate": 1.4429439789370089e-06, "logits/chosen": -7.82776403427124, "logits/rejected": -7.805758476257324, "logps/chosen": -9.986175537109375, "logps/rejected": -103.57845306396484, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.601698875427246, "rewards/margins": 9.585823059082031, "rewards/rejected": -6.984124183654785, "step": 1378 }, { "epoch": 0.9680589680589681, "grad_norm": 0.0009192450088448822, "learning_rate": 1.3816783420833017e-06, "logits/chosen": -9.104162216186523, "logits/rejected": -9.073590278625488, "logps/chosen": -6.561991214752197, "logps/rejected": -109.63001251220703, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.589323043823242, "rewards/margins": 10.253145217895508, "rewards/rejected": -7.663822174072266, "step": 1379 }, { "epoch": 0.9687609687609687, "grad_norm": 0.0009257533238269389, "learning_rate": 1.3217094695298508e-06, "logits/chosen": -8.396517753601074, "logits/rejected": -8.390589714050293, "logps/chosen": -1.435081124305725, "logps/rejected": -109.972900390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0903687477111816, "rewards/margins": 10.890178680419922, "rewards/rejected": -7.79980993270874, "step": 1380 }, { "epoch": 0.9694629694629695, "grad_norm": 0.0008333427249453962, "learning_rate": 1.263040653520016e-06, "logits/chosen": -7.656696319580078, "logits/rejected": -7.659037113189697, "logps/chosen": -1.383973479270935, "logps/rejected": -109.96727752685547, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0396502017974854, "rewards/margins": 10.750529289245605, "rewards/rejected": -7.710878372192383, "step": 1381 }, { "epoch": 0.9701649701649702, "grad_norm": 0.0043931077234447, "learning_rate": 1.2056751149250679e-06, "logits/chosen": -8.994950294494629, "logits/rejected": -8.997322082519531, "logps/chosen": -1.906968593597412, "logps/rejected": -110.02405548095703, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.855865001678467, "rewards/margins": 10.699921607971191, "rewards/rejected": -7.844056129455566, "step": 1382 }, { "epoch": 0.9708669708669708, "grad_norm": 0.0008894916391000152, "learning_rate": 1.1496160030674104e-06, "logits/chosen": -8.479622840881348, "logits/rejected": -8.48927116394043, "logps/chosen": -4.699127674102783, "logps/rejected": -108.0932388305664, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.64579176902771, "rewards/margins": 10.322044372558594, "rewards/rejected": -7.676252365112305, "step": 1383 }, { "epoch": 0.9715689715689716, "grad_norm": 0.00602220231667161, "learning_rate": 1.0948663955476308e-06, "logits/chosen": -8.420933723449707, "logits/rejected": -8.427053451538086, "logps/chosen": -1.3249870538711548, "logps/rejected": -109.19840240478516, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1358463764190674, "rewards/margins": 10.880016326904297, "rewards/rejected": -7.744170188903809, "step": 1384 }, { "epoch": 0.9722709722709723, "grad_norm": 0.0007398105808533728, "learning_rate": 1.0414292980755935e-06, "logits/chosen": -9.073956489562988, "logits/rejected": -9.047508239746094, "logps/chosen": -10.774728775024414, "logps/rejected": -102.80621337890625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.1504552364349365, "rewards/margins": 9.130342483520508, "rewards/rejected": -6.979887008666992, "step": 1385 }, { "epoch": 0.972972972972973, "grad_norm": 0.0007099287468008697, "learning_rate": 9.893076443054085e-07, "logits/chosen": -8.950366973876953, "logits/rejected": -8.945340156555176, "logps/chosen": -9.016007423400879, "logps/rejected": -107.46222686767578, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.405845880508423, "rewards/margins": 9.710236549377441, "rewards/rejected": -7.304390907287598, "step": 1386 }, { "epoch": 0.9736749736749737, "grad_norm": 0.001524159568361938, "learning_rate": 9.385042956743443e-07, "logits/chosen": -8.343716621398926, "logits/rejected": -8.32679557800293, "logps/chosen": -1.3643734455108643, "logps/rejected": -110.1829605102539, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.055591106414795, "rewards/margins": 10.87564468383789, "rewards/rejected": -7.820053577423096, "step": 1387 }, { "epoch": 0.9743769743769743, "grad_norm": 0.001998540014028549, "learning_rate": 8.890220412458051e-07, "logits/chosen": -8.491369247436523, "logits/rejected": -8.480422973632812, "logps/chosen": -1.686583161354065, "logps/rejected": -108.98086547851562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.0870842933654785, "rewards/margins": 10.700441360473633, "rewards/rejected": -7.613357067108154, "step": 1388 }, { "epoch": 0.9750789750789751, "grad_norm": 0.0009184933151118457, "learning_rate": 8.408635975561439e-07, "logits/chosen": -9.703895568847656, "logits/rejected": -9.683259963989258, "logps/chosen": -1.109130620956421, "logps/rejected": -110.560546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9208240509033203, "rewards/margins": 10.880060195922852, "rewards/rejected": -7.959236145019531, "step": 1389 }, { "epoch": 0.9757809757809758, "grad_norm": 0.0007729565841145813, "learning_rate": 7.94031608465584e-07, "logits/chosen": -8.705818176269531, "logits/rejected": -8.692220687866211, "logps/chosen": -5.399176597595215, "logps/rejected": -108.34336853027344, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.564276695251465, "rewards/margins": 10.235198974609375, "rewards/rejected": -7.670921802520752, "step": 1390 }, { "epoch": 0.9764829764829764, "grad_norm": 0.0008465806022286415, "learning_rate": 7.485286450130249e-07, "logits/chosen": -8.847637176513672, "logits/rejected": -8.840566635131836, "logps/chosen": -6.120500564575195, "logps/rejected": -105.60325622558594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5977749824523926, "rewards/margins": 9.982362747192383, "rewards/rejected": -7.384587287902832, "step": 1391 }, { "epoch": 0.9771849771849772, "grad_norm": 0.32676222920417786, "learning_rate": 7.043572052749351e-07, "logits/chosen": -8.38469123840332, "logits/rejected": -8.35000991821289, "logps/chosen": -5.065084457397461, "logps/rejected": -107.26806640625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 2.8192195892333984, "rewards/margins": 10.262594223022461, "rewards/rejected": -7.443375110626221, "step": 1392 }, { "epoch": 0.9778869778869779, "grad_norm": 0.005745495669543743, "learning_rate": 6.615197142281737e-07, "logits/chosen": -8.697529792785645, "logits/rejected": -8.666084289550781, "logps/chosen": -3.5127766132354736, "logps/rejected": -107.18000793457031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.929967164993286, "rewards/margins": 10.352335929870605, "rewards/rejected": -7.42236852645874, "step": 1393 }, { "epoch": 0.9785889785889786, "grad_norm": 0.000995452981442213, "learning_rate": 6.200185236168985e-07, "logits/chosen": -8.812406539916992, "logits/rejected": -8.826300621032715, "logps/chosen": -1.1192336082458496, "logps/rejected": -110.20980834960938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9603703022003174, "rewards/margins": 10.867770195007324, "rewards/rejected": -7.907400131225586, "step": 1394 }, { "epoch": 0.9792909792909793, "grad_norm": 0.000910759496036917, "learning_rate": 5.798559118234185e-07, "logits/chosen": -9.127287864685059, "logits/rejected": -9.128271102905273, "logps/chosen": -3.8209915161132812, "logps/rejected": -106.27529907226562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.858938694000244, "rewards/margins": 10.243873596191406, "rewards/rejected": -7.38493537902832, "step": 1395 }, { "epoch": 0.97999297999298, "grad_norm": 0.001697339117527008, "learning_rate": 5.410340837431479e-07, "logits/chosen": -9.14105224609375, "logits/rejected": -9.136428833007812, "logps/chosen": -4.452010631561279, "logps/rejected": -108.05095672607422, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5789873600006104, "rewards/margins": 10.358994483947754, "rewards/rejected": -7.780006408691406, "step": 1396 }, { "epoch": 0.9806949806949807, "grad_norm": 0.001068031182512641, "learning_rate": 5.035551706635352e-07, "logits/chosen": -8.002464294433594, "logits/rejected": -7.984198093414307, "logps/chosen": -5.344208717346191, "logps/rejected": -105.95576477050781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8294143676757812, "rewards/margins": 10.1746244430542, "rewards/rejected": -7.345210075378418, "step": 1397 }, { "epoch": 0.9813969813969814, "grad_norm": 0.0009021079749800265, "learning_rate": 4.6742123014705895e-07, "logits/chosen": -7.717872619628906, "logits/rejected": -7.715564727783203, "logps/chosen": -5.45743989944458, "logps/rejected": -108.74736022949219, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8772363662719727, "rewards/margins": 10.377097129821777, "rewards/rejected": -7.499860763549805, "step": 1398 }, { "epoch": 0.982098982098982, "grad_norm": 0.0011520996922627091, "learning_rate": 4.3263424591828644e-07, "logits/chosen": -8.586402893066406, "logits/rejected": -8.559966087341309, "logps/chosen": -1.7647615671157837, "logps/rejected": -109.28076934814453, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.081559181213379, "rewards/margins": 10.855531692504883, "rewards/rejected": -7.773972988128662, "step": 1399 }, { "epoch": 0.9828009828009828, "grad_norm": 0.004355923738330603, "learning_rate": 3.9919612775494824e-07, "logits/chosen": -8.030903816223145, "logits/rejected": -8.029336929321289, "logps/chosen": -3.7077322006225586, "logps/rejected": -107.20805358886719, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.8173413276672363, "rewards/margins": 10.296930313110352, "rewards/rejected": -7.479588508605957, "step": 1400 }, { "epoch": 0.9835029835029835, "grad_norm": 0.0008911702316254377, "learning_rate": 3.6710871138310834e-07, "logits/chosen": -8.612415313720703, "logits/rejected": -8.621355056762695, "logps/chosen": -1.3219208717346191, "logps/rejected": -110.02371215820312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.9648609161376953, "rewards/margins": 10.81867790222168, "rewards/rejected": -7.853817939758301, "step": 1401 }, { "epoch": 0.9842049842049843, "grad_norm": 0.005406058859080076, "learning_rate": 3.3637375837637116e-07, "logits/chosen": -7.752336502075195, "logits/rejected": -7.750354766845703, "logps/chosen": -2.3224079608917236, "logps/rejected": -108.8493881225586, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.0400071144104004, "rewards/margins": 10.647034645080566, "rewards/rejected": -7.607027053833008, "step": 1402 }, { "epoch": 0.9849069849069849, "grad_norm": 0.001077372464351356, "learning_rate": 3.0699295605918796e-07, "logits/chosen": -8.718467712402344, "logits/rejected": -8.723697662353516, "logps/chosen": -5.528104782104492, "logps/rejected": -108.12460327148438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.783285617828369, "rewards/margins": 10.256248474121094, "rewards/rejected": -7.472963809967041, "step": 1403 }, { "epoch": 0.9856089856089856, "grad_norm": 0.003422106383368373, "learning_rate": 2.789679174142046e-07, "logits/chosen": -7.870953559875488, "logits/rejected": -7.8755645751953125, "logps/chosen": -7.510532379150391, "logps/rejected": -103.64038848876953, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.691546678543091, "rewards/margins": 9.644139289855957, "rewards/rejected": -6.952591896057129, "step": 1404 }, { "epoch": 0.9863109863109863, "grad_norm": 0.0009597139433026314, "learning_rate": 2.523001809937351e-07, "logits/chosen": -8.108116149902344, "logits/rejected": -8.102728843688965, "logps/chosen": -5.385380744934082, "logps/rejected": -108.15605163574219, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.737410545349121, "rewards/margins": 10.251585006713867, "rewards/rejected": -7.514174461364746, "step": 1405 }, { "epoch": 0.987012987012987, "grad_norm": 0.0014216918498277664, "learning_rate": 2.2699121083526567e-07, "logits/chosen": -8.785545349121094, "logits/rejected": -8.773576736450195, "logps/chosen": -1.5797131061553955, "logps/rejected": -109.85289001464844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.097390651702881, "rewards/margins": 10.830490112304688, "rewards/rejected": -7.733099937438965, "step": 1406 }, { "epoch": 0.9877149877149877, "grad_norm": 0.0008149555651471019, "learning_rate": 2.0304239638111052e-07, "logits/chosen": -9.553266525268555, "logits/rejected": -9.535876274108887, "logps/chosen": -4.4854607582092285, "logps/rejected": -107.17082977294922, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.744561195373535, "rewards/margins": 10.229267120361328, "rewards/rejected": -7.484705924987793, "step": 1407 }, { "epoch": 0.9884169884169884, "grad_norm": 0.0014582007424905896, "learning_rate": 1.8045505240211085e-07, "logits/chosen": -8.906880378723145, "logits/rejected": -8.910831451416016, "logps/chosen": -7.031953811645508, "logps/rejected": -104.76141357421875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.473134994506836, "rewards/margins": 9.788056373596191, "rewards/rejected": -7.3149213790893555, "step": 1408 }, { "epoch": 0.9891189891189891, "grad_norm": 0.0009197979816235602, "learning_rate": 1.5923041892546387e-07, "logits/chosen": -8.604863166809082, "logits/rejected": -8.595357894897461, "logps/chosen": -7.734543800354004, "logps/rejected": -106.64581298828125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.5301923751831055, "rewards/margins": 9.91776180267334, "rewards/rejected": -7.387569427490234, "step": 1409 }, { "epoch": 0.9898209898209899, "grad_norm": 0.0008254371932707727, "learning_rate": 1.3936966116665055e-07, "logits/chosen": -7.7137861251831055, "logits/rejected": -7.686235427856445, "logps/chosen": -4.099059104919434, "logps/rejected": -106.56480407714844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.787278652191162, "rewards/margins": 10.311885833740234, "rewards/rejected": -7.524607181549072, "step": 1410 }, { "epoch": 0.9905229905229905, "grad_norm": 0.0014439182123169303, "learning_rate": 1.2087386946545723e-07, "logits/chosen": -8.912240028381348, "logits/rejected": -8.935575485229492, "logps/chosen": -1.2195773124694824, "logps/rejected": -110.20440673828125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9453012943267822, "rewards/margins": 10.842637062072754, "rewards/rejected": -7.897335529327393, "step": 1411 }, { "epoch": 0.9912249912249912, "grad_norm": 0.0008528974140062928, "learning_rate": 1.0374405922611564e-07, "logits/chosen": -9.014580726623535, "logits/rejected": -8.999217987060547, "logps/chosen": -1.1618787050247192, "logps/rejected": -109.8916015625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.0626063346862793, "rewards/margins": 10.89974308013916, "rewards/rejected": -7.837136745452881, "step": 1412 }, { "epoch": 0.991926991926992, "grad_norm": 0.0007645180448889732, "learning_rate": 8.79811708615723e-08, "logits/chosen": -10.149192810058594, "logits/rejected": -10.152084350585938, "logps/chosen": -7.629578113555908, "logps/rejected": -107.06379699707031, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4278054237365723, "rewards/margins": 9.871381759643555, "rewards/rejected": -7.443576812744141, "step": 1413 }, { "epoch": 0.9926289926289926, "grad_norm": 0.0017322616185992956, "learning_rate": 7.358606974184355e-08, "logits/chosen": -8.60546875, "logits/rejected": -8.599081993103027, "logps/chosen": -4.437937259674072, "logps/rejected": -106.9256362915039, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.705294370651245, "rewards/margins": 10.237358093261719, "rewards/rejected": -7.5320634841918945, "step": 1414 }, { "epoch": 0.9933309933309933, "grad_norm": 0.008058445528149605, "learning_rate": 6.055954614652204e-08, "logits/chosen": -7.761528968811035, "logits/rejected": -7.776362419128418, "logps/chosen": -5.001325607299805, "logps/rejected": -107.15058898925781, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.770045280456543, "rewards/margins": 10.141345024108887, "rewards/rejected": -7.371299743652344, "step": 1415 }, { "epoch": 0.994032994032994, "grad_norm": 0.047873686999082565, "learning_rate": 4.890231522137424e-08, "logits/chosen": -8.199563980102539, "logits/rejected": -8.175474166870117, "logps/chosen": -1.719151496887207, "logps/rejected": -109.98907470703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 3.0499801635742188, "rewards/margins": 10.84958553314209, "rewards/rejected": -7.799605369567871, "step": 1416 }, { "epoch": 0.9947349947349947, "grad_norm": 0.0005756942555308342, "learning_rate": 3.8615016939103454e-08, "logits/chosen": -8.539419174194336, "logits/rejected": -8.56144905090332, "logps/chosen": -1.3194249868392944, "logps/rejected": -109.16705322265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.029693126678467, "rewards/margins": 10.835699081420898, "rewards/rejected": -7.80600643157959, "step": 1417 }, { "epoch": 0.9954369954369955, "grad_norm": 0.0008088778122328222, "learning_rate": 2.9698216064189796e-08, "logits/chosen": -8.039783477783203, "logits/rejected": -8.013481140136719, "logps/chosen": -1.2731865644454956, "logps/rejected": -109.12992858886719, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1442883014678955, "rewards/margins": 10.816617965698242, "rewards/rejected": -7.672328948974609, "step": 1418 }, { "epoch": 0.9961389961389961, "grad_norm": 0.0010148739675059915, "learning_rate": 2.2152402121901446e-08, "logits/chosen": -8.121295928955078, "logits/rejected": -8.119210243225098, "logps/chosen": -3.827897548675537, "logps/rejected": -107.1634750366211, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.901045322418213, "rewards/margins": 10.272342681884766, "rewards/rejected": -7.3712968826293945, "step": 1419 }, { "epoch": 0.9968409968409968, "grad_norm": 0.001405224553309381, "learning_rate": 1.5977989371413286e-08, "logits/chosen": -8.060805320739746, "logits/rejected": -8.047975540161133, "logps/chosen": -12.697122573852539, "logps/rejected": -103.67720794677734, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.2113723754882812, "rewards/margins": 9.125361442565918, "rewards/rejected": -6.913989067077637, "step": 1420 }, { "epoch": 0.9975429975429976, "grad_norm": 0.001439022016711533, "learning_rate": 1.1175316783071778e-08, "logits/chosen": -8.59253215789795, "logits/rejected": -8.575397491455078, "logps/chosen": -4.919360637664795, "logps/rejected": -107.837890625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.726996421813965, "rewards/margins": 10.381292343139648, "rewards/rejected": -7.654296875, "step": 1421 }, { "epoch": 0.9982449982449982, "grad_norm": 0.0007503292290493846, "learning_rate": 7.744648019775646e-09, "logits/chosen": -9.507061004638672, "logits/rejected": -9.497603416442871, "logps/chosen": -4.065119743347168, "logps/rejected": -108.08378601074219, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.7375082969665527, "rewards/margins": 10.316261291503906, "rewards/rejected": -7.5787529945373535, "step": 1422 }, { "epoch": 0.9989469989469989, "grad_norm": 0.0025854387786239386, "learning_rate": 5.686171422511089e-09, "logits/chosen": -8.802715301513672, "logits/rejected": -8.821380615234375, "logps/chosen": -1.671375036239624, "logps/rejected": -109.6783447265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.1092538833618164, "rewards/margins": 10.848894119262695, "rewards/rejected": -7.739640235900879, "step": 1423 }, { "epoch": 0.9996489996489997, "grad_norm": 0.0008484012214466929, "learning_rate": 5e-09, "logits/chosen": -8.537653923034668, "logits/rejected": -8.536896705627441, "logps/chosen": -4.497082710266113, "logps/rejected": -107.5028305053711, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6521730422973633, "rewards/margins": 10.198890686035156, "rewards/rejected": -7.546717166900635, "step": 1424 }, { "epoch": 0.9996489996489997, "step": 1424, "total_flos": 3.8407553376195707e+18, "train_loss": 0.020223459395160053, "train_runtime": 6022.9119, "train_samples_per_second": 30.267, "train_steps_per_second": 0.236 } ], "logging_steps": 1.0, "max_steps": 1424, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.8407553376195707e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }