diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,18170 +10,17036 @@ "log_history": [ { "epoch": 0.002676032781401572, - "grad_norm": 6.1621459865713515, + "grad_norm": 5.944779453760548, "learning_rate": 8.9126559714795e-09, - "logits/chosen": -0.06070180982351303, - "logits/rejected": 0.14738903939723969, - "logps/chosen": -1.716059684753418, - "logps/rejected": -1.8892710208892822, - "loss": 1.0429, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.716059684753418, - "rewards/margins": 0.1732112467288971, - "rewards/rejected": -1.8892710208892822, - "semantic_entropy": 0.6584457159042358, + "logits/chosen": -0.06503242254257202, + "logits/rejected": 0.1413465142250061, + "logps/chosen": -1.7161604166030884, + "logps/rejected": -1.8896411657333374, + "loss": 0.7138, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.7161604166030884, + "rewards/margins": 0.17348076403141022, + "rewards/rejected": -1.8896411657333374, "step": 5 }, { "epoch": 0.005352065562803144, - "grad_norm": 9.137033794779027, + "grad_norm": 10.10720294319382, "learning_rate": 1.7825311942959e-08, - "logits/chosen": -0.0036977827548980713, - "logits/rejected": 0.11409668624401093, - "logps/chosen": -1.8028045892715454, - "logps/rejected": -1.8464124202728271, - "loss": 1.1233, + "logits/chosen": -0.0033793686889111996, + "logits/rejected": 0.11422939598560333, + "logps/chosen": -1.8027198314666748, + "logps/rejected": -1.8457956314086914, + "loss": 0.8041, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.8028045892715454, - "rewards/margins": 0.0436079278588295, - "rewards/rejected": -1.8464124202728271, - "semantic_entropy": 0.6394152641296387, + "rewards/chosen": -1.8027198314666748, + "rewards/margins": 0.04307572543621063, + "rewards/rejected": -1.8457956314086914, "step": 10 }, { "epoch": 0.008028098344204716, - "grad_norm": 9.22389226014171, + "grad_norm": 10.801841084704561, "learning_rate": 2.67379679144385e-08, - "logits/chosen": -0.029309600591659546, - "logits/rejected": 0.06751412898302078, - "logps/chosen": -1.6355518102645874, - "logps/rejected": -1.7657592296600342, - "loss": 1.1344, + "logits/chosen": -0.03136734291911125, + "logits/rejected": 0.06487132608890533, + "logps/chosen": -1.634657621383667, + "logps/rejected": -1.76498544216156, + "loss": 0.7875, "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.6355518102645874, - "rewards/margins": 0.13020756840705872, - "rewards/rejected": -1.7657592296600342, - "semantic_entropy": 0.6930069923400879, + "rewards/chosen": -1.634657621383667, + "rewards/margins": 0.13032770156860352, + "rewards/rejected": -1.76498544216156, "step": 15 }, { "epoch": 0.010704131125606288, - "grad_norm": 6.704632465419751, + "grad_norm": 6.1633070273081705, "learning_rate": 3.5650623885918e-08, - "logits/chosen": -0.03660174086689949, - "logits/rejected": 0.049360670149326324, - "logps/chosen": -1.724509596824646, - "logps/rejected": -1.8065202236175537, - "loss": 1.145, + "logits/chosen": -0.043427206575870514, + "logits/rejected": 0.039137102663517, + "logps/chosen": -1.7245880365371704, + "logps/rejected": -1.8060840368270874, + "loss": 0.8112, "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.724509596824646, - "rewards/margins": 0.08201076835393906, - "rewards/rejected": -1.8065202236175537, - "semantic_entropy": 0.6685421466827393, + "rewards/chosen": -1.7245880365371704, + "rewards/margins": 0.08149626106023788, + "rewards/rejected": -1.8060840368270874, "step": 20 }, { "epoch": 0.013380163907007862, - "grad_norm": 13.950567091423647, + "grad_norm": 16.532475392387802, "learning_rate": 4.45632798573975e-08, - "logits/chosen": -0.04136265441775322, - "logits/rejected": 0.044629622250795364, - "logps/chosen": -1.869329810142517, - "logps/rejected": -1.7786051034927368, - "loss": 1.2712, + "logits/chosen": -0.05956069380044937, + "logits/rejected": 0.02474214881658554, + "logps/chosen": -1.8682079315185547, + "logps/rejected": -1.777593970298767, + "loss": 0.9486, "rewards/accuracies": 0.375, - "rewards/chosen": -1.869329810142517, - "rewards/margins": -0.09072484076023102, - "rewards/rejected": -1.7786051034927368, - "semantic_entropy": 0.6433960795402527, + "rewards/chosen": -1.8682079315185547, + "rewards/margins": -0.09061405807733536, + "rewards/rejected": -1.777593970298767, "step": 25 }, { "epoch": 0.016056196688409432, - "grad_norm": 7.520127719976578, + "grad_norm": 8.947371657351983, "learning_rate": 5.3475935828877e-08, - "logits/chosen": -0.07225209474563599, - "logits/rejected": 0.020951146259903908, - "logps/chosen": -1.9089466333389282, - "logps/rejected": -1.832271933555603, - "loss": 1.1721, - "rewards/accuracies": 0.4437499940395355, - "rewards/chosen": -1.9089466333389282, - "rewards/margins": -0.07667465507984161, - "rewards/rejected": -1.832271933555603, - "semantic_entropy": 0.6176777482032776, + "logits/chosen": -0.08522985130548477, + "logits/rejected": 0.004247778560966253, + "logps/chosen": -1.9091031551361084, + "logps/rejected": -1.832916259765625, + "loss": 0.8629, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.9091031551361084, + "rewards/margins": -0.07618677616119385, + "rewards/rejected": -1.832916259765625, "step": 30 }, { "epoch": 0.018732229469811006, - "grad_norm": 8.288075347283838, + "grad_norm": 9.999646585977542, "learning_rate": 6.23885918003565e-08, - "logits/chosen": -0.05746116489171982, - "logits/rejected": 0.10160557925701141, - "logps/chosen": -1.845741629600525, - "logps/rejected": -1.9970605373382568, - "loss": 1.1629, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.845741629600525, - "rewards/margins": 0.1513189673423767, - "rewards/rejected": -1.9970605373382568, - "semantic_entropy": 0.6350187063217163, + "logits/chosen": -0.06489699333906174, + "logits/rejected": 0.0926300436258316, + "logps/chosen": -1.8453718423843384, + "logps/rejected": -1.9969125986099243, + "loss": 0.8457, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8453718423843384, + "rewards/margins": 0.15154069662094116, + "rewards/rejected": -1.9969125986099243, "step": 35 }, { "epoch": 0.021408262251212576, - "grad_norm": 7.5458186716671465, + "grad_norm": 9.568940663975653, "learning_rate": 7.1301247771836e-08, - "logits/chosen": 0.05770735815167427, - "logits/rejected": 0.23583391308784485, - "logps/chosen": -1.880816102027893, - "logps/rejected": -1.743043303489685, - "loss": 1.2132, + "logits/chosen": 0.03302387148141861, + "logits/rejected": 0.20599150657653809, + "logps/chosen": -1.880316972732544, + "logps/rejected": -1.7426525354385376, + "loss": 0.8913, "rewards/accuracies": 0.45625001192092896, - "rewards/chosen": -1.880816102027893, - "rewards/margins": -0.1377728283405304, - "rewards/rejected": -1.743043303489685, - "semantic_entropy": 0.6431102752685547, + "rewards/chosen": -1.880316972732544, + "rewards/margins": -0.13766419887542725, + "rewards/rejected": -1.7426525354385376, "step": 40 }, { "epoch": 0.02408429503261415, - "grad_norm": 12.928036650171752, + "grad_norm": 15.181692252538227, "learning_rate": 8.021390374331551e-08, - "logits/chosen": 0.049303993582725525, - "logits/rejected": 0.25262051820755005, - "logps/chosen": -1.837459921836853, - "logps/rejected": -1.8713966608047485, - "loss": 1.1798, + "logits/chosen": 0.03328787535429001, + "logits/rejected": 0.23170337080955505, + "logps/chosen": -1.836920976638794, + "logps/rejected": -1.8731199502944946, + "loss": 0.8539, "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.837459921836853, - "rewards/margins": 0.03393695876002312, - "rewards/rejected": -1.8713966608047485, - "semantic_entropy": 0.649166464805603, + "rewards/chosen": -1.836920976638794, + "rewards/margins": 0.03619895130395889, + "rewards/rejected": -1.8731199502944946, "step": 45 }, { "epoch": 0.026760327814015723, - "grad_norm": 10.160669036683966, + "grad_norm": 12.296909869173225, "learning_rate": 8.9126559714795e-08, - "logits/chosen": -0.027670959010720253, - "logits/rejected": 0.1239209994673729, - "logps/chosen": -1.8993823528289795, - "logps/rejected": -1.7789846658706665, - "loss": 1.2256, + "logits/chosen": -0.04849355295300484, + "logits/rejected": 0.10151131451129913, + "logps/chosen": -1.8959643840789795, + "logps/rejected": -1.7764707803726196, + "loss": 0.9076, "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.8993823528289795, - "rewards/margins": -0.1203979030251503, - "rewards/rejected": -1.7789846658706665, - "semantic_entropy": 0.6335883140563965, + "rewards/chosen": -1.8959643840789795, + "rewards/margins": -0.11949358135461807, + "rewards/rejected": -1.7764707803726196, "step": 50 }, { "epoch": 0.029436360595417294, - "grad_norm": 7.047012193835533, + "grad_norm": 8.019334661639341, "learning_rate": 9.80392156862745e-08, - "logits/chosen": -0.10063391923904419, - "logits/rejected": 0.12058229744434357, - "logps/chosen": -1.8336282968521118, - "logps/rejected": -1.8673959970474243, - "loss": 1.1935, + "logits/chosen": -0.09500084817409515, + "logits/rejected": 0.12824514508247375, + "logps/chosen": -1.8301572799682617, + "logps/rejected": -1.8652454614639282, + "loss": 0.8699, "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.8336282968521118, - "rewards/margins": 0.03376791998744011, - "rewards/rejected": -1.8673959970474243, - "semantic_entropy": 0.6438094973564148, + "rewards/chosen": -1.8301572799682617, + "rewards/margins": 0.035088278353214264, + "rewards/rejected": -1.8652454614639282, "step": 55 }, { "epoch": 0.032112393376818864, - "grad_norm": 7.199053435790905, + "grad_norm": 7.707208974595881, "learning_rate": 1.06951871657754e-07, - "logits/chosen": -0.08423934876918793, - "logits/rejected": 0.10448728501796722, - "logps/chosen": -1.789345145225525, - "logps/rejected": -1.894176721572876, - "loss": 1.1008, + "logits/chosen": -0.08524443209171295, + "logits/rejected": 0.10578173398971558, + "logps/chosen": -1.7843726873397827, + "logps/rejected": -1.889954924583435, + "loss": 0.7814, "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.789345145225525, - "rewards/margins": 0.10483156144618988, - "rewards/rejected": -1.894176721572876, - "semantic_entropy": 0.6360429525375366, + "rewards/chosen": -1.7843726873397827, + "rewards/margins": 0.1055823415517807, + "rewards/rejected": -1.889954924583435, "step": 60 }, { "epoch": 0.03478842615822044, - "grad_norm": 5.878839162191842, + "grad_norm": 6.6437645055292105, "learning_rate": 1.158645276292335e-07, - "logits/chosen": -0.04232923686504364, - "logits/rejected": 0.10366680473089218, - "logps/chosen": -1.6381199359893799, - "logps/rejected": -1.7684608697891235, - "loss": 1.0888, + "logits/chosen": -0.010567838326096535, + "logits/rejected": 0.1390911042690277, + "logps/chosen": -1.6323703527450562, + "logps/rejected": -1.7623701095581055, + "loss": 0.7401, "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.6381199359893799, - "rewards/margins": 0.13034099340438843, - "rewards/rejected": -1.7684608697891235, - "semantic_entropy": 0.6962206959724426, + "rewards/chosen": -1.6323703527450562, + "rewards/margins": 0.12999966740608215, + "rewards/rejected": -1.7623701095581055, "step": 65 }, { "epoch": 0.03746445893962201, - "grad_norm": 11.097796193507412, + "grad_norm": 12.073995266380342, "learning_rate": 1.24777183600713e-07, - "logits/chosen": -0.07627397030591965, - "logits/rejected": 0.07312844693660736, - "logps/chosen": -1.766296148300171, - "logps/rejected": -1.8135309219360352, - "loss": 1.1905, + "logits/chosen": -0.08409874886274338, + "logits/rejected": 0.06362706422805786, + "logps/chosen": -1.7632910013198853, + "logps/rejected": -1.808368444442749, + "loss": 0.8637, "rewards/accuracies": 0.42500001192092896, - "rewards/chosen": -1.766296148300171, - "rewards/margins": 0.047234609723091125, - "rewards/rejected": -1.8135309219360352, - "semantic_entropy": 0.6539437770843506, + "rewards/chosen": -1.7632910013198853, + "rewards/margins": 0.04507749527692795, + "rewards/rejected": -1.808368444442749, "step": 70 }, { "epoch": 0.04014049172102358, - "grad_norm": 11.180823699806128, + "grad_norm": 13.144499074530275, "learning_rate": 1.3368983957219251e-07, - "logits/chosen": -0.043935492634773254, - "logits/rejected": 0.1390921175479889, - "logps/chosen": -1.7772403955459595, - "logps/rejected": -2.038160562515259, - "loss": 1.0594, + "logits/chosen": -0.058572955429553986, + "logits/rejected": 0.12240276485681534, + "logps/chosen": -1.7725986242294312, + "logps/rejected": -2.032947540283203, + "loss": 0.7417, "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.7772403955459595, - "rewards/margins": 0.2609199583530426, - "rewards/rejected": -2.038160562515259, - "semantic_entropy": 0.6338866353034973, + "rewards/chosen": -1.7725986242294312, + "rewards/margins": 0.26034852862358093, + "rewards/rejected": -2.032947540283203, "step": 75 }, { "epoch": 0.04281652450242515, - "grad_norm": 7.729614400854603, + "grad_norm": 9.063540848789689, "learning_rate": 1.42602495543672e-07, - "logits/chosen": 0.009521784260869026, - "logits/rejected": 0.11359156668186188, - "logps/chosen": -1.7183939218521118, - "logps/rejected": -1.7508172988891602, - "loss": 1.1522, + "logits/chosen": 0.003893436398357153, + "logits/rejected": 0.10652987658977509, + "logps/chosen": -1.7095410823822021, + "logps/rejected": -1.741156816482544, + "loss": 0.816, "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.7183939218521118, - "rewards/margins": 0.0324234738945961, - "rewards/rejected": -1.7508172988891602, - "semantic_entropy": 0.6691663265228271, + "rewards/chosen": -1.7095410823822021, + "rewards/margins": 0.031615592539310455, + "rewards/rejected": -1.741156816482544, "step": 80 }, { "epoch": 0.04549255728382673, - "grad_norm": 5.774164895526498, + "grad_norm": 6.23931661297619, "learning_rate": 1.5151515151515152e-07, - "logits/chosen": -0.16618943214416504, - "logits/rejected": 0.07412171363830566, - "logps/chosen": -1.7912899255752563, - "logps/rejected": -1.9684991836547852, - "loss": 1.1099, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.7912899255752563, - "rewards/margins": 0.17720915377140045, - "rewards/rejected": -1.9684991836547852, - "semantic_entropy": 0.6479779481887817, + "logits/chosen": -0.14125987887382507, + "logits/rejected": 0.10873997211456299, + "logps/chosen": -1.773114800453186, + "logps/rejected": -1.9469287395477295, + "loss": 0.7821, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.773114800453186, + "rewards/margins": 0.17381396889686584, + "rewards/rejected": -1.9469287395477295, "step": 85 }, { "epoch": 0.0481685900652283, - "grad_norm": 13.994171190985876, + "grad_norm": 15.569072969938869, "learning_rate": 1.6042780748663102e-07, - "logits/chosen": 0.08484308421611786, - "logits/rejected": 0.04691457375884056, - "logps/chosen": -1.750454306602478, - "logps/rejected": -1.7775003910064697, - "loss": 1.1925, - "rewards/accuracies": 0.46875, - "rewards/chosen": -1.750454306602478, - "rewards/margins": 0.027046024799346924, - "rewards/rejected": -1.7775003910064697, - "semantic_entropy": 0.668484091758728, + "logits/chosen": 0.09540364891290665, + "logits/rejected": 0.05513875558972359, + "logps/chosen": -1.7274713516235352, + "logps/rejected": -1.759783148765564, + "loss": 0.8489, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -1.7274713516235352, + "rewards/margins": 0.032311733812093735, + "rewards/rejected": -1.759783148765564, "step": 90 }, { "epoch": 0.05084462284662987, - "grad_norm": 5.179734454416302, + "grad_norm": 6.731057420673302, "learning_rate": 1.693404634581105e-07, - "logits/chosen": -0.0784115418791771, - "logits/rejected": 0.06837181746959686, - "logps/chosen": -1.805314302444458, - "logps/rejected": -1.9120498895645142, - "loss": 1.1394, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.805314302444458, - "rewards/margins": 0.10673556476831436, - "rewards/rejected": -1.9120498895645142, - "semantic_entropy": 0.6409928202629089, + "logits/chosen": -0.07308058440685272, + "logits/rejected": 0.07406400889158249, + "logps/chosen": -1.7728599309921265, + "logps/rejected": -1.8886535167694092, + "loss": 0.8067, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.7728599309921265, + "rewards/margins": 0.1157936081290245, + "rewards/rejected": -1.8886535167694092, "step": 95 }, { "epoch": 0.05352065562803145, - "grad_norm": 6.303816361495141, + "grad_norm": 5.499688214176538, "learning_rate": 1.7825311942959e-07, - "logits/chosen": -0.0438729003071785, - "logits/rejected": 0.01818550005555153, - "logps/chosen": -1.6925382614135742, - "logps/rejected": -1.8010832071304321, - "loss": 1.104, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.6925382614135742, - "rewards/margins": 0.10854510962963104, - "rewards/rejected": -1.8010832071304321, - "semantic_entropy": 0.6733208298683167, + "logits/chosen": -0.03358002379536629, + "logits/rejected": 0.028250019997358322, + "logps/chosen": -1.6723474264144897, + "logps/rejected": -1.7788429260253906, + "loss": 0.7634, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6723474264144897, + "rewards/margins": 0.10649553686380386, + "rewards/rejected": -1.7788429260253906, "step": 100 }, { "epoch": 0.05619668840943302, - "grad_norm": 8.250362407815302, + "grad_norm": 9.778528043269088, "learning_rate": 1.8716577540106952e-07, - "logits/chosen": 0.04747066646814346, - "logits/rejected": 0.07233314961194992, - "logps/chosen": -1.6426517963409424, - "logps/rejected": -1.8100935220718384, - "loss": 1.0833, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.6426517963409424, - "rewards/margins": 0.16744166612625122, - "rewards/rejected": -1.8100935220718384, - "semantic_entropy": 0.6844531297683716, + "logits/chosen": 0.030319351702928543, + "logits/rejected": 0.05664747208356857, + "logps/chosen": -1.6236553192138672, + "logps/rejected": -1.7927690744400024, + "loss": 0.7352, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.6236553192138672, + "rewards/margins": 0.16911372542381287, + "rewards/rejected": -1.7927690744400024, "step": 105 }, { "epoch": 0.05887272119083459, - "grad_norm": 6.750093995633937, + "grad_norm": 6.811684213959745, "learning_rate": 1.96078431372549e-07, - "logits/chosen": 0.0031594126485288143, - "logits/rejected": 0.09811054170131683, - "logps/chosen": -1.6814390420913696, - "logps/rejected": -1.7384357452392578, - "loss": 1.1586, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.6814390420913696, - "rewards/margins": 0.05699686333537102, - "rewards/rejected": -1.7384357452392578, - "semantic_entropy": 0.6790895462036133, + "logits/chosen": 0.00945661123842001, + "logits/rejected": 0.10386445373296738, + "logps/chosen": -1.6445382833480835, + "logps/rejected": -1.7022031545639038, + "loss": 0.8083, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.6445382833480835, + "rewards/margins": 0.0576648935675621, + "rewards/rejected": -1.7022031545639038, "step": 110 }, { "epoch": 0.06154875397223616, - "grad_norm": 8.991359972391313, + "grad_norm": 11.531968435296026, "learning_rate": 2.049910873440285e-07, - "logits/chosen": 0.024249624460935593, - "logits/rejected": 0.23187024891376495, - "logps/chosen": -1.6709773540496826, - "logps/rejected": -1.9569326639175415, - "loss": 1.0366, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.6709773540496826, - "rewards/margins": 0.28595516085624695, - "rewards/rejected": -1.9569326639175415, - "semantic_entropy": 0.6562684178352356, + "logits/chosen": 0.03109263814985752, + "logits/rejected": 0.24077387154102325, + "logps/chosen": -1.6186809539794922, + "logps/rejected": -1.8899390697479248, + "loss": 0.6996, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6186809539794922, + "rewards/margins": 0.27125832438468933, + "rewards/rejected": -1.8899390697479248, "step": 115 }, { "epoch": 0.06422478675363773, - "grad_norm": 5.808107193049869, + "grad_norm": 6.84320496493739, "learning_rate": 2.13903743315508e-07, - "logits/chosen": -0.07225940376520157, - "logits/rejected": 0.10119612514972687, - "logps/chosen": -1.7596435546875, - "logps/rejected": -1.8809823989868164, - "loss": 1.0991, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.7596435546875, - "rewards/margins": 0.12133894115686417, - "rewards/rejected": -1.8809823989868164, - "semantic_entropy": 0.6526800990104675, + "logits/chosen": -0.06487534940242767, + "logits/rejected": 0.1105581521987915, + "logps/chosen": -1.6760625839233398, + "logps/rejected": -1.7911736965179443, + "loss": 0.7613, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.6760625839233398, + "rewards/margins": 0.11511099338531494, + "rewards/rejected": -1.7911736965179443, "step": 120 }, { "epoch": 0.0669008195350393, - "grad_norm": 6.833917796547652, + "grad_norm": 6.140573145214282, "learning_rate": 2.2281639928698751e-07, - "logits/chosen": -0.07721801102161407, - "logits/rejected": 0.05250721424818039, - "logps/chosen": -1.6813856363296509, - "logps/rejected": -1.6302525997161865, - "loss": 1.1922, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.6813856363296509, - "rewards/margins": -0.051132846623659134, - "rewards/rejected": -1.6302525997161865, - "semantic_entropy": 0.6917638778686523, + "logits/chosen": -0.066864013671875, + "logits/rejected": 0.0663563460111618, + "logps/chosen": -1.606372594833374, + "logps/rejected": -1.5677980184555054, + "loss": 0.8292, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.606372594833374, + "rewards/margins": -0.03857450932264328, + "rewards/rejected": -1.5677980184555054, "step": 125 }, { "epoch": 0.06957685231644088, - "grad_norm": 8.414614080726308, + "grad_norm": 9.767909175438842, "learning_rate": 2.31729055258467e-07, - "logits/chosen": 0.02672005072236061, - "logits/rejected": 0.15939494967460632, - "logps/chosen": -1.7269341945648193, - "logps/rejected": -1.847845435142517, - "loss": 1.0565, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.7269341945648193, - "rewards/margins": 0.12091119587421417, - "rewards/rejected": -1.847845435142517, - "semantic_entropy": 0.6518223881721497, + "logits/chosen": 0.05265479162335396, + "logits/rejected": 0.1912578046321869, + "logps/chosen": -1.6436245441436768, + "logps/rejected": -1.7642732858657837, + "loss": 0.7164, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.6436245441436768, + "rewards/margins": 0.1206488385796547, + "rewards/rejected": -1.7642732858657837, "step": 130 }, { "epoch": 0.07225288509784245, - "grad_norm": 15.502747011489308, + "grad_norm": 16.509017635882987, "learning_rate": 2.406417112299465e-07, - "logits/chosen": -0.045921992510557175, - "logits/rejected": 0.06921950727701187, - "logps/chosen": -1.7797927856445312, - "logps/rejected": -1.7936254739761353, - "loss": 1.1683, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.7797927856445312, - "rewards/margins": 0.013832822442054749, - "rewards/rejected": -1.7936254739761353, - "semantic_entropy": 0.6467072367668152, + "logits/chosen": -0.04632113501429558, + "logits/rejected": 0.07392780482769012, + "logps/chosen": -1.6936384439468384, + "logps/rejected": -1.7223297357559204, + "loss": 0.8144, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6936384439468384, + "rewards/margins": 0.0286913700401783, + "rewards/rejected": -1.7223297357559204, "step": 135 }, { "epoch": 0.07492891787924402, - "grad_norm": 10.934153827167037, + "grad_norm": 10.736922972762205, "learning_rate": 2.49554367201426e-07, - "logits/chosen": -0.029972439631819725, - "logits/rejected": 0.13952571153640747, - "logps/chosen": -1.7365680932998657, - "logps/rejected": -1.8929036855697632, - "loss": 1.0664, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.7365680932998657, - "rewards/margins": 0.15633563697338104, - "rewards/rejected": -1.8929036855697632, - "semantic_entropy": 0.6409581899642944, + "logits/chosen": -0.053708963096141815, + "logits/rejected": 0.10864436626434326, + "logps/chosen": -1.6563133001327515, + "logps/rejected": -1.7850277423858643, + "loss": 0.7439, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.6563133001327515, + "rewards/margins": 0.12871433794498444, + "rewards/rejected": -1.7850277423858643, "step": 140 }, { "epoch": 0.0776049506606456, - "grad_norm": 9.72307827733719, + "grad_norm": 10.508807924847892, "learning_rate": 2.5846702317290554e-07, - "logits/chosen": -0.0310811810195446, - "logits/rejected": 0.11993386596441269, - "logps/chosen": -1.651533842086792, - "logps/rejected": -1.7689011096954346, - "loss": 1.0902, - "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": -1.651533842086792, - "rewards/margins": 0.1173669844865799, - "rewards/rejected": -1.7689011096954346, - "semantic_entropy": 0.6721662282943726, + "logits/chosen": -0.029666831716895103, + "logits/rejected": 0.12364047765731812, + "logps/chosen": -1.5674049854278564, + "logps/rejected": -1.6781294345855713, + "loss": 0.7398, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.5674049854278564, + "rewards/margins": 0.11072440445423126, + "rewards/rejected": -1.6781294345855713, "step": 145 }, { "epoch": 0.08028098344204716, - "grad_norm": 10.946409292553673, + "grad_norm": 11.904665511204097, "learning_rate": 2.6737967914438503e-07, - "logits/chosen": -0.06234356015920639, - "logits/rejected": 0.0975189134478569, - "logps/chosen": -1.6039183139801025, - "logps/rejected": -1.6026118993759155, - "loss": 1.1726, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.6039183139801025, - "rewards/margins": -0.0013063341611996293, - "rewards/rejected": -1.6026118993759155, - "semantic_entropy": 0.7188035249710083, + "logits/chosen": -0.05004817247390747, + "logits/rejected": 0.11277272552251816, + "logps/chosen": -1.5183589458465576, + "logps/rejected": -1.5164936780929565, + "loss": 0.7991, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5183589458465576, + "rewards/margins": -0.0018654048908501863, + "rewards/rejected": -1.5164936780929565, "step": 150 }, { "epoch": 0.08295701622344874, - "grad_norm": 8.966979965135215, + "grad_norm": 9.572574101414995, "learning_rate": 2.762923351158645e-07, - "logits/chosen": -0.053622614592313766, - "logits/rejected": -0.006728078238666058, - "logps/chosen": -1.6239417791366577, - "logps/rejected": -1.7132408618927002, - "loss": 1.1056, + "logits/chosen": -0.07051060348749161, + "logits/rejected": -0.020061589777469635, + "logps/chosen": -1.5284699201583862, + "logps/rejected": -1.625427007675171, + "loss": 0.7426, "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.6239417791366577, - "rewards/margins": 0.0892990455031395, - "rewards/rejected": -1.7132408618927002, - "semantic_entropy": 0.6902952790260315, + "rewards/chosen": -1.5284699201583862, + "rewards/margins": 0.09695716202259064, + "rewards/rejected": -1.625427007675171, "step": 155 }, { "epoch": 0.0856330490048503, - "grad_norm": 7.433737051457635, + "grad_norm": 9.924795334187923, "learning_rate": 2.85204991087344e-07, - "logits/chosen": -0.1430046111345291, - "logits/rejected": -0.0034906647633761168, - "logps/chosen": -1.7533985376358032, - "logps/rejected": -1.7312949895858765, - "loss": 1.1909, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.7533985376358032, - "rewards/margins": -0.022103413939476013, - "rewards/rejected": -1.7312949895858765, - "semantic_entropy": 0.6651071310043335, + "logits/chosen": -0.15549620985984802, + "logits/rejected": -0.017335455864667892, + "logps/chosen": -1.639505386352539, + "logps/rejected": -1.6174213886260986, + "loss": 0.8376, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.639505386352539, + "rewards/margins": -0.022084157913923264, + "rewards/rejected": -1.6174213886260986, "step": 160 }, { "epoch": 0.08830908178625188, - "grad_norm": 8.024498031822322, + "grad_norm": 8.962623478343406, "learning_rate": 2.941176470588235e-07, - "logits/chosen": -0.05941913276910782, - "logits/rejected": 0.11122976243495941, - "logps/chosen": -1.5744872093200684, - "logps/rejected": -1.7276694774627686, - "loss": 1.1042, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.5744872093200684, - "rewards/margins": 0.1531822681427002, - "rewards/rejected": -1.7276694774627686, - "semantic_entropy": 0.6984173059463501, + "logits/chosen": -0.05273245647549629, + "logits/rejected": 0.1209317222237587, + "logps/chosen": -1.4794989824295044, + "logps/rejected": -1.6011667251586914, + "loss": 0.7491, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.4794989824295044, + "rewards/margins": 0.12166787683963776, + "rewards/rejected": -1.6011667251586914, "step": 165 }, { "epoch": 0.09098511456765346, - "grad_norm": 12.469922507489153, + "grad_norm": 17.67045068909263, "learning_rate": 3.0303030303030305e-07, - "logits/chosen": -0.0928221344947815, - "logits/rejected": -0.041338033974170685, - "logps/chosen": -1.7328227758407593, - "logps/rejected": -1.7776778936386108, - "loss": 1.1493, - "rewards/accuracies": 0.48124998807907104, - "rewards/chosen": -1.7328227758407593, - "rewards/margins": 0.04485485702753067, - "rewards/rejected": -1.7776778936386108, - "semantic_entropy": 0.6588774919509888, + "logits/chosen": -0.09730833023786545, + "logits/rejected": -0.04521063342690468, + "logps/chosen": -1.5999524593353271, + "logps/rejected": -1.6559072732925415, + "loss": 0.7865, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.5999524593353271, + "rewards/margins": 0.055954743176698685, + "rewards/rejected": -1.6559072732925415, "step": 170 }, { "epoch": 0.09366114734905502, - "grad_norm": 10.037232387665194, + "grad_norm": 9.490355203637604, "learning_rate": 3.1194295900178254e-07, - "logits/chosen": 0.05204933136701584, - "logits/rejected": 0.04736893251538277, - "logps/chosen": -1.6078169345855713, - "logps/rejected": -1.7068147659301758, - "loss": 1.135, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.6078169345855713, - "rewards/margins": 0.09899773448705673, - "rewards/rejected": -1.7068147659301758, - "semantic_entropy": 0.6981975436210632, + "logits/chosen": 0.031183790415525436, + "logits/rejected": 0.025438766926527023, + "logps/chosen": -1.4530776739120483, + "logps/rejected": -1.555862545967102, + "loss": 0.7523, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.4530776739120483, + "rewards/margins": 0.1027846708893776, + "rewards/rejected": -1.555862545967102, "step": 175 }, { "epoch": 0.0963371801304566, - "grad_norm": 8.220031140397653, + "grad_norm": 9.17414365437009, "learning_rate": 3.2085561497326203e-07, - "logits/chosen": 0.015208420343697071, - "logits/rejected": 0.01307359803467989, - "logps/chosen": -1.633329153060913, - "logps/rejected": -1.777130365371704, - "loss": 1.1206, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.633329153060913, - "rewards/margins": 0.14380115270614624, - "rewards/rejected": -1.777130365371704, - "semantic_entropy": 0.6907540559768677, + "logits/chosen": -0.0127823231741786, + "logits/rejected": -0.016824591904878616, + "logps/chosen": -1.4412308931350708, + "logps/rejected": -1.6311981678009033, + "loss": 0.732, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4412308931350708, + "rewards/margins": 0.1899670660495758, + "rewards/rejected": -1.6311981678009033, "step": 180 }, { "epoch": 0.09901321291185818, - "grad_norm": 8.44277458445029, + "grad_norm": 8.445844697747436, "learning_rate": 3.297682709447415e-07, - "logits/chosen": -0.12780144810676575, - "logits/rejected": -0.040175847709178925, - "logps/chosen": -1.5936999320983887, - "logps/rejected": -1.6630268096923828, - "loss": 1.1614, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.5936999320983887, - "rewards/margins": 0.06932689249515533, - "rewards/rejected": -1.6630268096923828, - "semantic_entropy": 0.7057312726974487, + "logits/chosen": -0.1640012562274933, + "logits/rejected": -0.08039584010839462, + "logps/chosen": -1.40049147605896, + "logps/rejected": -1.463126540184021, + "loss": 0.7853, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.40049147605896, + "rewards/margins": 0.06263527274131775, + "rewards/rejected": -1.463126540184021, "step": 185 }, { "epoch": 0.10168924569325974, - "grad_norm": 8.214598410169415, + "grad_norm": 10.92568766572086, "learning_rate": 3.38680926916221e-07, - "logits/chosen": -0.06513194739818573, - "logits/rejected": 0.05364646762609482, - "logps/chosen": -1.6519644260406494, - "logps/rejected": -1.7357133626937866, - "loss": 1.0944, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.6519644260406494, - "rewards/margins": 0.08374904841184616, - "rewards/rejected": -1.7357133626937866, - "semantic_entropy": 0.6681785583496094, + "logits/chosen": -0.0818137377500534, + "logits/rejected": 0.036564864218235016, + "logps/chosen": -1.342758297920227, + "logps/rejected": -1.477901577949524, + "loss": 0.7183, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.342758297920227, + "rewards/margins": 0.13514335453510284, + "rewards/rejected": -1.477901577949524, "step": 190 }, { "epoch": 0.10436527847466132, - "grad_norm": 5.81056993834838, + "grad_norm": 6.396934705681357, "learning_rate": 3.475935828877005e-07, - "logits/chosen": 0.021021168678998947, - "logits/rejected": 0.17753520607948303, - "logps/chosen": -1.4594143629074097, - "logps/rejected": -1.6115144491195679, - "loss": 1.0913, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.4594143629074097, - "rewards/margins": 0.15210004150867462, - "rewards/rejected": -1.6115144491195679, - "semantic_entropy": 0.7464505434036255, + "logits/chosen": -0.04009098559617996, + "logits/rejected": 0.10541429370641708, + "logps/chosen": -1.2911746501922607, + "logps/rejected": -1.4600231647491455, + "loss": 0.6906, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2911746501922607, + "rewards/margins": 0.16884845495224, + "rewards/rejected": -1.4600231647491455, "step": 195 }, { "epoch": 0.1070413112560629, - "grad_norm": 12.873598411605691, + "grad_norm": 16.241358898012777, "learning_rate": 3.5650623885918e-07, - "logits/chosen": -0.06861015409231186, - "logits/rejected": 0.07104112207889557, - "logps/chosen": -1.5893186330795288, - "logps/rejected": -1.601284384727478, - "loss": 1.1435, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.5893186330795288, - "rewards/margins": 0.011965674348175526, - "rewards/rejected": -1.601284384727478, - "semantic_entropy": 0.7116156816482544, + "logits/chosen": -0.08804573863744736, + "logits/rejected": 0.04634449630975723, + "logps/chosen": -1.415854811668396, + "logps/rejected": -1.4563789367675781, + "loss": 0.7593, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.415854811668396, + "rewards/margins": 0.04052383080124855, + "rewards/rejected": -1.4563789367675781, "step": 200 }, { "epoch": 0.10971734403746446, - "grad_norm": 13.584024992116543, + "grad_norm": 14.182627717143983, "learning_rate": 3.654188948306595e-07, - "logits/chosen": -0.05508657544851303, - "logits/rejected": 0.08913681656122208, - "logps/chosen": -1.518781065940857, - "logps/rejected": -1.5591602325439453, - "loss": 1.1321, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.518781065940857, - "rewards/margins": 0.04037924110889435, - "rewards/rejected": -1.5591602325439453, - "semantic_entropy": 0.7272243499755859, + "logits/chosen": -0.07371653616428375, + "logits/rejected": 0.06543730199337006, + "logps/chosen": -1.3307875394821167, + "logps/rejected": -1.4017726182937622, + "loss": 0.7463, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3307875394821167, + "rewards/margins": 0.07098503410816193, + "rewards/rejected": -1.4017726182937622, "step": 205 }, { "epoch": 0.11239337681886603, - "grad_norm": 12.81406377653034, + "grad_norm": 10.575891753990744, "learning_rate": 3.7433155080213904e-07, - "logits/chosen": -0.135735422372818, - "logits/rejected": 0.05884036421775818, - "logps/chosen": -1.5534254312515259, - "logps/rejected": -1.7356328964233398, - "loss": 1.0671, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.5534254312515259, - "rewards/margins": 0.1822076290845871, - "rewards/rejected": -1.7356328964233398, - "semantic_entropy": 0.7095759510993958, + "logits/chosen": -0.1559882014989853, + "logits/rejected": 0.032074861228466034, + "logps/chosen": -1.4107450246810913, + "logps/rejected": -1.5542168617248535, + "loss": 0.7285, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4107450246810913, + "rewards/margins": 0.143471822142601, + "rewards/rejected": -1.5542168617248535, "step": 210 }, { "epoch": 0.1150694096002676, - "grad_norm": 7.357321246011275, + "grad_norm": 9.422107470627992, "learning_rate": 3.8324420677361853e-07, - "logits/chosen": -0.18377165496349335, - "logits/rejected": 0.060975439846515656, - "logps/chosen": -1.5331767797470093, - "logps/rejected": -1.627393126487732, - "loss": 1.0754, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.5331767797470093, - "rewards/margins": 0.09421636164188385, - "rewards/rejected": -1.627393126487732, - "semantic_entropy": 0.7281553149223328, + "logits/chosen": -0.18807239830493927, + "logits/rejected": 0.05366736650466919, + "logps/chosen": -1.4291021823883057, + "logps/rejected": -1.5053478479385376, + "loss": 0.717, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.4291021823883057, + "rewards/margins": 0.07624554634094238, + "rewards/rejected": -1.5053478479385376, "step": 215 }, { "epoch": 0.11774544238166917, - "grad_norm": 16.502921098288432, + "grad_norm": 17.049270506697756, "learning_rate": 3.92156862745098e-07, - "logits/chosen": 0.04538556560873985, - "logits/rejected": 0.14347299933433533, - "logps/chosen": -1.5174219608306885, - "logps/rejected": -1.7298427820205688, - "loss": 1.0482, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.5174219608306885, - "rewards/margins": 0.21242070198059082, - "rewards/rejected": -1.7298427820205688, - "semantic_entropy": 0.7120253443717957, + "logits/chosen": 0.03968513011932373, + "logits/rejected": 0.1367587298154831, + "logps/chosen": -1.3826735019683838, + "logps/rejected": -1.5626559257507324, + "loss": 0.6985, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3826735019683838, + "rewards/margins": 0.1799825131893158, + "rewards/rejected": -1.5626559257507324, "step": 220 }, { "epoch": 0.12042147516307075, - "grad_norm": 6.370433761880968, + "grad_norm": 6.551096739071047, "learning_rate": 4.010695187165775e-07, - "logits/chosen": -0.0893501490354538, - "logits/rejected": 0.08341099321842194, - "logps/chosen": -1.4963688850402832, - "logps/rejected": -1.64535391330719, - "loss": 1.0625, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.4963688850402832, - "rewards/margins": 0.14898499846458435, - "rewards/rejected": -1.64535391330719, - "semantic_entropy": 0.7224361300468445, + "logits/chosen": -0.12207405269145966, + "logits/rejected": 0.03914044424891472, + "logps/chosen": -1.3879001140594482, + "logps/rejected": -1.532904863357544, + "loss": 0.6921, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3879001140594482, + "rewards/margins": 0.1450047791004181, + "rewards/rejected": -1.532904863357544, "step": 225 }, { "epoch": 0.12309750794447231, - "grad_norm": 6.037012104027165, + "grad_norm": 6.748237519575699, "learning_rate": 4.09982174688057e-07, - "logits/chosen": -0.012658950872719288, - "logits/rejected": 0.06327076256275177, - "logps/chosen": -1.5707639455795288, - "logps/rejected": -1.736476182937622, - "loss": 1.0823, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.5707639455795288, - "rewards/margins": 0.16571208834648132, - "rewards/rejected": -1.736476182937622, - "semantic_entropy": 0.7066926956176758, + "logits/chosen": -0.02085573971271515, + "logits/rejected": 0.05412941053509712, + "logps/chosen": -1.4305589199066162, + "logps/rejected": -1.6017751693725586, + "loss": 0.7035, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4305589199066162, + "rewards/margins": 0.17121610045433044, + "rewards/rejected": -1.6017751693725586, "step": 230 }, { "epoch": 0.1257735407258739, - "grad_norm": 10.864711575015095, + "grad_norm": 13.006091098841727, "learning_rate": 4.188948306595365e-07, - "logits/chosen": 0.02206835150718689, - "logits/rejected": 0.1628737896680832, - "logps/chosen": -1.5150396823883057, - "logps/rejected": -1.692229986190796, - "loss": 1.0396, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.5150396823883057, - "rewards/margins": 0.17719021439552307, - "rewards/rejected": -1.692229986190796, - "semantic_entropy": 0.7193215489387512, + "logits/chosen": -0.0014473020564764738, + "logits/rejected": 0.1337365210056305, + "logps/chosen": -1.385797381401062, + "logps/rejected": -1.5564935207366943, + "loss": 0.6754, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.385797381401062, + "rewards/margins": 0.1706962138414383, + "rewards/rejected": -1.5564935207366943, "step": 235 }, { "epoch": 0.12844957350727546, - "grad_norm": 7.074774903340499, + "grad_norm": 5.544352081498345, "learning_rate": 4.27807486631016e-07, - "logits/chosen": -0.02384335733950138, - "logits/rejected": 0.1055992841720581, - "logps/chosen": -1.5265928506851196, - "logps/rejected": -1.719167947769165, - "loss": 1.0547, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.5265928506851196, - "rewards/margins": 0.19257517158985138, - "rewards/rejected": -1.719167947769165, - "semantic_entropy": 0.7045563459396362, + "logits/chosen": -0.038391418755054474, + "logits/rejected": 0.0862513929605484, + "logps/chosen": -1.3868850469589233, + "logps/rejected": -1.5840628147125244, + "loss": 0.6954, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3868850469589233, + "rewards/margins": 0.19717755913734436, + "rewards/rejected": -1.5840628147125244, "step": 240 }, { "epoch": 0.13112560628867703, - "grad_norm": 7.662272971348944, + "grad_norm": 8.12080523446333, "learning_rate": 4.3672014260249554e-07, - "logits/chosen": 0.036410488188266754, - "logits/rejected": 0.1562315970659256, - "logps/chosen": -1.6148598194122314, - "logps/rejected": -1.7143230438232422, - "loss": 1.0858, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.6148598194122314, - "rewards/margins": 0.09946312010288239, - "rewards/rejected": -1.7143230438232422, - "semantic_entropy": 0.6894456744194031, + "logits/chosen": 0.014441674575209618, + "logits/rejected": 0.12915469706058502, + "logps/chosen": -1.5010141134262085, + "logps/rejected": -1.5602171421051025, + "loss": 0.7536, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5010141134262085, + "rewards/margins": 0.05920302867889404, + "rewards/rejected": -1.5602171421051025, "step": 245 }, { "epoch": 0.1338016390700786, - "grad_norm": 11.864125002258264, + "grad_norm": 15.042577026564823, "learning_rate": 4.4563279857397503e-07, - "logits/chosen": -0.048241887241601944, - "logits/rejected": 0.11782636493444443, - "logps/chosen": -1.6349153518676758, - "logps/rejected": -1.6976579427719116, - "loss": 1.1406, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.6349153518676758, - "rewards/margins": 0.06274263560771942, - "rewards/rejected": -1.6976579427719116, - "semantic_entropy": 0.6956798434257507, + "logits/chosen": -0.07014378160238266, + "logits/rejected": 0.08847087621688843, + "logps/chosen": -1.427731990814209, + "logps/rejected": -1.4958961009979248, + "loss": 0.7697, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.427731990814209, + "rewards/margins": 0.0681641548871994, + "rewards/rejected": -1.4958961009979248, "step": 250 }, { "epoch": 0.1364776718514802, - "grad_norm": 7.796195965000263, + "grad_norm": 10.528139574262203, "learning_rate": 4.545454545454545e-07, - "logits/chosen": -0.020806463435292244, - "logits/rejected": 0.12357542663812637, - "logps/chosen": -1.4866034984588623, - "logps/rejected": -1.6658039093017578, - "loss": 1.056, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.4866034984588623, - "rewards/margins": 0.17920050024986267, - "rewards/rejected": -1.6658039093017578, - "semantic_entropy": 0.7214481830596924, + "logits/chosen": -0.026789207011461258, + "logits/rejected": 0.11618626117706299, + "logps/chosen": -1.3601758480072021, + "logps/rejected": -1.497462511062622, + "loss": 0.7018, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3601758480072021, + "rewards/margins": 0.13728663325309753, + "rewards/rejected": -1.497462511062622, "step": 255 }, { "epoch": 0.13915370463288176, - "grad_norm": 7.563331790101729, + "grad_norm": 7.06011701786399, "learning_rate": 4.63458110516934e-07, - "logits/chosen": -0.21241986751556396, - "logits/rejected": -0.10534496605396271, - "logps/chosen": -1.653738260269165, - "logps/rejected": -1.7724952697753906, - "loss": 1.0457, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.653738260269165, - "rewards/margins": 0.1187569871544838, - "rewards/rejected": -1.7724952697753906, - "semantic_entropy": 0.6744239330291748, + "logits/chosen": -0.22546398639678955, + "logits/rejected": -0.12003403902053833, + "logps/chosen": -1.4767396450042725, + "logps/rejected": -1.6295922994613647, + "loss": 0.6824, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4767396450042725, + "rewards/margins": 0.15285278856754303, + "rewards/rejected": -1.6295922994613647, "step": 260 }, { "epoch": 0.1418297374142833, - "grad_norm": 16.160498491276236, + "grad_norm": 11.060461245272897, "learning_rate": 4.723707664884135e-07, - "logits/chosen": -0.07287711650133133, - "logits/rejected": 0.009031775407493114, - "logps/chosen": -1.675157904624939, - "logps/rejected": -1.7961061000823975, - "loss": 1.0988, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.675157904624939, - "rewards/margins": 0.12094844877719879, - "rewards/rejected": -1.7961061000823975, - "semantic_entropy": 0.6546419262886047, + "logits/chosen": -0.08746234327554703, + "logits/rejected": -0.0033986233174800873, + "logps/chosen": -1.4708541631698608, + "logps/rejected": -1.6343562602996826, + "loss": 0.7277, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.4708541631698608, + "rewards/margins": 0.16350200772285461, + "rewards/rejected": -1.6343562602996826, "step": 265 }, { "epoch": 0.1445057701956849, - "grad_norm": 5.654659760480054, + "grad_norm": 5.6937508313989, "learning_rate": 4.81283422459893e-07, - "logits/chosen": -0.08118149638175964, - "logits/rejected": 0.0527944378554821, - "logps/chosen": -1.5596070289611816, - "logps/rejected": -1.6867377758026123, - "loss": 1.0573, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.5596070289611816, - "rewards/margins": 0.12713071703910828, - "rewards/rejected": -1.6867377758026123, - "semantic_entropy": 0.6952215433120728, + "logits/chosen": -0.09768249094486237, + "logits/rejected": 0.031185900792479515, + "logps/chosen": -1.4019721746444702, + "logps/rejected": -1.5090265274047852, + "loss": 0.7177, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.4019721746444702, + "rewards/margins": 0.10705439746379852, + "rewards/rejected": -1.5090265274047852, "step": 270 }, { "epoch": 0.14718180297708647, - "grad_norm": 7.923835022429222, + "grad_norm": 9.286504683135158, "learning_rate": 4.901960784313725e-07, - "logits/chosen": -0.02389593794941902, - "logits/rejected": 0.07080022990703583, - "logps/chosen": -1.520407795906067, - "logps/rejected": -1.754201889038086, - "loss": 1.0602, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.520407795906067, - "rewards/margins": 0.23379412293434143, - "rewards/rejected": -1.754201889038086, - "semantic_entropy": 0.7215244770050049, + "logits/chosen": -0.05452440306544304, + "logits/rejected": 0.037936486303806305, + "logps/chosen": -1.3463284969329834, + "logps/rejected": -1.5438971519470215, + "loss": 0.6977, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3463284969329834, + "rewards/margins": 0.19756855070590973, + "rewards/rejected": -1.5438971519470215, "step": 275 }, { "epoch": 0.14985783575848804, - "grad_norm": 14.712512665656517, + "grad_norm": 12.113058398451965, "learning_rate": 4.99108734402852e-07, - "logits/chosen": -0.10531296581029892, - "logits/rejected": 0.0524088516831398, - "logps/chosen": -1.6230707168579102, - "logps/rejected": -1.7438873052597046, - "loss": 1.0823, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.6230707168579102, - "rewards/margins": 0.12081663310527802, - "rewards/rejected": -1.7438873052597046, - "semantic_entropy": 0.679011344909668, + "logits/chosen": -0.10539605468511581, + "logits/rejected": 0.04941311478614807, + "logps/chosen": -1.4230097532272339, + "logps/rejected": -1.5360320806503296, + "loss": 0.7307, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4230097532272339, + "rewards/margins": 0.11302228271961212, + "rewards/rejected": -1.5360320806503296, "step": 280 }, { "epoch": 0.15253386853988962, - "grad_norm": 6.69243253514015, + "grad_norm": 9.412216810837805, "learning_rate": 5.080213903743315e-07, - "logits/chosen": -0.0718010812997818, - "logits/rejected": 0.06791789084672928, - "logps/chosen": -1.597745656967163, - "logps/rejected": -1.7144542932510376, - "loss": 1.106, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.597745656967163, - "rewards/margins": 0.11670851707458496, - "rewards/rejected": -1.7144542932510376, - "semantic_entropy": 0.6957116723060608, + "logits/chosen": -0.07987500727176666, + "logits/rejected": 0.05349307507276535, + "logps/chosen": -1.418097734451294, + "logps/rejected": -1.524569034576416, + "loss": 0.7378, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.418097734451294, + "rewards/margins": 0.10647116601467133, + "rewards/rejected": -1.524569034576416, "step": 285 }, { "epoch": 0.1552099013212912, - "grad_norm": 7.7782211257781055, + "grad_norm": 7.7550351681633165, "learning_rate": 5.169340463458111e-07, - "logits/chosen": -0.11950629949569702, - "logits/rejected": 0.17797723412513733, - "logps/chosen": -1.5724703073501587, - "logps/rejected": -1.766248345375061, - "loss": 1.0197, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.5724703073501587, - "rewards/margins": 0.1937781125307083, - "rewards/rejected": -1.766248345375061, - "semantic_entropy": 0.6906196475028992, + "logits/chosen": -0.12368433177471161, + "logits/rejected": 0.17039503157138824, + "logps/chosen": -1.4334779977798462, + "logps/rejected": -1.583195686340332, + "loss": 0.6867, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4334779977798462, + "rewards/margins": 0.14971765875816345, + "rewards/rejected": -1.583195686340332, "step": 290 }, { "epoch": 0.15788593410269275, - "grad_norm": 11.79027110371697, + "grad_norm": 10.732634599709597, "learning_rate": 5.258467023172905e-07, - "logits/chosen": -0.04648825526237488, - "logits/rejected": 0.012335294857621193, - "logps/chosen": -1.5410258769989014, - "logps/rejected": -1.663569688796997, - "loss": 1.063, + "logits/chosen": -0.056438885629177094, + "logits/rejected": -0.002274793339893222, + "logps/chosen": -1.357804536819458, + "logps/rejected": -1.4930763244628906, + "loss": 0.6937, "rewards/accuracies": 0.5625, - "rewards/chosen": -1.5410258769989014, - "rewards/margins": 0.12254378944635391, - "rewards/rejected": -1.663569688796997, - "semantic_entropy": 0.7017509341239929, + "rewards/chosen": -1.357804536819458, + "rewards/margins": 0.135271817445755, + "rewards/rejected": -1.4930763244628906, "step": 295 }, { "epoch": 0.16056196688409433, - "grad_norm": 8.835849573199795, + "grad_norm": 8.241167599366566, "learning_rate": 5.347593582887701e-07, - "logits/chosen": -0.07654620707035065, - "logits/rejected": 0.09046686440706253, - "logps/chosen": -1.5949294567108154, - "logps/rejected": -1.71002197265625, - "loss": 1.0829, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.5949294567108154, - "rewards/margins": 0.11509259045124054, - "rewards/rejected": -1.71002197265625, - "semantic_entropy": 0.6955488920211792, + "logits/chosen": -0.08898478001356125, + "logits/rejected": 0.07272998988628387, + "logps/chosen": -1.3931163549423218, + "logps/rejected": -1.4954721927642822, + "loss": 0.7261, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3931163549423218, + "rewards/margins": 0.10235557705163956, + "rewards/rejected": -1.4954721927642822, "step": 300 }, { "epoch": 0.1632379996654959, - "grad_norm": 6.983366890384154, + "grad_norm": 7.472276488796815, "learning_rate": 5.436720142602496e-07, - "logits/chosen": -0.02877117693424225, - "logits/rejected": 0.04091879725456238, - "logps/chosen": -1.6932131052017212, - "logps/rejected": -1.6878843307495117, - "loss": 1.1315, - "rewards/accuracies": 0.44999998807907104, - "rewards/chosen": -1.6932131052017212, - "rewards/margins": -0.005328828003257513, - "rewards/rejected": -1.6878843307495117, - "semantic_entropy": 0.6642959713935852, + "logits/chosen": -0.024565458297729492, + "logits/rejected": 0.046417467296123505, + "logps/chosen": -1.5031818151474, + "logps/rejected": -1.5077327489852905, + "loss": 0.7873, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.5031818151474, + "rewards/margins": 0.004550829529762268, + "rewards/rejected": -1.5077327489852905, "step": 305 }, { "epoch": 0.16591403244689748, - "grad_norm": 9.615112381569704, + "grad_norm": 10.591619633980644, "learning_rate": 5.52584670231729e-07, - "logits/chosen": -0.2014521062374115, - "logits/rejected": -0.11008793115615845, - "logps/chosen": -1.6740272045135498, - "logps/rejected": -1.7929702997207642, - "loss": 1.0864, + "logits/chosen": -0.2066633403301239, + "logits/rejected": -0.11681792885065079, + "logps/chosen": -1.4645190238952637, + "logps/rejected": -1.5705251693725586, + "loss": 0.7536, "rewards/accuracies": 0.48124998807907104, - "rewards/chosen": -1.6740272045135498, - "rewards/margins": 0.11894307285547256, - "rewards/rejected": -1.7929702997207642, - "semantic_entropy": 0.6629990339279175, + "rewards/chosen": -1.4645190238952637, + "rewards/margins": 0.10600608587265015, + "rewards/rejected": -1.5705251693725586, "step": 310 }, { "epoch": 0.16859006522829906, - "grad_norm": 12.292656026710143, + "grad_norm": 12.078058550781307, "learning_rate": 5.614973262032086e-07, - "logits/chosen": -0.01841166988015175, - "logits/rejected": 0.14304211735725403, - "logps/chosen": -1.6711599826812744, - "logps/rejected": -1.8580175638198853, - "loss": 1.0557, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.6711599826812744, - "rewards/margins": 0.186857670545578, - "rewards/rejected": -1.8580175638198853, - "semantic_entropy": 0.6592516899108887, + "logits/chosen": -0.014210551977157593, + "logits/rejected": 0.14244139194488525, + "logps/chosen": -1.4675190448760986, + "logps/rejected": -1.6418001651763916, + "loss": 0.7171, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4675190448760986, + "rewards/margins": 0.17428097128868103, + "rewards/rejected": -1.6418001651763916, "step": 315 }, { "epoch": 0.1712660980097006, - "grad_norm": 6.646670707784385, + "grad_norm": 7.310247708248856, "learning_rate": 5.70409982174688e-07, - "logits/chosen": -0.059432487934827805, - "logits/rejected": 0.0704459697008133, - "logps/chosen": -1.5968337059020996, - "logps/rejected": -1.6530053615570068, - "loss": 1.0909, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.5968337059020996, - "rewards/margins": 0.05617170408368111, - "rewards/rejected": -1.6530053615570068, - "semantic_entropy": 0.6955806612968445, + "logits/chosen": -0.06022310256958008, + "logits/rejected": 0.07212124764919281, + "logps/chosen": -1.417494297027588, + "logps/rejected": -1.478116750717163, + "loss": 0.7338, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.417494297027588, + "rewards/margins": 0.060622356832027435, + "rewards/rejected": -1.478116750717163, "step": 320 }, { "epoch": 0.17394213079110218, - "grad_norm": 9.457657582105599, + "grad_norm": 11.17858684448945, "learning_rate": 5.793226381461676e-07, - "logits/chosen": -0.10694797337055206, - "logits/rejected": 0.012209171429276466, - "logps/chosen": -1.6328935623168945, - "logps/rejected": -1.9439910650253296, - "loss": 1.0218, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.6328935623168945, - "rewards/margins": 0.3110976219177246, - "rewards/rejected": -1.9439910650253296, - "semantic_entropy": 0.6683646440505981, + "logits/chosen": -0.12095586955547333, + "logits/rejected": -0.005189047660678625, + "logps/chosen": -1.4315059185028076, + "logps/rejected": -1.7110908031463623, + "loss": 0.6808, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4315059185028076, + "rewards/margins": 0.27958500385284424, + "rewards/rejected": -1.7110908031463623, "step": 325 }, { "epoch": 0.17661816357250376, - "grad_norm": 12.926849465905892, + "grad_norm": 13.107698297279065, "learning_rate": 5.88235294117647e-07, - "logits/chosen": -0.005584185477346182, - "logits/rejected": 0.14117801189422607, - "logps/chosen": -1.6041080951690674, - "logps/rejected": -1.904754638671875, - "loss": 0.995, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.6041080951690674, - "rewards/margins": 0.30064669251441956, - "rewards/rejected": -1.904754638671875, - "semantic_entropy": 0.6687676906585693, + "logits/chosen": -0.005552935414016247, + "logits/rejected": 0.14489194750785828, + "logps/chosen": -1.4379152059555054, + "logps/rejected": -1.6690351963043213, + "loss": 0.6729, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4379152059555054, + "rewards/margins": 0.2311200201511383, + "rewards/rejected": -1.6690351963043213, "step": 330 }, { "epoch": 0.17929419635390534, - "grad_norm": 20.510474921917158, + "grad_norm": 14.610050720935291, "learning_rate": 5.971479500891266e-07, - "logits/chosen": 0.04617486149072647, - "logits/rejected": 0.151122584939003, - "logps/chosen": -1.688357949256897, - "logps/rejected": -1.7622886896133423, - "loss": 1.0828, + "logits/chosen": 0.019246840849518776, + "logits/rejected": 0.12311823666095734, + "logps/chosen": -1.4685430526733398, + "logps/rejected": -1.5264159440994263, + "loss": 0.7366, "rewards/accuracies": 0.5, - "rewards/chosen": -1.688357949256897, - "rewards/margins": 0.07393099367618561, - "rewards/rejected": -1.7622886896133423, - "semantic_entropy": 0.66343754529953, + "rewards/chosen": -1.4685430526733398, + "rewards/margins": 0.05787289887666702, + "rewards/rejected": -1.5264159440994263, "step": 335 }, { "epoch": 0.18197022913530692, - "grad_norm": 17.51474938670172, + "grad_norm": 12.947971344894437, "learning_rate": 6.060606060606061e-07, - "logits/chosen": -0.02019418589770794, - "logits/rejected": 0.12733808159828186, - "logps/chosen": -1.763425588607788, - "logps/rejected": -1.875178575515747, - "loss": 1.1057, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.763425588607788, - "rewards/margins": 0.11175310611724854, - "rewards/rejected": -1.875178575515747, - "semantic_entropy": 0.6318264007568359, + "logits/chosen": -0.037995003163814545, + "logits/rejected": 0.10654360055923462, + "logps/chosen": -1.551306962966919, + "logps/rejected": -1.6473627090454102, + "loss": 0.7619, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.551306962966919, + "rewards/margins": 0.09605582058429718, + "rewards/rejected": -1.6473627090454102, "step": 340 }, { "epoch": 0.1846462619167085, - "grad_norm": 11.458477109078752, + "grad_norm": 14.571058029571686, "learning_rate": 6.149732620320855e-07, - "logits/chosen": 0.050300829112529755, - "logits/rejected": 0.08116074651479721, - "logps/chosen": -1.6531829833984375, - "logps/rejected": -1.8452335596084595, - "loss": 1.0593, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.6531829833984375, - "rewards/margins": 0.1920507401227951, - "rewards/rejected": -1.8452335596084595, - "semantic_entropy": 0.6621311902999878, + "logits/chosen": 0.04449956491589546, + "logits/rejected": 0.07455377280712128, + "logps/chosen": -1.4586262702941895, + "logps/rejected": -1.6225944757461548, + "loss": 0.7087, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.4586262702941895, + "rewards/margins": 0.1639682799577713, + "rewards/rejected": -1.6225944757461548, "step": 345 }, { "epoch": 0.18732229469811004, - "grad_norm": 11.329624892763588, + "grad_norm": 11.890857602408156, "learning_rate": 6.238859180035651e-07, - "logits/chosen": 0.015283575281500816, - "logits/rejected": 0.1078251451253891, - "logps/chosen": -1.609297752380371, - "logps/rejected": -1.7397960424423218, - "loss": 1.0848, + "logits/chosen": -0.01408949214965105, + "logits/rejected": 0.0767146572470665, + "logps/chosen": -1.398534893989563, + "logps/rejected": -1.526024580001831, + "loss": 0.7279, "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.609297752380371, - "rewards/margins": 0.13049837946891785, - "rewards/rejected": -1.7397960424423218, - "semantic_entropy": 0.6860819458961487, + "rewards/chosen": -1.398534893989563, + "rewards/margins": 0.12748976051807404, + "rewards/rejected": -1.526024580001831, "step": 350 }, { "epoch": 0.18999832747951162, - "grad_norm": 9.749373970268978, + "grad_norm": 8.711286803481565, "learning_rate": 6.327985739750445e-07, - "logits/chosen": -0.07602973282337189, - "logits/rejected": 0.14425484836101532, - "logps/chosen": -1.6898406744003296, - "logps/rejected": -1.7698333263397217, - "loss": 1.0882, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.6898406744003296, - "rewards/margins": 0.07999298721551895, - "rewards/rejected": -1.7698333263397217, - "semantic_entropy": 0.6580514311790466, + "logits/chosen": -0.09813243895769119, + "logits/rejected": 0.12113466113805771, + "logps/chosen": -1.4836345911026, + "logps/rejected": -1.5430461168289185, + "loss": 0.7588, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.4836345911026, + "rewards/margins": 0.059411488473415375, + "rewards/rejected": -1.5430461168289185, "step": 355 }, { "epoch": 0.1926743602609132, - "grad_norm": 9.515555465754796, + "grad_norm": 9.704359331320719, "learning_rate": 6.417112299465241e-07, - "logits/chosen": -0.05237163230776787, - "logits/rejected": 0.025818347930908203, - "logps/chosen": -1.6769046783447266, - "logps/rejected": -1.8567724227905273, - "loss": 1.0509, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.6769046783447266, - "rewards/margins": 0.17986764013767242, - "rewards/rejected": -1.8567724227905273, - "semantic_entropy": 0.6746851205825806, + "logits/chosen": -0.07371324300765991, + "logits/rejected": 0.002655577613040805, + "logps/chosen": -1.4366960525512695, + "logps/rejected": -1.5897114276885986, + "loss": 0.7207, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.4366960525512695, + "rewards/margins": 0.1530153453350067, + "rewards/rejected": -1.5897114276885986, "step": 360 }, { "epoch": 0.19535039304231477, - "grad_norm": 15.086499304257604, + "grad_norm": 15.659859148307678, "learning_rate": 6.506238859180035e-07, - "logits/chosen": 0.019944345578551292, - "logits/rejected": 0.10378506034612656, - "logps/chosen": -1.6250860691070557, - "logps/rejected": -1.7133582830429077, - "loss": 1.0988, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.6250860691070557, - "rewards/margins": 0.08827227354049683, - "rewards/rejected": -1.7133582830429077, - "semantic_entropy": 0.6908445358276367, + "logits/chosen": 0.012166516855359077, + "logits/rejected": 0.09506052732467651, + "logps/chosen": -1.4012553691864014, + "logps/rejected": -1.499685287475586, + "loss": 0.7465, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4012553691864014, + "rewards/margins": 0.09842980653047562, + "rewards/rejected": -1.499685287475586, "step": 365 }, { "epoch": 0.19802642582371635, - "grad_norm": 13.830072743833517, + "grad_norm": 11.386020984315136, "learning_rate": 6.59536541889483e-07, - "logits/chosen": -0.03349882736802101, - "logits/rejected": 0.05777007341384888, - "logps/chosen": -1.6009029150009155, - "logps/rejected": -1.7026888132095337, - "loss": 1.1096, + "logits/chosen": -0.030647719278931618, + "logits/rejected": 0.05775851756334305, + "logps/chosen": -1.3883259296417236, + "logps/rejected": -1.4623894691467285, + "loss": 0.7568, "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.6009029150009155, - "rewards/margins": 0.10178569704294205, - "rewards/rejected": -1.7026888132095337, - "semantic_entropy": 0.7001045346260071, + "rewards/chosen": -1.3883259296417236, + "rewards/margins": 0.07406359910964966, + "rewards/rejected": -1.4623894691467285, "step": 370 }, { "epoch": 0.2007024586051179, - "grad_norm": 11.699656608660698, + "grad_norm": 13.604560383607478, "learning_rate": 6.684491978609626e-07, - "logits/chosen": -0.07194206863641739, - "logits/rejected": 0.07688136398792267, - "logps/chosen": -1.6611896753311157, - "logps/rejected": -1.9261270761489868, - "loss": 1.0219, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.6611896753311157, - "rewards/margins": 0.2649373412132263, - "rewards/rejected": -1.9261270761489868, - "semantic_entropy": 0.6614036560058594, + "logits/chosen": -0.085804782807827, + "logits/rejected": 0.06725587695837021, + "logps/chosen": -1.3863632678985596, + "logps/rejected": -1.5604827404022217, + "loss": 0.7109, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3863632678985596, + "rewards/margins": 0.1741194725036621, + "rewards/rejected": -1.5604827404022217, "step": 375 }, { "epoch": 0.20337849138651948, - "grad_norm": 8.432293708205574, + "grad_norm": 6.235798891808619, "learning_rate": 6.77361853832442e-07, - "logits/chosen": -0.03905266523361206, - "logits/rejected": 0.04177533835172653, - "logps/chosen": -1.8525673151016235, - "logps/rejected": -2.073270082473755, - "loss": 1.0041, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.8525673151016235, - "rewards/margins": 0.22070245444774628, - "rewards/rejected": -2.073270082473755, - "semantic_entropy": 0.6032805442810059, + "logits/chosen": -0.04459378868341446, + "logits/rejected": 0.03797828406095505, + "logps/chosen": -1.4204928874969482, + "logps/rejected": -1.6168655157089233, + "loss": 0.6608, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4204928874969482, + "rewards/margins": 0.1963725984096527, + "rewards/rejected": -1.6168655157089233, "step": 380 }, { "epoch": 0.20605452416792105, - "grad_norm": 6.558363447038307, + "grad_norm": 5.327275881495095, "learning_rate": 6.862745098039216e-07, - "logits/chosen": -0.004520825110375881, - "logits/rejected": 0.06874732673168182, - "logps/chosen": -1.8738794326782227, - "logps/rejected": -1.971431016921997, - "loss": 1.0507, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.8738794326782227, - "rewards/margins": 0.09755153954029083, - "rewards/rejected": -1.971431016921997, - "semantic_entropy": 0.5936424732208252, + "logits/chosen": -0.008041461929678917, + "logits/rejected": 0.070212222635746, + "logps/chosen": -1.5062916278839111, + "logps/rejected": -1.5332037210464478, + "loss": 0.7662, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.5062916278839111, + "rewards/margins": 0.026912039145827293, + "rewards/rejected": -1.5332037210464478, "step": 385 }, { "epoch": 0.20873055694932263, - "grad_norm": 11.9814690281164, + "grad_norm": 12.706451437731948, "learning_rate": 6.95187165775401e-07, - "logits/chosen": 0.056425292044878006, - "logits/rejected": 0.21399247646331787, - "logps/chosen": -1.9227275848388672, - "logps/rejected": -2.138357400894165, - "loss": 1.014, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.9227275848388672, - "rewards/margins": 0.21562990546226501, - "rewards/rejected": -2.138357400894165, - "semantic_entropy": 0.5690654516220093, + "logits/chosen": 0.07303877174854279, + "logits/rejected": 0.24054667353630066, + "logps/chosen": -1.5222657918930054, + "logps/rejected": -1.6287994384765625, + "loss": 0.7404, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.5222657918930054, + "rewards/margins": 0.10653360188007355, + "rewards/rejected": -1.6287994384765625, "step": 390 }, { "epoch": 0.2114065897307242, - "grad_norm": 11.659848810770669, + "grad_norm": 7.524532694631373, "learning_rate": 7.040998217468806e-07, - "logits/chosen": -0.04911988228559494, - "logits/rejected": 0.10953982919454575, - "logps/chosen": -1.8240041732788086, - "logps/rejected": -1.974021553993225, - "loss": 1.0404, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.8240041732788086, - "rewards/margins": 0.1500171720981598, - "rewards/rejected": -1.974021553993225, - "semantic_entropy": 0.608680009841919, + "logits/chosen": -0.04835174232721329, + "logits/rejected": 0.11143501102924347, + "logps/chosen": -1.480359673500061, + "logps/rejected": -1.5448495149612427, + "loss": 0.7274, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.480359673500061, + "rewards/margins": 0.06448996812105179, + "rewards/rejected": -1.5448495149612427, "step": 395 }, { "epoch": 0.2140826225121258, - "grad_norm": 17.654443437190544, + "grad_norm": 7.901364620293172, "learning_rate": 7.1301247771836e-07, - "logits/chosen": 0.07092205435037613, - "logits/rejected": 0.15978960692882538, - "logps/chosen": -1.84757399559021, - "logps/rejected": -2.0060219764709473, - "loss": 1.0119, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.84757399559021, - "rewards/margins": 0.158447727560997, - "rewards/rejected": -2.0060219764709473, - "semantic_entropy": 0.5995103120803833, + "logits/chosen": 0.05830896645784378, + "logits/rejected": 0.14825519919395447, + "logps/chosen": -1.501800298690796, + "logps/rejected": -1.618944525718689, + "loss": 0.7057, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.501800298690796, + "rewards/margins": 0.11714410781860352, + "rewards/rejected": -1.618944525718689, "step": 400 }, { "epoch": 0.2140826225121258, - "eval_logits/chosen": 0.27376288175582886, - "eval_logits/rejected": 0.35996997356414795, - "eval_logps/chosen": -1.7929844856262207, - "eval_logps/rejected": -2.0279812812805176, - "eval_loss": 1.013211965560913, - "eval_rewards/accuracies": 0.5660237669944763, - "eval_rewards/chosen": -1.7929844856262207, - "eval_rewards/margins": 0.23499667644500732, - "eval_rewards/rejected": -2.0279812812805176, - "eval_runtime": 35.6332, - "eval_samples_per_second": 37.746, - "eval_semantic_entropy": 0.6131907105445862, - "eval_steps_per_second": 9.457, + "eval_logits/chosen": 0.24975009262561798, + "eval_logits/rejected": 0.3357105553150177, + "eval_logps/chosen": -1.5154058933258057, + "eval_logps/rejected": -1.6776376962661743, + "eval_loss": 0.7062471508979797, + "eval_rewards/accuracies": 0.5571216344833374, + "eval_rewards/chosen": -1.5154058933258057, + "eval_rewards/margins": 0.16223178803920746, + "eval_rewards/rejected": -1.6776376962661743, + "eval_runtime": 41.2582, + "eval_samples_per_second": 32.6, + "eval_steps_per_second": 8.168, "step": 400 }, { "epoch": 0.21675865529352734, - "grad_norm": 10.075168755240373, + "grad_norm": 7.732135825022327, "learning_rate": 7.219251336898395e-07, - "logits/chosen": -0.0024316341150552034, - "logits/rejected": 0.0858984887599945, - "logps/chosen": -1.8473918437957764, - "logps/rejected": -2.068129062652588, - "loss": 1.0621, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.8473918437957764, - "rewards/margins": 0.22073736786842346, - "rewards/rejected": -2.068129062652588, - "semantic_entropy": 0.6077993512153625, + "logits/chosen": -0.010264229960739613, + "logits/rejected": 0.07922157645225525, + "logps/chosen": -1.5177091360092163, + "logps/rejected": -1.6268842220306396, + "loss": 0.7472, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.5177091360092163, + "rewards/margins": 0.10917504131793976, + "rewards/rejected": -1.6268842220306396, "step": 405 }, { "epoch": 0.2194346880749289, - "grad_norm": 18.950538852606062, + "grad_norm": 16.610481309093924, "learning_rate": 7.30837789661319e-07, - "logits/chosen": 0.04571036621928215, - "logits/rejected": 0.16462978720664978, - "logps/chosen": -1.7759612798690796, - "logps/rejected": -1.9969593286514282, - "loss": 1.012, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.7759612798690796, - "rewards/margins": 0.22099807858467102, - "rewards/rejected": -1.9969593286514282, - "semantic_entropy": 0.6101277470588684, + "logits/chosen": 0.039059411734342575, + "logits/rejected": 0.16739651560783386, + "logps/chosen": -1.449614405632019, + "logps/rejected": -1.5890365839004517, + "loss": 0.7085, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.449614405632019, + "rewards/margins": 0.13942214846611023, + "rewards/rejected": -1.5890365839004517, "step": 410 }, { "epoch": 0.2221107208563305, - "grad_norm": 12.639518148578636, + "grad_norm": 7.224553960809832, "learning_rate": 7.397504456327985e-07, - "logits/chosen": 0.04078054428100586, - "logits/rejected": 0.08592768013477325, - "logps/chosen": -1.8478095531463623, - "logps/rejected": -2.025573492050171, - "loss": 1.0192, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.8478095531463623, - "rewards/margins": 0.17776378989219666, - "rewards/rejected": -2.025573492050171, - "semantic_entropy": 0.5999530553817749, + "logits/chosen": 0.001240618177689612, + "logits/rejected": 0.0372118279337883, + "logps/chosen": -1.4344186782836914, + "logps/rejected": -1.608679175376892, + "loss": 0.6988, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.4344186782836914, + "rewards/margins": 0.17426052689552307, + "rewards/rejected": -1.608679175376892, "step": 415 }, { "epoch": 0.22478675363773207, - "grad_norm": 10.190575297401363, + "grad_norm": 8.70380961143005, "learning_rate": 7.486631016042781e-07, - "logits/chosen": 0.0002573668898548931, - "logits/rejected": 0.18812724947929382, - "logps/chosen": -1.6988885402679443, - "logps/rejected": -1.8715341091156006, - "loss": 1.0463, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.6988885402679443, - "rewards/margins": 0.17264559864997864, - "rewards/rejected": -1.8715341091156006, - "semantic_entropy": 0.6426397562026978, + "logits/chosen": -0.01791190169751644, + "logits/rejected": 0.17258715629577637, + "logps/chosen": -1.3735711574554443, + "logps/rejected": -1.5033009052276611, + "loss": 0.7261, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3735711574554443, + "rewards/margins": 0.12972982227802277, + "rewards/rejected": -1.5033009052276611, "step": 420 }, { "epoch": 0.22746278641913364, - "grad_norm": 11.953799492318076, + "grad_norm": 9.970280911170597, "learning_rate": 7.575757575757575e-07, - "logits/chosen": 0.0010619193781167269, - "logits/rejected": 0.19794592261314392, - "logps/chosen": -1.7909456491470337, - "logps/rejected": -2.094650983810425, - "loss": 0.9602, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.7909456491470337, - "rewards/margins": 0.30370545387268066, - "rewards/rejected": -2.094650983810425, - "semantic_entropy": 0.6076642274856567, + "logits/chosen": -0.04901670292019844, + "logits/rejected": 0.14770185947418213, + "logps/chosen": -1.4214407205581665, + "logps/rejected": -1.6318986415863037, + "loss": 0.6634, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4214407205581665, + "rewards/margins": 0.21045811474323273, + "rewards/rejected": -1.6318986415863037, "step": 425 }, { "epoch": 0.2301388192005352, - "grad_norm": 10.263106298016128, + "grad_norm": 11.189676966730312, "learning_rate": 7.664884135472371e-07, - "logits/chosen": -0.02113468013703823, - "logits/rejected": 0.18052729964256287, - "logps/chosen": -1.803195595741272, - "logps/rejected": -2.2266640663146973, - "loss": 0.9559, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.803195595741272, - "rewards/margins": 0.4234686493873596, - "rewards/rejected": -2.2266640663146973, - "semantic_entropy": 0.6058921813964844, + "logits/chosen": -0.08885525166988373, + "logits/rejected": 0.10649999231100082, + "logps/chosen": -1.436267375946045, + "logps/rejected": -1.6482349634170532, + "loss": 0.6695, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.436267375946045, + "rewards/margins": 0.2119673788547516, + "rewards/rejected": -1.6482349634170532, "step": 430 }, { "epoch": 0.23281485198193677, - "grad_norm": 23.770582060225383, + "grad_norm": 9.609836662630357, "learning_rate": 7.754010695187165e-07, - "logits/chosen": 0.07859322428703308, - "logits/rejected": 0.17100855708122253, - "logps/chosen": -1.8036648035049438, - "logps/rejected": -1.972700834274292, - "loss": 1.0131, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.8036648035049438, - "rewards/margins": 0.1690361052751541, - "rewards/rejected": -1.972700834274292, - "semantic_entropy": 0.6105883717536926, + "logits/chosen": -0.016007598489522934, + "logits/rejected": 0.07220776379108429, + "logps/chosen": -1.3251539468765259, + "logps/rejected": -1.476294755935669, + "loss": 0.6798, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3251539468765259, + "rewards/margins": 0.15114082396030426, + "rewards/rejected": -1.476294755935669, "step": 435 }, { "epoch": 0.23549088476333835, - "grad_norm": 8.942531726492792, + "grad_norm": 9.740729837303004, "learning_rate": 7.84313725490196e-07, - "logits/chosen": 0.027078593149781227, - "logits/rejected": 0.11831989139318466, - "logps/chosen": -1.8217464685440063, - "logps/rejected": -2.0932116508483887, - "loss": 0.9838, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.8217464685440063, - "rewards/margins": 0.2714650630950928, - "rewards/rejected": -2.0932116508483887, - "semantic_entropy": 0.5999422669410706, + "logits/chosen": -0.03409050777554512, + "logits/rejected": 0.053927529603242874, + "logps/chosen": -1.3960790634155273, + "logps/rejected": -1.5437512397766113, + "loss": 0.6974, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3960790634155273, + "rewards/margins": 0.14767225086688995, + "rewards/rejected": -1.5437512397766113, "step": 440 }, { "epoch": 0.23816691754473993, - "grad_norm": 12.264776032319165, + "grad_norm": 10.586867151195362, "learning_rate": 7.932263814616755e-07, - "logits/chosen": 0.01442508865147829, - "logits/rejected": 0.12444069236516953, - "logps/chosen": -1.918039083480835, - "logps/rejected": -2.2938976287841797, - "loss": 0.9688, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.918039083480835, - "rewards/margins": 0.37585827708244324, - "rewards/rejected": -2.2938976287841797, - "semantic_entropy": 0.5600379705429077, + "logits/chosen": -0.08641007542610168, + "logits/rejected": 0.019953008741140366, + "logps/chosen": -1.4350497722625732, + "logps/rejected": -1.6332752704620361, + "loss": 0.7212, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4350497722625732, + "rewards/margins": 0.1982256919145584, + "rewards/rejected": -1.6332752704620361, "step": 445 }, { "epoch": 0.2408429503261415, - "grad_norm": 20.147606211927062, + "grad_norm": 12.599415472284832, "learning_rate": 8.02139037433155e-07, - "logits/chosen": 0.08424471318721771, - "logits/rejected": 0.2129439115524292, - "logps/chosen": -1.9648675918579102, - "logps/rejected": -2.2125535011291504, - "loss": 0.9685, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.9648675918579102, - "rewards/margins": 0.24768579006195068, - "rewards/rejected": -2.2125535011291504, - "semantic_entropy": 0.555503249168396, + "logits/chosen": 0.004355481825768948, + "logits/rejected": 0.12667454779148102, + "logps/chosen": -1.4702180624008179, + "logps/rejected": -1.6423380374908447, + "loss": 0.6751, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4702180624008179, + "rewards/margins": 0.1721198558807373, + "rewards/rejected": -1.6423380374908447, "step": 450 }, { "epoch": 0.24351898310754308, - "grad_norm": 10.39177189011024, + "grad_norm": 17.386909256273288, "learning_rate": 8.110516934046346e-07, - "logits/chosen": 0.1032770648598671, - "logits/rejected": 0.18853013217449188, - "logps/chosen": -1.8158471584320068, - "logps/rejected": -2.1788058280944824, - "loss": 0.9408, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.8158471584320068, - "rewards/margins": 0.3629588484764099, - "rewards/rejected": -2.1788058280944824, - "semantic_entropy": 0.5897886157035828, + "logits/chosen": 0.0011116235982626677, + "logits/rejected": 0.08777167648077011, + "logps/chosen": -1.4536190032958984, + "logps/rejected": -1.6972615718841553, + "loss": 0.6674, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.4536190032958984, + "rewards/margins": 0.24364276230335236, + "rewards/rejected": -1.6972615718841553, "step": 455 }, { "epoch": 0.24619501588894463, - "grad_norm": 9.700606050871391, + "grad_norm": 10.198363738177113, "learning_rate": 8.19964349376114e-07, - "logits/chosen": -0.019435208290815353, - "logits/rejected": 0.10268989950418472, - "logps/chosen": -1.9720776081085205, - "logps/rejected": -2.17942476272583, - "loss": 0.9789, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.9720776081085205, - "rewards/margins": 0.2073473036289215, - "rewards/rejected": -2.17942476272583, - "semantic_entropy": 0.5492539405822754, + "logits/chosen": -0.12262700498104095, + "logits/rejected": -0.00267317658290267, + "logps/chosen": -1.547154188156128, + "logps/rejected": -1.6543045043945312, + "loss": 0.7255, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.547154188156128, + "rewards/margins": 0.10715029388666153, + "rewards/rejected": -1.6543045043945312, "step": 460 }, { "epoch": 0.2488710486703462, - "grad_norm": 13.542184802712866, + "grad_norm": 12.870251671630236, "learning_rate": 8.288770053475936e-07, - "logits/chosen": 0.25124862790107727, - "logits/rejected": 0.2735592722892761, - "logps/chosen": -2.041325807571411, - "logps/rejected": -2.2813258171081543, - "loss": 0.9751, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -2.041325807571411, - "rewards/margins": 0.24000012874603271, - "rewards/rejected": -2.2813258171081543, - "semantic_entropy": 0.5254560112953186, + "logits/chosen": 0.1624952107667923, + "logits/rejected": 0.18104097247123718, + "logps/chosen": -1.5187817811965942, + "logps/rejected": -1.7125638723373413, + "loss": 0.7065, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.5187817811965942, + "rewards/margins": 0.193782240152359, + "rewards/rejected": -1.7125638723373413, "step": 465 }, { "epoch": 0.2515470814517478, - "grad_norm": 9.43369236776717, + "grad_norm": 9.340252134899437, "learning_rate": 8.37789661319073e-07, - "logits/chosen": 0.28713709115982056, - "logits/rejected": 0.24178346991539001, - "logps/chosen": -2.011660575866699, - "logps/rejected": -2.2471611499786377, - "loss": 0.9972, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -2.011660575866699, - "rewards/margins": 0.2355005294084549, - "rewards/rejected": -2.2471611499786377, - "semantic_entropy": 0.5238825678825378, + "logits/chosen": 0.17645838856697083, + "logits/rejected": 0.12602797150611877, + "logps/chosen": -1.4498023986816406, + "logps/rejected": -1.6570565700531006, + "loss": 0.6895, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.4498023986816406, + "rewards/margins": 0.20725421607494354, + "rewards/rejected": -1.6570565700531006, "step": 470 }, { "epoch": 0.25422311423314936, - "grad_norm": 8.440890172534203, + "grad_norm": 8.243427575523365, "learning_rate": 8.467023172905525e-07, - "logits/chosen": 0.06835935264825821, - "logits/rejected": 0.2116996943950653, - "logps/chosen": -1.9710376262664795, - "logps/rejected": -2.4928886890411377, - "loss": 0.8891, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.9710376262664795, - "rewards/margins": 0.5218510031700134, - "rewards/rejected": -2.4928886890411377, - "semantic_entropy": 0.5133862495422363, + "logits/chosen": -0.029578542336821556, + "logits/rejected": 0.11565957218408585, + "logps/chosen": -1.4557960033416748, + "logps/rejected": -1.7940890789031982, + "loss": 0.645, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.4557960033416748, + "rewards/margins": 0.3382931649684906, + "rewards/rejected": -1.7940890789031982, "step": 475 }, { "epoch": 0.2568991470145509, - "grad_norm": 13.271352574157751, + "grad_norm": 18.033161144745723, "learning_rate": 8.55614973262032e-07, - "logits/chosen": 0.11900024116039276, - "logits/rejected": 0.3160308599472046, - "logps/chosen": -2.046262502670288, - "logps/rejected": -2.303492307662964, - "loss": 0.9389, - "rewards/accuracies": 0.59375, - "rewards/chosen": -2.046262502670288, - "rewards/margins": 0.25722989439964294, - "rewards/rejected": -2.303492307662964, - "semantic_entropy": 0.5193344354629517, + "logits/chosen": -0.004798990674316883, + "logits/rejected": 0.20031125843524933, + "logps/chosen": -1.437205195426941, + "logps/rejected": -1.5695266723632812, + "loss": 0.7217, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.437205195426941, + "rewards/margins": 0.13232167065143585, + "rewards/rejected": -1.5695266723632812, "step": 480 }, { "epoch": 0.2595751797959525, - "grad_norm": 17.40327979933669, + "grad_norm": 18.082909592300624, "learning_rate": 8.645276292335115e-07, - "logits/chosen": 0.11675455421209335, - "logits/rejected": 0.16688722372055054, - "logps/chosen": -2.2786717414855957, - "logps/rejected": -2.450490951538086, - "loss": 0.9805, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -2.2786717414855957, - "rewards/margins": 0.17181938886642456, - "rewards/rejected": -2.450490951538086, - "semantic_entropy": 0.45640721917152405, + "logits/chosen": 0.04143375903367996, + "logits/rejected": 0.08174540102481842, + "logps/chosen": -1.59268319606781, + "logps/rejected": -1.7031772136688232, + "loss": 0.7403, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.59268319606781, + "rewards/margins": 0.11049391329288483, + "rewards/rejected": -1.7031772136688232, "step": 485 }, { "epoch": 0.26225121257735406, - "grad_norm": 10.5637127371766, + "grad_norm": 8.336204262401544, "learning_rate": 8.734402852049911e-07, - "logits/chosen": 0.14526286721229553, - "logits/rejected": 0.21295936405658722, - "logps/chosen": -2.2720694541931152, - "logps/rejected": -2.437973737716675, - "loss": 0.9629, - "rewards/accuracies": 0.5625, - "rewards/chosen": -2.2720694541931152, - "rewards/margins": 0.16590480506420135, - "rewards/rejected": -2.437973737716675, - "semantic_entropy": 0.4487149715423584, + "logits/chosen": 0.06809569895267487, + "logits/rejected": 0.14321336150169373, + "logps/chosen": -1.5520291328430176, + "logps/rejected": -1.6646690368652344, + "loss": 0.7413, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.5520291328430176, + "rewards/margins": 0.11263992637395859, + "rewards/rejected": -1.6646690368652344, "step": 490 }, { "epoch": 0.26492724535875567, - "grad_norm": 13.968933074481987, + "grad_norm": 13.842520862351547, "learning_rate": 8.823529411764705e-07, - "logits/chosen": 0.10887887328863144, - "logits/rejected": 0.1369287371635437, - "logps/chosen": -2.321854829788208, - "logps/rejected": -2.471646785736084, - "loss": 0.9461, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -2.321854829788208, - "rewards/margins": 0.14979204535484314, - "rewards/rejected": -2.471646785736084, - "semantic_entropy": 0.42514246702194214, + "logits/chosen": 0.0002840966044459492, + "logits/rejected": 0.022160915657877922, + "logps/chosen": -1.5664799213409424, + "logps/rejected": -1.6862761974334717, + "loss": 0.7266, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.5664799213409424, + "rewards/margins": 0.11979645490646362, + "rewards/rejected": -1.6862761974334717, "step": 495 }, { "epoch": 0.2676032781401572, - "grad_norm": 15.91869715362306, + "grad_norm": 9.449206637699177, "learning_rate": 8.912655971479501e-07, - "logits/chosen": 0.11035114526748657, - "logits/rejected": 0.19290763139724731, - "logps/chosen": -2.324972152709961, - "logps/rejected": -2.60076642036438, - "loss": 0.8935, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -2.324972152709961, - "rewards/margins": 0.2757939100265503, - "rewards/rejected": -2.60076642036438, - "semantic_entropy": 0.4203091263771057, + "logits/chosen": -0.014049562625586987, + "logits/rejected": 0.08820073306560516, + "logps/chosen": -1.473944902420044, + "logps/rejected": -1.6701536178588867, + "loss": 0.7048, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.473944902420044, + "rewards/margins": 0.19620880484580994, + "rewards/rejected": -1.6701536178588867, "step": 500 }, { "epoch": 0.27027931092155877, - "grad_norm": 17.82851785747078, + "grad_norm": 9.643039840749609, "learning_rate": 9.001782531194295e-07, - "logits/chosen": 0.10254337638616562, - "logits/rejected": 0.23869287967681885, - "logps/chosen": -2.434830904006958, - "logps/rejected": -2.6683614253997803, - "loss": 0.88, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -2.434830904006958, - "rewards/margins": 0.23353052139282227, - "rewards/rejected": -2.6683614253997803, - "semantic_entropy": 0.38676854968070984, + "logits/chosen": -0.026906386017799377, + "logits/rejected": 0.11797040700912476, + "logps/chosen": -1.5578444004058838, + "logps/rejected": -1.6293739080429077, + "loss": 0.7341, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.5578444004058838, + "rewards/margins": 0.07152950763702393, + "rewards/rejected": -1.6293739080429077, "step": 505 }, { "epoch": 0.2729553437029604, - "grad_norm": 9.222919526301315, + "grad_norm": 7.105656450336462, "learning_rate": 9.09090909090909e-07, - "logits/chosen": 0.2949961721897125, - "logits/rejected": 0.34191757440567017, - "logps/chosen": -2.5707197189331055, - "logps/rejected": -2.9306082725524902, - "loss": 0.8353, + "logits/chosen": 0.13335290551185608, + "logits/rejected": 0.19222620129585266, + "logps/chosen": -1.4883719682693481, + "logps/rejected": -1.7041199207305908, + "loss": 0.6683, "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -2.5707197189331055, - "rewards/margins": 0.35988861322402954, - "rewards/rejected": -2.9306082725524902, - "semantic_entropy": 0.3533479571342468, + "rewards/chosen": -1.4883719682693481, + "rewards/margins": 0.21574795246124268, + "rewards/rejected": -1.7041199207305908, "step": 510 }, { "epoch": 0.2756313764843619, - "grad_norm": 13.125321055251398, + "grad_norm": 8.170451097737303, "learning_rate": 9.180035650623885e-07, - "logits/chosen": 0.26439735293388367, - "logits/rejected": 0.3643534779548645, - "logps/chosen": -2.6091151237487793, - "logps/rejected": -2.9702847003936768, - "loss": 0.8275, - "rewards/accuracies": 0.625, - "rewards/chosen": -2.6091151237487793, - "rewards/margins": 0.3611697256565094, - "rewards/rejected": -2.9702847003936768, - "semantic_entropy": 0.3468519449234009, + "logits/chosen": 0.08237681537866592, + "logits/rejected": 0.18179181218147278, + "logps/chosen": -1.4099267721176147, + "logps/rejected": -1.597726583480835, + "loss": 0.6747, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4099267721176147, + "rewards/margins": 0.18779978156089783, + "rewards/rejected": -1.597726583480835, "step": 515 }, { "epoch": 0.27830740926576353, - "grad_norm": 13.837098088359745, + "grad_norm": 6.357017723509141, "learning_rate": 9.26916221033868e-07, - "logits/chosen": 0.1988576203584671, - "logits/rejected": 0.3169275224208832, - "logps/chosen": -2.997946262359619, - "logps/rejected": -3.395313262939453, - "loss": 0.824, - "rewards/accuracies": 0.625, - "rewards/chosen": -2.997946262359619, - "rewards/margins": 0.39736661314964294, - "rewards/rejected": -3.395313262939453, - "semantic_entropy": 0.2663891315460205, + "logits/chosen": -0.05814286321401596, + "logits/rejected": 0.08064989745616913, + "logps/chosen": -1.4644676446914673, + "logps/rejected": -1.620928406715393, + "loss": 0.7077, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4644676446914673, + "rewards/margins": 0.15646079182624817, + "rewards/rejected": -1.620928406715393, "step": 520 }, { "epoch": 0.2809834420471651, - "grad_norm": 23.212683763316665, + "grad_norm": 16.248993691004035, "learning_rate": 9.358288770053476e-07, - "logits/chosen": 0.362657368183136, - "logits/rejected": 0.431486040353775, - "logps/chosen": -3.46795654296875, - "logps/rejected": -3.9383537769317627, - "loss": 0.7851, - "rewards/accuracies": 0.65625, - "rewards/chosen": -3.46795654296875, - "rewards/margins": 0.4703969359397888, - "rewards/rejected": -3.9383537769317627, - "semantic_entropy": 0.19957469403743744, + "logits/chosen": 0.13906976580619812, + "logits/rejected": 0.21467263996601105, + "logps/chosen": -1.4559953212738037, + "logps/rejected": -1.7041419744491577, + "loss": 0.6828, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.4559953212738037, + "rewards/margins": 0.2481466829776764, + "rewards/rejected": -1.7041419744491577, "step": 525 }, { "epoch": 0.2836594748285666, - "grad_norm": 19.427149448565736, + "grad_norm": 8.014435049801467, "learning_rate": 9.44741532976827e-07, - "logits/chosen": 0.3255121111869812, - "logits/rejected": 0.39444050192832947, - "logps/chosen": -3.581265926361084, - "logps/rejected": -4.079986095428467, - "loss": 0.7894, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -3.581265926361084, - "rewards/margins": 0.4987207055091858, - "rewards/rejected": -4.079986095428467, - "semantic_entropy": 0.19464334845542908, + "logits/chosen": 0.11178193241357803, + "logits/rejected": 0.19584989547729492, + "logps/chosen": -1.411655306816101, + "logps/rejected": -1.5912854671478271, + "loss": 0.715, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.411655306816101, + "rewards/margins": 0.1796303391456604, + "rewards/rejected": -1.5912854671478271, "step": 530 }, { "epoch": 0.28633550760996823, - "grad_norm": 17.35259546926179, + "grad_norm": 8.75990227886059, "learning_rate": 9.536541889483066e-07, - "logits/chosen": 0.20121872425079346, - "logits/rejected": 0.4174925684928894, - "logps/chosen": -3.966810941696167, - "logps/rejected": -4.486771583557129, - "loss": 0.721, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -3.966810941696167, - "rewards/margins": 0.5199612379074097, - "rewards/rejected": -4.486771583557129, - "semantic_entropy": 0.14531609416007996, + "logits/chosen": -0.06300154328346252, + "logits/rejected": 0.20278656482696533, + "logps/chosen": -1.4264861345291138, + "logps/rejected": -1.5696561336517334, + "loss": 0.6818, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4264861345291138, + "rewards/margins": 0.14316999912261963, + "rewards/rejected": -1.5696561336517334, "step": 535 }, { "epoch": 0.2890115403913698, - "grad_norm": 26.596890402490573, + "grad_norm": 6.528353269132695, "learning_rate": 9.62566844919786e-07, - "logits/chosen": 0.2888971269130707, - "logits/rejected": 0.3459900915622711, - "logps/chosen": -4.495975971221924, - "logps/rejected": -4.9110822677612305, - "loss": 0.7492, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -4.495975971221924, - "rewards/margins": 0.4151054322719574, - "rewards/rejected": -4.9110822677612305, - "semantic_entropy": 0.09809108078479767, + "logits/chosen": 0.06240390986204147, + "logits/rejected": 0.13641305267810822, + "logps/chosen": -1.5720045566558838, + "logps/rejected": -1.7112045288085938, + "loss": 0.7099, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.5720045566558838, + "rewards/margins": 0.13919982314109802, + "rewards/rejected": -1.7112045288085938, "step": 540 }, { "epoch": 0.2916875731727714, - "grad_norm": 18.02680952570327, + "grad_norm": 8.716350912794395, "learning_rate": 9.714795008912655e-07, - "logits/chosen": 0.20895743370056152, - "logits/rejected": 0.3623776435852051, - "logps/chosen": -4.741501808166504, - "logps/rejected": -5.503144264221191, - "loss": 0.6348, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -4.741501808166504, - "rewards/margins": 0.7616419196128845, - "rewards/rejected": -5.503144264221191, - "semantic_entropy": 0.08251913636922836, + "logits/chosen": -0.05930591747164726, + "logits/rejected": 0.14189715683460236, + "logps/chosen": -1.483886480331421, + "logps/rejected": -1.683929204940796, + "loss": 0.6533, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.483886480331421, + "rewards/margins": 0.200042724609375, + "rewards/rejected": -1.683929204940796, "step": 545 }, { "epoch": 0.29436360595417294, - "grad_norm": 20.727150313169535, + "grad_norm": 9.3048952658064, "learning_rate": 9.80392156862745e-07, - "logits/chosen": 0.30375415086746216, - "logits/rejected": 0.3442818522453308, - "logps/chosen": -5.456840991973877, - "logps/rejected": -6.024032115936279, - "loss": 0.7065, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -5.456840991973877, - "rewards/margins": 0.5671912431716919, - "rewards/rejected": -6.024032115936279, - "semantic_entropy": 0.06045646220445633, + "logits/chosen": 0.11191095411777496, + "logits/rejected": 0.1827966868877411, + "logps/chosen": -1.5168445110321045, + "logps/rejected": -1.700286865234375, + "loss": 0.6695, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.5168445110321045, + "rewards/margins": 0.18344244360923767, + "rewards/rejected": -1.700286865234375, "step": 550 }, { "epoch": 0.2970396387355745, - "grad_norm": 27.134124904666713, + "grad_norm": 13.8553607303148, "learning_rate": 9.893048128342244e-07, - "logits/chosen": 0.28694844245910645, - "logits/rejected": 0.3676696717739105, - "logps/chosen": -5.522095680236816, - "logps/rejected": -5.709345817565918, - "loss": 0.8281, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -5.522095680236816, - "rewards/margins": 0.18725113570690155, - "rewards/rejected": -5.709345817565918, - "semantic_entropy": 0.05345703288912773, + "logits/chosen": -0.0009193867444992065, + "logits/rejected": 0.12733322381973267, + "logps/chosen": -1.6178038120269775, + "logps/rejected": -1.7281265258789062, + "loss": 0.7299, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.6178038120269775, + "rewards/margins": 0.11032304912805557, + "rewards/rejected": -1.7281265258789062, "step": 555 }, { "epoch": 0.2997156715169761, - "grad_norm": 31.40474528083335, + "grad_norm": 11.882575471987293, "learning_rate": 9.98217468805704e-07, - "logits/chosen": 0.3329199552536011, - "logits/rejected": 0.3428536355495453, - "logps/chosen": -4.907380104064941, - "logps/rejected": -5.42898416519165, - "loss": 0.684, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -4.907380104064941, - "rewards/margins": 0.5216037034988403, - "rewards/rejected": -5.42898416519165, - "semantic_entropy": 0.08699695765972137, + "logits/chosen": 0.10396585613489151, + "logits/rejected": 0.12245559692382812, + "logps/chosen": -1.4712176322937012, + "logps/rejected": -1.659424066543579, + "loss": 0.6679, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4712176322937012, + "rewards/margins": 0.18820635974407196, + "rewards/rejected": -1.659424066543579, "step": 560 }, { "epoch": 0.30239170429837764, - "grad_norm": 15.731040510232038, + "grad_norm": 7.21796949009117, "learning_rate": 9.999984476788462e-07, - "logits/chosen": 0.32874903082847595, - "logits/rejected": 0.37473997473716736, - "logps/chosen": -4.960855960845947, - "logps/rejected": -5.458104610443115, - "loss": 0.6559, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -4.960855960845947, - "rewards/margins": 0.49724894762039185, - "rewards/rejected": -5.458104610443115, - "semantic_entropy": 0.07233523577451706, + "logits/chosen": 0.0884551927447319, + "logits/rejected": 0.14948171377182007, + "logps/chosen": -1.5736544132232666, + "logps/rejected": -1.7732725143432617, + "loss": 0.6801, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5736544132232666, + "rewards/margins": 0.19961810111999512, + "rewards/rejected": -1.7732725143432617, "step": 565 }, { "epoch": 0.30506773707977924, - "grad_norm": 26.591918700910675, + "grad_norm": 12.955844831007765, "learning_rate": 9.999921413906797e-07, - "logits/chosen": 0.2645338177680969, - "logits/rejected": 0.4236833453178406, - "logps/chosen": -4.993116855621338, - "logps/rejected": -5.463040828704834, - "loss": 0.6672, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -4.993116855621338, - "rewards/margins": 0.4699248671531677, - "rewards/rejected": -5.463040828704834, - "semantic_entropy": 0.06868621706962585, + "logits/chosen": -0.007973670959472656, + "logits/rejected": 0.21382908523082733, + "logps/chosen": -1.5576199293136597, + "logps/rejected": -1.7177006006240845, + "loss": 0.6995, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.5576199293136597, + "rewards/margins": 0.1600807160139084, + "rewards/rejected": -1.7177006006240845, "step": 570 }, { "epoch": 0.3077437698611808, - "grad_norm": 17.123203800580065, + "grad_norm": 6.852727571620492, "learning_rate": 9.999809841765644e-07, - "logits/chosen": 0.26640480756759644, - "logits/rejected": 0.3024447560310364, - "logps/chosen": -4.769078731536865, - "logps/rejected": -5.275615692138672, - "loss": 0.6851, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -4.769078731536865, - "rewards/margins": 0.5065367817878723, - "rewards/rejected": -5.275615692138672, - "semantic_entropy": 0.07678450644016266, + "logits/chosen": 0.035446397960186005, + "logits/rejected": 0.09382332861423492, + "logps/chosen": -1.4970608949661255, + "logps/rejected": -1.6978238821029663, + "loss": 0.6847, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4970608949661255, + "rewards/margins": 0.20076322555541992, + "rewards/rejected": -1.6978238821029663, "step": 575 }, { "epoch": 0.3104198026425824, - "grad_norm": 16.862219340555285, + "grad_norm": 7.07115685356617, "learning_rate": 9.999649761447477e-07, - "logits/chosen": 0.2403547316789627, - "logits/rejected": 0.35913315415382385, - "logps/chosen": -4.844521999359131, - "logps/rejected": -5.310414791107178, - "loss": 0.6817, - "rewards/accuracies": 0.65625, - "rewards/chosen": -4.844521999359131, - "rewards/margins": 0.4658929407596588, - "rewards/rejected": -5.310414791107178, - "semantic_entropy": 0.07107989490032196, + "logits/chosen": 0.029823005199432373, + "logits/rejected": 0.19235572218894958, + "logps/chosen": -1.5113080739974976, + "logps/rejected": -1.786739706993103, + "loss": 0.6374, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5113080739974976, + "rewards/margins": 0.2754315733909607, + "rewards/rejected": -1.786739706993103, "step": 580 }, { "epoch": 0.31309583542398395, - "grad_norm": 20.48877444397547, + "grad_norm": 8.405177882047731, "learning_rate": 9.999441174505398e-07, - "logits/chosen": 0.2463439702987671, - "logits/rejected": 0.3041590750217438, - "logps/chosen": -5.344332218170166, - "logps/rejected": -5.5950727462768555, - "loss": 0.7775, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -5.344332218170166, - "rewards/margins": 0.250741183757782, - "rewards/rejected": -5.5950727462768555, - "semantic_entropy": 0.05236155912280083, + "logits/chosen": -0.015560868196189404, + "logits/rejected": 0.10073105245828629, + "logps/chosen": -1.7266113758087158, + "logps/rejected": -1.8543450832366943, + "loss": 0.7372, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.7266113758087158, + "rewards/margins": 0.1277337670326233, + "rewards/rejected": -1.8543450832366943, "step": 585 }, { "epoch": 0.3157718682053855, - "grad_norm": 33.45307601229143, + "grad_norm": 20.173327844824414, "learning_rate": 9.999184082963116e-07, - "logits/chosen": 0.2851886749267578, - "logits/rejected": 0.37152618169784546, - "logps/chosen": -5.112926006317139, - "logps/rejected": -5.441379547119141, - "loss": 0.7305, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -5.112926006317139, - "rewards/margins": 0.3284529149532318, - "rewards/rejected": -5.441379547119141, - "semantic_entropy": 0.05966230109333992, + "logits/chosen": 0.03434785455465317, + "logits/rejected": 0.16322410106658936, + "logps/chosen": -1.655646562576294, + "logps/rejected": -1.7503175735473633, + "loss": 0.719, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.655646562576294, + "rewards/margins": 0.09467087686061859, + "rewards/rejected": -1.7503175735473633, "step": 590 }, { "epoch": 0.3184479009867871, - "grad_norm": 28.772414597800683, + "grad_norm": 12.570050024431763, "learning_rate": 9.998878489314937e-07, - "logits/chosen": 0.3128214478492737, - "logits/rejected": 0.4057738184928894, - "logps/chosen": -5.047120094299316, - "logps/rejected": -5.475900173187256, - "loss": 0.6587, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -5.047120094299316, - "rewards/margins": 0.4287797510623932, - "rewards/rejected": -5.475900173187256, - "semantic_entropy": 0.056463856250047684, + "logits/chosen": 0.07566434144973755, + "logits/rejected": 0.20687690377235413, + "logps/chosen": -1.5775829553604126, + "logps/rejected": -1.7780683040618896, + "loss": 0.6736, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.5775829553604126, + "rewards/margins": 0.20048511028289795, + "rewards/rejected": -1.7780683040618896, "step": 595 }, { "epoch": 0.32112393376818865, - "grad_norm": 21.595817825496415, + "grad_norm": 7.892583907880249, "learning_rate": 9.99852439652573e-07, - "logits/chosen": 0.27063342928886414, - "logits/rejected": 0.3820621371269226, - "logps/chosen": -5.453131198883057, - "logps/rejected": -5.779126167297363, - "loss": 0.6993, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -5.453131198883057, - "rewards/margins": 0.32599514722824097, - "rewards/rejected": -5.779126167297363, - "semantic_entropy": 0.04167807847261429, + "logits/chosen": 0.02129637636244297, + "logits/rejected": 0.17136108875274658, + "logps/chosen": -1.624513864517212, + "logps/rejected": -1.7846864461898804, + "loss": 0.689, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.624513864517212, + "rewards/margins": 0.1601727306842804, + "rewards/rejected": -1.7846864461898804, "step": 600 }, { "epoch": 0.32379996654959026, - "grad_norm": 26.050724529880945, + "grad_norm": 13.36593167443576, "learning_rate": 9.998121808030904e-07, - "logits/chosen": 0.24278345704078674, - "logits/rejected": 0.3093962073326111, - "logps/chosen": -5.513918876647949, - "logps/rejected": -5.7660112380981445, - "loss": 0.8, - "rewards/accuracies": 0.59375, - "rewards/chosen": -5.513918876647949, - "rewards/margins": 0.25209134817123413, - "rewards/rejected": -5.7660112380981445, - "semantic_entropy": 0.04349964112043381, + "logits/chosen": -0.006542189512401819, + "logits/rejected": 0.07912351936101913, + "logps/chosen": -1.7766975164413452, + "logps/rejected": -1.9879192113876343, + "loss": 0.6821, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.7766975164413452, + "rewards/margins": 0.2112216055393219, + "rewards/rejected": -1.9879192113876343, "step": 605 }, { "epoch": 0.3264759993309918, - "grad_norm": 45.051043777132236, + "grad_norm": 20.243093227193167, "learning_rate": 9.997670727736379e-07, - "logits/chosen": 0.31496500968933105, - "logits/rejected": 0.47544389963150024, - "logps/chosen": -5.2426862716674805, - "logps/rejected": -5.6988115310668945, - "loss": 0.6803, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -5.2426862716674805, - "rewards/margins": 0.4561251699924469, - "rewards/rejected": -5.6988115310668945, - "semantic_entropy": 0.05658901482820511, + "logits/chosen": 0.1253727376461029, + "logits/rejected": 0.29659098386764526, + "logps/chosen": -1.69742751121521, + "logps/rejected": -1.9165337085723877, + "loss": 0.6777, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.69742751121521, + "rewards/margins": 0.21910591423511505, + "rewards/rejected": -1.9165337085723877, "step": 610 }, { "epoch": 0.32915203211239336, - "grad_norm": 23.856980522226188, + "grad_norm": 6.616506582179463, "learning_rate": 9.99717116001853e-07, - "logits/chosen": 0.32243964076042175, - "logits/rejected": 0.39280059933662415, - "logps/chosen": -5.77672815322876, - "logps/rejected": -6.442985534667969, - "loss": 0.5973, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -5.77672815322876, - "rewards/margins": 0.6662576794624329, - "rewards/rejected": -6.442985534667969, - "semantic_entropy": 0.03704090788960457, + "logits/chosen": 0.02734069898724556, + "logits/rejected": 0.13534528017044067, + "logps/chosen": -1.659375786781311, + "logps/rejected": -1.956472396850586, + "loss": 0.6439, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.659375786781311, + "rewards/margins": 0.2970966696739197, + "rewards/rejected": -1.956472396850586, "step": 615 }, { "epoch": 0.33182806489379496, - "grad_norm": 15.346139192682116, + "grad_norm": 8.629197467128938, "learning_rate": 9.996623109724173e-07, - "logits/chosen": 0.4094429016113281, - "logits/rejected": 0.44762665033340454, - "logps/chosen": -6.212830543518066, - "logps/rejected": -6.7759881019592285, - "loss": 0.6226, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -6.212830543518066, - "rewards/margins": 0.5631579160690308, - "rewards/rejected": -6.7759881019592285, - "semantic_entropy": 0.03154679387807846, + "logits/chosen": 0.12442900985479355, + "logits/rejected": 0.19420668482780457, + "logps/chosen": -1.7717794179916382, + "logps/rejected": -1.9653661251068115, + "loss": 0.6856, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.7717794179916382, + "rewards/margins": 0.19358675181865692, + "rewards/rejected": -1.9653661251068115, "step": 620 }, { "epoch": 0.3345040976751965, - "grad_norm": 18.787286491671612, + "grad_norm": 11.310526734168743, "learning_rate": 9.996026582170488e-07, - "logits/chosen": 0.41264209151268005, - "logits/rejected": 0.5199421048164368, - "logps/chosen": -6.266766548156738, - "logps/rejected": -6.7597222328186035, - "loss": 0.6553, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -6.266766548156738, - "rewards/margins": 0.49295586347579956, - "rewards/rejected": -6.7597222328186035, - "semantic_entropy": 0.03314858675003052, + "logits/chosen": 0.12832635641098022, + "logits/rejected": 0.250538170337677, + "logps/chosen": -1.6486222743988037, + "logps/rejected": -1.9326112270355225, + "loss": 0.6209, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6486222743988037, + "rewards/margins": 0.2839890420436859, + "rewards/rejected": -1.9326112270355225, "step": 625 }, { "epoch": 0.3371801304565981, - "grad_norm": 23.697424686130283, + "grad_norm": 11.01534836804931, "learning_rate": 9.995381583144996e-07, - "logits/chosen": 0.3286336362361908, - "logits/rejected": 0.416952908039093, - "logps/chosen": -6.489420413970947, - "logps/rejected": -7.091695308685303, - "loss": 0.6025, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -6.489420413970947, - "rewards/margins": 0.6022747755050659, - "rewards/rejected": -7.091695308685303, - "semantic_entropy": 0.019754167646169662, + "logits/chosen": 0.04354455694556236, + "logits/rejected": 0.15197351574897766, + "logps/chosen": -1.7005122900009155, + "logps/rejected": -1.985790491104126, + "loss": 0.6262, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.7005122900009155, + "rewards/margins": 0.2852782905101776, + "rewards/rejected": -1.985790491104126, "step": 630 }, { "epoch": 0.33985616323799966, - "grad_norm": 23.346401547821337, + "grad_norm": 7.153627272171035, "learning_rate": 9.994688118905471e-07, - "logits/chosen": 0.39607900381088257, - "logits/rejected": 0.5640990138053894, - "logps/chosen": -6.5967607498168945, - "logps/rejected": -7.059272766113281, - "loss": 0.674, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -6.5967607498168945, - "rewards/margins": 0.4625115990638733, - "rewards/rejected": -7.059272766113281, - "semantic_entropy": 0.023461516946554184, + "logits/chosen": 0.05143728852272034, + "logits/rejected": 0.29082220792770386, + "logps/chosen": -1.8001759052276611, + "logps/rejected": -2.029228687286377, + "loss": 0.6788, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.8001759052276611, + "rewards/margins": 0.22905273735523224, + "rewards/rejected": -2.029228687286377, "step": 635 }, { "epoch": 0.3425321960194012, - "grad_norm": 28.41310343878819, + "grad_norm": 17.858181050727644, "learning_rate": 9.993946196179912e-07, - "logits/chosen": 0.37280526757240295, - "logits/rejected": 0.5077700018882751, - "logps/chosen": -6.618893623352051, - "logps/rejected": -6.993128776550293, - "loss": 0.6847, - "rewards/accuracies": 0.625, - "rewards/chosen": -6.618893623352051, - "rewards/margins": 0.3742350935935974, - "rewards/rejected": -6.993128776550293, - "semantic_entropy": 0.016319947317242622, + "logits/chosen": -0.013154381886124611, + "logits/rejected": 0.19423320889472961, + "logps/chosen": -1.7882518768310547, + "logps/rejected": -2.0224525928497314, + "loss": 0.6813, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.7882518768310547, + "rewards/margins": 0.23420067131519318, + "rewards/rejected": -2.0224525928497314, "step": 640 }, { "epoch": 0.3452082288008028, - "grad_norm": 17.075034812283786, + "grad_norm": 9.121406718086735, "learning_rate": 9.993155822166455e-07, - "logits/chosen": 0.4683307707309723, - "logits/rejected": 0.5001148581504822, - "logps/chosen": -6.166420936584473, - "logps/rejected": -6.613181114196777, - "loss": 0.6588, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -6.166420936584473, - "rewards/margins": 0.4467601776123047, - "rewards/rejected": -6.613181114196777, - "semantic_entropy": 0.023232873529195786, + "logits/chosen": -0.00975788850337267, + "logits/rejected": 0.07375514507293701, + "logps/chosen": -1.725366234779358, + "logps/rejected": -2.0122714042663574, + "loss": 0.6396, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.725366234779358, + "rewards/margins": 0.2869051694869995, + "rewards/rejected": -2.0122714042663574, "step": 645 }, { "epoch": 0.34788426158220437, - "grad_norm": 22.115212411816152, + "grad_norm": 14.334675560465596, "learning_rate": 9.992317004533313e-07, - "logits/chosen": 0.49777716398239136, - "logits/rejected": 0.5633991360664368, - "logps/chosen": -6.14475154876709, - "logps/rejected": -6.610726833343506, - "loss": 0.6416, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -6.14475154876709, - "rewards/margins": 0.4659750461578369, - "rewards/rejected": -6.610726833343506, - "semantic_entropy": 0.023257287219166756, + "logits/chosen": 0.05272717401385307, + "logits/rejected": 0.19426897168159485, + "logps/chosen": -1.8792060613632202, + "logps/rejected": -2.190314292907715, + "loss": 0.6404, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.8792060613632202, + "rewards/margins": 0.31110823154449463, + "rewards/rejected": -2.190314292907715, "step": 650 }, { "epoch": 0.350560294363606, - "grad_norm": 20.090310906010423, + "grad_norm": 14.588385778087009, "learning_rate": 9.991429751418696e-07, - "logits/chosen": 0.4952174127101898, - "logits/rejected": 0.5142993927001953, - "logps/chosen": -5.922252655029297, - "logps/rejected": -6.414206504821777, - "loss": 0.6695, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -5.922252655029297, - "rewards/margins": 0.49195390939712524, - "rewards/rejected": -6.414206504821777, - "semantic_entropy": 0.027981286868453026, + "logits/chosen": 0.11557390540838242, + "logits/rejected": 0.12611190974712372, + "logps/chosen": -1.853022575378418, + "logps/rejected": -2.151285409927368, + "loss": 0.6803, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.853022575378418, + "rewards/margins": 0.2982625961303711, + "rewards/rejected": -2.151285409927368, "step": 655 }, { "epoch": 0.3532363271450075, - "grad_norm": 19.79614358859752, + "grad_norm": 12.462123185299712, "learning_rate": 9.99049407143074e-07, - "logits/chosen": 0.5208943486213684, - "logits/rejected": 0.5800861120223999, - "logps/chosen": -6.255806922912598, - "logps/rejected": -6.602316856384277, - "loss": 0.697, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -6.255806922912598, - "rewards/margins": 0.34651073813438416, - "rewards/rejected": -6.602316856384277, - "semantic_entropy": 0.02483288012444973, + "logits/chosen": 0.08674181997776031, + "logits/rejected": 0.21128582954406738, + "logps/chosen": -1.844439148902893, + "logps/rejected": -2.024085283279419, + "loss": 0.6843, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.844439148902893, + "rewards/margins": 0.17964598536491394, + "rewards/rejected": -2.024085283279419, "step": 660 }, { "epoch": 0.35591235992640907, - "grad_norm": 19.579221986465477, + "grad_norm": 7.043749697215875, "learning_rate": 9.989509973647416e-07, - "logits/chosen": 0.5133857727050781, - "logits/rejected": 0.58699631690979, - "logps/chosen": -6.382502555847168, - "logps/rejected": -6.7586350440979, - "loss": 0.6752, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -6.382502555847168, - "rewards/margins": 0.3761317729949951, - "rewards/rejected": -6.7586350440979, - "semantic_entropy": 0.019113317131996155, + "logits/chosen": 0.06778523325920105, + "logits/rejected": 0.20547974109649658, + "logps/chosen": -1.7933094501495361, + "logps/rejected": -2.0701348781585693, + "loss": 0.6431, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.7933094501495361, + "rewards/margins": 0.2768252193927765, + "rewards/rejected": -2.0701348781585693, "step": 665 }, { "epoch": 0.3585883927078107, - "grad_norm": 19.24827550379443, + "grad_norm": 9.961015636314245, "learning_rate": 9.988477467616445e-07, - "logits/chosen": 0.5319421291351318, - "logits/rejected": 0.6193274259567261, - "logps/chosen": -6.479439735412598, - "logps/rejected": -6.937412261962891, - "loss": 0.5944, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -6.479439735412598, - "rewards/margins": 0.4579733908176422, - "rewards/rejected": -6.937412261962891, - "semantic_entropy": 0.016540968790650368, + "logits/chosen": 0.048078346997499466, + "logits/rejected": 0.24679461121559143, + "logps/chosen": -1.844089150428772, + "logps/rejected": -2.0876410007476807, + "loss": 0.6281, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.844089150428772, + "rewards/margins": 0.24355196952819824, + "rewards/rejected": -2.0876410007476807, "step": 670 }, { "epoch": 0.3612644254892122, - "grad_norm": 20.36938516339038, + "grad_norm": 13.658304110692146, "learning_rate": 9.987396563355205e-07, - "logits/chosen": 0.5054916143417358, - "logits/rejected": 0.5468543171882629, - "logps/chosen": -6.396731376647949, - "logps/rejected": -6.9033098220825195, - "loss": 0.5913, - "rewards/accuracies": 0.71875, - "rewards/chosen": -6.396731376647949, - "rewards/margins": 0.5065786242485046, - "rewards/rejected": -6.9033098220825195, - "semantic_entropy": 0.019933702424168587, + "logits/chosen": 0.039140813052654266, + "logits/rejected": 0.11539199203252792, + "logps/chosen": -1.850952386856079, + "logps/rejected": -2.20514178276062, + "loss": 0.6196, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.850952386856079, + "rewards/margins": 0.354189395904541, + "rewards/rejected": -2.20514178276062, "step": 675 }, { "epoch": 0.36394045827061383, - "grad_norm": 25.694663813660224, + "grad_norm": 14.921282465901704, "learning_rate": 9.986267271350631e-07, - "logits/chosen": 0.4442412257194519, - "logits/rejected": 0.5351762771606445, - "logps/chosen": -6.232950687408447, - "logps/rejected": -6.665299892425537, - "loss": 0.6833, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -6.232950687408447, - "rewards/margins": 0.4323497414588928, - "rewards/rejected": -6.665299892425537, - "semantic_entropy": 0.02159653976559639, + "logits/chosen": 0.15032705664634705, + "logits/rejected": 0.31885138154029846, + "logps/chosen": -1.9729877710342407, + "logps/rejected": -2.2173333168029785, + "loss": 0.7223, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.9729877710342407, + "rewards/margins": 0.24434557557106018, + "rewards/rejected": -2.2173333168029785, "step": 680 }, { "epoch": 0.3666164910520154, - "grad_norm": 26.644324005748285, + "grad_norm": 26.261115758282084, "learning_rate": 9.985089602559123e-07, - "logits/chosen": 0.49901971220970154, - "logits/rejected": 0.5869329571723938, - "logps/chosen": -6.33712911605835, - "logps/rejected": -6.925673007965088, - "loss": 0.5808, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -6.33712911605835, - "rewards/margins": 0.588544487953186, - "rewards/rejected": -6.925673007965088, - "semantic_entropy": 0.01862289011478424, + "logits/chosen": 0.10252350568771362, + "logits/rejected": 0.2622830271720886, + "logps/chosen": -1.9864047765731812, + "logps/rejected": -2.2703216075897217, + "loss": 0.6591, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.9864047765731812, + "rewards/margins": 0.2839171290397644, + "rewards/rejected": -2.2703216075897217, "step": 685 }, { "epoch": 0.369292523833417, - "grad_norm": 20.404114370621034, + "grad_norm": 9.671905082641993, "learning_rate": 9.983863568406428e-07, - "logits/chosen": 0.5496017336845398, - "logits/rejected": 0.5704872012138367, - "logps/chosen": -6.48660135269165, - "logps/rejected": -6.933077812194824, - "loss": 0.6603, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -6.48660135269165, - "rewards/margins": 0.44647669792175293, - "rewards/rejected": -6.933077812194824, - "semantic_entropy": 0.017888184636831284, + "logits/chosen": 0.12365067005157471, + "logits/rejected": 0.1564207822084427, + "logps/chosen": -2.0155513286590576, + "logps/rejected": -2.286001205444336, + "loss": 0.6606, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.0155513286590576, + "rewards/margins": 0.27045008540153503, + "rewards/rejected": -2.286001205444336, "step": 690 }, { "epoch": 0.37196855661481854, - "grad_norm": 20.958217633474742, + "grad_norm": 8.264756994707858, "learning_rate": 9.982589180787532e-07, - "logits/chosen": 0.4869377613067627, - "logits/rejected": 0.5502743721008301, - "logps/chosen": -6.533112525939941, - "logps/rejected": -6.993691921234131, - "loss": 0.6216, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -6.533112525939941, - "rewards/margins": 0.4605790674686432, - "rewards/rejected": -6.993691921234131, - "semantic_entropy": 0.01869816519320011, + "logits/chosen": 0.08439230918884277, + "logits/rejected": 0.17201441526412964, + "logps/chosen": -1.8809579610824585, + "logps/rejected": -2.214841604232788, + "loss": 0.6194, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.8809579610824585, + "rewards/margins": 0.33388373255729675, + "rewards/rejected": -2.214841604232788, "step": 695 }, { "epoch": 0.3746445893962201, - "grad_norm": 28.100366278994386, + "grad_norm": 12.956960838611854, "learning_rate": 9.981266452066553e-07, - "logits/chosen": 0.38128662109375, - "logits/rejected": 0.45251068472862244, - "logps/chosen": -6.801139831542969, - "logps/rejected": -7.043248176574707, - "loss": 0.6849, - "rewards/accuracies": 0.59375, - "rewards/chosen": -6.801139831542969, - "rewards/margins": 0.2421083003282547, - "rewards/rejected": -7.043248176574707, - "semantic_entropy": 0.012132355943322182, + "logits/chosen": -0.0224702600389719, + "logits/rejected": 0.11274166405200958, + "logps/chosen": -2.0886378288269043, + "logps/rejected": -2.337368965148926, + "loss": 0.6444, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0886378288269043, + "rewards/margins": 0.24873118102550507, + "rewards/rejected": -2.337368965148926, "step": 700 }, { "epoch": 0.3773206221776217, - "grad_norm": 20.82013187037515, + "grad_norm": 10.528717033207455, "learning_rate": 9.979895395076608e-07, - "logits/chosen": 0.30956459045410156, - "logits/rejected": 0.44323819875717163, - "logps/chosen": -6.558831691741943, - "logps/rejected": -7.164409637451172, - "loss": 0.575, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -6.558831691741943, - "rewards/margins": 0.6055777668952942, - "rewards/rejected": -7.164409637451172, - "semantic_entropy": 0.01754785142838955, + "logits/chosen": 0.025121700018644333, + "logits/rejected": 0.1942935436964035, + "logps/chosen": -2.0611321926116943, + "logps/rejected": -2.4596261978149414, + "loss": 0.6074, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.0611321926116943, + "rewards/margins": 0.39849406480789185, + "rewards/rejected": -2.4596261978149414, "step": 705 }, { "epoch": 0.37999665495902324, - "grad_norm": 21.30045872363481, + "grad_norm": 10.778191011143726, "learning_rate": 9.9784760231197e-07, - "logits/chosen": 0.3447558283805847, - "logits/rejected": 0.4274185299873352, - "logps/chosen": -6.733677864074707, - "logps/rejected": -7.2757744789123535, - "loss": 0.6037, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -6.733677864074707, - "rewards/margins": 0.5420972108840942, - "rewards/rejected": -7.2757744789123535, - "semantic_entropy": 0.015751570463180542, + "logits/chosen": 0.12061578035354614, + "logits/rejected": 0.2114303857088089, + "logps/chosen": -2.0864968299865723, + "logps/rejected": -2.408721446990967, + "loss": 0.6142, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.0864968299865723, + "rewards/margins": 0.32222452759742737, + "rewards/rejected": -2.408721446990967, "step": 710 }, { "epoch": 0.38267268774042484, - "grad_norm": 30.288720792247716, + "grad_norm": 12.181566557142107, "learning_rate": 9.97700834996658e-07, - "logits/chosen": 0.3281204402446747, - "logits/rejected": 0.449833482503891, - "logps/chosen": -7.066946983337402, - "logps/rejected": -7.522922515869141, - "loss": 0.6432, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -7.066946983337402, - "rewards/margins": 0.4559754431247711, - "rewards/rejected": -7.522922515869141, - "semantic_entropy": 0.010822773911058903, + "logits/chosen": 0.03635063022375107, + "logits/rejected": 0.19632598757743835, + "logps/chosen": -2.239739179611206, + "logps/rejected": -2.5304043292999268, + "loss": 0.6423, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.239739179611206, + "rewards/margins": 0.2906648814678192, + "rewards/rejected": -2.5304043292999268, "step": 715 }, { "epoch": 0.3853487205218264, - "grad_norm": 21.97533323049887, + "grad_norm": 19.675937794866623, "learning_rate": 9.97549238985662e-07, - "logits/chosen": 0.4738125205039978, - "logits/rejected": 0.6201906204223633, - "logps/chosen": -6.687346458435059, - "logps/rejected": -7.375749111175537, - "loss": 0.5906, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -6.687346458435059, - "rewards/margins": 0.6884029507637024, - "rewards/rejected": -7.375749111175537, - "semantic_entropy": 0.01416093111038208, + "logits/chosen": 0.10846034437417984, + "logits/rejected": 0.2928066849708557, + "logps/chosen": -2.332794666290283, + "logps/rejected": -2.6422171592712402, + "loss": 0.6707, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.332794666290283, + "rewards/margins": 0.30942273139953613, + "rewards/rejected": -2.6422171592712402, "step": 720 }, { "epoch": 0.38802475330322794, - "grad_norm": 16.154939819122443, + "grad_norm": 11.958220823628832, "learning_rate": 9.973928157497674e-07, - "logits/chosen": 0.397294819355011, - "logits/rejected": 0.4953377842903137, - "logps/chosen": -6.555293083190918, - "logps/rejected": -7.133930206298828, - "loss": 0.5751, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -6.555293083190918, - "rewards/margins": 0.5786372423171997, - "rewards/rejected": -7.133930206298828, - "semantic_entropy": 0.016166144981980324, + "logits/chosen": -0.017281439155340195, + "logits/rejected": 0.11997995525598526, + "logps/chosen": -2.119809627532959, + "logps/rejected": -2.5231869220733643, + "loss": 0.5885, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.119809627532959, + "rewards/margins": 0.40337705612182617, + "rewards/rejected": -2.5231869220733643, "step": 725 }, { "epoch": 0.39070078608462955, - "grad_norm": 18.66427550708316, + "grad_norm": 18.231430185742685, "learning_rate": 9.972315668065927e-07, - "logits/chosen": 0.39967241883277893, - "logits/rejected": 0.4871141314506531, - "logps/chosen": -6.588616371154785, - "logps/rejected": -7.018074989318848, - "loss": 0.6501, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -6.588616371154785, - "rewards/margins": 0.4294595718383789, - "rewards/rejected": -7.018074989318848, - "semantic_entropy": 0.015043877065181732, + "logits/chosen": -0.05263448879122734, + "logits/rejected": 0.09009256213903427, + "logps/chosen": -2.3031411170959473, + "logps/rejected": -2.593540668487549, + "loss": 0.663, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.3031411170959473, + "rewards/margins": 0.29039958119392395, + "rewards/rejected": -2.593540668487549, "step": 730 }, { "epoch": 0.3933768188660311, - "grad_norm": 19.58743135959893, + "grad_norm": 10.916247784991578, "learning_rate": 9.97065493720576e-07, - "logits/chosen": 0.424797385931015, - "logits/rejected": 0.5106293559074402, - "logps/chosen": -6.42412805557251, - "logps/rejected": -6.823407173156738, - "loss": 0.661, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -6.42412805557251, - "rewards/margins": 0.3992784023284912, - "rewards/rejected": -6.823407173156738, - "semantic_entropy": 0.017190445214509964, + "logits/chosen": -0.009555116295814514, + "logits/rejected": 0.08649361878633499, + "logps/chosen": -2.2827672958374023, + "logps/rejected": -2.5299229621887207, + "loss": 0.6443, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.2827672958374023, + "rewards/margins": 0.24715574085712433, + "rewards/rejected": -2.5299229621887207, "step": 735 }, { "epoch": 0.3960528516474327, - "grad_norm": 19.710530643393298, + "grad_norm": 16.88651993519724, "learning_rate": 9.968945981029594e-07, - "logits/chosen": 0.5481308102607727, - "logits/rejected": 0.6441117525100708, - "logps/chosen": -6.604589939117432, - "logps/rejected": -7.156881809234619, - "loss": 0.5894, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -6.604589939117432, - "rewards/margins": 0.5522912740707397, - "rewards/rejected": -7.156881809234619, - "semantic_entropy": 0.014214654453098774, + "logits/chosen": -0.014693480916321278, + "logits/rejected": 0.14500722289085388, + "logps/chosen": -2.329932689666748, + "logps/rejected": -2.6660799980163574, + "loss": 0.6239, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.329932689666748, + "rewards/margins": 0.3361472487449646, + "rewards/rejected": -2.6660799980163574, "step": 740 }, { "epoch": 0.39872888442883425, - "grad_norm": 17.871273849433326, + "grad_norm": 7.683154120299283, "learning_rate": 9.967188816117726e-07, - "logits/chosen": 0.6293722987174988, - "logits/rejected": 0.6717933416366577, - "logps/chosen": -6.835976600646973, - "logps/rejected": -7.2494215965271, - "loss": 0.6631, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -6.835976600646973, - "rewards/margins": 0.4134441316127777, - "rewards/rejected": -7.2494215965271, - "semantic_entropy": 0.01153610274195671, + "logits/chosen": 0.09473200142383575, + "logits/rejected": 0.16610334813594818, + "logps/chosen": -2.338489055633545, + "logps/rejected": -2.7219491004943848, + "loss": 0.6582, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.338489055633545, + "rewards/margins": 0.38346007466316223, + "rewards/rejected": -2.7219491004943848, "step": 745 }, { "epoch": 0.4014049172102358, - "grad_norm": 16.865148721700457, + "grad_norm": 20.004661141391402, "learning_rate": 9.965383459518179e-07, - "logits/chosen": 0.541202962398529, - "logits/rejected": 0.6529287695884705, - "logps/chosen": -6.675736904144287, - "logps/rejected": -7.145176887512207, - "loss": 0.6284, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -6.675736904144287, - "rewards/margins": 0.4694399833679199, - "rewards/rejected": -7.145176887512207, - "semantic_entropy": 0.013248731382191181, + "logits/chosen": 0.02197187952697277, + "logits/rejected": 0.17875012755393982, + "logps/chosen": -2.32258939743042, + "logps/rejected": -2.714125871658325, + "loss": 0.6219, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.32258939743042, + "rewards/margins": 0.39153629541397095, + "rewards/rejected": -2.714125871658325, "step": 750 }, { "epoch": 0.4040809499916374, - "grad_norm": 23.498295974070185, + "grad_norm": 10.487567295309734, "learning_rate": 9.963529928746533e-07, - "logits/chosen": 0.5743650197982788, - "logits/rejected": 0.663760781288147, - "logps/chosen": -6.699901580810547, - "logps/rejected": -7.10396671295166, - "loss": 0.6749, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -6.699901580810547, - "rewards/margins": 0.4040653109550476, - "rewards/rejected": -7.10396671295166, - "semantic_entropy": 0.014644038863480091, + "logits/chosen": 0.06035762280225754, + "logits/rejected": 0.1866520494222641, + "logps/chosen": -2.3090062141418457, + "logps/rejected": -2.61854887008667, + "loss": 0.6644, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.3090062141418457, + "rewards/margins": 0.3095424771308899, + "rewards/rejected": -2.61854887008667, "step": 755 }, { "epoch": 0.40675698277303896, - "grad_norm": 15.10343166970609, + "grad_norm": 7.751545509076353, "learning_rate": 9.961628241785746e-07, - "logits/chosen": 0.4620290696620941, - "logits/rejected": 0.5227762460708618, - "logps/chosen": -6.713381767272949, - "logps/rejected": -7.118601322174072, - "loss": 0.6779, + "logits/chosen": -0.02450668253004551, + "logits/rejected": 0.0534825325012207, + "logps/chosen": -2.343313694000244, + "logps/rejected": -2.6583704948425293, + "loss": 0.6429, "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -6.713381767272949, - "rewards/margins": 0.4052188992500305, - "rewards/rejected": -7.118601322174072, - "semantic_entropy": 0.014408141374588013, + "rewards/chosen": -2.343313694000244, + "rewards/margins": 0.31505703926086426, + "rewards/rejected": -2.6583704948425293, "step": 760 }, { "epoch": 0.40943301555444056, - "grad_norm": 16.82158650124028, + "grad_norm": 9.221767580875369, "learning_rate": 9.959678417085998e-07, - "logits/chosen": 0.4263577461242676, - "logits/rejected": 0.4895492494106293, - "logps/chosen": -6.6601409912109375, - "logps/rejected": -7.117767333984375, - "loss": 0.6173, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -6.6601409912109375, - "rewards/margins": 0.45762643218040466, - "rewards/rejected": -7.117767333984375, - "semantic_entropy": 0.014361525885760784, + "logits/chosen": 0.018906479701399803, + "logits/rejected": 0.11153922230005264, + "logps/chosen": -2.21673583984375, + "logps/rejected": -2.500614643096924, + "loss": 0.6536, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.21673583984375, + "rewards/margins": 0.2838789224624634, + "rewards/rejected": -2.500614643096924, "step": 765 }, { "epoch": 0.4121090483358421, - "grad_norm": 13.610647877557357, + "grad_norm": 10.709173064457636, "learning_rate": 9.957680473564493e-07, - "logits/chosen": 0.5320577621459961, - "logits/rejected": 0.6060940027236938, - "logps/chosen": -6.88693904876709, - "logps/rejected": -7.462642669677734, - "loss": 0.5833, - "rewards/accuracies": 0.6875, - "rewards/chosen": -6.88693904876709, - "rewards/margins": 0.5757043957710266, - "rewards/rejected": -7.462642669677734, - "semantic_entropy": 0.011815531179308891, + "logits/chosen": 0.12333840131759644, + "logits/rejected": 0.2510780990123749, + "logps/chosen": -2.1899380683898926, + "logps/rejected": -2.658698558807373, + "loss": 0.5861, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.1899380683898926, + "rewards/margins": 0.4687604010105133, + "rewards/rejected": -2.658698558807373, "step": 770 }, { "epoch": 0.41478508111724366, - "grad_norm": 10.55729867477025, + "grad_norm": 7.585037947229681, "learning_rate": 9.95563443060529e-07, - "logits/chosen": 0.4426754415035248, - "logits/rejected": 0.527428150177002, - "logps/chosen": -6.9573655128479, - "logps/rejected": -7.31267786026001, - "loss": 0.6804, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -6.9573655128479, - "rewards/margins": 0.35531362891197205, - "rewards/rejected": -7.31267786026001, - "semantic_entropy": 0.011011673137545586, + "logits/chosen": -0.06295964866876602, + "logits/rejected": 0.10656394064426422, + "logps/chosen": -2.194547176361084, + "logps/rejected": -2.5272059440612793, + "loss": 0.6591, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.194547176361084, + "rewards/margins": 0.33265867829322815, + "rewards/rejected": -2.5272059440612793, "step": 775 }, { "epoch": 0.41746111389864526, - "grad_norm": 20.225405830042792, + "grad_norm": 10.768220564604052, "learning_rate": 9.95354030805911e-07, - "logits/chosen": 0.38837724924087524, - "logits/rejected": 0.4707297384738922, - "logps/chosen": -6.923590660095215, - "logps/rejected": -7.26629638671875, - "loss": 0.6241, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -6.923590660095215, - "rewards/margins": 0.34270578622817993, - "rewards/rejected": -7.26629638671875, - "semantic_entropy": 0.010305705480277538, + "logits/chosen": -0.11507584154605865, + "logits/rejected": 0.029914315789937973, + "logps/chosen": -2.2199347019195557, + "logps/rejected": -2.531000852584839, + "loss": 0.6378, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.2199347019195557, + "rewards/margins": 0.31106656789779663, + "rewards/rejected": -2.531000852584839, "step": 780 }, { "epoch": 0.4201371466800468, - "grad_norm": 24.361936444306224, + "grad_norm": 9.706019985454642, "learning_rate": 9.951398126243133e-07, - "logits/chosen": 0.49288463592529297, - "logits/rejected": 0.5485498309135437, - "logps/chosen": -6.930272102355957, - "logps/rejected": -7.437797546386719, - "loss": 0.6253, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -6.930272102355957, - "rewards/margins": 0.5075257420539856, - "rewards/rejected": -7.437797546386719, - "semantic_entropy": 0.0111698554828763, + "logits/chosen": 0.018134308978915215, + "logits/rejected": 0.1439289152622223, + "logps/chosen": -2.1604928970336914, + "logps/rejected": -2.574357271194458, + "loss": 0.6065, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.1604928970336914, + "rewards/margins": 0.4138646125793457, + "rewards/rejected": -2.574357271194458, "step": 785 }, { "epoch": 0.4228131794614484, - "grad_norm": 17.05301230734048, + "grad_norm": 10.605182403876867, "learning_rate": 9.94920790594082e-07, - "logits/chosen": 0.3991442918777466, - "logits/rejected": 0.45411986112594604, - "logps/chosen": -6.633962154388428, - "logps/rejected": -7.100059509277344, - "loss": 0.6083, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -6.633962154388428, - "rewards/margins": 0.4660969376564026, - "rewards/rejected": -7.100059509277344, - "semantic_entropy": 0.014079605229198933, + "logits/chosen": -0.035647179931402206, + "logits/rejected": 0.09288085997104645, + "logps/chosen": -2.138533592224121, + "logps/rejected": -2.5365071296691895, + "loss": 0.594, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.138533592224121, + "rewards/margins": 0.3979737162590027, + "rewards/rejected": -2.5365071296691895, "step": 790 }, { "epoch": 0.42548921224284997, - "grad_norm": 16.409030978097405, + "grad_norm": 12.589036676415915, "learning_rate": 9.946969668401696e-07, - "logits/chosen": 0.2830341160297394, - "logits/rejected": 0.3877353072166443, - "logps/chosen": -6.539282321929932, - "logps/rejected": -7.096798896789551, - "loss": 0.6047, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -6.539282321929932, - "rewards/margins": 0.5575160384178162, - "rewards/rejected": -7.096798896789551, - "semantic_entropy": 0.015337374992668629, + "logits/chosen": -0.04490866884589195, + "logits/rejected": 0.14527466893196106, + "logps/chosen": -2.234137773513794, + "logps/rejected": -2.721458673477173, + "loss": 0.6025, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.234137773513794, + "rewards/margins": 0.4873208999633789, + "rewards/rejected": -2.721458673477173, "step": 795 }, { "epoch": 0.4281652450242516, - "grad_norm": 13.519739236635578, + "grad_norm": 9.852757865938282, "learning_rate": 9.944683435341155e-07, - "logits/chosen": 0.30631715059280396, - "logits/rejected": 0.35199958086013794, - "logps/chosen": -6.5635480880737305, - "logps/rejected": -7.057257175445557, - "loss": 0.5924, - "rewards/accuracies": 0.6875, - "rewards/chosen": -6.5635480880737305, - "rewards/margins": 0.49370861053466797, - "rewards/rejected": -7.057257175445557, - "semantic_entropy": 0.015060871839523315, + "logits/chosen": -0.0007214114302769303, + "logits/rejected": 0.07458268105983734, + "logps/chosen": -2.2955057621002197, + "logps/rejected": -2.6941685676574707, + "loss": 0.597, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2955057621002197, + "rewards/margins": 0.39866283535957336, + "rewards/rejected": -2.6941685676574707, "step": 800 }, { "epoch": 0.4281652450242516, - "eval_logits/chosen": 0.4883604347705841, - "eval_logits/rejected": 0.5385698676109314, - "eval_logps/chosen": -6.740940093994141, - "eval_logps/rejected": -7.298737049102783, - "eval_loss": 0.5850783586502075, - "eval_rewards/accuracies": 0.6810088753700256, - "eval_rewards/chosen": -6.740940093994141, - "eval_rewards/margins": 0.5577963590621948, - "eval_rewards/rejected": -7.298737049102783, - "eval_runtime": 34.813, - "eval_samples_per_second": 38.635, - "eval_semantic_entropy": 0.013112816959619522, - "eval_steps_per_second": 9.68, + "eval_logits/chosen": 0.3584064841270447, + "eval_logits/rejected": 0.4528006315231323, + "eval_logps/chosen": -2.337101697921753, + "eval_logps/rejected": -2.7821595668792725, + "eval_loss": 0.5955701470375061, + "eval_rewards/accuracies": 0.6669139266014099, + "eval_rewards/chosen": -2.337101697921753, + "eval_rewards/margins": 0.44505763053894043, + "eval_rewards/rejected": -2.7821595668792725, + "eval_runtime": 40.0601, + "eval_samples_per_second": 33.575, + "eval_steps_per_second": 8.412, "step": 800 }, { "epoch": 0.4308412778056531, - "grad_norm": 19.237636205467812, + "grad_norm": 11.672502709879994, "learning_rate": 9.942349228940236e-07, - "logits/chosen": 0.30846095085144043, - "logits/rejected": 0.3973791003227234, - "logps/chosen": -6.796361446380615, - "logps/rejected": -7.3480072021484375, - "loss": 0.5769, - "rewards/accuracies": 0.71875, - "rewards/chosen": -6.796361446380615, - "rewards/margins": 0.551645040512085, - "rewards/rejected": -7.3480072021484375, - "semantic_entropy": 0.01173459179699421, + "logits/chosen": -0.041388239711523056, + "logits/rejected": 0.1269700974225998, + "logps/chosen": -2.3135457038879395, + "logps/rejected": -2.894125461578369, + "loss": 0.5564, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.3135457038879395, + "rewards/margins": 0.5805795788764954, + "rewards/rejected": -2.894125461578369, "step": 805 }, { "epoch": 0.43351731058705467, - "grad_norm": 17.157117885795127, + "grad_norm": 10.198121878652486, "learning_rate": 9.939967071845424e-07, - "logits/chosen": 0.30237749218940735, - "logits/rejected": 0.3430071473121643, - "logps/chosen": -6.8641839027404785, - "logps/rejected": -7.332343101501465, - "loss": 0.6095, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -6.8641839027404785, - "rewards/margins": 0.4681592583656311, - "rewards/rejected": -7.332343101501465, - "semantic_entropy": 0.013705052435398102, + "logits/chosen": 0.06244518607854843, + "logits/rejected": 0.1340337097644806, + "logps/chosen": -2.4297807216644287, + "logps/rejected": -2.772531270980835, + "loss": 0.6292, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.4297807216644287, + "rewards/margins": 0.342750608921051, + "rewards/rejected": -2.772531270980835, "step": 810 }, { "epoch": 0.4361933433684563, - "grad_norm": 17.86668130034969, + "grad_norm": 12.365230033945252, "learning_rate": 9.937536987168413e-07, - "logits/chosen": 0.31386134028434753, - "logits/rejected": 0.3876408636569977, - "logps/chosen": -6.719006538391113, - "logps/rejected": -7.485579490661621, - "loss": 0.5959, - "rewards/accuracies": 0.71875, - "rewards/chosen": -6.719006538391113, - "rewards/margins": 0.7665729522705078, - "rewards/rejected": -7.485579490661621, - "semantic_entropy": 0.016674160957336426, + "logits/chosen": 0.08241907507181168, + "logits/rejected": 0.21835966408252716, + "logps/chosen": -2.3917410373687744, + "logps/rejected": -2.9638357162475586, + "loss": 0.589, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.3917410373687744, + "rewards/margins": 0.5720947980880737, + "rewards/rejected": -2.9638357162475586, "step": 815 }, { "epoch": 0.4388693761498578, - "grad_norm": 14.614768670805427, + "grad_norm": 10.727567664713462, "learning_rate": 9.935058998485896e-07, - "logits/chosen": 0.39349859952926636, - "logits/rejected": 0.4051267206668854, - "logps/chosen": -7.036497592926025, - "logps/rejected": -7.662347316741943, - "loss": 0.5724, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -7.036497592926025, - "rewards/margins": 0.6258499622344971, - "rewards/rejected": -7.662347316741943, - "semantic_entropy": 0.012881157919764519, + "logits/chosen": 0.08931861072778702, + "logits/rejected": 0.1313246190547943, + "logps/chosen": -2.4762635231018066, + "logps/rejected": -2.900010585784912, + "loss": 0.6367, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.4762635231018066, + "rewards/margins": 0.42374682426452637, + "rewards/rejected": -2.900010585784912, "step": 820 }, { "epoch": 0.44154540893125943, - "grad_norm": 30.403962010078292, + "grad_norm": 16.904251297025134, "learning_rate": 9.932533129839333e-07, - "logits/chosen": 0.3851960599422455, - "logits/rejected": 0.45596402883529663, - "logps/chosen": -7.236742973327637, - "logps/rejected": -7.798255920410156, - "loss": 0.6115, - "rewards/accuracies": 0.71875, - "rewards/chosen": -7.236742973327637, - "rewards/margins": 0.5615121126174927, - "rewards/rejected": -7.798255920410156, - "semantic_entropy": 0.013946113176643848, + "logits/chosen": 0.06125732511281967, + "logits/rejected": 0.17693720757961273, + "logps/chosen": -2.2941484451293945, + "logps/rejected": -2.8171210289001465, + "loss": 0.5819, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2941484451293945, + "rewards/margins": 0.5229721665382385, + "rewards/rejected": -2.8171210289001465, "step": 825 }, { "epoch": 0.444221441712661, - "grad_norm": 20.869180068401533, + "grad_norm": 10.347489583655758, "learning_rate": 9.929959405734711e-07, - "logits/chosen": 0.45160192251205444, - "logits/rejected": 0.5339521765708923, - "logps/chosen": -7.302011966705322, - "logps/rejected": -7.983066558837891, - "loss": 0.5467, + "logits/chosen": 0.10931508243083954, + "logits/rejected": 0.27648285031318665, + "logps/chosen": -2.341714859008789, + "logps/rejected": -2.702512741088867, + "loss": 0.6248, "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -7.302011966705322, - "rewards/margins": 0.6810555458068848, - "rewards/rejected": -7.983066558837891, - "semantic_entropy": 0.010290712118148804, + "rewards/chosen": -2.341714859008789, + "rewards/margins": 0.36079803109169006, + "rewards/rejected": -2.702512741088867, "step": 830 }, { "epoch": 0.44689747449406253, - "grad_norm": 19.2525609808125, + "grad_norm": 12.377574879026888, "learning_rate": 9.927337851142314e-07, - "logits/chosen": 0.5376949906349182, - "logits/rejected": 0.588058590888977, - "logps/chosen": -7.635756492614746, - "logps/rejected": -8.161395072937012, - "loss": 0.6001, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -7.635756492614746, - "rewards/margins": 0.5256373286247253, - "rewards/rejected": -8.161395072937012, - "semantic_entropy": 0.008404644206166267, + "logits/chosen": 0.06265640258789062, + "logits/rejected": 0.18200336396694183, + "logps/chosen": -2.283501148223877, + "logps/rejected": -2.677382707595825, + "loss": 0.6225, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.283501148223877, + "rewards/margins": 0.393881618976593, + "rewards/rejected": -2.677382707595825, "step": 835 }, { "epoch": 0.44957350727546413, - "grad_norm": 19.454601896686906, + "grad_norm": 13.63932537105377, "learning_rate": 9.924668491496474e-07, - "logits/chosen": 0.5822176933288574, - "logits/rejected": 0.6964151263237, - "logps/chosen": -7.75359582901001, - "logps/rejected": -8.28437614440918, - "loss": 0.6008, - "rewards/accuracies": 0.6875, - "rewards/chosen": -7.75359582901001, - "rewards/margins": 0.5307798981666565, - "rewards/rejected": -8.28437614440918, - "semantic_entropy": 0.006352287717163563, + "logits/chosen": 0.031840045005083084, + "logits/rejected": 0.20358750224113464, + "logps/chosen": -2.4968299865722656, + "logps/rejected": -2.868072986602783, + "loss": 0.6577, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.4968299865722656, + "rewards/margins": 0.3712424635887146, + "rewards/rejected": -2.868072986602783, "step": 840 }, { "epoch": 0.4522495400568657, - "grad_norm": 16.509437642167754, + "grad_norm": 7.538364517176485, "learning_rate": 9.92195135269533e-07, - "logits/chosen": 0.6624695658683777, - "logits/rejected": 0.6998416185379028, - "logps/chosen": -7.631247043609619, - "logps/rejected": -8.026775360107422, - "loss": 0.6553, - "rewards/accuracies": 0.65625, - "rewards/chosen": -7.631247043609619, - "rewards/margins": 0.39552828669548035, - "rewards/rejected": -8.026775360107422, - "semantic_entropy": 0.00823633000254631, + "logits/chosen": 0.11281611770391464, + "logits/rejected": 0.17632463574409485, + "logps/chosen": -2.4762721061706543, + "logps/rejected": -2.756732940673828, + "loss": 0.6735, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.4762721061706543, + "rewards/margins": 0.2804607152938843, + "rewards/rejected": -2.756732940673828, "step": 845 }, { "epoch": 0.4549255728382673, - "grad_norm": 18.796921800518653, + "grad_norm": 10.381343609795504, "learning_rate": 9.919186461100574e-07, - "logits/chosen": 0.6312128305435181, - "logits/rejected": 0.6908556222915649, - "logps/chosen": -7.528157711029053, - "logps/rejected": -8.073545455932617, - "loss": 0.565, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -7.528157711029053, - "rewards/margins": 0.545387864112854, - "rewards/rejected": -8.073545455932617, - "semantic_entropy": 0.007035645190626383, + "logits/chosen": 0.0654638484120369, + "logits/rejected": 0.14173004031181335, + "logps/chosen": -2.526261329650879, + "logps/rejected": -2.8466367721557617, + "loss": 0.6276, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.526261329650879, + "rewards/margins": 0.3203752934932709, + "rewards/rejected": -2.8466367721557617, "step": 850 }, { "epoch": 0.45760160561966884, - "grad_norm": 27.677336964086486, + "grad_norm": 15.478168343331205, "learning_rate": 9.9163738435372e-07, - "logits/chosen": 0.5352962017059326, - "logits/rejected": 0.6097812056541443, - "logps/chosen": -7.2810773849487305, - "logps/rejected": -7.931620121002197, - "loss": 0.6214, - "rewards/accuracies": 0.71875, - "rewards/chosen": -7.2810773849487305, - "rewards/margins": 0.6505423188209534, - "rewards/rejected": -7.931620121002197, - "semantic_entropy": 0.009111289866268635, + "logits/chosen": 0.05029426887631416, + "logits/rejected": 0.20643608272075653, + "logps/chosen": -2.5773580074310303, + "logps/rejected": -3.0046603679656982, + "loss": 0.656, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.5773580074310303, + "rewards/margins": 0.4273025393486023, + "rewards/rejected": -3.0046603679656982, "step": 855 }, { "epoch": 0.4602776384010704, - "grad_norm": 14.272226362354068, + "grad_norm": 7.591781893620831, "learning_rate": 9.913513527293234e-07, - "logits/chosen": 0.34996968507766724, - "logits/rejected": 0.4666077494621277, - "logps/chosen": -7.2218828201293945, - "logps/rejected": -7.859616756439209, - "loss": 0.6001, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -7.2218828201293945, - "rewards/margins": 0.6377342939376831, - "rewards/rejected": -7.859616756439209, - "semantic_entropy": 0.009801121428608894, + "logits/chosen": -0.028552144765853882, + "logits/rejected": 0.1452280580997467, + "logps/chosen": -2.7550814151763916, + "logps/rejected": -3.3258566856384277, + "loss": 0.5731, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.7550814151763916, + "rewards/margins": 0.5707749128341675, + "rewards/rejected": -3.3258566856384277, "step": 860 }, { "epoch": 0.462953671182472, - "grad_norm": 26.063412987584922, + "grad_norm": 19.969105182665107, "learning_rate": 9.910605540119474e-07, - "logits/chosen": 0.33111342787742615, - "logits/rejected": 0.40587443113327026, - "logps/chosen": -7.022481441497803, - "logps/rejected": -7.678803443908691, - "loss": 0.605, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -7.022481441497803, - "rewards/margins": 0.6563228368759155, - "rewards/rejected": -7.678803443908691, - "semantic_entropy": 0.013630586676299572, + "logits/chosen": 0.06832808256149292, + "logits/rejected": 0.1697092354297638, + "logps/chosen": -2.650777578353882, + "logps/rejected": -3.1261699199676514, + "loss": 0.6337, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.650777578353882, + "rewards/margins": 0.4753924310207367, + "rewards/rejected": -3.1261699199676514, "step": 865 }, { "epoch": 0.46562970396387354, - "grad_norm": 13.43476713833555, + "grad_norm": 9.18275264293642, "learning_rate": 9.907649910229227e-07, - "logits/chosen": 0.22445161640644073, - "logits/rejected": 0.39778199791908264, - "logps/chosen": -6.874536037445068, - "logps/rejected": -7.5805511474609375, - "loss": 0.5545, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -6.874536037445068, - "rewards/margins": 0.7060148119926453, - "rewards/rejected": -7.5805511474609375, - "semantic_entropy": 0.014564545825123787, + "logits/chosen": -0.06474676728248596, + "logits/rejected": 0.20468583703041077, + "logps/chosen": -2.647327423095703, + "logps/rejected": -3.1647019386291504, + "loss": 0.5723, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.647327423095703, + "rewards/margins": 0.5173742175102234, + "rewards/rejected": -3.1647019386291504, "step": 870 }, { "epoch": 0.46830573674527515, - "grad_norm": 20.657905051442476, + "grad_norm": 13.735380245326121, "learning_rate": 9.90464666629803e-07, - "logits/chosen": 0.3672102391719818, - "logits/rejected": 0.41505926847457886, - "logps/chosen": -7.1059699058532715, - "logps/rejected": -7.5586981773376465, - "loss": 0.6699, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -7.1059699058532715, - "rewards/margins": 0.4527283310890198, - "rewards/rejected": -7.5586981773376465, - "semantic_entropy": 0.01175951398909092, + "logits/chosen": 0.04960073530673981, + "logits/rejected": 0.12968887388706207, + "logps/chosen": -2.697988986968994, + "logps/rejected": -3.064722776412964, + "loss": 0.7008, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.697988986968994, + "rewards/margins": 0.36673375964164734, + "rewards/rejected": -3.064722776412964, "step": 875 }, { "epoch": 0.4709817695266767, - "grad_norm": 10.963959193632688, + "grad_norm": 13.81392103483724, "learning_rate": 9.901595837463363e-07, - "logits/chosen": 0.3955201506614685, - "logits/rejected": 0.5143112540245056, - "logps/chosen": -7.272347927093506, - "logps/rejected": -7.984310150146484, - "loss": 0.5331, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -7.272347927093506, - "rewards/margins": 0.7119626998901367, - "rewards/rejected": -7.984310150146484, - "semantic_entropy": 0.009277241304516792, + "logits/chosen": 0.016006212681531906, + "logits/rejected": 0.2064128816127777, + "logps/chosen": -2.763716697692871, + "logps/rejected": -3.2392704486846924, + "loss": 0.5962, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.763716697692871, + "rewards/margins": 0.4755537509918213, + "rewards/rejected": -3.2392704486846924, "step": 880 }, { "epoch": 0.47365780230807825, - "grad_norm": 17.54527652296308, + "grad_norm": 12.172514185753123, "learning_rate": 9.898497453324384e-07, - "logits/chosen": 0.330959290266037, - "logits/rejected": 0.38617414236068726, - "logps/chosen": -7.402396202087402, - "logps/rejected": -7.947201728820801, - "loss": 0.5931, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -7.402396202087402, - "rewards/margins": 0.5448045134544373, - "rewards/rejected": -7.947201728820801, - "semantic_entropy": 0.008298173546791077, + "logits/chosen": -0.08251271396875381, + "logits/rejected": 0.01790446601808071, + "logps/chosen": -2.677971363067627, + "logps/rejected": -3.161337375640869, + "loss": 0.5679, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.677971363067627, + "rewards/margins": 0.48336631059646606, + "rewards/rejected": -3.161337375640869, "step": 885 }, { "epoch": 0.47633383508947985, - "grad_norm": 18.303499456390117, + "grad_norm": 11.973949288442336, "learning_rate": 9.895351543941628e-07, - "logits/chosen": 0.2591246962547302, - "logits/rejected": 0.33385053277015686, - "logps/chosen": -7.262864589691162, - "logps/rejected": -7.755476951599121, - "loss": 0.6183, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -7.262864589691162, - "rewards/margins": 0.49261218309402466, - "rewards/rejected": -7.755476951599121, - "semantic_entropy": 0.012104134075343609, + "logits/chosen": -0.14440613985061646, + "logits/rejected": -0.005009172949939966, + "logps/chosen": -2.5764148235321045, + "logps/rejected": -2.99174427986145, + "loss": 0.611, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.5764148235321045, + "rewards/margins": 0.4153295159339905, + "rewards/rejected": -2.99174427986145, "step": 890 }, { "epoch": 0.4790098678708814, - "grad_norm": 17.671561073979735, + "grad_norm": 11.87230204902039, "learning_rate": 9.892158139836724e-07, - "logits/chosen": 0.348507821559906, - "logits/rejected": 0.4041469991207123, - "logps/chosen": -7.320245265960693, - "logps/rejected": -7.844791412353516, - "loss": 0.6116, + "logits/chosen": 0.05665450543165207, + "logits/rejected": 0.16414842009544373, + "logps/chosen": -2.4439568519592285, + "logps/rejected": -2.7829558849334717, + "loss": 0.6333, "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -7.320245265960693, - "rewards/margins": 0.5245463848114014, - "rewards/rejected": -7.844791412353516, - "semantic_entropy": 0.012385739013552666, + "rewards/chosen": -2.4439568519592285, + "rewards/margins": 0.3389991223812103, + "rewards/rejected": -2.7829558849334717, "step": 895 }, { "epoch": 0.481685900652283, - "grad_norm": 19.73543179549833, + "grad_norm": 12.202856349620255, "learning_rate": 9.88891727199209e-07, - "logits/chosen": 0.2475900948047638, - "logits/rejected": 0.30757012963294983, - "logps/chosen": -7.321754455566406, - "logps/rejected": -7.892062187194824, - "loss": 0.6304, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -7.321754455566406, - "rewards/margins": 0.5703079104423523, - "rewards/rejected": -7.892062187194824, - "semantic_entropy": 0.011291766539216042, + "logits/chosen": -0.02834293805062771, + "logits/rejected": 0.05204206705093384, + "logps/chosen": -2.3923208713531494, + "logps/rejected": -2.8424270153045654, + "loss": 0.6197, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.3923208713531494, + "rewards/margins": 0.4501059055328369, + "rewards/rejected": -2.8424270153045654, "step": 900 }, { "epoch": 0.48436193343368455, - "grad_norm": 23.01329046657235, + "grad_norm": 12.24747430718298, "learning_rate": 9.885628971850641e-07, - "logits/chosen": 0.3327587842941284, - "logits/rejected": 0.4450058043003082, - "logps/chosen": -7.170090675354004, - "logps/rejected": -7.9321088790893555, - "loss": 0.5644, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -7.170090675354004, - "rewards/margins": 0.7620194554328918, - "rewards/rejected": -7.9321088790893555, - "semantic_entropy": 0.013015474192798138, + "logits/chosen": 0.05989367887377739, + "logits/rejected": 0.2608010172843933, + "logps/chosen": -2.491574764251709, + "logps/rejected": -2.979733943939209, + "loss": 0.6188, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.491574764251709, + "rewards/margins": 0.4881592392921448, + "rewards/rejected": -2.979733943939209, "step": 905 }, { "epoch": 0.48703796621508616, - "grad_norm": 13.383948275630102, + "grad_norm": 8.196017133455076, "learning_rate": 9.882293271315481e-07, - "logits/chosen": 0.3371312916278839, - "logits/rejected": 0.39455828070640564, - "logps/chosen": -7.155638217926025, - "logps/rejected": -7.695284843444824, - "loss": 0.6107, + "logits/chosen": 0.027694154530763626, + "logits/rejected": 0.11828331649303436, + "logps/chosen": -2.565986394882202, + "logps/rejected": -2.9436612129211426, + "loss": 0.6448, "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -7.155638217926025, - "rewards/margins": 0.5396467447280884, - "rewards/rejected": -7.695284843444824, - "semantic_entropy": 0.010182186029851437, + "rewards/chosen": -2.565986394882202, + "rewards/margins": 0.3776748776435852, + "rewards/rejected": -2.9436612129211426, "step": 910 }, { "epoch": 0.4897139989964877, - "grad_norm": 17.28977001865095, + "grad_norm": 9.107659092968914, "learning_rate": 9.878910202749589e-07, - "logits/chosen": 0.3446193337440491, - "logits/rejected": 0.4533708095550537, - "logps/chosen": -7.136691093444824, - "logps/rejected": -7.7163190841674805, - "loss": 0.6031, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -7.136691093444824, - "rewards/margins": 0.5796278715133667, - "rewards/rejected": -7.7163190841674805, - "semantic_entropy": 0.01082690805196762, + "logits/chosen": 0.002064554486423731, + "logits/rejected": 0.19788119196891785, + "logps/chosen": -2.5119967460632324, + "logps/rejected": -2.986353635787964, + "loss": 0.5856, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.5119967460632324, + "rewards/margins": 0.4743567407131195, + "rewards/rejected": -2.986353635787964, "step": 915 }, { "epoch": 0.49239003177788926, - "grad_norm": 16.51391506460225, + "grad_norm": 11.327454451541835, "learning_rate": 9.875479798975512e-07, - "logits/chosen": 0.3450031876564026, - "logits/rejected": 0.4413267970085144, - "logps/chosen": -6.917219638824463, - "logps/rejected": -7.611997127532959, - "loss": 0.5868, - "rewards/accuracies": 0.71875, - "rewards/chosen": -6.917219638824463, - "rewards/margins": 0.6947778463363647, - "rewards/rejected": -7.611997127532959, - "semantic_entropy": 0.013024079613387585, + "logits/chosen": 0.09237170219421387, + "logits/rejected": 0.2540927827358246, + "logps/chosen": -2.5075395107269287, + "logps/rejected": -3.0558876991271973, + "loss": 0.5886, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.5075395107269287, + "rewards/margins": 0.5483481287956238, + "rewards/rejected": -3.0558876991271973, "step": 920 }, { "epoch": 0.49506606455929086, - "grad_norm": 22.468621981222572, + "grad_norm": 12.983640991976607, "learning_rate": 9.87200209327504e-07, - "logits/chosen": 0.30431827902793884, - "logits/rejected": 0.4026539921760559, - "logps/chosen": -7.283698081970215, - "logps/rejected": -7.741362571716309, - "loss": 0.6606, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -7.283698081970215, - "rewards/margins": 0.4576646387577057, - "rewards/rejected": -7.741362571716309, - "semantic_entropy": 0.009632373228669167, + "logits/chosen": -0.02536569908261299, + "logits/rejected": 0.14128804206848145, + "logps/chosen": -2.794076681137085, + "logps/rejected": -3.209535598754883, + "loss": 0.6013, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.794076681137085, + "rewards/margins": 0.4154588580131531, + "rewards/rejected": -3.209535598754883, "step": 925 }, { "epoch": 0.4977420973406924, - "grad_norm": 22.170244658612646, + "grad_norm": 12.507300678860842, "learning_rate": 9.868477119388894e-07, - "logits/chosen": 0.29118505120277405, - "logits/rejected": 0.34077757596969604, - "logps/chosen": -7.040729522705078, - "logps/rejected": -7.742720127105713, - "loss": 0.5839, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -7.040729522705078, - "rewards/margins": 0.7019898295402527, - "rewards/rejected": -7.742720127105713, - "semantic_entropy": 0.01227110717445612, + "logits/chosen": -0.05326130986213684, + "logits/rejected": 0.05450112372636795, + "logps/chosen": -2.668976306915283, + "logps/rejected": -3.233686923980713, + "loss": 0.6119, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.668976306915283, + "rewards/margins": 0.5647104978561401, + "rewards/rejected": -3.233686923980713, "step": 930 }, { "epoch": 0.500418130122094, - "grad_norm": 17.301303488197902, + "grad_norm": 10.07656148157406, "learning_rate": 9.864904911516383e-07, - "logits/chosen": 0.285023957490921, - "logits/rejected": 0.3279130458831787, - "logps/chosen": -7.310843467712402, - "logps/rejected": -7.921121120452881, - "loss": 0.5644, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -7.310843467712402, - "rewards/margins": 0.6102767586708069, - "rewards/rejected": -7.921121120452881, - "semantic_entropy": 0.011266985908150673, + "logits/chosen": 0.036895230412483215, + "logits/rejected": 0.08204253017902374, + "logps/chosen": -2.8191280364990234, + "logps/rejected": -3.234853744506836, + "loss": 0.6237, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.8191280364990234, + "rewards/margins": 0.4157254695892334, + "rewards/rejected": -3.234853744506836, "step": 935 }, { "epoch": 0.5030941629034956, - "grad_norm": 17.308136534374302, + "grad_norm": 10.686022315815544, "learning_rate": 9.861285504315084e-07, - "logits/chosen": 0.2767130434513092, - "logits/rejected": 0.3372814357280731, - "logps/chosen": -7.225625038146973, - "logps/rejected": -7.890871524810791, - "loss": 0.5472, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -7.225625038146973, - "rewards/margins": 0.6652467846870422, - "rewards/rejected": -7.890871524810791, - "semantic_entropy": 0.011087710037827492, + "logits/chosen": -0.008217873051762581, + "logits/rejected": 0.10149233043193817, + "logps/chosen": -2.7078254222869873, + "logps/rejected": -3.1586432456970215, + "loss": 0.5898, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.7078254222869873, + "rewards/margins": 0.45081788301467896, + "rewards/rejected": -3.1586432456970215, "step": 940 }, { "epoch": 0.5057701956848971, - "grad_norm": 18.83655138445813, + "grad_norm": 10.846661384143271, "learning_rate": 9.857618932900502e-07, - "logits/chosen": 0.25278183817863464, - "logits/rejected": 0.37249675393104553, - "logps/chosen": -7.409969329833984, - "logps/rejected": -7.930933475494385, - "loss": 0.6172, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -7.409969329833984, - "rewards/margins": 0.5209641456604004, - "rewards/rejected": -7.930933475494385, - "semantic_entropy": 0.012186022475361824, + "logits/chosen": -0.040483418852090836, + "logits/rejected": 0.09846405684947968, + "logps/chosen": -2.683884859085083, + "logps/rejected": -3.2226176261901855, + "loss": 0.5548, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.683884859085083, + "rewards/margins": 0.538733184337616, + "rewards/rejected": -3.2226176261901855, "step": 945 }, { "epoch": 0.5084462284662987, - "grad_norm": 22.919603952974256, + "grad_norm": 10.342696952874242, "learning_rate": 9.853905232845727e-07, - "logits/chosen": 0.22130601108074188, - "logits/rejected": 0.3324377238750458, - "logps/chosen": -7.2493391036987305, - "logps/rejected": -7.837095737457275, - "loss": 0.614, - "rewards/accuracies": 0.71875, - "rewards/chosen": -7.2493391036987305, - "rewards/margins": 0.5877568125724792, - "rewards/rejected": -7.837095737457275, - "semantic_entropy": 0.013909459114074707, + "logits/chosen": -0.044281408190727234, + "logits/rejected": 0.11830176413059235, + "logps/chosen": -2.7899413108825684, + "logps/rejected": -3.2054734230041504, + "loss": 0.6244, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.7899413108825684, + "rewards/margins": 0.41553187370300293, + "rewards/rejected": -3.2054734230041504, "step": 950 }, { "epoch": 0.5111222612477003, - "grad_norm": 20.299888419343265, + "grad_norm": 10.433648143564472, "learning_rate": 9.850144440181095e-07, - "logits/chosen": 0.2346227467060089, - "logits/rejected": 0.38050609827041626, - "logps/chosen": -7.475827217102051, - "logps/rejected": -8.115577697753906, - "loss": 0.5694, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -7.475827217102051, - "rewards/margins": 0.639750599861145, - "rewards/rejected": -8.115577697753906, - "semantic_entropy": 0.011748342774808407, + "logits/chosen": -0.00742561649531126, + "logits/rejected": 0.1885843575000763, + "logps/chosen": -2.9425978660583496, + "logps/rejected": -3.395627498626709, + "loss": 0.5951, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.9425978660583496, + "rewards/margins": 0.45302996039390564, + "rewards/rejected": -3.395627498626709, "step": 955 }, { "epoch": 0.5137982940291018, - "grad_norm": 26.80684693873056, + "grad_norm": 9.113901546964737, "learning_rate": 9.846336591393832e-07, - "logits/chosen": 0.26571953296661377, - "logits/rejected": 0.3704363703727722, - "logps/chosen": -7.530020713806152, - "logps/rejected": -8.231939315795898, - "loss": 0.5966, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -7.530020713806152, - "rewards/margins": 0.7019174098968506, - "rewards/rejected": -8.231939315795898, - "semantic_entropy": 0.010797923430800438, + "logits/chosen": -0.045781467109918594, + "logits/rejected": 0.08921016752719879, + "logps/chosen": -2.939887523651123, + "logps/rejected": -3.3899879455566406, + "loss": 0.6214, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.939887523651123, + "rewards/margins": 0.4501004219055176, + "rewards/rejected": -3.3899879455566406, "step": 960 }, { "epoch": 0.5164743268105034, - "grad_norm": 22.781706412422338, + "grad_norm": 11.536622553504339, "learning_rate": 9.842481723427704e-07, - "logits/chosen": 0.3323562741279602, - "logits/rejected": 0.3556436598300934, - "logps/chosen": -7.796743869781494, - "logps/rejected": -8.327234268188477, - "loss": 0.6751, - "rewards/accuracies": 0.65625, - "rewards/chosen": -7.796743869781494, - "rewards/margins": 0.5304909944534302, - "rewards/rejected": -8.327234268188477, - "semantic_entropy": 0.0087089529260993, + "logits/chosen": 0.0495973564684391, + "logits/rejected": 0.0575166717171669, + "logps/chosen": -3.074889659881592, + "logps/rejected": -3.558596134185791, + "loss": 0.6444, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -3.074889659881592, + "rewards/margins": 0.4837065637111664, + "rewards/rejected": -3.558596134185791, "step": 965 }, { "epoch": 0.519150359591905, - "grad_norm": 15.79393005727687, + "grad_norm": 11.280320933300699, "learning_rate": 9.838579873682658e-07, - "logits/chosen": 0.37312960624694824, - "logits/rejected": 0.370185911655426, - "logps/chosen": -7.5613603591918945, - "logps/rejected": -8.011152267456055, - "loss": 0.6467, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -7.5613603591918945, - "rewards/margins": 0.4497918486595154, - "rewards/rejected": -8.011152267456055, - "semantic_entropy": 0.008952843025326729, + "logits/chosen": 0.02022400125861168, + "logits/rejected": 0.025910425931215286, + "logps/chosen": -2.8878684043884277, + "logps/rejected": -3.254673480987549, + "loss": 0.6392, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.8878684043884277, + "rewards/margins": 0.3668050467967987, + "rewards/rejected": -3.254673480987549, "step": 970 }, { "epoch": 0.5218263923733065, - "grad_norm": 11.274721147056358, + "grad_norm": 9.301001183677757, "learning_rate": 9.834631080014457e-07, - "logits/chosen": 0.34255561232566833, - "logits/rejected": 0.4821571409702301, - "logps/chosen": -7.382586479187012, - "logps/rejected": -7.989335060119629, - "loss": 0.5608, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -7.382586479187012, - "rewards/margins": 0.6067487001419067, - "rewards/rejected": -7.989335060119629, - "semantic_entropy": 0.00937967374920845, + "logits/chosen": -0.10016496479511261, + "logits/rejected": 0.08622417598962784, + "logps/chosen": -2.8813564777374268, + "logps/rejected": -3.380495071411133, + "loss": 0.5618, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.8813564777374268, + "rewards/margins": 0.4991387724876404, + "rewards/rejected": -3.380495071411133, "step": 975 }, { "epoch": 0.5245024251547081, - "grad_norm": 18.0683203101528, + "grad_norm": 18.01238890394644, "learning_rate": 9.830635380734312e-07, - "logits/chosen": 0.3637096583843231, - "logits/rejected": 0.4652346074581146, - "logps/chosen": -7.228192329406738, - "logps/rejected": -7.7642951011657715, - "loss": 0.5798, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -7.228192329406738, - "rewards/margins": 0.536103367805481, - "rewards/rejected": -7.7642951011657715, - "semantic_entropy": 0.010401034727692604, + "logits/chosen": -0.07457484304904938, + "logits/rejected": 0.10912604629993439, + "logps/chosen": -2.985914707183838, + "logps/rejected": -3.3950748443603516, + "loss": 0.6253, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.985914707183838, + "rewards/margins": 0.4091602861881256, + "rewards/rejected": -3.3950748443603516, "step": 980 }, { "epoch": 0.5271784579361097, - "grad_norm": 15.686670446006769, + "grad_norm": 10.660246382660587, "learning_rate": 9.826592814608517e-07, - "logits/chosen": 0.508113443851471, - "logits/rejected": 0.6232683062553406, - "logps/chosen": -7.083076477050781, - "logps/rejected": -7.62778377532959, - "loss": 0.5897, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -7.083076477050781, - "rewards/margins": 0.5447085499763489, - "rewards/rejected": -7.62778377532959, - "semantic_entropy": 0.010786814615130424, + "logits/chosen": 0.016102924942970276, + "logits/rejected": 0.21320465207099915, + "logps/chosen": -2.8388538360595703, + "logps/rejected": -3.301830768585205, + "loss": 0.5887, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.8388538360595703, + "rewards/margins": 0.46297687292099, + "rewards/rejected": -3.301830768585205, "step": 985 }, { "epoch": 0.5298544907175113, - "grad_norm": 11.8792829377083, + "grad_norm": 10.502621576823326, "learning_rate": 9.822503420858067e-07, - "logits/chosen": 0.5830814242362976, - "logits/rejected": 0.602809488773346, - "logps/chosen": -7.003039360046387, - "logps/rejected": -7.556340217590332, - "loss": 0.5887, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -7.003039360046387, - "rewards/margins": 0.5533004999160767, - "rewards/rejected": -7.556340217590332, - "semantic_entropy": 0.01125816348940134, + "logits/chosen": 0.07925257086753845, + "logits/rejected": 0.10265052318572998, + "logps/chosen": -2.716010332107544, + "logps/rejected": -3.263709306716919, + "loss": 0.5776, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.716010332107544, + "rewards/margins": 0.5476993918418884, + "rewards/rejected": -3.263709306716919, "step": 990 }, { "epoch": 0.5325305234989128, - "grad_norm": 13.602049737746205, + "grad_norm": 13.630124204544165, "learning_rate": 9.818367239158277e-07, - "logits/chosen": 0.5771272778511047, - "logits/rejected": 0.6160858869552612, - "logps/chosen": -7.035143852233887, - "logps/rejected": -7.554785251617432, - "loss": 0.6097, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -7.035143852233887, - "rewards/margins": 0.5196409225463867, - "rewards/rejected": -7.554785251617432, - "semantic_entropy": 0.010585736483335495, + "logits/chosen": 0.07809169590473175, + "logits/rejected": 0.15364494919776917, + "logps/chosen": -2.888439655303955, + "logps/rejected": -3.3436026573181152, + "loss": 0.6352, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.888439655303955, + "rewards/margins": 0.45516282320022583, + "rewards/rejected": -3.3436026573181152, "step": 995 }, { "epoch": 0.5352065562803144, - "grad_norm": 13.516147889237553, + "grad_norm": 15.46111677789521, "learning_rate": 9.8141843096384e-07, - "logits/chosen": 0.594735860824585, - "logits/rejected": 0.6816811561584473, - "logps/chosen": -7.359915256500244, - "logps/rejected": -7.887757778167725, - "loss": 0.5661, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -7.359915256500244, - "rewards/margins": 0.52784264087677, - "rewards/rejected": -7.887757778167725, - "semantic_entropy": 0.00826399214565754, + "logits/chosen": 0.07574556767940521, + "logits/rejected": 0.20494620501995087, + "logps/chosen": -3.1541690826416016, + "logps/rejected": -3.7323715686798096, + "loss": 0.5554, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.1541690826416016, + "rewards/margins": 0.5782027244567871, + "rewards/rejected": -3.7323715686798096, "step": 1000 }, { "epoch": 0.537882589061716, - "grad_norm": 21.398826323428153, + "grad_norm": 15.766242089611916, "learning_rate": 9.809954672881237e-07, - "logits/chosen": 0.5744519829750061, - "logits/rejected": 0.676822304725647, - "logps/chosen": -7.203065395355225, - "logps/rejected": -7.726864814758301, - "loss": 0.5962, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -7.203065395355225, - "rewards/margins": 0.5237992405891418, - "rewards/rejected": -7.726864814758301, - "semantic_entropy": 0.01075592078268528, + "logits/chosen": 0.050642527639865875, + "logits/rejected": 0.21202464401721954, + "logps/chosen": -3.2888782024383545, + "logps/rejected": -3.7921385765075684, + "loss": 0.637, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.2888782024383545, + "rewards/margins": 0.50326007604599, + "rewards/rejected": -3.7921385765075684, "step": 1005 }, { "epoch": 0.5405586218431175, - "grad_norm": 13.803390343561437, + "grad_norm": 14.260365220145145, "learning_rate": 9.80567836992274e-07, - "logits/chosen": 0.5930423140525818, - "logits/rejected": 0.6964749693870544, - "logps/chosen": -6.960592746734619, - "logps/rejected": -7.653602600097656, - "loss": 0.5627, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -6.960592746734619, - "rewards/margins": 0.6930093765258789, - "rewards/rejected": -7.653602600097656, - "semantic_entropy": 0.012550493702292442, + "logits/chosen": 0.024269647896289825, + "logits/rejected": 0.21209308505058289, + "logps/chosen": -2.97652006149292, + "logps/rejected": -3.6514039039611816, + "loss": 0.561, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.97652006149292, + "rewards/margins": 0.6748837232589722, + "rewards/rejected": -3.6514039039611816, "step": 1010 }, { "epoch": 0.5432346546245191, - "grad_norm": 17.08463390545894, + "grad_norm": 10.715897210901726, "learning_rate": 9.801355442251625e-07, - "logits/chosen": 0.5665202140808105, - "logits/rejected": 0.6539164781570435, - "logps/chosen": -6.99094295501709, - "logps/rejected": -7.582823276519775, - "loss": 0.5839, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -6.99094295501709, - "rewards/margins": 0.5918795466423035, - "rewards/rejected": -7.582823276519775, - "semantic_entropy": 0.011810271069407463, + "logits/chosen": 0.0067981332540512085, + "logits/rejected": 0.16358794271945953, + "logps/chosen": -2.9791476726531982, + "logps/rejected": -3.5057830810546875, + "loss": 0.6043, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.9791476726531982, + "rewards/margins": 0.5266350507736206, + "rewards/rejected": -3.5057830810546875, "step": 1015 }, { "epoch": 0.5459106874059207, - "grad_norm": 16.08279102674936, + "grad_norm": 15.052140933399171, "learning_rate": 9.796985931808949e-07, - "logits/chosen": 0.5616046786308289, - "logits/rejected": 0.6401379704475403, - "logps/chosen": -6.898770809173584, - "logps/rejected": -7.535937309265137, - "loss": 0.5637, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -6.898770809173584, - "rewards/margins": 0.6371673941612244, - "rewards/rejected": -7.535937309265137, - "semantic_entropy": 0.012174823321402073, + "logits/chosen": 0.006623516790568829, + "logits/rejected": 0.15501698851585388, + "logps/chosen": -3.0031588077545166, + "logps/rejected": -3.54280424118042, + "loss": 0.5631, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.0031588077545166, + "rewards/margins": 0.5396460294723511, + "rewards/rejected": -3.54280424118042, "step": 1020 }, { "epoch": 0.5485867201873222, - "grad_norm": 17.72669785551067, + "grad_norm": 15.845875698524438, "learning_rate": 9.792569880987724e-07, - "logits/chosen": 0.5178264379501343, - "logits/rejected": 0.5918501019477844, - "logps/chosen": -7.055424690246582, - "logps/rejected": -7.835641384124756, - "loss": 0.5346, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -7.055424690246582, - "rewards/margins": 0.7802165150642395, - "rewards/rejected": -7.835641384124756, - "semantic_entropy": 0.012361900880932808, + "logits/chosen": -0.06385553628206253, + "logits/rejected": 0.057505108416080475, + "logps/chosen": -3.065153121948242, + "logps/rejected": -3.6488163471221924, + "loss": 0.5819, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.065153121948242, + "rewards/margins": 0.5836632251739502, + "rewards/rejected": -3.6488163471221924, "step": 1025 }, { "epoch": 0.5512627529687238, - "grad_norm": 23.667023672408078, + "grad_norm": 25.677327698832478, "learning_rate": 9.788107332632493e-07, - "logits/chosen": 0.5643856525421143, - "logits/rejected": 0.615861713886261, - "logps/chosen": -7.197871208190918, - "logps/rejected": -7.710868835449219, - "loss": 0.6595, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -7.197871208190918, - "rewards/margins": 0.512997031211853, - "rewards/rejected": -7.710868835449219, - "semantic_entropy": 0.010254869237542152, + "logits/chosen": -0.028798198327422142, + "logits/rejected": 0.05488016456365585, + "logps/chosen": -3.0705292224884033, + "logps/rejected": -3.5016632080078125, + "loss": 0.6467, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.0705292224884033, + "rewards/margins": 0.4311339259147644, + "rewards/rejected": -3.5016632080078125, "step": 1030 }, { "epoch": 0.5539387857501255, - "grad_norm": 18.12568347350743, + "grad_norm": 11.878687747816457, "learning_rate": 9.783598330038924e-07, - "logits/chosen": 0.607509434223175, - "logits/rejected": 0.6717751622200012, - "logps/chosen": -7.640361785888672, - "logps/rejected": -8.166301727294922, - "loss": 0.5919, - "rewards/accuracies": 0.6875, - "rewards/chosen": -7.640361785888672, - "rewards/margins": 0.525938868522644, - "rewards/rejected": -8.166301727294922, - "semantic_entropy": 0.0061937421560287476, + "logits/chosen": -0.0441121943295002, + "logits/rejected": 0.07024122774600983, + "logps/chosen": -2.9722323417663574, + "logps/rejected": -3.3824875354766846, + "loss": 0.6102, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.9722323417663574, + "rewards/margins": 0.41025543212890625, + "rewards/rejected": -3.3824875354766846, "step": 1035 }, { "epoch": 0.5566148185315271, - "grad_norm": 16.521524018519518, + "grad_norm": 16.07958563269316, "learning_rate": 9.779042916953376e-07, - "logits/chosen": 0.6564599871635437, - "logits/rejected": 0.7573956251144409, - "logps/chosen": -7.6685791015625, - "logps/rejected": -8.591699600219727, - "loss": 0.4835, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -7.6685791015625, - "rewards/margins": 0.923120379447937, - "rewards/rejected": -8.591699600219727, - "semantic_entropy": 0.006848378572613001, + "logits/chosen": -0.017382394522428513, + "logits/rejected": 0.153041809797287, + "logps/chosen": -2.656935214996338, + "logps/rejected": -3.2748610973358154, + "loss": 0.5611, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.656935214996338, + "rewards/margins": 0.6179259419441223, + "rewards/rejected": -3.2748610973358154, "step": 1040 }, { "epoch": 0.5592908513129285, - "grad_norm": 23.54333369460797, + "grad_norm": 10.881468485762895, "learning_rate": 9.774441137572487e-07, - "logits/chosen": 0.6087485551834106, - "logits/rejected": 0.704781711101532, - "logps/chosen": -8.048932075500488, - "logps/rejected": -8.719579696655273, - "loss": 0.5672, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -8.048932075500488, - "rewards/margins": 0.6706476211547852, - "rewards/rejected": -8.719579696655273, - "semantic_entropy": 0.005030449479818344, + "logits/chosen": -0.11188797652721405, + "logits/rejected": 0.033138252794742584, + "logps/chosen": -2.985079526901245, + "logps/rejected": -3.5826873779296875, + "loss": 0.5561, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.985079526901245, + "rewards/margins": 0.5976080298423767, + "rewards/rejected": -3.5826873779296875, "step": 1045 }, { "epoch": 0.5619668840943302, - "grad_norm": 18.562047799738053, + "grad_norm": 14.43406342800708, "learning_rate": 9.76979303654274e-07, - "logits/chosen": 0.531555712223053, - "logits/rejected": 0.5957599878311157, - "logps/chosen": -8.234556198120117, - "logps/rejected": -8.946748733520508, - "loss": 0.5617, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.234556198120117, - "rewards/margins": 0.7121928930282593, - "rewards/rejected": -8.946748733520508, - "semantic_entropy": 0.003957569133490324, + "logits/chosen": -0.12570540606975555, + "logits/rejected": -0.025706101208925247, + "logps/chosen": -3.1821255683898926, + "logps/rejected": -3.7612223625183105, + "loss": 0.5618, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.1821255683898926, + "rewards/margins": 0.5790969133377075, + "rewards/rejected": -3.7612223625183105, "step": 1050 }, { "epoch": 0.5646429168757318, - "grad_norm": 20.633835966884693, + "grad_norm": 18.368626800390786, "learning_rate": 9.765098658960035e-07, - "logits/chosen": 0.5291253924369812, - "logits/rejected": 0.5597686767578125, - "logps/chosen": -8.142400741577148, - "logps/rejected": -8.771413803100586, - "loss": 0.5741, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -8.142400741577148, - "rewards/margins": 0.6290136575698853, - "rewards/rejected": -8.771413803100586, - "semantic_entropy": 0.004258748609572649, + "logits/chosen": -0.037792421877384186, + "logits/rejected": 0.023655090481042862, + "logps/chosen": -3.1505191326141357, + "logps/rejected": -3.7224929332733154, + "loss": 0.5625, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.1505191326141357, + "rewards/margins": 0.5719733834266663, + "rewards/rejected": -3.7224929332733154, "step": 1055 }, { "epoch": 0.5673189496571333, - "grad_norm": 34.54645017218859, + "grad_norm": 16.620923723011963, "learning_rate": 9.76035805036924e-07, - "logits/chosen": 0.5386477112770081, - "logits/rejected": 0.6480933427810669, - "logps/chosen": -8.24083137512207, - "logps/rejected": -8.844766616821289, - "loss": 0.5895, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -8.24083137512207, - "rewards/margins": 0.6039354801177979, - "rewards/rejected": -8.844766616821289, - "semantic_entropy": 0.004295586608350277, + "logits/chosen": 0.024298612028360367, + "logits/rejected": 0.20108160376548767, + "logps/chosen": -3.452113389968872, + "logps/rejected": -3.9429848194122314, + "loss": 0.6146, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.452113389968872, + "rewards/margins": 0.4908713400363922, + "rewards/rejected": -3.9429848194122314, "step": 1060 }, { "epoch": 0.5699949824385349, - "grad_norm": 18.991335444449422, + "grad_norm": 14.13909880042195, "learning_rate": 9.755571256763764e-07, - "logits/chosen": 0.5871809124946594, - "logits/rejected": 0.6752435564994812, - "logps/chosen": -8.063031196594238, - "logps/rejected": -8.696462631225586, - "loss": 0.5889, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -8.063031196594238, - "rewards/margins": 0.6334304809570312, - "rewards/rejected": -8.696462631225586, - "semantic_entropy": 0.005001295357942581, + "logits/chosen": 0.01513664424419403, + "logits/rejected": 0.14771047234535217, + "logps/chosen": -3.361903429031372, + "logps/rejected": -3.9949944019317627, + "loss": 0.5613, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.361903429031372, + "rewards/margins": 0.6330903768539429, + "rewards/rejected": -3.9949944019317627, "step": 1065 }, { "epoch": 0.5726710152199365, - "grad_norm": 13.986507130299717, + "grad_norm": 10.133871947228824, "learning_rate": 9.750738324585097e-07, - "logits/chosen": 0.5075832605361938, - "logits/rejected": 0.65854811668396, - "logps/chosen": -8.004261016845703, - "logps/rejected": -8.6107177734375, - "loss": 0.5822, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -8.004261016845703, - "rewards/margins": 0.6064566373825073, - "rewards/rejected": -8.6107177734375, - "semantic_entropy": 0.004056010395288467, + "logits/chosen": -0.12645608186721802, + "logits/rejected": 0.11110260337591171, + "logps/chosen": -3.464323043823242, + "logps/rejected": -4.075308799743652, + "loss": 0.5594, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.464323043823242, + "rewards/margins": 0.6109856367111206, + "rewards/rejected": -4.075308799743652, "step": 1070 }, { "epoch": 0.5753470480013381, - "grad_norm": 13.26346189390983, + "grad_norm": 8.19280822371788, "learning_rate": 9.74585930072237e-07, - "logits/chosen": 0.5564194917678833, - "logits/rejected": 0.6411615014076233, - "logps/chosen": -7.78427791595459, - "logps/rejected": -8.535164833068848, - "loss": 0.5486, - "rewards/accuracies": 0.75, - "rewards/chosen": -7.78427791595459, - "rewards/margins": 0.7508861422538757, - "rewards/rejected": -8.535164833068848, - "semantic_entropy": 0.00624846201390028, + "logits/chosen": -0.016082096844911575, + "logits/rejected": 0.11984304338693619, + "logps/chosen": -3.111365556716919, + "logps/rejected": -3.73207426071167, + "loss": 0.5792, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.111365556716919, + "rewards/margins": 0.6207085847854614, + "rewards/rejected": -3.73207426071167, "step": 1075 }, { "epoch": 0.5780230807827396, - "grad_norm": 17.424932509279063, + "grad_norm": 11.298433677260162, "learning_rate": 9.740934232511892e-07, - "logits/chosen": 0.5387696623802185, - "logits/rejected": 0.6152251362800598, - "logps/chosen": -7.787422180175781, - "logps/rejected": -8.456579208374023, - "loss": 0.5961, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -7.787422180175781, - "rewards/margins": 0.6691574454307556, - "rewards/rejected": -8.456579208374023, - "semantic_entropy": 0.005864334292709827, + "logits/chosen": -0.12629804015159607, + "logits/rejected": -0.01740388013422489, + "logps/chosen": -3.220460891723633, + "logps/rejected": -3.7799887657165527, + "loss": 0.5742, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.220460891723633, + "rewards/margins": 0.5595277547836304, + "rewards/rejected": -3.7799887657165527, "step": 1080 }, { "epoch": 0.5806991135641412, - "grad_norm": 13.701215181738062, + "grad_norm": 13.412340126241434, "learning_rate": 9.735963167736698e-07, - "logits/chosen": 0.6226884126663208, - "logits/rejected": 0.7190333008766174, - "logps/chosen": -7.892449855804443, - "logps/rejected": -8.544393539428711, - "loss": 0.5802, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -7.892449855804443, - "rewards/margins": 0.6519426107406616, - "rewards/rejected": -8.544393539428711, - "semantic_entropy": 0.005305818282067776, + "logits/chosen": -0.027102509513497353, + "logits/rejected": 0.13758878409862518, + "logps/chosen": -3.0929787158966064, + "logps/rejected": -3.486976146697998, + "loss": 0.6322, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -3.0929787158966064, + "rewards/margins": 0.39399731159210205, + "rewards/rejected": -3.486976146697998, "step": 1085 }, { "epoch": 0.5833751463455428, - "grad_norm": 24.146957684848896, + "grad_norm": 13.779960488199876, "learning_rate": 9.730946154626078e-07, - "logits/chosen": 0.6268946528434753, - "logits/rejected": 0.6841186285018921, - "logps/chosen": -7.797842502593994, - "logps/rejected": -8.302523612976074, - "loss": 0.6651, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -7.797842502593994, - "rewards/margins": 0.5046811699867249, - "rewards/rejected": -8.302523612976074, - "semantic_entropy": 0.006395612843334675, + "logits/chosen": -0.0119629530236125, + "logits/rejected": 0.08613927662372589, + "logps/chosen": -3.3224644660949707, + "logps/rejected": -3.7729790210723877, + "loss": 0.6489, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.3224644660949707, + "rewards/margins": 0.450514018535614, + "rewards/rejected": -3.7729790210723877, "step": 1090 }, { "epoch": 0.5860511791269443, - "grad_norm": 18.196449287747534, + "grad_norm": 17.131458706643393, "learning_rate": 9.725883241855117e-07, - "logits/chosen": 0.5718734264373779, - "logits/rejected": 0.6677632331848145, - "logps/chosen": -7.862264156341553, - "logps/rejected": -8.452044486999512, - "loss": 0.5838, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -7.862264156341553, - "rewards/margins": 0.5897812843322754, - "rewards/rejected": -8.452044486999512, - "semantic_entropy": 0.005370546132326126, + "logits/chosen": -0.16275691986083984, + "logits/rejected": -0.005458747036755085, + "logps/chosen": -3.293793201446533, + "logps/rejected": -3.8579368591308594, + "loss": 0.5736, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.293793201446533, + "rewards/margins": 0.5641440153121948, + "rewards/rejected": -3.8579368591308594, "step": 1095 }, { "epoch": 0.5887272119083459, - "grad_norm": 18.197375532898803, + "grad_norm": 14.471406156127255, "learning_rate": 9.720774478544218e-07, - "logits/chosen": 0.6339142322540283, - "logits/rejected": 0.7179350852966309, - "logps/chosen": -7.516765594482422, - "logps/rejected": -8.200715065002441, - "loss": 0.5646, - "rewards/accuracies": 0.6875, - "rewards/chosen": -7.516765594482422, - "rewards/margins": 0.6839491128921509, - "rewards/rejected": -8.200715065002441, - "semantic_entropy": 0.007220913656055927, + "logits/chosen": -0.024837370961904526, + "logits/rejected": 0.10001087188720703, + "logps/chosen": -3.1233417987823486, + "logps/rejected": -3.787309169769287, + "loss": 0.553, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.1233417987823486, + "rewards/margins": 0.6639671325683594, + "rewards/rejected": -3.787309169769287, "step": 1100 }, { "epoch": 0.5914032446897475, - "grad_norm": 16.846636546681744, + "grad_norm": 13.2061941041561, "learning_rate": 9.715619914258624e-07, - "logits/chosen": 0.570271909236908, - "logits/rejected": 0.6238844990730286, - "logps/chosen": -7.6180243492126465, - "logps/rejected": -8.167299270629883, - "loss": 0.5952, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -7.6180243492126465, - "rewards/margins": 0.549274206161499, - "rewards/rejected": -8.167299270629883, - "semantic_entropy": 0.006116692908108234, + "logits/chosen": -0.13341191411018372, + "logits/rejected": -0.038438569754362106, + "logps/chosen": -3.2578506469726562, + "logps/rejected": -3.6706607341766357, + "loss": 0.6303, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.2578506469726562, + "rewards/margins": 0.41281014680862427, + "rewards/rejected": -3.6706607341766357, "step": 1105 }, { "epoch": 0.594079277471149, - "grad_norm": 26.633505176750862, + "grad_norm": 16.733042248531707, "learning_rate": 9.710419599007937e-07, - "logits/chosen": 0.638900101184845, - "logits/rejected": 0.7319290637969971, - "logps/chosen": -7.582394599914551, - "logps/rejected": -8.110502243041992, - "loss": 0.5966, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -7.582394599914551, - "rewards/margins": 0.5281090140342712, - "rewards/rejected": -8.110502243041992, - "semantic_entropy": 0.006717337761074305, + "logits/chosen": -0.06488059461116791, + "logits/rejected": 0.07974977046251297, + "logps/chosen": -3.2136306762695312, + "logps/rejected": -3.6501152515411377, + "loss": 0.6137, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.2136306762695312, + "rewards/margins": 0.436484158039093, + "rewards/rejected": -3.6501152515411377, "step": 1110 }, { "epoch": 0.5967553102525506, - "grad_norm": 27.411766077174338, + "grad_norm": 25.620392437138566, "learning_rate": 9.705173583245643e-07, - "logits/chosen": 0.6096881628036499, - "logits/rejected": 0.7123531699180603, - "logps/chosen": -7.56555700302124, - "logps/rejected": -7.999688625335693, - "loss": 0.6675, - "rewards/accuracies": 0.65625, - "rewards/chosen": -7.56555700302124, - "rewards/margins": 0.4341324269771576, - "rewards/rejected": -7.999688625335693, - "semantic_entropy": 0.0064805252477526665, + "logits/chosen": -0.03486672043800354, + "logits/rejected": 0.11669953167438507, + "logps/chosen": -2.87746524810791, + "logps/rejected": -3.3858590126037598, + "loss": 0.6095, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.87746524810791, + "rewards/margins": 0.5083936452865601, + "rewards/rejected": -3.3858590126037598, "step": 1115 }, { "epoch": 0.5994313430339522, - "grad_norm": 13.615665322183913, + "grad_norm": 9.396261321404555, "learning_rate": 9.699881917868609e-07, - "logits/chosen": 0.5667654275894165, - "logits/rejected": 0.6309981346130371, - "logps/chosen": -7.366901397705078, - "logps/rejected": -7.9114861488342285, - "loss": 0.6056, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -7.366901397705078, - "rewards/margins": 0.5445848703384399, - "rewards/rejected": -7.9114861488342285, - "semantic_entropy": 0.007139952387660742, + "logits/chosen": -0.18330372869968414, + "logits/rejected": -0.05816059187054634, + "logps/chosen": -2.8093981742858887, + "logps/rejected": -3.3236403465270996, + "loss": 0.5864, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.8093981742858887, + "rewards/margins": 0.5142425298690796, + "rewards/rejected": -3.3236403465270996, "step": 1120 }, { "epoch": 0.6021073758153538, - "grad_norm": 15.675473698672269, + "grad_norm": 12.03145215088481, "learning_rate": 9.694544654216594e-07, - "logits/chosen": 0.5247001647949219, - "logits/rejected": 0.6292780041694641, - "logps/chosen": -7.417148590087891, - "logps/rejected": -7.97311544418335, - "loss": 0.5796, + "logits/chosen": -0.1462056040763855, + "logits/rejected": 0.051098644733428955, + "logps/chosen": -2.8837461471557617, + "logps/rejected": -3.448829174041748, + "loss": 0.5494, "rewards/accuracies": 0.71875, - "rewards/chosen": -7.417148590087891, - "rewards/margins": 0.5559675097465515, - "rewards/rejected": -7.97311544418335, - "semantic_entropy": 0.006966522429138422, + "rewards/chosen": -2.8837461471557617, + "rewards/margins": 0.5650831460952759, + "rewards/rejected": -3.448829174041748, "step": 1125 }, { "epoch": 0.6047834085967553, - "grad_norm": 16.21958228068165, + "grad_norm": 13.728481744729434, "learning_rate": 9.689161844071755e-07, - "logits/chosen": 0.6028557419776917, - "logits/rejected": 0.6466434597969055, - "logps/chosen": -7.145249366760254, - "logps/rejected": -7.668765068054199, - "loss": 0.5902, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -7.145249366760254, - "rewards/margins": 0.5235155820846558, - "rewards/rejected": -7.668765068054199, - "semantic_entropy": 0.008800549432635307, + "logits/chosen": 0.025365393608808517, + "logits/rejected": 0.10421857982873917, + "logps/chosen": -2.7121846675872803, + "logps/rejected": -3.1644644737243652, + "loss": 0.6112, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.7121846675872803, + "rewards/margins": 0.4522795081138611, + "rewards/rejected": -3.1644644737243652, "step": 1130 }, { "epoch": 0.6074594413781569, - "grad_norm": 15.585684935656099, + "grad_norm": 11.661261570051034, "learning_rate": 9.683733539658138e-07, - "logits/chosen": 0.6030339002609253, - "logits/rejected": 0.7146845459938049, - "logps/chosen": -7.388113975524902, - "logps/rejected": -7.941763401031494, - "loss": 0.5785, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -7.388113975524902, - "rewards/margins": 0.5536485910415649, - "rewards/rejected": -7.941763401031494, - "semantic_entropy": 0.007055189460515976, + "logits/chosen": -0.08707188069820404, + "logits/rejected": 0.08479343354701996, + "logps/chosen": -2.8947556018829346, + "logps/rejected": -3.401803493499756, + "loss": 0.5967, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.8947556018829346, + "rewards/margins": 0.5070478916168213, + "rewards/rejected": -3.401803493499756, "step": 1135 }, { "epoch": 0.6101354741595585, - "grad_norm": 15.602057313200888, + "grad_norm": 12.15583102153243, "learning_rate": 9.678259793641178e-07, - "logits/chosen": 0.5913205742835999, - "logits/rejected": 0.602319598197937, - "logps/chosen": -7.421736717224121, - "logps/rejected": -7.810797214508057, - "loss": 0.6116, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -7.421736717224121, - "rewards/margins": 0.38906151056289673, - "rewards/rejected": -7.810797214508057, - "semantic_entropy": 0.00688566267490387, + "logits/chosen": -0.04618864879012108, + "logits/rejected": -0.0021269440185278654, + "logps/chosen": -2.954184055328369, + "logps/rejected": -3.2835166454315186, + "loss": 0.6405, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.954184055328369, + "rewards/margins": 0.32933247089385986, + "rewards/rejected": -3.2835166454315186, "step": 1140 }, { "epoch": 0.61281150694096, - "grad_norm": 15.626443610629703, + "grad_norm": 10.921089433146548, "learning_rate": 9.672740659127183e-07, - "logits/chosen": 0.5252998471260071, - "logits/rejected": 0.5994977355003357, - "logps/chosen": -7.5483551025390625, - "logps/rejected": -8.214573860168457, - "loss": 0.5531, - "rewards/accuracies": 0.71875, - "rewards/chosen": -7.5483551025390625, - "rewards/margins": 0.6662176251411438, - "rewards/rejected": -8.214573860168457, - "semantic_entropy": 0.007115071173757315, + "logits/chosen": -0.18046610057353973, + "logits/rejected": -0.040287889540195465, + "logps/chosen": -2.9487435817718506, + "logps/rejected": -3.5605597496032715, + "loss": 0.5817, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.9487435817718506, + "rewards/margins": 0.61181640625, + "rewards/rejected": -3.5605597496032715, "step": 1145 }, { "epoch": 0.6154875397223616, - "grad_norm": 16.005271466438277, + "grad_norm": 13.473225014474169, "learning_rate": 9.667176189662818e-07, - "logits/chosen": 0.5813131332397461, - "logits/rejected": 0.640332818031311, - "logps/chosen": -7.900903224945068, - "logps/rejected": -8.512435913085938, - "loss": 0.5717, + "logits/chosen": -0.1571674346923828, + "logits/rejected": -0.024224860593676567, + "logps/chosen": -3.0152747631073, + "logps/rejected": -3.6306490898132324, + "loss": 0.5505, "rewards/accuracies": 0.71875, - "rewards/chosen": -7.900903224945068, - "rewards/margins": 0.6115323901176453, - "rewards/rejected": -8.512435913085938, - "semantic_entropy": 0.0051747518591582775, + "rewards/chosen": -3.0152747631073, + "rewards/margins": 0.6153741478919983, + "rewards/rejected": -3.6306490898132324, "step": 1150 }, { "epoch": 0.6181635725037632, - "grad_norm": 10.625567401564435, + "grad_norm": 8.660489609950922, "learning_rate": 9.661566439234592e-07, - "logits/chosen": 0.6257847547531128, - "logits/rejected": 0.6622999906539917, - "logps/chosen": -7.946097373962402, - "logps/rejected": -8.47764778137207, - "loss": 0.6041, - "rewards/accuracies": 0.6875, - "rewards/chosen": -7.946097373962402, - "rewards/margins": 0.5315494537353516, - "rewards/rejected": -8.47764778137207, - "semantic_entropy": 0.005170217715203762, + "logits/chosen": -0.0691768005490303, + "logits/rejected": 0.04287392646074295, + "logps/chosen": -3.0143685340881348, + "logps/rejected": -3.4788291454315186, + "loss": 0.5969, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.0143685340881348, + "rewards/margins": 0.4644607901573181, + "rewards/rejected": -3.4788291454315186, "step": 1155 }, { "epoch": 0.6208396052851648, - "grad_norm": 13.397344602674059, + "grad_norm": 13.353657501405168, "learning_rate": 9.655911462268327e-07, - "logits/chosen": 0.6595005989074707, - "logits/rejected": 0.7116304636001587, - "logps/chosen": -7.946617126464844, - "logps/rejected": -8.630485534667969, - "loss": 0.5581, - "rewards/accuracies": 0.6875, - "rewards/chosen": -7.946617126464844, - "rewards/margins": 0.6838675737380981, - "rewards/rejected": -8.630485534667969, - "semantic_entropy": 0.005823346786201, + "logits/chosen": 0.002103020204231143, + "logits/rejected": 0.10967735201120377, + "logps/chosen": -3.1064541339874268, + "logps/rejected": -3.7477428913116455, + "loss": 0.522, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.1064541339874268, + "rewards/margins": 0.6412887573242188, + "rewards/rejected": -3.7477428913116455, "step": 1160 }, { "epoch": 0.6235156380665663, - "grad_norm": 15.865578560547265, + "grad_norm": 11.29388179643121, "learning_rate": 9.650211313628636e-07, - "logits/chosen": 0.5803102254867554, - "logits/rejected": 0.6273818016052246, - "logps/chosen": -7.978617191314697, - "logps/rejected": -8.42393684387207, - "loss": 0.6414, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -7.978617191314697, - "rewards/margins": 0.44531959295272827, - "rewards/rejected": -8.42393684387207, - "semantic_entropy": 0.0065127527341246605, + "logits/chosen": -0.11299815028905869, + "logits/rejected": -0.01636177860200405, + "logps/chosen": -3.2274951934814453, + "logps/rejected": -3.65303111076355, + "loss": 0.6125, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.2274951934814453, + "rewards/margins": 0.4255361557006836, + "rewards/rejected": -3.65303111076355, "step": 1165 }, { "epoch": 0.6261916708479679, - "grad_norm": 15.673714953896777, + "grad_norm": 19.523999584759952, "learning_rate": 9.644466048618386e-07, - "logits/chosen": 0.5762825608253479, - "logits/rejected": 0.6548576354980469, - "logps/chosen": -8.140003204345703, - "logps/rejected": -8.694659233093262, - "loss": 0.5987, + "logits/chosen": -0.08336153626441956, + "logits/rejected": 0.07079918682575226, + "logps/chosen": -3.558469295501709, + "logps/rejected": -4.092928886413574, + "loss": 0.6194, "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -8.140003204345703, - "rewards/margins": 0.5546567440032959, - "rewards/rejected": -8.694659233093262, - "semantic_entropy": 0.005027764476835728, + "rewards/chosen": -3.558469295501709, + "rewards/margins": 0.5344600081443787, + "rewards/rejected": -4.092928886413574, "step": 1170 }, { "epoch": 0.6288677036293695, - "grad_norm": 14.354746897825013, + "grad_norm": 9.14672572198393, "learning_rate": 9.63867572297816e-07, - "logits/chosen": 0.5808348655700684, - "logits/rejected": 0.6876333355903625, - "logps/chosen": -7.959421634674072, - "logps/rejected": -8.607701301574707, - "loss": 0.5651, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -7.959421634674072, - "rewards/margins": 0.6482798457145691, - "rewards/rejected": -8.607701301574707, - "semantic_entropy": 0.006594679318368435, + "logits/chosen": -0.11991317570209503, + "logits/rejected": 0.072477787733078, + "logps/chosen": -3.2550976276397705, + "logps/rejected": -3.8207218647003174, + "loss": 0.574, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.2550976276397705, + "rewards/margins": 0.5656242966651917, + "rewards/rejected": -3.8207218647003174, "step": 1175 }, { "epoch": 0.631543736410771, - "grad_norm": 12.98455385650067, + "grad_norm": 12.893437010872578, "learning_rate": 9.632840392885727e-07, - "logits/chosen": 0.5893815755844116, - "logits/rejected": 0.661659836769104, - "logps/chosen": -7.993622779846191, - "logps/rejected": -8.655765533447266, - "loss": 0.5828, + "logits/chosen": -0.13331344723701477, + "logits/rejected": 0.030132334679365158, + "logps/chosen": -3.5457446575164795, + "logps/rejected": -4.143066883087158, + "loss": 0.5874, "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -7.993622779846191, - "rewards/margins": 0.6621420979499817, - "rewards/rejected": -8.655765533447266, - "semantic_entropy": 0.00508379889652133, + "rewards/chosen": -3.5457446575164795, + "rewards/margins": 0.597322404384613, + "rewards/rejected": -4.143066883087158, "step": 1180 }, { "epoch": 0.6342197691921726, - "grad_norm": 14.115881656600934, + "grad_norm": 10.935585251672848, "learning_rate": 9.626960114955483e-07, - "logits/chosen": 0.6585602164268494, - "logits/rejected": 0.7358173131942749, - "logps/chosen": -7.811059474945068, - "logps/rejected": -8.613174438476562, - "loss": 0.5195, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -7.811059474945068, - "rewards/margins": 0.8021153211593628, - "rewards/rejected": -8.613174438476562, - "semantic_entropy": 0.006301518529653549, + "logits/chosen": -0.0833960548043251, + "logits/rejected": 0.0644834041595459, + "logps/chosen": -3.4998462200164795, + "logps/rejected": -4.1566362380981445, + "loss": 0.5526, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.4998462200164795, + "rewards/margins": 0.6567894816398621, + "rewards/rejected": -4.1566362380981445, "step": 1185 }, { "epoch": 0.6368958019735742, - "grad_norm": 16.283892207062348, + "grad_norm": 13.617553886010274, "learning_rate": 9.621034946237909e-07, - "logits/chosen": 0.6387815475463867, - "logits/rejected": 0.7039491534233093, - "logps/chosen": -7.9904046058654785, - "logps/rejected": -8.646058082580566, - "loss": 0.5645, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -7.9904046058654785, - "rewards/margins": 0.655653715133667, - "rewards/rejected": -8.646058082580566, - "semantic_entropy": 0.005252276547253132, + "logits/chosen": -0.16134895384311676, + "logits/rejected": -0.015465304255485535, + "logps/chosen": -3.7392566204071045, + "logps/rejected": -4.431344032287598, + "loss": 0.551, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.7392566204071045, + "rewards/margins": 0.6920870542526245, + "rewards/rejected": -4.431344032287598, "step": 1190 }, { "epoch": 0.6395718347549757, - "grad_norm": 15.655836290539325, + "grad_norm": 10.411174279904069, "learning_rate": 9.615064944219021e-07, - "logits/chosen": 0.6491774320602417, - "logits/rejected": 0.7328025698661804, - "logps/chosen": -7.836843013763428, - "logps/rejected": -8.513358116149902, - "loss": 0.5399, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -7.836843013763428, - "rewards/margins": 0.6765137910842896, - "rewards/rejected": -8.513358116149902, - "semantic_entropy": 0.005473036784678698, + "logits/chosen": -0.07534292340278625, + "logits/rejected": 0.040440887212753296, + "logps/chosen": -3.4149906635284424, + "logps/rejected": -4.073335647583008, + "loss": 0.5506, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.4149906635284424, + "rewards/margins": 0.6583448648452759, + "rewards/rejected": -4.073335647583008, "step": 1195 }, { "epoch": 0.6422478675363773, - "grad_norm": 22.62708626022183, + "grad_norm": 18.806358616431073, "learning_rate": 9.609050166819803e-07, - "logits/chosen": 0.5962838530540466, - "logits/rejected": 0.6391795873641968, - "logps/chosen": -8.035821914672852, - "logps/rejected": -8.608399391174316, - "loss": 0.5951, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -8.035821914672852, - "rewards/margins": 0.572577953338623, - "rewards/rejected": -8.608399391174316, - "semantic_entropy": 0.005160279106348753, + "logits/chosen": -0.10628336668014526, + "logits/rejected": -0.040675289928913116, + "logps/chosen": -3.6882033348083496, + "logps/rejected": -4.2382707595825195, + "loss": 0.5883, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.6882033348083496, + "rewards/margins": 0.5500668287277222, + "rewards/rejected": -4.2382707595825195, "step": 1200 }, { "epoch": 0.6422478675363773, - "eval_logits/chosen": 0.7506793141365051, - "eval_logits/rejected": 0.7968686819076538, - "eval_logps/chosen": -7.988265514373779, - "eval_logps/rejected": -8.681319236755371, - "eval_loss": 0.5522213578224182, - "eval_rewards/accuracies": 0.7062314748764038, - "eval_rewards/chosen": -7.988265514373779, - "eval_rewards/margins": 0.6930533647537231, - "eval_rewards/rejected": -8.681319236755371, - "eval_runtime": 35.081, - "eval_samples_per_second": 38.34, - "eval_semantic_entropy": 0.004989789333194494, - "eval_steps_per_second": 9.606, + "eval_logits/chosen": 0.2875683605670929, + "eval_logits/rejected": 0.39231419563293457, + "eval_logps/chosen": -3.5510618686676025, + "eval_logps/rejected": -4.212265491485596, + "eval_loss": 0.5485764741897583, + "eval_rewards/accuracies": 0.721068263053894, + "eval_rewards/chosen": -3.5510618686676025, + "eval_rewards/margins": 0.6612029671669006, + "eval_rewards/rejected": -4.212265491485596, + "eval_runtime": 40.426, + "eval_samples_per_second": 33.271, + "eval_steps_per_second": 8.336, "step": 1200 }, { "epoch": 0.6449239003177789, - "grad_norm": 18.377278845630318, + "grad_norm": 14.488498841664029, "learning_rate": 9.602990672395653e-07, - "logits/chosen": 0.582940399646759, - "logits/rejected": 0.6627975106239319, - "logps/chosen": -8.004450798034668, - "logps/rejected": -8.656949043273926, - "loss": 0.5505, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -8.004450798034668, - "rewards/margins": 0.6524981260299683, - "rewards/rejected": -8.656949043273926, - "semantic_entropy": 0.004912947304546833, + "logits/chosen": -0.22082920372486115, + "logits/rejected": -0.030663009732961655, + "logps/chosen": -3.444598436355591, + "logps/rejected": -4.103005409240723, + "loss": 0.5437, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.444598436355591, + "rewards/margins": 0.6584070920944214, + "rewards/rejected": -4.103005409240723, "step": 1205 }, { "epoch": 0.6475999330991805, - "grad_norm": 13.700239190708754, + "grad_norm": 12.638444019779717, "learning_rate": 9.59688651973581e-07, - "logits/chosen": 0.7021932601928711, - "logits/rejected": 0.7906457185745239, - "logps/chosen": -8.091392517089844, - "logps/rejected": -8.654991149902344, - "loss": 0.587, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -8.091392517089844, - "rewards/margins": 0.563599169254303, - "rewards/rejected": -8.654991149902344, - "semantic_entropy": 0.004794766195118427, + "logits/chosen": -0.11059337854385376, + "logits/rejected": 0.09193927049636841, + "logps/chosen": -3.2913970947265625, + "logps/rejected": -3.854825258255005, + "loss": 0.564, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.2913970947265625, + "rewards/margins": 0.5634284019470215, + "rewards/rejected": -3.854825258255005, "step": 1210 }, { "epoch": 0.650275965880582, - "grad_norm": 15.628975304693077, + "grad_norm": 12.141259134680345, "learning_rate": 9.590737768062792e-07, - "logits/chosen": 0.6097584962844849, - "logits/rejected": 0.6667622327804565, - "logps/chosen": -8.029305458068848, - "logps/rejected": -8.514669418334961, - "loss": 0.619, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -8.029305458068848, - "rewards/margins": 0.4853641390800476, - "rewards/rejected": -8.514669418334961, - "semantic_entropy": 0.004363791085779667, + "logits/chosen": -0.18308471143245697, + "logits/rejected": -0.07229964435100555, + "logps/chosen": -3.3822810649871826, + "logps/rejected": -3.8686976432800293, + "loss": 0.5965, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.3822810649871826, + "rewards/margins": 0.48641663789749146, + "rewards/rejected": -3.8686976432800293, "step": 1215 }, { "epoch": 0.6529519986619836, - "grad_norm": 14.141900238974408, + "grad_norm": 11.996657693966625, "learning_rate": 9.584544477031816e-07, - "logits/chosen": 0.7649446725845337, - "logits/rejected": 0.8241230249404907, - "logps/chosen": -7.659104824066162, - "logps/rejected": -8.234978675842285, - "loss": 0.5818, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -7.659104824066162, - "rewards/margins": 0.5758742094039917, - "rewards/rejected": -8.234978675842285, - "semantic_entropy": 0.006544353906065226, + "logits/chosen": 0.024976596236228943, + "logits/rejected": 0.14426210522651672, + "logps/chosen": -2.982923984527588, + "logps/rejected": -3.524927854537964, + "loss": 0.5877, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.982923984527588, + "rewards/margins": 0.5420037508010864, + "rewards/rejected": -3.524927854537964, "step": 1220 }, { "epoch": 0.6556280314433852, - "grad_norm": 17.44536516233375, + "grad_norm": 12.659797275362596, "learning_rate": 9.578306706730215e-07, - "logits/chosen": 0.6202625036239624, - "logits/rejected": 0.7067128419876099, - "logps/chosen": -7.734452724456787, - "logps/rejected": -8.231317520141602, - "loss": 0.6291, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -7.734452724456787, - "rewards/margins": 0.4968656599521637, - "rewards/rejected": -8.231317520141602, - "semantic_entropy": 0.006045544985681772, + "logits/chosen": -0.18757277727127075, + "logits/rejected": 0.03030409850180149, + "logps/chosen": -3.267350673675537, + "logps/rejected": -3.6926894187927246, + "loss": 0.6469, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.267350673675537, + "rewards/margins": 0.4253385066986084, + "rewards/rejected": -3.6926894187927246, "step": 1225 }, { "epoch": 0.6583040642247867, - "grad_norm": 14.058747132134709, + "grad_norm": 13.323812979935369, "learning_rate": 9.572024517676865e-07, - "logits/chosen": 0.6863638162612915, - "logits/rejected": 0.7385177612304688, - "logps/chosen": -7.626795768737793, - "logps/rejected": -8.15473461151123, - "loss": 0.6068, - "rewards/accuracies": 0.65625, - "rewards/chosen": -7.626795768737793, - "rewards/margins": 0.5279384851455688, - "rewards/rejected": -8.15473461151123, - "semantic_entropy": 0.006082098465412855, + "logits/chosen": -0.08569470793008804, + "logits/rejected": 0.014417910948395729, + "logps/chosen": -3.226412296295166, + "logps/rejected": -3.6949286460876465, + "loss": 0.6151, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.226412296295166, + "rewards/margins": 0.46851611137390137, + "rewards/rejected": -3.6949286460876465, "step": 1230 }, { "epoch": 0.6609800970061883, - "grad_norm": 15.524786458902534, + "grad_norm": 11.200687203998697, "learning_rate": 9.565697970821593e-07, - "logits/chosen": 0.6960703134536743, - "logits/rejected": 0.7752768397331238, - "logps/chosen": -7.594348907470703, - "logps/rejected": -8.13396167755127, - "loss": 0.5959, - "rewards/accuracies": 0.65625, - "rewards/chosen": -7.594348907470703, - "rewards/margins": 0.5396129488945007, - "rewards/rejected": -8.13396167755127, - "semantic_entropy": 0.0065464479848742485, + "logits/chosen": -0.08245605230331421, + "logits/rejected": 0.057003892958164215, + "logps/chosen": -3.0891146659851074, + "logps/rejected": -3.5262794494628906, + "loss": 0.6044, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.0891146659851074, + "rewards/margins": 0.4371647238731384, + "rewards/rejected": -3.5262794494628906, "step": 1235 }, { "epoch": 0.6636561297875899, - "grad_norm": 10.898687489202283, + "grad_norm": 10.954052092994113, "learning_rate": 9.559327127544585e-07, - "logits/chosen": 0.6455325484275818, - "logits/rejected": 0.7051125764846802, - "logps/chosen": -7.510709285736084, - "logps/rejected": -8.048765182495117, - "loss": 0.5766, + "logits/chosen": -0.17408405244350433, + "logits/rejected": -0.04174000769853592, + "logps/chosen": -3.022346019744873, + "logps/rejected": -3.5090103149414062, + "loss": 0.57, "rewards/accuracies": 0.65625, - "rewards/chosen": -7.510709285736084, - "rewards/margins": 0.5380562543869019, - "rewards/rejected": -8.048765182495117, - "semantic_entropy": 0.007156215608119965, + "rewards/chosen": -3.022346019744873, + "rewards/margins": 0.48666438460350037, + "rewards/rejected": -3.5090103149414062, "step": 1240 }, { "epoch": 0.6663321625689914, - "grad_norm": 18.786409391603485, + "grad_norm": 11.464628083818466, "learning_rate": 9.552912049655789e-07, - "logits/chosen": 0.6517975330352783, - "logits/rejected": 0.7252013683319092, - "logps/chosen": -7.325045108795166, - "logps/rejected": -7.9806952476501465, - "loss": 0.5695, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -7.325045108795166, - "rewards/margins": 0.6556496620178223, - "rewards/rejected": -7.9806952476501465, - "semantic_entropy": 0.00768858939409256, + "logits/chosen": -0.11672677844762802, + "logits/rejected": 0.05961567163467407, + "logps/chosen": -2.93503999710083, + "logps/rejected": -3.4622421264648438, + "loss": 0.5705, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.93503999710083, + "rewards/margins": 0.5272021293640137, + "rewards/rejected": -3.4622421264648438, "step": 1245 }, { "epoch": 0.669008195350393, - "grad_norm": 24.445945587094794, + "grad_norm": 15.357321927469696, "learning_rate": 9.546452799394315e-07, - "logits/chosen": 0.6680857539176941, - "logits/rejected": 0.7646031975746155, - "logps/chosen": -7.549722194671631, - "logps/rejected": -8.019618034362793, - "loss": 0.645, - "rewards/accuracies": 0.625, - "rewards/chosen": -7.549722194671631, - "rewards/margins": 0.46989649534225464, - "rewards/rejected": -8.019618034362793, - "semantic_entropy": 0.006873616483062506, + "logits/chosen": -0.06830787658691406, + "logits/rejected": 0.12355013191699982, + "logps/chosen": -3.153735637664795, + "logps/rejected": -3.573169231414795, + "loss": 0.6408, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.153735637664795, + "rewards/margins": 0.4194336533546448, + "rewards/rejected": -3.573169231414795, "step": 1250 }, { "epoch": 0.6716842281317946, - "grad_norm": 15.014444640765525, + "grad_norm": 11.26332296630573, "learning_rate": 9.539949439427846e-07, - "logits/chosen": 0.6218008995056152, - "logits/rejected": 0.6838528513908386, - "logps/chosen": -7.445742607116699, - "logps/rejected": -8.093426704406738, - "loss": 0.5457, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -7.445742607116699, - "rewards/margins": 0.6476832628250122, - "rewards/rejected": -8.093426704406738, - "semantic_entropy": 0.007571948226541281, + "logits/chosen": -0.10017766803503036, + "logits/rejected": 0.015266944654285908, + "logps/chosen": -3.0861427783966064, + "logps/rejected": -3.658721923828125, + "loss": 0.5622, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.0861427783966064, + "rewards/margins": 0.5725796818733215, + "rewards/rejected": -3.658721923828125, "step": 1255 }, { "epoch": 0.6743602609131962, - "grad_norm": 12.345959930188625, + "grad_norm": 11.463136612842302, "learning_rate": 9.533402032852002e-07, - "logits/chosen": 0.5849351286888123, - "logits/rejected": 0.6522541642189026, - "logps/chosen": -7.5591864585876465, - "logps/rejected": -8.265511512756348, - "loss": 0.5424, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -7.5591864585876465, - "rewards/margins": 0.7063250541687012, - "rewards/rejected": -8.265511512756348, - "semantic_entropy": 0.006167138926684856, + "logits/chosen": -0.13005469739437103, + "logits/rejected": 0.013041043654084206, + "logps/chosen": -3.217498302459717, + "logps/rejected": -3.88468861579895, + "loss": 0.5463, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.217498302459717, + "rewards/margins": 0.6671901941299438, + "rewards/rejected": -3.88468861579895, "step": 1260 }, { "epoch": 0.6770362936945977, - "grad_norm": 16.266967829326354, + "grad_norm": 12.899648302170469, "learning_rate": 9.526810643189754e-07, - "logits/chosen": 0.6240657567977905, - "logits/rejected": 0.7110647559165955, - "logps/chosen": -7.6305341720581055, - "logps/rejected": -8.264394760131836, - "loss": 0.5468, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -7.6305341720581055, - "rewards/margins": 0.6338610053062439, - "rewards/rejected": -8.264394760131836, - "semantic_entropy": 0.006204391364008188, + "logits/chosen": -0.058186911046504974, + "logits/rejected": 0.11038491874933243, + "logps/chosen": -3.2560951709747314, + "logps/rejected": -3.8879165649414062, + "loss": 0.5505, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.2560951709747314, + "rewards/margins": 0.63182133436203, + "rewards/rejected": -3.8879165649414062, "step": 1265 }, { "epoch": 0.6797123264759993, - "grad_norm": 16.69670345070714, + "grad_norm": 13.593909324299991, "learning_rate": 9.52017533439079e-07, - "logits/chosen": 0.5465856790542603, - "logits/rejected": 0.5944739580154419, - "logps/chosen": -7.692608833312988, - "logps/rejected": -8.17878532409668, - "loss": 0.6284, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -7.692608833312988, - "rewards/margins": 0.48617634177207947, - "rewards/rejected": -8.17878532409668, - "semantic_entropy": 0.0065203020349144936, + "logits/chosen": -0.12858423590660095, + "logits/rejected": -0.05160200595855713, + "logps/chosen": -3.392320156097412, + "logps/rejected": -3.895653486251831, + "loss": 0.5959, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.392320156097412, + "rewards/margins": 0.5033332109451294, + "rewards/rejected": -3.895653486251831, "step": 1270 }, { "epoch": 0.6823883592574009, - "grad_norm": 10.756532347044482, + "grad_norm": 11.187178967054068, "learning_rate": 9.513496170830909e-07, - "logits/chosen": 0.5842273235321045, - "logits/rejected": 0.6651209592819214, - "logps/chosen": -7.88360595703125, - "logps/rejected": -8.369672775268555, - "loss": 0.6449, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -7.88360595703125, - "rewards/margins": 0.4860672950744629, - "rewards/rejected": -8.369672775268555, - "semantic_entropy": 0.005015389062464237, + "logits/chosen": -0.12033028900623322, + "logits/rejected": -0.0057485452853143215, + "logps/chosen": -3.54186749458313, + "logps/rejected": -4.027322769165039, + "loss": 0.6315, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.54186749458313, + "rewards/margins": 0.48545533418655396, + "rewards/rejected": -4.027322769165039, "step": 1275 }, { "epoch": 0.6850643920388024, - "grad_norm": 21.16131993716241, + "grad_norm": 13.653365352733038, "learning_rate": 9.506773217311382e-07, - "logits/chosen": 0.6626430153846741, - "logits/rejected": 0.7511327862739563, - "logps/chosen": -7.711289882659912, - "logps/rejected": -8.375436782836914, - "loss": 0.5492, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -7.711289882659912, - "rewards/margins": 0.6641460657119751, - "rewards/rejected": -8.375436782836914, - "semantic_entropy": 0.006019088439643383, + "logits/chosen": -0.10203119367361069, + "logits/rejected": 0.06477317959070206, + "logps/chosen": -3.354496479034424, + "logps/rejected": -3.910160779953003, + "loss": 0.5751, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.354496479034424, + "rewards/margins": 0.5556651949882507, + "rewards/rejected": -3.910160779953003, "step": 1280 }, { "epoch": 0.687740424820204, - "grad_norm": 17.008433131854304, + "grad_norm": 12.450261931346308, "learning_rate": 9.500006539058334e-07, - "logits/chosen": 0.73247891664505, - "logits/rejected": 0.7920357584953308, - "logps/chosen": -7.982652187347412, - "logps/rejected": -8.343966484069824, - "loss": 0.6391, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -7.982652187347412, - "rewards/margins": 0.3613142967224121, - "rewards/rejected": -8.343966484069824, - "semantic_entropy": 0.004631609655916691, + "logits/chosen": -0.0540333166718483, + "logits/rejected": 0.09519968926906586, + "logps/chosen": -3.1939380168914795, + "logps/rejected": -3.6268506050109863, + "loss": 0.5992, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.1939380168914795, + "rewards/margins": 0.4329126477241516, + "rewards/rejected": -3.6268506050109863, "step": 1285 }, { "epoch": 0.6904164576016056, - "grad_norm": 13.160928071624928, + "grad_norm": 13.707001295796587, "learning_rate": 9.493196201722109e-07, - "logits/chosen": 0.6529003977775574, - "logits/rejected": 0.7320042252540588, - "logps/chosen": -7.793301582336426, - "logps/rejected": -8.294574737548828, - "loss": 0.6074, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -7.793301582336426, - "rewards/margins": 0.5012733340263367, - "rewards/rejected": -8.294574737548828, - "semantic_entropy": 0.0051393527537584305, + "logits/chosen": -0.19386431574821472, + "logits/rejected": -0.03738199919462204, + "logps/chosen": -3.27239990234375, + "logps/rejected": -3.6509177684783936, + "loss": 0.636, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.27239990234375, + "rewards/margins": 0.378518283367157, + "rewards/rejected": -3.6509177684783936, "step": 1290 }, { "epoch": 0.6930924903830072, - "grad_norm": 14.183716262535388, + "grad_norm": 9.867611986604537, "learning_rate": 9.486342271376628e-07, - "logits/chosen": 0.6803663969039917, - "logits/rejected": 0.6954035758972168, - "logps/chosen": -7.670355796813965, - "logps/rejected": -8.38364028930664, - "loss": 0.5344, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -7.670355796813965, - "rewards/margins": 0.7132849097251892, - "rewards/rejected": -8.38364028930664, - "semantic_entropy": 0.006493359804153442, + "logits/chosen": -0.08256490528583527, + "logits/rejected": -0.08659453690052032, + "logps/chosen": -3.2066993713378906, + "logps/rejected": -3.812256336212158, + "loss": 0.5521, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.2066993713378906, + "rewards/margins": 0.6055571436882019, + "rewards/rejected": -3.812256336212158, "step": 1295 }, { "epoch": 0.6957685231644087, - "grad_norm": 14.518556999601335, + "grad_norm": 14.865181124732603, "learning_rate": 9.479444814518755e-07, - "logits/chosen": 0.7013619542121887, - "logits/rejected": 0.8164475560188293, - "logps/chosen": -7.910555839538574, - "logps/rejected": -8.657155990600586, - "loss": 0.5453, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -7.910555839538574, - "rewards/margins": 0.746599555015564, - "rewards/rejected": -8.657155990600586, - "semantic_entropy": 0.004653572104871273, + "logits/chosen": -0.13809911906719208, + "logits/rejected": 0.12425445020198822, + "logps/chosen": -3.2317771911621094, + "logps/rejected": -3.9344844818115234, + "loss": 0.545, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.2317771911621094, + "rewards/margins": 0.7027073502540588, + "rewards/rejected": -3.9344844818115234, "step": 1300 }, { "epoch": 0.6984445559458103, - "grad_norm": 12.861320996249733, + "grad_norm": 9.5211112794932, "learning_rate": 9.472503898067645e-07, - "logits/chosen": 0.7577477693557739, - "logits/rejected": 0.7888853549957275, - "logps/chosen": -7.878431797027588, - "logps/rejected": -8.5064058303833, - "loss": 0.5883, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -7.878431797027588, - "rewards/margins": 0.6279749870300293, - "rewards/rejected": -8.5064058303833, - "semantic_entropy": 0.004988783039152622, + "logits/chosen": 0.0020645230542868376, + "logits/rejected": 0.048176027834415436, + "logps/chosen": -3.3173155784606934, + "logps/rejected": -3.7721729278564453, + "loss": 0.6187, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.3173155784606934, + "rewards/margins": 0.4548572599887848, + "rewards/rejected": -3.7721729278564453, "step": 1305 }, { "epoch": 0.701120588727212, - "grad_norm": 14.945074518060963, + "grad_norm": 12.326119581854993, "learning_rate": 9.465519589364099e-07, - "logits/chosen": 0.8046930432319641, - "logits/rejected": 0.8452129364013672, - "logps/chosen": -7.966567039489746, - "logps/rejected": -8.664255142211914, - "loss": 0.5506, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -7.966567039489746, - "rewards/margins": 0.6976876258850098, - "rewards/rejected": -8.664255142211914, - "semantic_entropy": 0.004865294322371483, + "logits/chosen": -0.01973193883895874, + "logits/rejected": 0.06389573961496353, + "logps/chosen": -3.1816892623901367, + "logps/rejected": -3.829970598220825, + "loss": 0.5512, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.1816892623901367, + "rewards/margins": 0.6482815742492676, + "rewards/rejected": -3.829970598220825, "step": 1310 }, { "epoch": 0.7037966215086134, - "grad_norm": 18.100914575781236, + "grad_norm": 12.172913466966774, "learning_rate": 9.458491956169914e-07, - "logits/chosen": 0.8275071382522583, - "logits/rejected": 0.8813290596008301, - "logps/chosen": -8.298616409301758, - "logps/rejected": -8.859460830688477, - "loss": 0.6166, - "rewards/accuracies": 0.6875, - "rewards/chosen": -8.298616409301758, - "rewards/margins": 0.5608429312705994, - "rewards/rejected": -8.859460830688477, - "semantic_entropy": 0.003682538866996765, + "logits/chosen": -0.10095510631799698, + "logits/rejected": 0.0665215402841568, + "logps/chosen": -3.362898349761963, + "logps/rejected": -3.945164442062378, + "loss": 0.571, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.362898349761963, + "rewards/margins": 0.5822662115097046, + "rewards/rejected": -3.945164442062378, "step": 1315 }, { "epoch": 0.706472654290015, - "grad_norm": 15.788405153142321, + "grad_norm": 13.86915123064612, "learning_rate": 9.451421066667215e-07, - "logits/chosen": 0.7420376539230347, - "logits/rejected": 0.8037020564079285, - "logps/chosen": -8.253267288208008, - "logps/rejected": -8.862098693847656, - "loss": 0.5702, - "rewards/accuracies": 0.71875, - "rewards/chosen": -8.253267288208008, - "rewards/margins": 0.6088317036628723, - "rewards/rejected": -8.862098693847656, - "semantic_entropy": 0.003497874829918146, + "logits/chosen": -0.21288760006427765, + "logits/rejected": -0.026159871369600296, + "logps/chosen": -3.2552108764648438, + "logps/rejected": -3.80267333984375, + "loss": 0.5644, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.2552108764648438, + "rewards/margins": 0.5474623441696167, + "rewards/rejected": -3.80267333984375, "step": 1320 }, { "epoch": 0.7091486870714167, - "grad_norm": 20.04954090545044, + "grad_norm": 16.595010001622438, "learning_rate": 9.444306989457805e-07, - "logits/chosen": 0.8426326513290405, - "logits/rejected": 0.8867173194885254, - "logps/chosen": -8.014307022094727, - "logps/rejected": -8.588244438171387, - "loss": 0.6257, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -8.014307022094727, - "rewards/margins": 0.5739374756813049, - "rewards/rejected": -8.588244438171387, - "semantic_entropy": 0.0046114143915474415, + "logits/chosen": -0.058408599346876144, + "logits/rejected": 0.06597563624382019, + "logps/chosen": -3.120912790298462, + "logps/rejected": -3.607276201248169, + "loss": 0.6531, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.120912790298462, + "rewards/margins": 0.48636364936828613, + "rewards/rejected": -3.607276201248169, "step": 1325 }, { "epoch": 0.7118247198528181, - "grad_norm": 16.08088418888514, + "grad_norm": 18.989744250299676, "learning_rate": 9.437149793562489e-07, - "logits/chosen": 0.8074777722358704, - "logits/rejected": 0.8401328921318054, - "logps/chosen": -7.99387264251709, - "logps/rejected": -8.621038436889648, - "loss": 0.5737, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -7.99387264251709, - "rewards/margins": 0.6271660327911377, - "rewards/rejected": -8.621038436889648, - "semantic_entropy": 0.004982014186680317, + "logits/chosen": -0.06904155761003494, + "logits/rejected": 0.03812388330698013, + "logps/chosen": -3.1645686626434326, + "logps/rejected": -3.7125766277313232, + "loss": 0.5986, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.1645686626434326, + "rewards/margins": 0.5480076670646667, + "rewards/rejected": -3.7125766277313232, "step": 1330 }, { "epoch": 0.7145007526342197, - "grad_norm": 17.648574778030703, + "grad_norm": 12.33548944229797, "learning_rate": 9.429949548420417e-07, - "logits/chosen": 0.7622288465499878, - "logits/rejected": 0.7995889782905579, - "logps/chosen": -8.03836441040039, - "logps/rejected": -8.615036010742188, - "loss": 0.5845, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -8.03836441040039, - "rewards/margins": 0.5766717195510864, - "rewards/rejected": -8.615036010742188, - "semantic_entropy": 0.005048284772783518, + "logits/chosen": -0.06997523456811905, + "logits/rejected": 0.026143023744225502, + "logps/chosen": -3.1095046997070312, + "logps/rejected": -3.621790647506714, + "loss": 0.5994, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.1095046997070312, + "rewards/margins": 0.5122863054275513, + "rewards/rejected": -3.621790647506714, "step": 1335 }, { "epoch": 0.7171767854156214, - "grad_norm": 13.251467288565097, + "grad_norm": 18.92825344965984, "learning_rate": 9.422706323888396e-07, - "logits/chosen": 0.7949849963188171, - "logits/rejected": 0.835627555847168, - "logps/chosen": -8.13330078125, - "logps/rejected": -8.713802337646484, - "loss": 0.5896, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -8.13330078125, - "rewards/margins": 0.580501139163971, - "rewards/rejected": -8.713802337646484, - "semantic_entropy": 0.00421832874417305, + "logits/chosen": -0.05226869508624077, + "logits/rejected": -0.005358004476875067, + "logps/chosen": -2.9178309440612793, + "logps/rejected": -3.3798935413360596, + "loss": 0.6131, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.9178309440612793, + "rewards/margins": 0.4620625376701355, + "rewards/rejected": -3.3798935413360596, "step": 1340 }, { "epoch": 0.719852818197023, - "grad_norm": 12.727013846864125, + "grad_norm": 12.491808511213566, "learning_rate": 9.415420190240225e-07, - "logits/chosen": 0.8075268864631653, - "logits/rejected": 0.8839607238769531, - "logps/chosen": -8.259916305541992, - "logps/rejected": -9.057500839233398, - "loss": 0.4887, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.259916305541992, - "rewards/margins": 0.7975843548774719, - "rewards/rejected": -9.057500839233398, - "semantic_entropy": 0.003385394811630249, + "logits/chosen": -0.014972483739256859, + "logits/rejected": 0.18696525692939758, + "logps/chosen": -3.0169167518615723, + "logps/rejected": -3.6787776947021484, + "loss": 0.4967, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0169167518615723, + "rewards/margins": 0.6618611812591553, + "rewards/rejected": -3.6787776947021484, "step": 1345 }, { "epoch": 0.7225288509784245, - "grad_norm": 19.33189686664916, + "grad_norm": 14.40027433421022, "learning_rate": 9.408091218166002e-07, - "logits/chosen": 0.7991722226142883, - "logits/rejected": 0.829816997051239, - "logps/chosen": -8.155218124389648, - "logps/rejected": -8.655525207519531, - "loss": 0.6021, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -8.155218124389648, - "rewards/margins": 0.5003066062927246, - "rewards/rejected": -8.655525207519531, - "semantic_entropy": 0.004503914155066013, + "logits/chosen": -0.05366384983062744, + "logits/rejected": 0.023615699261426926, + "logps/chosen": -3.192579746246338, + "logps/rejected": -3.5836784839630127, + "loss": 0.6277, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.192579746246338, + "rewards/margins": 0.39109936356544495, + "rewards/rejected": -3.5836784839630127, "step": 1350 }, { "epoch": 0.7252048837598261, - "grad_norm": 21.351677483542304, + "grad_norm": 14.22738878781431, "learning_rate": 9.400719478771449e-07, - "logits/chosen": 0.7611302137374878, - "logits/rejected": 0.8729363679885864, - "logps/chosen": -8.371480941772461, - "logps/rejected": -8.982285499572754, - "loss": 0.5771, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -8.371480941772461, - "rewards/margins": 0.6108050346374512, - "rewards/rejected": -8.982285499572754, - "semantic_entropy": 0.004243707284331322, + "logits/chosen": -0.10313792526721954, + "logits/rejected": 0.19015619158744812, + "logps/chosen": -3.4706928730010986, + "logps/rejected": -4.009016990661621, + "loss": 0.5796, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.4706928730010986, + "rewards/margins": 0.5383240580558777, + "rewards/rejected": -4.009016990661621, "step": 1355 }, { "epoch": 0.7278809165412277, - "grad_norm": 15.476926041346282, + "grad_norm": 14.170745001173735, "learning_rate": 9.393305043577209e-07, - "logits/chosen": 0.7315706610679626, - "logits/rejected": 0.7855316400527954, - "logps/chosen": -8.193710327148438, - "logps/rejected": -9.018750190734863, - "loss": 0.5218, - "rewards/accuracies": 0.71875, - "rewards/chosen": -8.193710327148438, - "rewards/margins": 0.8250393867492676, - "rewards/rejected": -9.018750190734863, - "semantic_entropy": 0.004040227737277746, + "logits/chosen": -0.14026983082294464, + "logits/rejected": -0.01674916222691536, + "logps/chosen": -3.727367401123047, + "logps/rejected": -4.4143571853637695, + "loss": 0.5549, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.727367401123047, + "rewards/margins": 0.6869907379150391, + "rewards/rejected": -4.4143571853637695, "step": 1360 }, { "epoch": 0.7305569493226292, - "grad_norm": 11.327037956728795, + "grad_norm": 9.108719987910694, "learning_rate": 9.38584798451817e-07, - "logits/chosen": 0.6972507238388062, - "logits/rejected": 0.7701762318611145, - "logps/chosen": -8.05665397644043, - "logps/rejected": -8.655900955200195, - "loss": 0.5916, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -8.05665397644043, - "rewards/margins": 0.5992475748062134, - "rewards/rejected": -8.655900955200195, - "semantic_entropy": 0.0046203965321183205, + "logits/chosen": -0.10338069498538971, + "logits/rejected": 0.042006999254226685, + "logps/chosen": -3.547541379928589, + "logps/rejected": -4.1569318771362305, + "loss": 0.5554, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.547541379928589, + "rewards/margins": 0.6093905568122864, + "rewards/rejected": -4.1569318771362305, "step": 1365 }, { "epoch": 0.7332329821040308, - "grad_norm": 26.290588474950017, + "grad_norm": 22.520144676141488, "learning_rate": 9.37834837394275e-07, - "logits/chosen": 0.6830715537071228, - "logits/rejected": 0.7537237405776978, - "logps/chosen": -8.189208030700684, - "logps/rejected": -9.016082763671875, - "loss": 0.5602, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.189208030700684, - "rewards/margins": 0.8268746137619019, - "rewards/rejected": -9.016082763671875, - "semantic_entropy": 0.004340589977800846, + "logits/chosen": -0.09211014211177826, + "logits/rejected": 0.04549660533666611, + "logps/chosen": -3.5685601234436035, + "logps/rejected": -4.329239845275879, + "loss": 0.575, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.5685601234436035, + "rewards/margins": 0.7606796026229858, + "rewards/rejected": -4.329239845275879, "step": 1370 }, { "epoch": 0.7359090148854324, - "grad_norm": 14.319339627993806, + "grad_norm": 9.155916046499202, "learning_rate": 9.370806284612203e-07, - "logits/chosen": 0.6698434352874756, - "logits/rejected": 0.7308276295661926, - "logps/chosen": -8.377470016479492, - "logps/rejected": -9.087924003601074, - "loss": 0.53, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -8.377470016479492, - "rewards/margins": 0.7104541063308716, - "rewards/rejected": -9.087924003601074, - "semantic_entropy": 0.0037077039014548063, + "logits/chosen": -0.10397066920995712, + "logits/rejected": 0.04061213508248329, + "logps/chosen": -3.2964344024658203, + "logps/rejected": -4.0229105949401855, + "loss": 0.5279, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.2964344024658203, + "rewards/margins": 0.7264761924743652, + "rewards/rejected": -4.0229105949401855, "step": 1375 }, { "epoch": 0.738585047666834, - "grad_norm": 14.5676067902578, + "grad_norm": 11.174325937048911, "learning_rate": 9.363221789699912e-07, - "logits/chosen": 0.6635018587112427, - "logits/rejected": 0.7242141962051392, - "logps/chosen": -8.505064010620117, - "logps/rejected": -9.010017395019531, - "loss": 0.6233, - "rewards/accuracies": 0.65625, - "rewards/chosen": -8.505064010620117, - "rewards/margins": 0.5049545168876648, - "rewards/rejected": -9.010017395019531, - "semantic_entropy": 0.003181255189701915, + "logits/chosen": -0.12654843926429749, + "logits/rejected": 0.008234462700784206, + "logps/chosen": -3.2263312339782715, + "logps/rejected": -3.643080472946167, + "loss": 0.658, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.2263312339782715, + "rewards/margins": 0.41674908995628357, + "rewards/rejected": -3.643080472946167, "step": 1380 }, { "epoch": 0.7412610804482355, - "grad_norm": 16.866071551135633, + "grad_norm": 18.19312948728153, "learning_rate": 9.355594962790682e-07, - "logits/chosen": 0.6800563335418701, - "logits/rejected": 0.7338107228279114, - "logps/chosen": -8.420328140258789, - "logps/rejected": -9.087446212768555, - "loss": 0.5497, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -8.420328140258789, - "rewards/margins": 0.6671197414398193, - "rewards/rejected": -9.087446212768555, - "semantic_entropy": 0.0032719075679779053, + "logits/chosen": -0.12673334777355194, + "logits/rejected": 0.00641622394323349, + "logps/chosen": -2.9295339584350586, + "logps/rejected": -3.4973530769348145, + "loss": 0.5788, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.9295339584350586, + "rewards/margins": 0.5678192377090454, + "rewards/rejected": -3.4973530769348145, "step": 1385 }, { "epoch": 0.7439371132296371, - "grad_norm": 15.614567180768711, + "grad_norm": 11.299558765124523, "learning_rate": 9.34792587788002e-07, - "logits/chosen": 0.737398624420166, - "logits/rejected": 0.7930010557174683, - "logps/chosen": -8.552255630493164, - "logps/rejected": -9.13463020324707, - "loss": 0.5937, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -8.552255630493164, - "rewards/margins": 0.5823749899864197, - "rewards/rejected": -9.13463020324707, - "semantic_entropy": 0.002839865395799279, + "logits/chosen": -0.015995724126696587, + "logits/rejected": 0.1160619705915451, + "logps/chosen": -3.048121929168701, + "logps/rejected": -3.523608684539795, + "loss": 0.5923, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.048121929168701, + "rewards/margins": 0.475486695766449, + "rewards/rejected": -3.523608684539795, "step": 1390 }, { "epoch": 0.7466131460110387, - "grad_norm": 17.35671690242798, + "grad_norm": 12.65708748331718, "learning_rate": 9.34021460937342e-07, - "logits/chosen": 0.7323909997940063, - "logits/rejected": 0.7694789171218872, - "logps/chosen": -8.701313972473145, - "logps/rejected": -9.22875690460205, - "loss": 0.5945, - "rewards/accuracies": 0.6875, - "rewards/chosen": -8.701313972473145, - "rewards/margins": 0.5274431109428406, - "rewards/rejected": -9.22875690460205, - "semantic_entropy": 0.002377058146521449, + "logits/chosen": -0.014932510443031788, + "logits/rejected": 0.07133794575929642, + "logps/chosen": -3.171405792236328, + "logps/rejected": -3.6070709228515625, + "loss": 0.6067, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.171405792236328, + "rewards/margins": 0.4356653690338135, + "rewards/rejected": -3.6070709228515625, "step": 1395 }, { "epoch": 0.7492891787924402, - "grad_norm": 10.711809648309112, + "grad_norm": 8.471211048102994, "learning_rate": 9.332461232085646e-07, - "logits/chosen": 0.6817182302474976, - "logits/rejected": 0.7388890385627747, - "logps/chosen": -8.783699989318848, - "logps/rejected": -9.290229797363281, - "loss": 0.5884, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -8.783699989318848, - "rewards/margins": 0.5065295696258545, - "rewards/rejected": -9.290229797363281, - "semantic_entropy": 0.0022387620992958546, + "logits/chosen": -0.2101004421710968, + "logits/rejected": -0.06754465401172638, + "logps/chosen": -3.287217617034912, + "logps/rejected": -3.765547275543213, + "loss": 0.5825, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.287217617034912, + "rewards/margins": 0.4783296585083008, + "rewards/rejected": -3.765547275543213, "step": 1400 }, { "epoch": 0.7519652115738418, - "grad_norm": 15.614698518847275, + "grad_norm": 11.55899732642469, "learning_rate": 9.324665821239998e-07, - "logits/chosen": 0.6605618000030518, - "logits/rejected": 0.7475873827934265, - "logps/chosen": -8.55317497253418, - "logps/rejected": -9.264518737792969, - "loss": 0.5843, - "rewards/accuracies": 0.6875, - "rewards/chosen": -8.55317497253418, - "rewards/margins": 0.7113439440727234, - "rewards/rejected": -9.264518737792969, - "semantic_entropy": 0.002737089293077588, + "logits/chosen": -0.1197824701666832, + "logits/rejected": 0.07881350070238113, + "logps/chosen": -3.04571270942688, + "logps/rejected": -3.7274563312530518, + "loss": 0.5947, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.04571270942688, + "rewards/margins": 0.6817437410354614, + "rewards/rejected": -3.7274563312530518, "step": 1405 }, { "epoch": 0.7546412443552434, - "grad_norm": 13.033534778573415, + "grad_norm": 15.119928158239041, "learning_rate": 9.316828452467583e-07, - "logits/chosen": 0.6980472803115845, - "logits/rejected": 0.7700284719467163, - "logps/chosen": -8.742466926574707, - "logps/rejected": -9.366756439208984, - "loss": 0.5443, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -8.742466926574707, - "rewards/margins": 0.6242889165878296, - "rewards/rejected": -9.366756439208984, - "semantic_entropy": 0.0023838577326387167, + "logits/chosen": -0.11559943854808807, + "logits/rejected": 0.05540876463055611, + "logps/chosen": -3.1982269287109375, + "logps/rejected": -3.857407331466675, + "loss": 0.5267, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.1982269287109375, + "rewards/margins": 0.6591804623603821, + "rewards/rejected": -3.857407331466675, "step": 1410 }, { "epoch": 0.7573172771366449, - "grad_norm": 38.89935633989603, + "grad_norm": 17.505985084907067, "learning_rate": 9.30894920180659e-07, - "logits/chosen": 0.7200860977172852, - "logits/rejected": 0.7739099264144897, - "logps/chosen": -8.618478775024414, - "logps/rejected": -9.061718940734863, - "loss": 0.6104, - "rewards/accuracies": 0.65625, - "rewards/chosen": -8.618478775024414, - "rewards/margins": 0.4432406425476074, - "rewards/rejected": -9.061718940734863, - "semantic_entropy": 0.002750970423221588, + "logits/chosen": -0.0294291190803051, + "logits/rejected": 0.10296481847763062, + "logps/chosen": -3.116891384124756, + "logps/rejected": -3.5595526695251465, + "loss": 0.6108, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.116891384124756, + "rewards/margins": 0.44266146421432495, + "rewards/rejected": -3.5595526695251465, "step": 1415 }, { "epoch": 0.7599933099180465, - "grad_norm": 12.996110838480991, + "grad_norm": 9.540072008156027, "learning_rate": 9.301028145701543e-07, - "logits/chosen": 0.6858905553817749, - "logits/rejected": 0.7582074403762817, - "logps/chosen": -8.569108963012695, - "logps/rejected": -9.244118690490723, - "loss": 0.5774, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -8.569108963012695, - "rewards/margins": 0.6750102043151855, - "rewards/rejected": -9.244118690490723, - "semantic_entropy": 0.0030036987736821175, + "logits/chosen": -0.03620930761098862, + "logits/rejected": 0.10043051093816757, + "logps/chosen": -3.221766710281372, + "logps/rejected": -3.9662222862243652, + "loss": 0.563, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.221766710281372, + "rewards/margins": 0.7444559335708618, + "rewards/rejected": -3.9662222862243652, "step": 1420 }, { "epoch": 0.7626693426994481, - "grad_norm": 11.513246333141913, + "grad_norm": 9.601069203774616, "learning_rate": 9.293065361002563e-07, - "logits/chosen": 0.6833176612854004, - "logits/rejected": 0.7447593212127686, - "logps/chosen": -8.49552059173584, - "logps/rejected": -9.083105087280273, - "loss": 0.5847, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -8.49552059173584, - "rewards/margins": 0.5875846147537231, - "rewards/rejected": -9.083105087280273, - "semantic_entropy": 0.003222426865249872, + "logits/chosen": -0.03262603282928467, + "logits/rejected": 0.08074156939983368, + "logps/chosen": -3.2843852043151855, + "logps/rejected": -3.93562650680542, + "loss": 0.5772, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.2843852043151855, + "rewards/margins": 0.6512413024902344, + "rewards/rejected": -3.93562650680542, "step": 1425 }, { "epoch": 0.7653453754808497, - "grad_norm": 17.050277712672678, + "grad_norm": 14.503772753956518, "learning_rate": 9.285060924964622e-07, - "logits/chosen": 0.6484526991844177, - "logits/rejected": 0.7113555669784546, - "logps/chosen": -8.497220993041992, - "logps/rejected": -9.029525756835938, - "loss": 0.5907, + "logits/chosen": -0.13785400986671448, + "logits/rejected": -0.006715321447700262, + "logps/chosen": -3.477222442626953, + "logps/rejected": -4.046846866607666, + "loss": 0.5707, "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -8.497220993041992, - "rewards/margins": 0.5323046445846558, - "rewards/rejected": -9.029525756835938, - "semantic_entropy": 0.002952256705611944, + "rewards/chosen": -3.477222442626953, + "rewards/margins": 0.5696240663528442, + "rewards/rejected": -4.046846866607666, "step": 1430 }, { "epoch": 0.7680214082622512, - "grad_norm": 14.721214698389861, + "grad_norm": 14.620275920114581, "learning_rate": 9.277014915246792e-07, - "logits/chosen": 0.7316317558288574, - "logits/rejected": 0.7572312355041504, - "logps/chosen": -8.28238296508789, - "logps/rejected": -8.961918830871582, - "loss": 0.5423, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -8.28238296508789, - "rewards/margins": 0.6795355677604675, - "rewards/rejected": -8.961918830871582, - "semantic_entropy": 0.004207999911159277, + "logits/chosen": -0.018372194841504097, + "logits/rejected": 0.03606110066175461, + "logps/chosen": -3.41900634765625, + "logps/rejected": -4.138044834136963, + "loss": 0.5453, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.41900634765625, + "rewards/margins": 0.7190379500389099, + "rewards/rejected": -4.138044834136963, "step": 1435 }, { "epoch": 0.7706974410436528, - "grad_norm": 12.207483167169574, + "grad_norm": 10.074074877436061, "learning_rate": 9.268927409911498e-07, - "logits/chosen": 0.703294038772583, - "logits/rejected": 0.7658201456069946, - "logps/chosen": -8.169378280639648, - "logps/rejected": -8.818994522094727, - "loss": 0.5517, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -8.169378280639648, - "rewards/margins": 0.6496168971061707, - "rewards/rejected": -8.818994522094727, - "semantic_entropy": 0.0045172530226409435, + "logits/chosen": -0.0776173397898674, + "logits/rejected": 0.031278226524591446, + "logps/chosen": -3.4013218879699707, + "logps/rejected": -3.978905439376831, + "loss": 0.5871, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.4013218879699707, + "rewards/margins": 0.5775834321975708, + "rewards/rejected": -3.978905439376831, "step": 1440 }, { "epoch": 0.7733734738250544, - "grad_norm": 31.132649463038923, + "grad_norm": 15.10446174100583, "learning_rate": 9.260798487423749e-07, - "logits/chosen": 0.6745550036430359, - "logits/rejected": 0.7686847448348999, - "logps/chosen": -8.215496063232422, - "logps/rejected": -8.745055198669434, - "loss": 0.5833, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.215496063232422, - "rewards/margins": 0.5295597910881042, - "rewards/rejected": -8.745055198669434, - "semantic_entropy": 0.004211473278701305, + "logits/chosen": -0.15803281962871552, + "logits/rejected": 0.035580217838287354, + "logps/chosen": -3.415907621383667, + "logps/rejected": -3.9811999797821045, + "loss": 0.5733, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.415907621383667, + "rewards/margins": 0.5652923583984375, + "rewards/rejected": -3.9811999797821045, "step": 1445 }, { "epoch": 0.7760495066064559, - "grad_norm": 20.17196061549839, + "grad_norm": 19.47585891776186, "learning_rate": 9.252628226650389e-07, - "logits/chosen": 0.68059903383255, - "logits/rejected": 0.7309106588363647, - "logps/chosen": -8.216412544250488, - "logps/rejected": -8.789416313171387, - "loss": 0.5969, - "rewards/accuracies": 0.65625, - "rewards/chosen": -8.216412544250488, - "rewards/margins": 0.5730043649673462, - "rewards/rejected": -8.789416313171387, - "semantic_entropy": 0.004476086236536503, + "logits/chosen": -0.046204663813114166, + "logits/rejected": 0.057601820677518845, + "logps/chosen": -3.4943251609802246, + "logps/rejected": -3.995941162109375, + "loss": 0.6453, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.4943251609802246, + "rewards/margins": 0.5016158223152161, + "rewards/rejected": -3.995941162109375, "step": 1450 }, { "epoch": 0.7787255393878575, - "grad_norm": 17.688599393649028, + "grad_norm": 12.780602535590704, "learning_rate": 9.244416706859321e-07, - "logits/chosen": 0.6764446496963501, - "logits/rejected": 0.74993497133255, - "logps/chosen": -8.021484375, - "logps/rejected": -8.647329330444336, - "loss": 0.5842, - "rewards/accuracies": 0.71875, - "rewards/chosen": -8.021484375, - "rewards/margins": 0.6258445978164673, - "rewards/rejected": -8.647329330444336, - "semantic_entropy": 0.005138213746249676, + "logits/chosen": -0.09628926217556, + "logits/rejected": 0.06856326758861542, + "logps/chosen": -3.345426082611084, + "logps/rejected": -3.986189365386963, + "loss": 0.5656, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.345426082611084, + "rewards/margins": 0.6407630443572998, + "rewards/rejected": -3.986189365386963, "step": 1455 }, { "epoch": 0.7814015721692591, - "grad_norm": 14.905907733509586, + "grad_norm": 10.377683403020999, "learning_rate": 9.23616400771875e-07, - "logits/chosen": 0.6466782689094543, - "logits/rejected": 0.7279826402664185, - "logps/chosen": -7.967951774597168, - "logps/rejected": -8.646588325500488, - "loss": 0.5654, - "rewards/accuracies": 0.71875, - "rewards/chosen": -7.967951774597168, - "rewards/margins": 0.6786371469497681, - "rewards/rejected": -8.646588325500488, - "semantic_entropy": 0.004834360908716917, + "logits/chosen": -0.08277436345815659, + "logits/rejected": 0.09804098308086395, + "logps/chosen": -3.4418601989746094, + "logps/rejected": -4.098714351654053, + "loss": 0.5508, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.4418601989746094, + "rewards/margins": 0.6568541526794434, + "rewards/rejected": -4.098714351654053, "step": 1460 }, { "epoch": 0.7840776049506607, - "grad_norm": 11.247618506337135, + "grad_norm": 9.441083534738105, "learning_rate": 9.227870209296395e-07, - "logits/chosen": 0.6892117857933044, - "logits/rejected": 0.7534765601158142, - "logps/chosen": -8.057080268859863, - "logps/rejected": -8.565564155578613, - "loss": 0.6136, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -8.057080268859863, - "rewards/margins": 0.5084843039512634, - "rewards/rejected": -8.565564155578613, - "semantic_entropy": 0.00426045898348093, + "logits/chosen": -0.06961619108915329, + "logits/rejected": 0.07172751426696777, + "logps/chosen": -3.596712589263916, + "logps/rejected": -4.116532325744629, + "loss": 0.5965, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.596712589263916, + "rewards/margins": 0.5198195576667786, + "rewards/rejected": -4.116532325744629, "step": 1465 }, { "epoch": 0.7867536377320622, - "grad_norm": 13.37630930170591, + "grad_norm": 9.924468534375313, "learning_rate": 9.219535392058728e-07, - "logits/chosen": 0.6549677848815918, - "logits/rejected": 0.6734327077865601, - "logps/chosen": -8.016042709350586, - "logps/rejected": -8.552094459533691, - "loss": 0.6187, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -8.016042709350586, - "rewards/margins": 0.5360512137413025, - "rewards/rejected": -8.552094459533691, - "semantic_entropy": 0.005072770640254021, + "logits/chosen": -0.117099329829216, + "logits/rejected": -0.0955801010131836, + "logps/chosen": -3.539616823196411, + "logps/rejected": -4.111162185668945, + "loss": 0.5892, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.539616823196411, + "rewards/margins": 0.5715450644493103, + "rewards/rejected": -4.111162185668945, "step": 1470 }, { "epoch": 0.7894296705134638, - "grad_norm": 13.216764994602103, + "grad_norm": 12.335356271575431, "learning_rate": 9.211159636870181e-07, - "logits/chosen": 0.717325747013092, - "logits/rejected": 0.8044508695602417, - "logps/chosen": -8.261899948120117, - "logps/rejected": -8.891412734985352, - "loss": 0.5755, - "rewards/accuracies": 0.6875, - "rewards/chosen": -8.261899948120117, - "rewards/margins": 0.6295128464698792, - "rewards/rejected": -8.891412734985352, - "semantic_entropy": 0.0033580393064767122, + "logits/chosen": -0.1582588106393814, + "logits/rejected": 0.015975693240761757, + "logps/chosen": -3.5987820625305176, + "logps/rejected": -4.245301246643066, + "loss": 0.5548, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.5987820625305176, + "rewards/margins": 0.6465197801589966, + "rewards/rejected": -4.245301246643066, "step": 1475 }, { "epoch": 0.7921057032948654, - "grad_norm": 15.240573495892372, + "grad_norm": 12.493730039049966, "learning_rate": 9.202743024992367e-07, - "logits/chosen": 0.8391082882881165, - "logits/rejected": 0.8751834034919739, - "logps/chosen": -8.093847274780273, - "logps/rejected": -8.829301834106445, - "loss": 0.5484, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.093847274780273, - "rewards/margins": 0.7354532480239868, - "rewards/rejected": -8.829301834106445, - "semantic_entropy": 0.004077838733792305, + "logits/chosen": -0.01800287328660488, + "logits/rejected": 0.07996153086423874, + "logps/chosen": -3.4672462940216064, + "logps/rejected": -4.137523651123047, + "loss": 0.5707, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.4672462940216064, + "rewards/margins": 0.6702775955200195, + "rewards/rejected": -4.137523651123047, "step": 1480 }, { "epoch": 0.7947817360762669, - "grad_norm": 14.604017721504034, + "grad_norm": 13.634165077102823, "learning_rate": 9.194285638083293e-07, - "logits/chosen": 0.8139607310295105, - "logits/rejected": 0.8734992742538452, - "logps/chosen": -8.397181510925293, - "logps/rejected": -9.134657859802246, - "loss": 0.5343, - "rewards/accuracies": 0.71875, - "rewards/chosen": -8.397181510925293, - "rewards/margins": 0.7374764680862427, - "rewards/rejected": -9.134657859802246, - "semantic_entropy": 0.0033265065867453814, + "logits/chosen": -0.06193091720342636, + "logits/rejected": 0.0952802449464798, + "logps/chosen": -3.7961997985839844, + "logps/rejected": -4.552024841308594, + "loss": 0.5074, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.7961997985839844, + "rewards/margins": 0.755825400352478, + "rewards/rejected": -4.552024841308594, "step": 1485 }, { "epoch": 0.7974577688576685, - "grad_norm": 17.08531394574266, + "grad_norm": 15.702461728394395, "learning_rate": 9.185787558196562e-07, - "logits/chosen": 0.8408036231994629, - "logits/rejected": 0.882840633392334, - "logps/chosen": -8.239664077758789, - "logps/rejected": -8.969578742980957, - "loss": 0.5681, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.239664077758789, - "rewards/margins": 0.7299133539199829, - "rewards/rejected": -8.969578742980957, - "semantic_entropy": 0.0042380583472549915, + "logits/chosen": -0.11942867189645767, + "logits/rejected": -0.008321581408381462, + "logps/chosen": -3.6584815979003906, + "logps/rejected": -4.335725784301758, + "loss": 0.5576, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.6584815979003906, + "rewards/margins": 0.6772439479827881, + "rewards/rejected": -4.335725784301758, "step": 1490 }, { "epoch": 0.8001338016390701, - "grad_norm": 14.477147607243182, + "grad_norm": 15.067296966882504, "learning_rate": 9.177248867780583e-07, - "logits/chosen": 0.8981844186782837, - "logits/rejected": 0.9408555030822754, - "logps/chosen": -8.414166450500488, - "logps/rejected": -8.8630952835083, - "loss": 0.6356, - "rewards/accuracies": 0.65625, - "rewards/chosen": -8.414166450500488, - "rewards/margins": 0.44892817735671997, - "rewards/rejected": -8.8630952835083, - "semantic_entropy": 0.003574197646230459, + "logits/chosen": -0.06090831011533737, + "logits/rejected": 0.037081725895404816, + "logps/chosen": -4.005630016326904, + "logps/rejected": -4.444871425628662, + "loss": 0.6197, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.005630016326904, + "rewards/margins": 0.43924078345298767, + "rewards/rejected": -4.444871425628662, "step": 1495 }, { "epoch": 0.8028098344204716, - "grad_norm": 13.661227346120718, + "grad_norm": 14.177594321896276, "learning_rate": 9.168669649677769e-07, - "logits/chosen": 0.8391574621200562, - "logits/rejected": 0.8985759019851685, - "logps/chosen": -8.088191032409668, - "logps/rejected": -8.639988899230957, - "loss": 0.6163, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -8.088191032409668, - "rewards/margins": 0.5517988801002502, - "rewards/rejected": -8.639988899230957, - "semantic_entropy": 0.00466513354331255, + "logits/chosen": -0.1009480208158493, + "logits/rejected": 0.017125608399510384, + "logps/chosen": -3.8164494037628174, + "logps/rejected": -4.393873691558838, + "loss": 0.6297, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.8164494037628174, + "rewards/margins": 0.5774238705635071, + "rewards/rejected": -4.393873691558838, "step": 1500 }, { "epoch": 0.8054858672018732, - "grad_norm": 14.061529510910384, + "grad_norm": 12.70241935163675, "learning_rate": 9.16004998712373e-07, - "logits/chosen": 0.8700096011161804, - "logits/rejected": 0.9046772718429565, - "logps/chosen": -8.150907516479492, - "logps/rejected": -8.664621353149414, - "loss": 0.6186, + "logits/chosen": -0.017670366913080215, + "logits/rejected": 0.06509570777416229, + "logps/chosen": -3.9049553871154785, + "logps/rejected": -4.517946243286133, + "loss": 0.5706, "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -8.150907516479492, - "rewards/margins": 0.5137127637863159, - "rewards/rejected": -8.664621353149414, - "semantic_entropy": 0.004039828199893236, + "rewards/chosen": -3.9049553871154785, + "rewards/margins": 0.6129913330078125, + "rewards/rejected": -4.517946243286133, "step": 1505 }, { "epoch": 0.8081618999832748, - "grad_norm": 12.998907380952517, + "grad_norm": 16.38124311602703, "learning_rate": 9.151389963746472e-07, - "logits/chosen": 0.8303213119506836, - "logits/rejected": 0.9604493379592896, - "logps/chosen": -8.156909942626953, - "logps/rejected": -8.906213760375977, - "loss": 0.5157, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.156909942626953, - "rewards/margins": 0.7493036985397339, - "rewards/rejected": -8.906213760375977, - "semantic_entropy": 0.004079463891685009, + "logits/chosen": -0.08304581046104431, + "logits/rejected": 0.2212526500225067, + "logps/chosen": -3.7265231609344482, + "logps/rejected": -4.455516815185547, + "loss": 0.5266, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.7265231609344482, + "rewards/margins": 0.7289934158325195, + "rewards/rejected": -4.455516815185547, "step": 1510 }, { "epoch": 0.8108379327646764, - "grad_norm": 11.673455076007292, + "grad_norm": 11.396827153585821, "learning_rate": 9.142689663565577e-07, - "logits/chosen": 0.8863071203231812, - "logits/rejected": 0.9234801530838013, - "logps/chosen": -8.100628852844238, - "logps/rejected": -8.736692428588867, - "loss": 0.5499, - "rewards/accuracies": 0.6875, - "rewards/chosen": -8.100628852844238, - "rewards/margins": 0.6360650062561035, - "rewards/rejected": -8.736692428588867, - "semantic_entropy": 0.0043214112520217896, + "logits/chosen": -0.020080409944057465, + "logits/rejected": 0.044258564710617065, + "logps/chosen": -3.6086630821228027, + "logps/rejected": -4.240175724029541, + "loss": 0.5508, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.6086630821228027, + "rewards/margins": 0.6315125226974487, + "rewards/rejected": -4.240175724029541, "step": 1515 }, { "epoch": 0.8135139655460779, - "grad_norm": 14.906715664230957, + "grad_norm": 16.95960510484773, "learning_rate": 9.133949170991397e-07, - "logits/chosen": 0.8381370306015015, - "logits/rejected": 0.8832274675369263, - "logps/chosen": -8.123575210571289, - "logps/rejected": -8.772272109985352, - "loss": 0.5631, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -8.123575210571289, - "rewards/margins": 0.6486952900886536, - "rewards/rejected": -8.772272109985352, - "semantic_entropy": 0.003872636239975691, + "logits/chosen": -0.06366250663995743, + "logits/rejected": 0.03639589995145798, + "logps/chosen": -3.406628131866455, + "logps/rejected": -4.038851737976074, + "loss": 0.5529, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.406628131866455, + "rewards/margins": 0.632224440574646, + "rewards/rejected": -4.038851737976074, "step": 1520 }, { "epoch": 0.8161899983274795, - "grad_norm": 13.00308584021274, + "grad_norm": 10.130789604536234, "learning_rate": 9.125168570824231e-07, - "logits/chosen": 0.826133131980896, - "logits/rejected": 0.8961697816848755, - "logps/chosen": -8.128072738647461, - "logps/rejected": -8.749670028686523, - "loss": 0.5707, - "rewards/accuracies": 0.65625, - "rewards/chosen": -8.128072738647461, - "rewards/margins": 0.621599555015564, - "rewards/rejected": -8.749670028686523, - "semantic_entropy": 0.00478363037109375, + "logits/chosen": -0.09025071561336517, + "logits/rejected": 0.08892885595560074, + "logps/chosen": -3.3867111206054688, + "logps/rejected": -3.915360927581787, + "loss": 0.5904, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.3867111206054688, + "rewards/margins": 0.5286494493484497, + "rewards/rejected": -3.915360927581787, "step": 1525 }, { "epoch": 0.8188660311088811, - "grad_norm": 22.234033697271784, + "grad_norm": 33.82610012537084, "learning_rate": 9.116347948253496e-07, - "logits/chosen": 0.7835792303085327, - "logits/rejected": 0.8497790098190308, - "logps/chosen": -8.275663375854492, - "logps/rejected": -8.82735538482666, - "loss": 0.5884, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -8.275663375854492, - "rewards/margins": 0.5516918301582336, - "rewards/rejected": -8.82735538482666, - "semantic_entropy": 0.003584084566682577, + "logits/chosen": -0.09061969071626663, + "logits/rejected": 0.056922584772109985, + "logps/chosen": -3.4702510833740234, + "logps/rejected": -3.937161922454834, + "loss": 0.6048, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.4702510833740234, + "rewards/margins": 0.46691060066223145, + "rewards/rejected": -3.937161922454834, "step": 1530 }, { "epoch": 0.8215420638902826, - "grad_norm": 13.347377014963383, + "grad_norm": 12.000129259477653, "learning_rate": 9.107487388856916e-07, - "logits/chosen": 0.7705615758895874, - "logits/rejected": 0.8761787414550781, - "logps/chosen": -8.147015571594238, - "logps/rejected": -8.853046417236328, - "loss": 0.5173, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.147015571594238, - "rewards/margins": 0.7060302495956421, - "rewards/rejected": -8.853046417236328, - "semantic_entropy": 0.004314957652240992, + "logits/chosen": -0.08950912952423096, + "logits/rejected": 0.10529811680316925, + "logps/chosen": -3.2299671173095703, + "logps/rejected": -3.774557113647461, + "loss": 0.5491, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.2299671173095703, + "rewards/margins": 0.5445898175239563, + "rewards/rejected": -3.774557113647461, "step": 1535 }, { "epoch": 0.8242180966716842, - "grad_norm": 18.743981589501008, + "grad_norm": 12.629843667720875, "learning_rate": 9.098586978599673e-07, - "logits/chosen": 0.7425702214241028, - "logits/rejected": 0.8426684141159058, - "logps/chosen": -8.143719673156738, - "logps/rejected": -8.96298599243164, - "loss": 0.5714, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -8.143719673156738, - "rewards/margins": 0.8192659616470337, - "rewards/rejected": -8.96298599243164, - "semantic_entropy": 0.00467184092849493, + "logits/chosen": -0.005651325918734074, + "logits/rejected": 0.15307532250881195, + "logps/chosen": -3.195488452911377, + "logps/rejected": -3.9492506980895996, + "loss": 0.529, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.195488452911377, + "rewards/margins": 0.7537627220153809, + "rewards/rejected": -3.9492506980895996, "step": 1540 }, { "epoch": 0.8268941294530858, - "grad_norm": 17.857816830477596, + "grad_norm": 9.845085113059666, "learning_rate": 9.089646803833588e-07, - "logits/chosen": 0.6718012094497681, - "logits/rejected": 0.7780871987342834, - "logps/chosen": -8.064419746398926, - "logps/rejected": -8.771439552307129, - "loss": 0.5654, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -8.064419746398926, - "rewards/margins": 0.7070209383964539, - "rewards/rejected": -8.771439552307129, - "semantic_entropy": 0.004769052378833294, + "logits/chosen": 0.007146684918552637, + "logits/rejected": 0.16614550352096558, + "logps/chosen": -3.0856337547302246, + "logps/rejected": -3.6454014778137207, + "loss": 0.5638, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.0856337547302246, + "rewards/margins": 0.5597675442695618, + "rewards/rejected": -3.6454014778137207, "step": 1545 }, { "epoch": 0.8295701622344873, - "grad_norm": 15.883695166387948, + "grad_norm": 10.630223618962006, "learning_rate": 9.080666951296276e-07, - "logits/chosen": 0.523202121257782, - "logits/rejected": 0.7221616506576538, - "logps/chosen": -7.929041385650635, - "logps/rejected": -8.971317291259766, - "loss": 0.4605, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -7.929041385650635, - "rewards/margins": 1.0422756671905518, - "rewards/rejected": -8.971317291259766, - "semantic_entropy": 0.0057184770703315735, + "logits/chosen": -0.1607871651649475, + "logits/rejected": 0.11934441328048706, + "logps/chosen": -3.0448296070098877, + "logps/rejected": -3.831932544708252, + "loss": 0.4986, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.0448296070098877, + "rewards/margins": 0.7871026992797852, + "rewards/rejected": -3.831932544708252, "step": 1550 }, { "epoch": 0.8322461950158889, - "grad_norm": 11.62870813642443, + "grad_norm": 10.567649129904295, "learning_rate": 9.071647508110305e-07, - "logits/chosen": 0.5561312437057495, - "logits/rejected": 0.7267721891403198, - "logps/chosen": -7.7915754318237305, - "logps/rejected": -8.73208999633789, - "loss": 0.524, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -7.7915754318237305, - "rewards/margins": 0.9405128359794617, - "rewards/rejected": -8.73208999633789, - "semantic_entropy": 0.005991402082145214, + "logits/chosen": -0.1058889776468277, + "logits/rejected": 0.17786701023578644, + "logps/chosen": -3.169146776199341, + "logps/rejected": -3.926156520843506, + "loss": 0.5599, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.169146776199341, + "rewards/margins": 0.7570096850395203, + "rewards/rejected": -3.926156520843506, "step": 1555 }, { "epoch": 0.8349222277972905, - "grad_norm": 12.493629792798085, + "grad_norm": 12.198403470435926, "learning_rate": 9.062588561782354e-07, - "logits/chosen": 0.6039088368415833, - "logits/rejected": 0.6618218421936035, - "logps/chosen": -8.060002326965332, - "logps/rejected": -8.698019981384277, - "loss": 0.5877, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -8.060002326965332, - "rewards/margins": 0.6380175352096558, - "rewards/rejected": -8.698019981384277, - "semantic_entropy": 0.004686708562076092, + "logits/chosen": -0.024015041068196297, + "logits/rejected": 0.054738402366638184, + "logps/chosen": -3.3969712257385254, + "logps/rejected": -3.9629223346710205, + "loss": 0.5968, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.3969712257385254, + "rewards/margins": 0.5659508109092712, + "rewards/rejected": -3.9629223346710205, "step": 1560 }, { "epoch": 0.8375982605786921, - "grad_norm": 11.295471536198134, + "grad_norm": 9.99089348686597, "learning_rate": 9.053490200202358e-07, - "logits/chosen": 0.7054456472396851, - "logits/rejected": 0.763306736946106, - "logps/chosen": -8.17889404296875, - "logps/rejected": -8.809242248535156, - "loss": 0.5912, - "rewards/accuracies": 0.6875, - "rewards/chosen": -8.17889404296875, - "rewards/margins": 0.6303480863571167, - "rewards/rejected": -8.809242248535156, - "semantic_entropy": 0.004566199611872435, + "logits/chosen": 0.013892561197280884, + "logits/rejected": 0.10044942051172256, + "logps/chosen": -3.57891845703125, + "logps/rejected": -4.152596473693848, + "loss": 0.5837, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.57891845703125, + "rewards/margins": 0.5736778974533081, + "rewards/rejected": -4.152596473693848, "step": 1565 }, { "epoch": 0.8402742933600936, - "grad_norm": 18.547048805065355, + "grad_norm": 17.54555751562761, "learning_rate": 9.044352511642661e-07, - "logits/chosen": 0.7248358726501465, - "logits/rejected": 0.7658167481422424, - "logps/chosen": -8.291397094726562, - "logps/rejected": -8.84311294555664, - "loss": 0.6214, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -8.291397094726562, - "rewards/margins": 0.5517162680625916, - "rewards/rejected": -8.84311294555664, - "semantic_entropy": 0.003865548875182867, + "logits/chosen": -0.023263219743967056, + "logits/rejected": 0.029283057898283005, + "logps/chosen": -3.496896743774414, + "logps/rejected": -4.001120567321777, + "loss": 0.6275, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.496896743774414, + "rewards/margins": 0.5042238235473633, + "rewards/rejected": -4.001120567321777, "step": 1570 }, { "epoch": 0.8429503261414952, - "grad_norm": 12.961728802829933, + "grad_norm": 9.845384993177653, "learning_rate": 9.03517558475716e-07, - "logits/chosen": 0.721314549446106, - "logits/rejected": 0.7908953428268433, - "logps/chosen": -8.195411682128906, - "logps/rejected": -8.733312606811523, - "loss": 0.5662, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -8.195411682128906, - "rewards/margins": 0.5379008650779724, - "rewards/rejected": -8.733312606811523, - "semantic_entropy": 0.004040508531033993, + "logits/chosen": -0.020920494571328163, + "logits/rejected": 0.08416114002466202, + "logps/chosen": -3.300476551055908, + "logps/rejected": -3.825829267501831, + "loss": 0.5759, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.300476551055908, + "rewards/margins": 0.5253530144691467, + "rewards/rejected": -3.825829267501831, "step": 1575 }, { "epoch": 0.8456263589228968, - "grad_norm": 13.388709273345874, + "grad_norm": 10.738592871521004, "learning_rate": 9.025959508580436e-07, - "logits/chosen": 0.7946035861968994, - "logits/rejected": 0.9081939458847046, - "logps/chosen": -8.505581855773926, - "logps/rejected": -9.189567565917969, - "loss": 0.5373, + "logits/chosen": 0.027344584465026855, + "logits/rejected": 0.264851838350296, + "logps/chosen": -3.5593318939208984, + "logps/rejected": -4.242374420166016, + "loss": 0.5473, "rewards/accuracies": 0.71875, - "rewards/chosen": -8.505581855773926, - "rewards/margins": 0.6839855313301086, - "rewards/rejected": -9.189567565917969, - "semantic_entropy": 0.003315441310405731, + "rewards/chosen": -3.5593318939208984, + "rewards/margins": 0.6830425262451172, + "rewards/rejected": -4.242374420166016, "step": 1580 }, { "epoch": 0.8483023917042983, - "grad_norm": 13.04328623863774, + "grad_norm": 11.53754297403796, "learning_rate": 9.016704372526905e-07, - "logits/chosen": 0.7168788313865662, - "logits/rejected": 0.8062397837638855, - "logps/chosen": -8.312705039978027, - "logps/rejected": -8.956674575805664, - "loss": 0.5598, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -8.312705039978027, - "rewards/margins": 0.6439692378044128, - "rewards/rejected": -8.956674575805664, - "semantic_entropy": 0.004072139970958233, + "logits/chosen": -0.0490080788731575, + "logits/rejected": 0.12134552001953125, + "logps/chosen": -3.4022674560546875, + "logps/rejected": -4.034855365753174, + "loss": 0.5519, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.4022674560546875, + "rewards/margins": 0.6325877904891968, + "rewards/rejected": -4.034855365753174, "step": 1585 }, { "epoch": 0.8509784244856999, - "grad_norm": 19.229092305162187, + "grad_norm": 18.310010960395953, "learning_rate": 9.007410266389934e-07, - "logits/chosen": 0.6322071552276611, - "logits/rejected": 0.6842302680015564, - "logps/chosen": -8.209632873535156, - "logps/rejected": -8.76137638092041, - "loss": 0.5783, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -8.209632873535156, - "rewards/margins": 0.5517433881759644, - "rewards/rejected": -8.76137638092041, - "semantic_entropy": 0.003791673108935356, + "logits/chosen": -0.0990963727235794, + "logits/rejected": -0.019897878170013428, + "logps/chosen": -3.3184802532196045, + "logps/rejected": -3.829444408416748, + "loss": 0.5855, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.3184802532196045, + "rewards/margins": 0.5109640955924988, + "rewards/rejected": -3.829444408416748, "step": 1590 }, { "epoch": 0.8536544572671015, - "grad_norm": 15.721412896614648, + "grad_norm": 13.03385752526129, "learning_rate": 8.998077280340981e-07, - "logits/chosen": 0.6889594793319702, - "logits/rejected": 0.7277365922927856, - "logps/chosen": -8.424080848693848, - "logps/rejected": -8.998977661132812, - "loss": 0.5645, - "rewards/accuracies": 0.6875, - "rewards/chosen": -8.424080848693848, - "rewards/margins": 0.5748964548110962, - "rewards/rejected": -8.998977661132812, - "semantic_entropy": 0.0032078386284410954, + "logits/chosen": 0.001789632486179471, + "logits/rejected": 0.06162182241678238, + "logps/chosen": -3.4980416297912598, + "logps/rejected": -3.936983108520508, + "loss": 0.6146, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.4980416297912598, + "rewards/margins": 0.4389413893222809, + "rewards/rejected": -3.936983108520508, "step": 1595 }, { "epoch": 0.8563304900485031, - "grad_norm": 12.71386552957631, + "grad_norm": 9.377595148289215, "learning_rate": 8.988705504928722e-07, - "logits/chosen": 0.6700653433799744, - "logits/rejected": 0.7809063196182251, - "logps/chosen": -8.39610481262207, - "logps/rejected": -9.334978103637695, - "loss": 0.4796, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -8.39610481262207, - "rewards/margins": 0.938875675201416, - "rewards/rejected": -9.334978103637695, - "semantic_entropy": 0.00389484572224319, + "logits/chosen": -0.10836903750896454, + "logits/rejected": 0.08811955899000168, + "logps/chosen": -3.4870657920837402, + "logps/rejected": -4.346532344818115, + "loss": 0.4794, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.4870657920837402, + "rewards/margins": 0.8594663739204407, + "rewards/rejected": -4.346532344818115, "step": 1600 }, { "epoch": 0.8563304900485031, - "eval_logits/chosen": 0.8517152070999146, - "eval_logits/rejected": 0.9157667756080627, - "eval_logps/chosen": -8.478996276855469, - "eval_logps/rejected": -9.19737434387207, - "eval_loss": 0.5405778884887695, - "eval_rewards/accuracies": 0.7047477960586548, - "eval_rewards/chosen": -8.478996276855469, - "eval_rewards/margins": 0.7183785438537598, - "eval_rewards/rejected": -9.19737434387207, - "eval_runtime": 35.1436, - "eval_samples_per_second": 38.272, - "eval_semantic_entropy": 0.0034910058602690697, - "eval_steps_per_second": 9.589, + "eval_logits/chosen": 0.2849295735359192, + "eval_logits/rejected": 0.388105571269989, + "eval_logps/chosen": -3.525459051132202, + "eval_logps/rejected": -4.217833518981934, + "eval_loss": 0.5320321321487427, + "eval_rewards/accuracies": 0.7277448177337646, + "eval_rewards/chosen": -3.525459051132202, + "eval_rewards/margins": 0.6923741102218628, + "eval_rewards/rejected": -4.217833518981934, + "eval_runtime": 40.3539, + "eval_samples_per_second": 33.33, + "eval_steps_per_second": 8.351, "step": 1600 }, { "epoch": 0.8590065228299046, - "grad_norm": 15.698469595260914, + "grad_norm": 12.362492194582563, "learning_rate": 8.979295031078157e-07, - "logits/chosen": 0.6854676008224487, - "logits/rejected": 0.8206149935722351, - "logps/chosen": -8.568761825561523, - "logps/rejected": -9.31121826171875, - "loss": 0.5156, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.568761825561523, - "rewards/margins": 0.742457389831543, - "rewards/rejected": -9.31121826171875, - "semantic_entropy": 0.003175111021846533, + "logits/chosen": -0.10155310481786728, + "logits/rejected": 0.13362309336662292, + "logps/chosen": -3.7728633880615234, + "logps/rejected": -4.504532814025879, + "loss": 0.5171, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.7728633880615234, + "rewards/margins": 0.7316694855690002, + "rewards/rejected": -4.504532814025879, "step": 1605 }, { "epoch": 0.8616825556113062, - "grad_norm": 13.477417972760096, + "grad_norm": 11.813424021982982, "learning_rate": 8.969845950089751e-07, - "logits/chosen": 0.699101448059082, - "logits/rejected": 0.8196004033088684, - "logps/chosen": -8.327180862426758, - "logps/rejected": -9.16191577911377, - "loss": 0.51, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.327180862426758, - "rewards/margins": 0.8347347974777222, - "rewards/rejected": -9.16191577911377, - "semantic_entropy": 0.0040628439746797085, + "logits/chosen": -0.13351674377918243, + "logits/rejected": 0.049798864871263504, + "logps/chosen": -3.7631313800811768, + "logps/rejected": -4.513304710388184, + "loss": 0.5422, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.7631313800811768, + "rewards/margins": 0.7501733899116516, + "rewards/rejected": -4.513304710388184, "step": 1610 }, { "epoch": 0.8643585883927078, - "grad_norm": 20.385879298528565, + "grad_norm": 16.446215875957925, "learning_rate": 8.960358353638526e-07, - "logits/chosen": 0.7844869494438171, - "logits/rejected": 0.8709976077079773, - "logps/chosen": -8.346048355102539, - "logps/rejected": -9.043893814086914, - "loss": 0.5844, - "rewards/accuracies": 0.6875, - "rewards/chosen": -8.346048355102539, - "rewards/margins": 0.6978455781936646, - "rewards/rejected": -9.043893814086914, - "semantic_entropy": 0.004238657653331757, + "logits/chosen": -0.07797622680664062, + "logits/rejected": 0.04963933676481247, + "logps/chosen": -3.6938133239746094, + "logps/rejected": -4.348758697509766, + "loss": 0.6157, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.6938133239746094, + "rewards/margins": 0.6549461483955383, + "rewards/rejected": -4.348758697509766, "step": 1615 }, { "epoch": 0.8670346211741093, - "grad_norm": 16.839936129408876, + "grad_norm": 12.154917698891413, "learning_rate": 8.950832333773184e-07, - "logits/chosen": 0.8071925044059753, - "logits/rejected": 0.9016444087028503, - "logps/chosen": -8.5030517578125, - "logps/rejected": -9.184589385986328, - "loss": 0.5976, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -8.5030517578125, - "rewards/margins": 0.6815375685691833, - "rewards/rejected": -9.184589385986328, - "semantic_entropy": 0.0040657538920640945, + "logits/chosen": -0.10814926773309708, + "logits/rejected": 0.05815862491726875, + "logps/chosen": -3.725407361984253, + "logps/rejected": -4.323855400085449, + "loss": 0.6282, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -3.725407361984253, + "rewards/margins": 0.5984477996826172, + "rewards/rejected": -4.323855400085449, "step": 1620 }, { "epoch": 0.869710653955511, - "grad_norm": 16.82819858205259, + "grad_norm": 14.915878015184393, "learning_rate": 8.941267982915213e-07, - "logits/chosen": 0.8722259402275085, - "logits/rejected": 0.9098442196846008, - "logps/chosen": -8.707246780395508, - "logps/rejected": -9.02901554107666, - "loss": 0.7021, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -8.707246780395508, - "rewards/margins": 0.32176870107650757, - "rewards/rejected": -9.02901554107666, - "semantic_entropy": 0.0030822004191577435, + "logits/chosen": -0.024492016062140465, + "logits/rejected": 0.019109733402729034, + "logps/chosen": -3.630009412765503, + "logps/rejected": -3.945751667022705, + "loss": 0.7055, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.630009412765503, + "rewards/margins": 0.3157421946525574, + "rewards/rejected": -3.945751667022705, "step": 1625 }, { "epoch": 0.8723866867369126, - "grad_norm": 12.54191217498305, + "grad_norm": 10.65720610916705, "learning_rate": 8.931665393857983e-07, - "logits/chosen": 0.853954017162323, - "logits/rejected": 0.9320189356803894, - "logps/chosen": -8.5809965133667, - "logps/rejected": -9.205659866333008, - "loss": 0.5758, + "logits/chosen": -0.03980112075805664, + "logits/rejected": 0.10950887203216553, + "logps/chosen": -3.2014377117156982, + "logps/rejected": -3.8042843341827393, + "loss": 0.5748, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -8.5809965133667, - "rewards/margins": 0.6246632933616638, - "rewards/rejected": -9.205659866333008, - "semantic_entropy": 0.002861475106328726, + "rewards/chosen": -3.2014377117156982, + "rewards/margins": 0.6028466820716858, + "rewards/rejected": -3.8042843341827393, "step": 1630 }, { "epoch": 0.875062719518314, - "grad_norm": 13.279557320703779, + "grad_norm": 9.859687545071464, "learning_rate": 8.922024659765861e-07, - "logits/chosen": 0.830333411693573, - "logits/rejected": 0.9043375849723816, - "logps/chosen": -8.427899360656738, - "logps/rejected": -9.160634994506836, - "loss": 0.5253, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -8.427899360656738, - "rewards/margins": 0.7327350378036499, - "rewards/rejected": -9.160634994506836, - "semantic_entropy": 0.0032138600945472717, + "logits/chosen": -0.14248326420783997, + "logits/rejected": -0.007390056736767292, + "logps/chosen": -2.8953769207000732, + "logps/rejected": -3.584461212158203, + "loss": 0.5265, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.8953769207000732, + "rewards/margins": 0.6890842914581299, + "rewards/rejected": -3.584461212158203, "step": 1635 }, { "epoch": 0.8777387522997157, - "grad_norm": 19.39467151368381, + "grad_norm": 11.331848058947434, "learning_rate": 8.912345874173288e-07, - "logits/chosen": 0.8193842172622681, - "logits/rejected": 0.8876082301139832, - "logps/chosen": -8.680830001831055, - "logps/rejected": -9.321008682250977, - "loss": 0.5834, + "logits/chosen": -0.10469019412994385, + "logits/rejected": 0.026147732511162758, + "logps/chosen": -2.997683048248291, + "logps/rejected": -3.6705849170684814, + "loss": 0.5566, "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -8.680830001831055, - "rewards/margins": 0.6401779651641846, - "rewards/rejected": -9.321008682250977, - "semantic_entropy": 0.0026057157665491104, + "rewards/chosen": -2.997683048248291, + "rewards/margins": 0.6729012727737427, + "rewards/rejected": -3.6705849170684814, "step": 1640 }, { "epoch": 0.8804147850811173, - "grad_norm": 15.252904015477414, + "grad_norm": 10.079829240319278, "learning_rate": 8.902629130983885e-07, - "logits/chosen": 0.8152815103530884, - "logits/rejected": 0.8414192199707031, - "logps/chosen": -8.817136764526367, - "logps/rejected": -9.296814918518066, - "loss": 0.6108, + "logits/chosen": -0.07183770090341568, + "logits/rejected": -0.03483227640390396, + "logps/chosen": -3.0931262969970703, + "logps/rejected": -3.5680012702941895, + "loss": 0.5822, "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -8.817136764526367, - "rewards/margins": 0.4796779751777649, - "rewards/rejected": -9.296814918518066, - "semantic_entropy": 0.002496039029210806, + "rewards/chosen": -3.0931262969970703, + "rewards/margins": 0.4748748242855072, + "rewards/rejected": -3.5680012702941895, "step": 1645 }, { "epoch": 0.8830908178625189, - "grad_norm": 17.953373871726004, + "grad_norm": 12.405999898580374, "learning_rate": 8.892874524469537e-07, - "logits/chosen": 0.8935707211494446, - "logits/rejected": 0.9353858232498169, - "logps/chosen": -8.69524097442627, - "logps/rejected": -9.359209060668945, - "loss": 0.5249, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -8.69524097442627, - "rewards/margins": 0.6639670133590698, - "rewards/rejected": -9.359209060668945, - "semantic_entropy": 0.002959498204290867, + "logits/chosen": 0.046564534306526184, + "logits/rejected": 0.0863698273897171, + "logps/chosen": -2.9996213912963867, + "logps/rejected": -3.6157379150390625, + "loss": 0.5335, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.9996213912963867, + "rewards/margins": 0.6161164045333862, + "rewards/rejected": -3.6157379150390625, "step": 1650 }, { "epoch": 0.8857668506439204, - "grad_norm": 15.625644253516462, + "grad_norm": 13.786492724467509, "learning_rate": 8.883082149269478e-07, - "logits/chosen": 0.8291314840316772, - "logits/rejected": 0.8965352177619934, - "logps/chosen": -8.816374778747559, - "logps/rejected": -9.487445831298828, - "loss": 0.5349, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.816374778747559, - "rewards/margins": 0.6710702180862427, - "rewards/rejected": -9.487445831298828, - "semantic_entropy": 0.0024764954578131437, + "logits/chosen": -0.07310228794813156, + "logits/rejected": 0.04409448802471161, + "logps/chosen": -3.1030819416046143, + "logps/rejected": -3.716714382171631, + "loss": 0.536, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.1030819416046143, + "rewards/margins": 0.6136326789855957, + "rewards/rejected": -3.716714382171631, "step": 1655 }, { "epoch": 0.888442883425322, - "grad_norm": 15.866078991034088, + "grad_norm": 11.272654788190755, "learning_rate": 8.873252100389377e-07, - "logits/chosen": 0.8331910371780396, - "logits/rejected": 0.8664076924324036, - "logps/chosen": -8.80284595489502, - "logps/rejected": -9.485407829284668, - "loss": 0.5391, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.80284595489502, - "rewards/margins": 0.6825627088546753, - "rewards/rejected": -9.485407829284668, - "semantic_entropy": 0.002537056338042021, + "logits/chosen": -0.054821424186229706, + "logits/rejected": -0.005968198180198669, + "logps/chosen": -3.0350852012634277, + "logps/rejected": -3.6073086261749268, + "loss": 0.5363, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.0350852012634277, + "rewards/margins": 0.5722234845161438, + "rewards/rejected": -3.6073086261749268, "step": 1660 }, { "epoch": 0.8911189162067236, - "grad_norm": 18.091028298007316, + "grad_norm": 15.073450366337543, "learning_rate": 8.863384473200411e-07, - "logits/chosen": 0.8735591769218445, - "logits/rejected": 0.8835130929946899, - "logps/chosen": -9.021527290344238, - "logps/rejected": -9.55382251739502, - "loss": 0.5901, + "logits/chosen": -0.03717409446835518, + "logits/rejected": 0.021764075383543968, + "logps/chosen": -3.486513137817383, + "logps/rejected": -3.9243502616882324, + "loss": 0.6187, "rewards/accuracies": 0.6875, - "rewards/chosen": -9.021527290344238, - "rewards/margins": 0.532294750213623, - "rewards/rejected": -9.55382251739502, - "semantic_entropy": 0.0023630578070878983, + "rewards/chosen": -3.486513137817383, + "rewards/margins": 0.4378372132778168, + "rewards/rejected": -3.9243502616882324, "step": 1665 }, { "epoch": 0.8937949489881251, - "grad_norm": 15.178387048780932, + "grad_norm": 12.846001634153197, "learning_rate": 8.853479363438342e-07, - "logits/chosen": 0.8976732492446899, - "logits/rejected": 0.9629983901977539, - "logps/chosen": -9.05078411102295, - "logps/rejected": -9.503996849060059, - "loss": 0.63, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -9.05078411102295, - "rewards/margins": 0.45321202278137207, - "rewards/rejected": -9.503996849060059, - "semantic_entropy": 0.0022809661459177732, + "logits/chosen": 0.018962472677230835, + "logits/rejected": 0.1796277016401291, + "logps/chosen": -3.4363675117492676, + "logps/rejected": -3.8784077167510986, + "loss": 0.6481, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.4363675117492676, + "rewards/margins": 0.44204002618789673, + "rewards/rejected": -3.8784077167510986, "step": 1670 }, { "epoch": 0.8964709817695267, - "grad_norm": 16.07101375873566, + "grad_norm": 11.762229306412117, "learning_rate": 8.843536867202588e-07, - "logits/chosen": 0.8819114565849304, - "logits/rejected": 0.9694005250930786, - "logps/chosen": -8.932337760925293, - "logps/rejected": -9.645790100097656, - "loss": 0.5404, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.932337760925293, - "rewards/margins": 0.7134513258934021, - "rewards/rejected": -9.645790100097656, - "semantic_entropy": 0.002526444150134921, + "logits/chosen": 0.006082400679588318, + "logits/rejected": 0.2097162902355194, + "logps/chosen": -3.4231345653533936, + "logps/rejected": -4.073763847351074, + "loss": 0.5573, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.4231345653533936, + "rewards/margins": 0.6506296396255493, + "rewards/rejected": -4.073763847351074, "step": 1675 }, { "epoch": 0.8991470145509283, - "grad_norm": 22.308829335036855, + "grad_norm": 13.918436173757607, "learning_rate": 8.833557080955292e-07, - "logits/chosen": 0.8551505208015442, - "logits/rejected": 0.8981190919876099, - "logps/chosen": -8.808084487915039, - "logps/rejected": -9.255620956420898, - "loss": 0.644, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -8.808084487915039, - "rewards/margins": 0.4475362300872803, - "rewards/rejected": -9.255620956420898, - "semantic_entropy": 0.0026800683699548244, + "logits/chosen": -0.09952910989522934, + "logits/rejected": 0.017457852140069008, + "logps/chosen": -3.382654905319214, + "logps/rejected": -3.8396778106689453, + "loss": 0.6068, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.382654905319214, + "rewards/margins": 0.4570225179195404, + "rewards/rejected": -3.8396778106689453, "step": 1680 }, { "epoch": 0.9018230473323299, - "grad_norm": 20.755871823674013, + "grad_norm": 14.182613565985802, "learning_rate": 8.823540101520381e-07, - "logits/chosen": 0.8553838729858398, - "logits/rejected": 0.9643619656562805, - "logps/chosen": -8.639958381652832, - "logps/rejected": -9.357548713684082, - "loss": 0.5769, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -8.639958381652832, - "rewards/margins": 0.7175900340080261, - "rewards/rejected": -9.357548713684082, - "semantic_entropy": 0.002625108230859041, + "logits/chosen": -0.03426302969455719, + "logits/rejected": 0.16999930143356323, + "logps/chosen": -3.2573084831237793, + "logps/rejected": -3.861452579498291, + "loss": 0.5956, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.2573084831237793, + "rewards/margins": 0.6041439771652222, + "rewards/rejected": -3.861452579498291, "step": 1685 }, { "epoch": 0.9044990801137314, - "grad_norm": 17.716986972701864, + "grad_norm": 11.643194261674557, "learning_rate": 8.813486026082637e-07, - "logits/chosen": 0.8817728161811829, - "logits/rejected": 0.9899671673774719, - "logps/chosen": -8.551309585571289, - "logps/rejected": -9.368635177612305, - "loss": 0.5262, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -8.551309585571289, - "rewards/margins": 0.8173257112503052, - "rewards/rejected": -9.368635177612305, - "semantic_entropy": 0.0030341236852109432, + "logits/chosen": -0.0401470847427845, + "logits/rejected": 0.16062679886817932, + "logps/chosen": -3.208315372467041, + "logps/rejected": -3.9116744995117188, + "loss": 0.5201, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.208315372467041, + "rewards/margins": 0.7033589482307434, + "rewards/rejected": -3.9116744995117188, "step": 1690 }, { "epoch": 0.907175112895133, - "grad_norm": 26.70901823794411, + "grad_norm": 16.967312633847413, "learning_rate": 8.803394952186742e-07, - "logits/chosen": 0.7727741003036499, - "logits/rejected": 0.8659934997558594, - "logps/chosen": -8.45335578918457, - "logps/rejected": -9.134082794189453, - "loss": 0.528, + "logits/chosen": -0.20215150713920593, + "logits/rejected": -0.035997480154037476, + "logps/chosen": -3.2470271587371826, + "logps/rejected": -3.812593460083008, + "loss": 0.5572, "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -8.45335578918457, - "rewards/margins": 0.6807276606559753, - "rewards/rejected": -9.134082794189453, - "semantic_entropy": 0.0030217047315090895, + "rewards/chosen": -3.2470271587371826, + "rewards/margins": 0.5655666589736938, + "rewards/rejected": -3.812593460083008, "step": 1695 }, { "epoch": 0.9098511456765346, - "grad_norm": 15.766523978337645, + "grad_norm": 13.585507326867159, "learning_rate": 8.793266977736342e-07, - "logits/chosen": 0.8686197996139526, - "logits/rejected": 0.8411453366279602, - "logps/chosen": -8.678540229797363, - "logps/rejected": -9.106678009033203, - "loss": 0.6239, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -8.678540229797363, - "rewards/margins": 0.42813801765441895, - "rewards/rejected": -9.106678009033203, - "semantic_entropy": 0.0028703988064080477, + "logits/chosen": -0.028521955013275146, + "logits/rejected": -0.08319459855556488, + "logps/chosen": -3.4853758811950684, + "logps/rejected": -3.817878007888794, + "loss": 0.6333, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.4853758811950684, + "rewards/margins": 0.3325018882751465, + "rewards/rejected": -3.817878007888794, "step": 1700 }, { "epoch": 0.9125271784579361, - "grad_norm": 18.30778845018795, + "grad_norm": 16.071374671373565, "learning_rate": 8.783102200993085e-07, - "logits/chosen": 0.8415244817733765, - "logits/rejected": 0.9102290868759155, - "logps/chosen": -8.705827713012695, - "logps/rejected": -9.459232330322266, - "loss": 0.5234, - "rewards/accuracies": 0.75, - "rewards/chosen": -8.705827713012695, - "rewards/margins": 0.753406286239624, - "rewards/rejected": -9.459232330322266, - "semantic_entropy": 0.002563622547313571, + "logits/chosen": -0.05006355047225952, + "logits/rejected": 0.08777417242527008, + "logps/chosen": -3.5207648277282715, + "logps/rejected": -4.162658214569092, + "loss": 0.5392, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.5207648277282715, + "rewards/margins": 0.6418930888175964, + "rewards/rejected": -4.162658214569092, "step": 1705 }, { "epoch": 0.9152032112393377, - "grad_norm": 14.122552458570945, + "grad_norm": 12.268957137894239, "learning_rate": 8.772900720575683e-07, - "logits/chosen": 0.8687243461608887, - "logits/rejected": 0.9228528738021851, - "logps/chosen": -8.917773246765137, - "logps/rejected": -9.463602066040039, - "loss": 0.5956, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -8.917773246765137, - "rewards/margins": 0.5458282828330994, - "rewards/rejected": -9.463602066040039, - "semantic_entropy": 0.002534933853894472, + "logits/chosen": -0.1048380509018898, + "logits/rejected": 0.006399867124855518, + "logps/chosen": -3.7375035285949707, + "logps/rejected": -4.228513240814209, + "loss": 0.5928, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.7375035285949707, + "rewards/margins": 0.4910096228122711, + "rewards/rejected": -4.228513240814209, "step": 1710 }, { "epoch": 0.9178792440207393, - "grad_norm": 22.1831103043282, + "grad_norm": 14.917234621309115, "learning_rate": 8.762662635458944e-07, - "logits/chosen": 0.8289508819580078, - "logits/rejected": 0.9157236218452454, - "logps/chosen": -8.972761154174805, - "logps/rejected": -9.611922264099121, - "loss": 0.6294, - "rewards/accuracies": 0.6875, - "rewards/chosen": -8.972761154174805, - "rewards/margins": 0.6391609907150269, - "rewards/rejected": -9.611922264099121, - "semantic_entropy": 0.0023491496685892344, + "logits/chosen": -0.12531444430351257, + "logits/rejected": 0.050012826919555664, + "logps/chosen": -4.011713981628418, + "logps/rejected": -4.624884605407715, + "loss": 0.5882, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.011713981628418, + "rewards/margins": 0.6131707429885864, + "rewards/rejected": -4.624884605407715, "step": 1715 }, { "epoch": 0.9205552768021408, - "grad_norm": 18.48437033985848, + "grad_norm": 14.444307268519669, "learning_rate": 8.752388044972811e-07, - "logits/chosen": 0.8286212086677551, - "logits/rejected": 0.8758748769760132, - "logps/chosen": -8.496380805969238, - "logps/rejected": -9.228724479675293, - "loss": 0.5515, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -8.496380805969238, - "rewards/margins": 0.7323442101478577, - "rewards/rejected": -9.228724479675293, - "semantic_entropy": 0.003955576568841934, + "logits/chosen": -0.0858539491891861, + "logits/rejected": 0.005042278673499823, + "logps/chosen": -3.905574083328247, + "logps/rejected": -4.620153903961182, + "loss": 0.5331, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.905574083328247, + "rewards/margins": 0.7145795822143555, + "rewards/rejected": -4.620153903961182, "step": 1720 }, { "epoch": 0.9232313095835424, - "grad_norm": 15.320431126329213, + "grad_norm": 11.38703612490689, "learning_rate": 8.74207704880141e-07, - "logits/chosen": 0.7788019180297852, - "logits/rejected": 0.8485990762710571, - "logps/chosen": -8.44409465789795, - "logps/rejected": -9.369488716125488, - "loss": 0.4862, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.44409465789795, - "rewards/margins": 0.9253931045532227, - "rewards/rejected": -9.369488716125488, - "semantic_entropy": 0.0033842413686215878, + "logits/chosen": -0.07306094467639923, + "logits/rejected": 0.03950928524136543, + "logps/chosen": -4.182978630065918, + "logps/rejected": -5.029630184173584, + "loss": 0.4937, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.182978630065918, + "rewards/margins": 0.8466521501541138, + "rewards/rejected": -5.029630184173584, "step": 1725 }, { "epoch": 0.925907342364944, - "grad_norm": 13.032187521778193, + "grad_norm": 13.316690545599727, "learning_rate": 8.731729746982068e-07, - "logits/chosen": 0.7908933758735657, - "logits/rejected": 0.839871883392334, - "logps/chosen": -8.113957405090332, - "logps/rejected": -8.812549591064453, - "loss": 0.5337, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.113957405090332, - "rewards/margins": 0.6985923647880554, - "rewards/rejected": -8.812549591064453, - "semantic_entropy": 0.004230237565934658, + "logits/chosen": -0.03247485309839249, + "logits/rejected": 0.051323145627975464, + "logps/chosen": -3.9788131713867188, + "logps/rejected": -4.636987686157227, + "loss": 0.5478, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.9788131713867188, + "rewards/margins": 0.6581746339797974, + "rewards/rejected": -4.636987686157227, "step": 1730 }, { "epoch": 0.9285833751463456, - "grad_norm": 19.494387499479178, + "grad_norm": 14.864106633112458, "learning_rate": 8.721346239904355e-07, - "logits/chosen": 0.72081059217453, - "logits/rejected": 0.8257828950881958, - "logps/chosen": -8.163381576538086, - "logps/rejected": -8.904546737670898, - "loss": 0.5976, - "rewards/accuracies": 0.625, - "rewards/chosen": -8.163381576538086, - "rewards/margins": 0.7411641478538513, - "rewards/rejected": -8.904546737670898, - "semantic_entropy": 0.00452050007879734, + "logits/chosen": -0.20449359714984894, + "logits/rejected": 0.030726462602615356, + "logps/chosen": -4.114497661590576, + "logps/rejected": -4.89510440826416, + "loss": 0.5595, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.114497661590576, + "rewards/margins": 0.7806065678596497, + "rewards/rejected": -4.89510440826416, "step": 1735 }, { "epoch": 0.9312594079277471, - "grad_norm": 18.390962651772945, + "grad_norm": 17.745010683075016, "learning_rate": 8.710926628309101e-07, - "logits/chosen": 0.7455258965492249, - "logits/rejected": 0.8390616178512573, - "logps/chosen": -8.214960098266602, - "logps/rejected": -8.860664367675781, - "loss": 0.5546, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -8.214960098266602, - "rewards/margins": 0.6457030177116394, - "rewards/rejected": -8.860664367675781, - "semantic_entropy": 0.004450926091521978, + "logits/chosen": -0.1406875103712082, + "logits/rejected": 0.06228721886873245, + "logps/chosen": -4.319915771484375, + "logps/rejected": -4.889880180358887, + "loss": 0.5621, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.319915771484375, + "rewards/margins": 0.5699647665023804, + "rewards/rejected": -4.889880180358887, "step": 1740 }, { "epoch": 0.9339354407091487, - "grad_norm": 12.45958358529586, + "grad_norm": 9.413209504285339, "learning_rate": 8.700471013287424e-07, - "logits/chosen": 0.779666543006897, - "logits/rejected": 0.7992655038833618, - "logps/chosen": -7.90356969833374, - "logps/rejected": -8.546361923217773, - "loss": 0.5456, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -7.90356969833374, - "rewards/margins": 0.6427920460700989, - "rewards/rejected": -8.546361923217773, - "semantic_entropy": 0.005241268780082464, + "logits/chosen": -0.057599931955337524, + "logits/rejected": -0.014551585540175438, + "logps/chosen": -3.803553819656372, + "logps/rejected": -4.435184955596924, + "loss": 0.5434, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.803553819656372, + "rewards/margins": 0.6316312551498413, + "rewards/rejected": -4.435184955596924, "step": 1745 }, { "epoch": 0.9366114734905503, - "grad_norm": 20.787825790509054, + "grad_norm": 18.79012600154289, "learning_rate": 8.689979496279746e-07, - "logits/chosen": 0.7400572896003723, - "logits/rejected": 0.7830491065979004, - "logps/chosen": -8.003057479858398, - "logps/rejected": -8.49720573425293, - "loss": 0.6642, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -8.003057479858398, - "rewards/margins": 0.4941479563713074, - "rewards/rejected": -8.49720573425293, - "semantic_entropy": 0.005025799386203289, + "logits/chosen": -0.09850213676691055, + "logits/rejected": -0.03192386031150818, + "logps/chosen": -3.9662413597106934, + "logps/rejected": -4.427224636077881, + "loss": 0.6466, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.9662413597106934, + "rewards/margins": 0.4609832763671875, + "rewards/rejected": -4.427224636077881, "step": 1750 }, { "epoch": 0.9392875062719518, - "grad_norm": 13.730264245914015, + "grad_norm": 12.401603186784454, "learning_rate": 8.679452179074811e-07, - "logits/chosen": 0.7920068502426147, - "logits/rejected": 0.8651610612869263, - "logps/chosen": -7.900570869445801, - "logps/rejected": -8.649713516235352, - "loss": 0.5124, - "rewards/accuracies": 0.75, - "rewards/chosen": -7.900570869445801, - "rewards/margins": 0.7491430640220642, - "rewards/rejected": -8.649713516235352, - "semantic_entropy": 0.005135712679475546, + "logits/chosen": -0.08789245784282684, + "logits/rejected": 0.04082120209932327, + "logps/chosen": -3.8730685710906982, + "logps/rejected": -4.547240257263184, + "loss": 0.5148, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.8730685710906982, + "rewards/margins": 0.6741719841957092, + "rewards/rejected": -4.547240257263184, "step": 1755 }, { "epoch": 0.9419635390533534, - "grad_norm": 13.626163938056823, + "grad_norm": 14.360454487597096, "learning_rate": 8.668889163808698e-07, - "logits/chosen": 0.7864473462104797, - "logits/rejected": 0.8614629507064819, - "logps/chosen": -7.7179975509643555, - "logps/rejected": -8.315633773803711, - "loss": 0.5636, - "rewards/accuracies": 0.6875, - "rewards/chosen": -7.7179975509643555, - "rewards/margins": 0.5976354479789734, - "rewards/rejected": -8.315633773803711, - "semantic_entropy": 0.0065610273741185665, + "logits/chosen": -0.09696818143129349, + "logits/rejected": 0.052021197974681854, + "logps/chosen": -3.7483444213867188, + "logps/rejected": -4.328455448150635, + "loss": 0.5507, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.7483444213867188, + "rewards/margins": 0.580110490322113, + "rewards/rejected": -4.328455448150635, "step": 1760 }, { "epoch": 0.944639571834755, - "grad_norm": 15.396621437912136, + "grad_norm": 13.471753249947533, "learning_rate": 8.658290552963827e-07, - "logits/chosen": 0.7842726111412048, - "logits/rejected": 0.8139573335647583, - "logps/chosen": -7.729952812194824, - "logps/rejected": -8.440402030944824, - "loss": 0.5618, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -7.729952812194824, - "rewards/margins": 0.710450291633606, - "rewards/rejected": -8.440402030944824, - "semantic_entropy": 0.005991552956402302, + "logits/chosen": -0.06702379137277603, + "logits/rejected": -0.02421344444155693, + "logps/chosen": -3.7387919425964355, + "logps/rejected": -4.349204063415527, + "loss": 0.5776, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.7387919425964355, + "rewards/margins": 0.6104118227958679, + "rewards/rejected": -4.349204063415527, "step": 1765 }, { "epoch": 0.9473156046161565, - "grad_norm": 11.779338418294584, + "grad_norm": 11.099237876018693, "learning_rate": 8.647656449367966e-07, - "logits/chosen": 0.7771416306495667, - "logits/rejected": 0.8638502359390259, - "logps/chosen": -7.753976345062256, - "logps/rejected": -8.381518363952637, - "loss": 0.5732, - "rewards/accuracies": 0.71875, - "rewards/chosen": -7.753976345062256, - "rewards/margins": 0.6275419592857361, - "rewards/rejected": -8.381518363952637, - "semantic_entropy": 0.0064716823399066925, + "logits/chosen": -0.03246191889047623, + "logits/rejected": 0.12877734005451202, + "logps/chosen": -3.8278896808624268, + "logps/rejected": -4.41904354095459, + "loss": 0.5747, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.8278896808624268, + "rewards/margins": 0.5911538600921631, + "rewards/rejected": -4.41904354095459, "step": 1770 }, { "epoch": 0.9499916373975581, - "grad_norm": 11.790444019030152, + "grad_norm": 11.13984067177661, "learning_rate": 8.636986956193235e-07, - "logits/chosen": 0.7170445919036865, - "logits/rejected": 0.7984222173690796, - "logps/chosen": -7.559231758117676, - "logps/rejected": -8.215555191040039, - "loss": 0.5718, - "rewards/accuracies": 0.6875, - "rewards/chosen": -7.559231758117676, - "rewards/margins": 0.6563239097595215, - "rewards/rejected": -8.215555191040039, - "semantic_entropy": 0.007783152163028717, + "logits/chosen": -0.10657230764627457, + "logits/rejected": 0.02521834708750248, + "logps/chosen": -3.6927547454833984, + "logps/rejected": -4.317018985748291, + "loss": 0.567, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.6927547454833984, + "rewards/margins": 0.6242635846138, + "rewards/rejected": -4.317018985748291, "step": 1775 }, { "epoch": 0.9526676701789597, - "grad_norm": 13.251274223031999, + "grad_norm": 11.639910325204637, "learning_rate": 8.626282176955104e-07, - "logits/chosen": 0.7697458863258362, - "logits/rejected": 0.8472963571548462, - "logps/chosen": -7.661177635192871, - "logps/rejected": -8.409696578979492, - "loss": 0.5248, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -7.661177635192871, - "rewards/margins": 0.7485184073448181, - "rewards/rejected": -8.409696578979492, - "semantic_entropy": 0.006556454114615917, + "logits/chosen": -0.09595645219087601, + "logits/rejected": 0.03128201141953468, + "logps/chosen": -3.4674370288848877, + "logps/rejected": -4.105962753295898, + "loss": 0.5366, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.4674370288848877, + "rewards/margins": 0.6385254859924316, + "rewards/rejected": -4.105962753295898, "step": 1780 }, { "epoch": 0.9553437029603613, - "grad_norm": 16.317970460951365, + "grad_norm": 17.437494394408603, "learning_rate": 8.615542215511389e-07, - "logits/chosen": 0.7925196886062622, - "logits/rejected": 0.827374279499054, - "logps/chosen": -7.7944183349609375, - "logps/rejected": -8.290719985961914, - "loss": 0.5983, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -7.7944183349609375, - "rewards/margins": 0.4963007867336273, - "rewards/rejected": -8.290719985961914, - "semantic_entropy": 0.006235038861632347, + "logits/chosen": -0.00845525972545147, + "logits/rejected": 0.05451008677482605, + "logps/chosen": -3.5052249431610107, + "logps/rejected": -3.8889031410217285, + "loss": 0.6299, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.5052249431610107, + "rewards/margins": 0.38367825746536255, + "rewards/rejected": -3.8889031410217285, "step": 1785 }, { "epoch": 0.9580197357417628, - "grad_norm": 17.388652579583002, + "grad_norm": 14.915485243235329, "learning_rate": 8.604767176061241e-07, - "logits/chosen": 0.7371417284011841, - "logits/rejected": 0.7948885560035706, - "logps/chosen": -7.8774542808532715, - "logps/rejected": -8.463842391967773, - "loss": 0.5767, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -7.8774542808532715, - "rewards/margins": 0.5863882303237915, - "rewards/rejected": -8.463842391967773, - "semantic_entropy": 0.0055058179423213005, + "logits/chosen": 0.05704737454652786, + "logits/rejected": 0.14632245898246765, + "logps/chosen": -3.710101366043091, + "logps/rejected": -4.216159343719482, + "loss": 0.5812, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.710101366043091, + "rewards/margins": 0.5060585141181946, + "rewards/rejected": -4.216159343719482, "step": 1790 }, { "epoch": 0.9606957685231644, - "grad_norm": 10.102522428025908, + "grad_norm": 8.50876843516854, "learning_rate": 8.593957163144141e-07, - "logits/chosen": 0.701524555683136, - "logits/rejected": 0.7919615507125854, - "logps/chosen": -7.6824631690979, - "logps/rejected": -8.461966514587402, - "loss": 0.516, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -7.6824631690979, - "rewards/margins": 0.7795030474662781, - "rewards/rejected": -8.461966514587402, - "semantic_entropy": 0.007335428148508072, + "logits/chosen": -0.11547045409679413, + "logits/rejected": 0.03303173929452896, + "logps/chosen": -3.5387675762176514, + "logps/rejected": -4.199957370758057, + "loss": 0.5394, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.5387675762176514, + "rewards/margins": 0.6611896753311157, + "rewards/rejected": -4.199957370758057, "step": 1795 }, { "epoch": 0.963371801304566, - "grad_norm": 13.049955014081108, + "grad_norm": 11.289544516786496, "learning_rate": 8.58311228163888e-07, - "logits/chosen": 0.723731279373169, - "logits/rejected": 0.7662105560302734, - "logps/chosen": -7.902833461761475, - "logps/rejected": -8.512142181396484, - "loss": 0.5491, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -7.902833461761475, - "rewards/margins": 0.6093090772628784, - "rewards/rejected": -8.512142181396484, - "semantic_entropy": 0.00509651331230998, + "logits/chosen": -0.08586417138576508, + "logits/rejected": 0.01008274219930172, + "logps/chosen": -3.765727996826172, + "logps/rejected": -4.277950763702393, + "loss": 0.5745, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.765727996826172, + "rewards/margins": 0.5122228860855103, + "rewards/rejected": -4.277950763702393, "step": 1800 }, { "epoch": 0.9660478340859675, - "grad_norm": 17.055444027861736, + "grad_norm": 12.61886700092441, "learning_rate": 8.57223263676255e-07, - "logits/chosen": 0.651114821434021, - "logits/rejected": 0.7337725758552551, - "logps/chosen": -7.82892370223999, - "logps/rejected": -8.779696464538574, - "loss": 0.4542, + "logits/chosen": -0.16673758625984192, + "logits/rejected": -0.012672263197600842, + "logps/chosen": -3.5525104999542236, + "logps/rejected": -4.401576519012451, + "loss": 0.4711, "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -7.82892370223999, - "rewards/margins": 0.950772762298584, - "rewards/rejected": -8.779696464538574, - "semantic_entropy": 0.0056858672760427, + "rewards/chosen": -3.5525104999542236, + "rewards/margins": 0.8490661382675171, + "rewards/rejected": -4.401576519012451, "step": 1805 }, { "epoch": 0.9687238668673691, - "grad_norm": 12.069977661137925, + "grad_norm": 11.880563351491139, "learning_rate": 8.561318334069511e-07, - "logits/chosen": 0.722413182258606, - "logits/rejected": 0.8114809989929199, - "logps/chosen": -8.02531623840332, - "logps/rejected": -8.714741706848145, - "loss": 0.5643, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.02531623840332, - "rewards/margins": 0.6894262433052063, - "rewards/rejected": -8.714741706848145, - "semantic_entropy": 0.004859632812440395, + "logits/chosen": -0.045554645359516144, + "logits/rejected": 0.0976235419511795, + "logps/chosen": -3.636070966720581, + "logps/rejected": -4.268924236297607, + "loss": 0.5285, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.636070966720581, + "rewards/margins": 0.632853627204895, + "rewards/rejected": -4.268924236297607, "step": 1810 }, { "epoch": 0.9713998996487707, - "grad_norm": 14.829212830302785, + "grad_norm": 12.436891781574658, "learning_rate": 8.550369479450375e-07, - "logits/chosen": 0.7346758842468262, - "logits/rejected": 0.8068816065788269, - "logps/chosen": -8.147111892700195, - "logps/rejected": -8.884883880615234, - "loss": 0.5406, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -8.147111892700195, - "rewards/margins": 0.7377720475196838, - "rewards/rejected": -8.884883880615234, - "semantic_entropy": 0.00493080448359251, + "logits/chosen": -0.08142866939306259, + "logits/rejected": 0.08259963244199753, + "logps/chosen": -3.959862470626831, + "logps/rejected": -4.709686279296875, + "loss": 0.5166, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.959862470626831, + "rewards/margins": 0.7498234510421753, + "rewards/rejected": -4.709686279296875, "step": 1815 }, { "epoch": 0.9740759324301723, - "grad_norm": 16.49209421591941, + "grad_norm": 15.589861576810325, "learning_rate": 8.539386179130977e-07, - "logits/chosen": 0.7819596529006958, - "logits/rejected": 0.8145734667778015, - "logps/chosen": -8.00898551940918, - "logps/rejected": -8.68702507019043, - "loss": 0.5593, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -8.00898551940918, - "rewards/margins": 0.6780385971069336, - "rewards/rejected": -8.68702507019043, - "semantic_entropy": 0.005897555500268936, + "logits/chosen": -0.04514032602310181, + "logits/rejected": 0.0056404429487884045, + "logps/chosen": -3.985635280609131, + "logps/rejected": -4.665538787841797, + "loss": 0.5417, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.985635280609131, + "rewards/margins": 0.6799032092094421, + "rewards/rejected": -4.665538787841797, "step": 1820 }, { "epoch": 0.9767519652115738, - "grad_norm": 14.239228008160856, + "grad_norm": 12.59263964166258, "learning_rate": 8.528368539671347e-07, - "logits/chosen": 0.7752368450164795, - "logits/rejected": 0.863335132598877, - "logps/chosen": -7.959936618804932, - "logps/rejected": -9.050023078918457, - "loss": 0.4822, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -7.959936618804932, - "rewards/margins": 1.0900851488113403, - "rewards/rejected": -9.050023078918457, - "semantic_entropy": 0.005652183201164007, + "logits/chosen": -0.1496410369873047, + "logits/rejected": 0.04125867411494255, + "logps/chosen": -3.6341655254364014, + "logps/rejected": -4.633608818054199, + "loss": 0.4815, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.6341655254364014, + "rewards/margins": 0.9994437098503113, + "rewards/rejected": -4.633608818054199, "step": 1825 }, { "epoch": 0.9794279979929754, - "grad_norm": 16.436302978410623, + "grad_norm": 15.36096317676079, "learning_rate": 8.51731666796467e-07, - "logits/chosen": 0.8048080205917358, - "logits/rejected": 0.8463269472122192, - "logps/chosen": -8.244328498840332, - "logps/rejected": -8.958102226257324, - "loss": 0.5714, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.244328498840332, - "rewards/margins": 0.7137740254402161, - "rewards/rejected": -8.958102226257324, - "semantic_entropy": 0.0040356675162911415, + "logits/chosen": -0.012117327190935612, + "logits/rejected": 0.064592644572258, + "logps/chosen": -4.328290939331055, + "logps/rejected": -5.013947486877441, + "loss": 0.5643, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.328290939331055, + "rewards/margins": 0.6856558322906494, + "rewards/rejected": -5.013947486877441, "step": 1830 }, { "epoch": 0.982104030774377, - "grad_norm": 17.79215643845262, + "grad_norm": 18.693389091329323, "learning_rate": 8.506230671236254e-07, - "logits/chosen": 0.7882435917854309, - "logits/rejected": 0.8229848146438599, - "logps/chosen": -8.396721839904785, - "logps/rejected": -8.957392692565918, - "loss": 0.5884, + "logits/chosen": -0.12255791574716568, + "logits/rejected": -0.027425622567534447, + "logps/chosen": -4.359742164611816, + "logps/rejected": -4.9788031578063965, + "loss": 0.5683, "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -8.396721839904785, - "rewards/margins": 0.5606712102890015, - "rewards/rejected": -8.957392692565918, - "semantic_entropy": 0.0036423238925635815, + "rewards/chosen": -4.359742164611816, + "rewards/margins": 0.6190616488456726, + "rewards/rejected": -4.9788031578063965, "step": 1835 }, { "epoch": 0.9847800635557785, - "grad_norm": 14.625489667473667, + "grad_norm": 12.352224148390906, "learning_rate": 8.495110657042488e-07, - "logits/chosen": 0.8869732618331909, - "logits/rejected": 0.9640114903450012, - "logps/chosen": -8.529989242553711, - "logps/rejected": -9.27656364440918, - "loss": 0.5253, + "logits/chosen": -0.06791806221008301, + "logits/rejected": 0.11598928272724152, + "logps/chosen": -4.643518924713135, + "logps/rejected": -5.392780780792236, + "loss": 0.5291, "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.529989242553711, - "rewards/margins": 0.7465731501579285, - "rewards/rejected": -9.27656364440918, - "semantic_entropy": 0.0031088325195014477, + "rewards/chosen": -4.643518924713135, + "rewards/margins": 0.7492610812187195, + "rewards/rejected": -5.392780780792236, "step": 1840 }, { "epoch": 0.9874560963371801, - "grad_norm": 18.188969568475773, + "grad_norm": 20.802577862008835, "learning_rate": 8.483956733269799e-07, - "logits/chosen": 0.8915464282035828, - "logits/rejected": 0.9507058262825012, - "logps/chosen": -8.5012845993042, - "logps/rejected": -9.236323356628418, - "loss": 0.5562, + "logits/chosen": -0.09340430796146393, + "logits/rejected": 0.016976332291960716, + "logps/chosen": -4.626471519470215, + "logps/rejected": -5.375168323516846, + "loss": 0.5504, "rewards/accuracies": 0.71875, - "rewards/chosen": -8.5012845993042, - "rewards/margins": 0.7350392937660217, - "rewards/rejected": -9.236323356628418, - "semantic_entropy": 0.002896857215091586, + "rewards/chosen": -4.626471519470215, + "rewards/margins": 0.7486966848373413, + "rewards/rejected": -5.375168323516846, "step": 1845 }, { "epoch": 0.9901321291185817, - "grad_norm": 17.9922914014973, + "grad_norm": 19.547887938067618, "learning_rate": 8.472769008133602e-07, - "logits/chosen": 0.8699381947517395, - "logits/rejected": 0.953484833240509, - "logps/chosen": -8.702693939208984, - "logps/rejected": -9.392245292663574, - "loss": 0.5587, - "rewards/accuracies": 0.6875, - "rewards/chosen": -8.702693939208984, - "rewards/margins": 0.6895512342453003, - "rewards/rejected": -9.392245292663574, - "semantic_entropy": 0.00236605666577816, + "logits/chosen": -0.24401775002479553, + "logits/rejected": -0.08901788294315338, + "logps/chosen": -4.993935585021973, + "logps/rejected": -5.645054817199707, + "loss": 0.5772, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.993935585021973, + "rewards/margins": 0.6511193513870239, + "rewards/rejected": -5.645054817199707, "step": 1850 }, { "epoch": 0.9928081618999832, - "grad_norm": 15.27885989370606, + "grad_norm": 17.841588886697625, "learning_rate": 8.461547590177259e-07, - "logits/chosen": 0.9513596296310425, - "logits/rejected": 1.0096721649169922, - "logps/chosen": -8.60840129852295, - "logps/rejected": -9.362217903137207, - "loss": 0.5943, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -8.60840129852295, - "rewards/margins": 0.7538172006607056, - "rewards/rejected": -9.362217903137207, - "semantic_entropy": 0.0031571455765515566, + "logits/chosen": -0.1381450593471527, + "logits/rejected": 0.017343619838356972, + "logps/chosen": -4.737591743469238, + "logps/rejected": -5.526686191558838, + "loss": 0.5834, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.737591743469238, + "rewards/margins": 0.7890938520431519, + "rewards/rejected": -5.526686191558838, "step": 1855 }, { "epoch": 0.9954841946813848, - "grad_norm": 16.3301144819492, + "grad_norm": 16.578311851707827, "learning_rate": 8.450292588271014e-07, - "logits/chosen": 0.9395162463188171, - "logits/rejected": 0.9991067051887512, - "logps/chosen": -8.834760665893555, - "logps/rejected": -9.530847549438477, - "loss": 0.5574, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -8.834760665893555, - "rewards/margins": 0.696088433265686, - "rewards/rejected": -9.530847549438477, - "semantic_entropy": 0.0025766813196241856, + "logits/chosen": -0.11683805286884308, + "logits/rejected": 0.01585022732615471, + "logps/chosen": -5.02516508102417, + "logps/rejected": -5.725735187530518, + "loss": 0.5583, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -5.02516508102417, + "rewards/margins": 0.7005696296691895, + "rewards/rejected": -5.725735187530518, "step": 1860 }, { "epoch": 0.9981602274627864, - "grad_norm": 14.909090646149107, + "grad_norm": 17.184926705790517, "learning_rate": 8.439004111610945e-07, - "logits/chosen": 0.9531529545783997, - "logits/rejected": 0.9883922338485718, - "logps/chosen": -8.549825668334961, - "logps/rejected": -9.276273727416992, - "loss": 0.5704, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -8.549825668334961, - "rewards/margins": 0.7264472246170044, - "rewards/rejected": -9.276273727416992, - "semantic_entropy": 0.003057825844734907, + "logits/chosen": -0.09671823680400848, + "logits/rejected": -0.01732897385954857, + "logps/chosen": -4.431591987609863, + "logps/rejected": -5.139439105987549, + "loss": 0.5804, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.431591987609863, + "rewards/margins": 0.707846999168396, + "rewards/rejected": -5.139439105987549, "step": 1865 }, { "epoch": 1.000836260244188, - "grad_norm": 14.551680179512683, + "grad_norm": 12.812556533056137, "learning_rate": 8.427682269717901e-07, - "logits/chosen": 0.918908953666687, - "logits/rejected": 0.9724335670471191, - "logps/chosen": -8.606492042541504, - "logps/rejected": -9.455463409423828, - "loss": 0.495, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -8.606492042541504, - "rewards/margins": 0.8489717245101929, - "rewards/rejected": -9.455463409423828, - "semantic_entropy": 0.002914209384471178, + "logits/chosen": -0.11657615005970001, + "logits/rejected": 0.018735522404313087, + "logps/chosen": -4.571199893951416, + "logps/rejected": -5.348707675933838, + "loss": 0.5166, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.571199893951416, + "rewards/margins": 0.7775079011917114, + "rewards/rejected": -5.348707675933838, "step": 1870 }, { "epoch": 1.0035122930255895, - "grad_norm": 17.126300719381895, + "grad_norm": 16.279368631994203, "learning_rate": 8.416327172436446e-07, - "logits/chosen": 0.9382045865058899, - "logits/rejected": 1.0026956796646118, - "logps/chosen": -8.67430591583252, - "logps/rejected": -9.235125541687012, - "loss": 0.5954, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -8.67430591583252, - "rewards/margins": 0.5608205795288086, - "rewards/rejected": -9.235125541687012, - "semantic_entropy": 0.0024842366110533476, + "logits/chosen": -0.16955925524234772, + "logits/rejected": -0.024608414620161057, + "logps/chosen": -4.164732933044434, + "logps/rejected": -4.702731132507324, + "loss": 0.5825, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.164732933044434, + "rewards/margins": 0.537997841835022, + "rewards/rejected": -4.702731132507324, "step": 1875 }, { "epoch": 1.0061883258069912, - "grad_norm": 12.526549153278241, + "grad_norm": 11.744136533887906, "learning_rate": 8.404938929933778e-07, - "logits/chosen": 0.9702759981155396, - "logits/rejected": 1.0298935174942017, - "logps/chosen": -8.516494750976562, - "logps/rejected": -9.477919578552246, - "loss": 0.4751, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.516494750976562, - "rewards/margins": 0.961426854133606, - "rewards/rejected": -9.477919578552246, - "semantic_entropy": 0.0030000859405845404, + "logits/chosen": -0.027989957481622696, + "logits/rejected": 0.12380681186914444, + "logps/chosen": -3.962738037109375, + "logps/rejected": -4.859511375427246, + "loss": 0.4707, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.962738037109375, + "rewards/margins": 0.8967738151550293, + "rewards/rejected": -4.859511375427246, "step": 1880 }, { "epoch": 1.0088643585883927, - "grad_norm": 13.104074462534614, + "grad_norm": 12.145547768857469, "learning_rate": 8.39351765269868e-07, - "logits/chosen": 0.9352337121963501, - "logits/rejected": 0.9791328310966492, - "logps/chosen": -8.481460571289062, - "logps/rejected": -9.137152671813965, - "loss": 0.5852, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -8.481460571289062, - "rewards/margins": 0.6556928157806396, - "rewards/rejected": -9.137152671813965, - "semantic_entropy": 0.0034713305067270994, + "logits/chosen": -0.08452141284942627, + "logits/rejected": 0.004247196018695831, + "logps/chosen": -3.699587345123291, + "logps/rejected": -4.298642635345459, + "loss": 0.5822, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.699587345123291, + "rewards/margins": 0.5990555882453918, + "rewards/rejected": -4.298642635345459, "step": 1885 }, { "epoch": 1.0115403913697942, - "grad_norm": 15.985094320283427, + "grad_norm": 15.428889276647734, "learning_rate": 8.382063451540431e-07, - "logits/chosen": 0.9075764417648315, - "logits/rejected": 1.0042946338653564, - "logps/chosen": -8.522588729858398, - "logps/rejected": -9.330839157104492, - "loss": 0.4979, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.522588729858398, - "rewards/margins": 0.8082510828971863, - "rewards/rejected": -9.330839157104492, - "semantic_entropy": 0.0028249945025891066, + "logits/chosen": -0.0934087410569191, + "logits/rejected": 0.1301339566707611, + "logps/chosen": -3.9712390899658203, + "logps/rejected": -4.673574924468994, + "loss": 0.5299, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.9712390899658203, + "rewards/margins": 0.7023356556892395, + "rewards/rejected": -4.673574924468994, "step": 1890 }, { "epoch": 1.014216424151196, - "grad_norm": 12.915214195374903, + "grad_norm": 13.025356945371943, "learning_rate": 8.370576437587742e-07, - "logits/chosen": 0.8950628042221069, - "logits/rejected": 0.924281120300293, - "logps/chosen": -8.34730052947998, - "logps/rejected": -9.118095397949219, - "loss": 0.5147, + "logits/chosen": -0.04328613728284836, + "logits/rejected": -0.007765733636915684, + "logps/chosen": -3.858792781829834, + "logps/rejected": -4.513943672180176, + "loss": 0.5262, "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.34730052947998, - "rewards/margins": 0.7707957625389099, - "rewards/rejected": -9.118095397949219, - "semantic_entropy": 0.0034280649852007627, + "rewards/chosen": -3.858792781829834, + "rewards/margins": 0.6551502346992493, + "rewards/rejected": -4.513943672180176, "step": 1895 }, { "epoch": 1.0168924569325974, - "grad_norm": 14.453317743532807, + "grad_norm": 12.318415515469066, "learning_rate": 8.359056722287674e-07, - "logits/chosen": 0.8320645093917847, - "logits/rejected": 0.9602710604667664, - "logps/chosen": -8.307376861572266, - "logps/rejected": -9.152776718139648, - "loss": 0.5081, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.307376861572266, - "rewards/margins": 0.845400333404541, - "rewards/rejected": -9.152776718139648, - "semantic_entropy": 0.0037023150362074375, + "logits/chosen": -0.16748423874378204, + "logits/rejected": 0.11654020845890045, + "logps/chosen": -3.8393337726593018, + "logps/rejected": -4.585803031921387, + "loss": 0.5271, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.8393337726593018, + "rewards/margins": 0.7464688420295715, + "rewards/rejected": -4.585803031921387, "step": 1900 }, { "epoch": 1.019568489713999, - "grad_norm": 13.51505578799361, + "grad_norm": 13.284611357445028, "learning_rate": 8.347504417404553e-07, - "logits/chosen": 0.8122785687446594, - "logits/rejected": 0.8963130712509155, - "logps/chosen": -8.318288803100586, - "logps/rejected": -9.081267356872559, - "loss": 0.536, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -8.318288803100586, - "rewards/margins": 0.7629793882369995, - "rewards/rejected": -9.081267356872559, - "semantic_entropy": 0.003525532316416502, + "logits/chosen": -0.07342179864645004, + "logits/rejected": 0.08061535656452179, + "logps/chosen": -3.9948630332946777, + "logps/rejected": -4.696008682250977, + "loss": 0.5577, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.9948630332946777, + "rewards/margins": 0.7011451721191406, + "rewards/rejected": -4.696008682250977, "step": 1905 }, { "epoch": 1.0222445224954007, - "grad_norm": 11.191552343163695, + "grad_norm": 9.941147566798575, "learning_rate": 8.335919635018893e-07, - "logits/chosen": 0.7361363172531128, - "logits/rejected": 0.8019342422485352, - "logps/chosen": -8.195878982543945, - "logps/rejected": -8.886285781860352, - "loss": 0.5351, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.195878982543945, - "rewards/margins": 0.6904064416885376, - "rewards/rejected": -8.886285781860352, - "semantic_entropy": 0.004457551054656506, + "logits/chosen": -0.19735580682754517, + "logits/rejected": -0.05294167250394821, + "logps/chosen": -3.891469955444336, + "logps/rejected": -4.5032548904418945, + "loss": 0.5433, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.891469955444336, + "rewards/margins": 0.6117848753929138, + "rewards/rejected": -4.5032548904418945, "step": 1910 }, { "epoch": 1.0249205552768021, - "grad_norm": 12.647769499958207, + "grad_norm": 10.965010322796017, "learning_rate": 8.324302487526303e-07, - "logits/chosen": 0.7044271230697632, - "logits/rejected": 0.77665114402771, - "logps/chosen": -8.380681037902832, - "logps/rejected": -9.184738159179688, - "loss": 0.5034, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.380681037902832, - "rewards/margins": 0.8040567636489868, - "rewards/rejected": -9.184738159179688, - "semantic_entropy": 0.003701858688145876, + "logits/chosen": -0.1210625171661377, + "logits/rejected": 0.007019522599875927, + "logps/chosen": -4.057669639587402, + "logps/rejected": -4.687264919281006, + "loss": 0.5404, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.057669639587402, + "rewards/margins": 0.6295956373214722, + "rewards/rejected": -4.687264919281006, "step": 1915 }, { "epoch": 1.0275965880582036, - "grad_norm": 13.95644498738611, + "grad_norm": 10.835581841659222, "learning_rate": 8.312653087636398e-07, - "logits/chosen": 0.722461998462677, - "logits/rejected": 0.7603663206100464, - "logps/chosen": -8.306981086730957, - "logps/rejected": -9.107414245605469, - "loss": 0.5306, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -8.306981086730957, - "rewards/margins": 0.8004336357116699, - "rewards/rejected": -9.107414245605469, - "semantic_entropy": 0.004512041341513395, + "logits/chosen": -0.1651735007762909, + "logits/rejected": -0.07508811354637146, + "logps/chosen": -3.791569471359253, + "logps/rejected": -4.52608585357666, + "loss": 0.5398, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.791569471359253, + "rewards/margins": 0.7345167398452759, + "rewards/rejected": -4.52608585357666, "step": 1920 }, { "epoch": 1.0302726208396054, - "grad_norm": 18.459332801354297, + "grad_norm": 14.912175577434061, "learning_rate": 8.300971548371711e-07, - "logits/chosen": 0.5903456211090088, - "logits/rejected": 0.7183451056480408, - "logps/chosen": -8.510897636413574, - "logps/rejected": -9.23041820526123, - "loss": 0.5363, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.510897636413574, - "rewards/margins": 0.719520628452301, - "rewards/rejected": -9.23041820526123, - "semantic_entropy": 0.003166732145473361, + "logits/chosen": -0.29709625244140625, + "logits/rejected": -0.07219813764095306, + "logps/chosen": -4.107485294342041, + "logps/rejected": -4.80081844329834, + "loss": 0.5387, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.107485294342041, + "rewards/margins": 0.6933332681655884, + "rewards/rejected": -4.80081844329834, "step": 1925 }, { "epoch": 1.0329486536210069, - "grad_norm": 18.340027693624933, + "grad_norm": 15.611338911400845, "learning_rate": 8.289257983066582e-07, - "logits/chosen": 0.6703733205795288, - "logits/rejected": 0.7412980198860168, - "logps/chosen": -8.390009880065918, - "logps/rejected": -9.196538925170898, - "loss": 0.5283, + "logits/chosen": -0.19945965707302094, + "logits/rejected": -0.04205937311053276, + "logps/chosen": -3.909473419189453, + "logps/rejected": -4.581412315368652, + "loss": 0.533, "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.390009880065918, - "rewards/margins": 0.8065292239189148, - "rewards/rejected": -9.196538925170898, - "semantic_entropy": 0.004317262209951878, + "rewards/chosen": -3.909473419189453, + "rewards/margins": 0.6719392538070679, + "rewards/rejected": -4.581412315368652, "step": 1930 }, { "epoch": 1.0356246864024083, - "grad_norm": 14.802116730649294, + "grad_norm": 13.646252825168181, "learning_rate": 8.277512505366077e-07, - "logits/chosen": 0.6543900966644287, - "logits/rejected": 0.7763436436653137, - "logps/chosen": -8.447009086608887, - "logps/rejected": -9.306153297424316, - "loss": 0.5238, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -8.447009086608887, - "rewards/margins": 0.8591440916061401, - "rewards/rejected": -9.306153297424316, - "semantic_entropy": 0.003334530396386981, + "logits/chosen": -0.2595495879650116, + "logits/rejected": -0.021945450454950333, + "logps/chosen": -3.870316982269287, + "logps/rejected": -4.622162342071533, + "loss": 0.528, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.870316982269287, + "rewards/margins": 0.7518454790115356, + "rewards/rejected": -4.622162342071533, "step": 1935 }, { "epoch": 1.03830071918381, - "grad_norm": 15.098311821125256, + "grad_norm": 12.762329439517744, "learning_rate": 8.265735229224868e-07, - "logits/chosen": 0.6818082928657532, - "logits/rejected": 0.7659986615180969, - "logps/chosen": -8.268132209777832, - "logps/rejected": -9.30879020690918, - "loss": 0.4737, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.268132209777832, - "rewards/margins": 1.040657877922058, - "rewards/rejected": -9.30879020690918, - "semantic_entropy": 0.003615723457187414, + "logits/chosen": -0.1658550202846527, + "logits/rejected": -0.026887375861406326, + "logps/chosen": -3.749594211578369, + "logps/rejected": -4.592177391052246, + "loss": 0.5096, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.749594211578369, + "rewards/margins": 0.8425830602645874, + "rewards/rejected": -4.592177391052246, "step": 1940 }, { "epoch": 1.0409767519652116, - "grad_norm": 14.723077590553624, + "grad_norm": 9.388729932952039, "learning_rate": 8.253926268906144e-07, - "logits/chosen": 0.6228159666061401, - "logits/rejected": 0.688185453414917, - "logps/chosen": -8.453712463378906, - "logps/rejected": -9.435036659240723, - "loss": 0.4647, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -8.453712463378906, - "rewards/margins": 0.9813230633735657, - "rewards/rejected": -9.435036659240723, - "semantic_entropy": 0.0034209657460451126, + "logits/chosen": -0.2348923236131668, + "logits/rejected": -0.06178142875432968, + "logps/chosen": -3.81074595451355, + "logps/rejected": -4.722434043884277, + "loss": 0.4651, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.81074595451355, + "rewards/margins": 0.9116878509521484, + "rewards/rejected": -4.722434043884277, "step": 1945 }, { "epoch": 1.043652784746613, - "grad_norm": 13.627077288321304, + "grad_norm": 12.144501232477452, "learning_rate": 8.242085738980487e-07, - "logits/chosen": 0.7107739448547363, - "logits/rejected": 0.8460835218429565, - "logps/chosen": -8.611701965332031, - "logps/rejected": -9.478960990905762, - "loss": 0.5379, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -8.611701965332031, - "rewards/margins": 0.8672583699226379, - "rewards/rejected": -9.478960990905762, - "semantic_entropy": 0.0030593627598136663, + "logits/chosen": -0.16371026635169983, + "logits/rejected": 0.07933319360017776, + "logps/chosen": -3.873079776763916, + "logps/rejected": -4.651761531829834, + "loss": 0.5165, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.873079776763916, + "rewards/margins": 0.7786819338798523, + "rewards/rejected": -4.651761531829834, "step": 1950 }, { "epoch": 1.0463288175280148, - "grad_norm": 19.757986263536445, + "grad_norm": 13.309219186601922, "learning_rate": 8.230213754324772e-07, - "logits/chosen": 0.6388633847236633, - "logits/rejected": 0.6890886425971985, - "logps/chosen": -8.633177757263184, - "logps/rejected": -9.407966613769531, - "loss": 0.51, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.633177757263184, - "rewards/margins": 0.7747882604598999, - "rewards/rejected": -9.407966613769531, - "semantic_entropy": 0.002916950499638915, + "logits/chosen": -0.1869739145040512, + "logits/rejected": -0.10564710199832916, + "logps/chosen": -3.8443291187286377, + "logps/rejected": -4.552462100982666, + "loss": 0.5054, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.8443291187286377, + "rewards/margins": 0.7081330418586731, + "rewards/rejected": -4.552462100982666, "step": 1955 }, { "epoch": 1.0490048503094163, - "grad_norm": 16.381811199534628, + "grad_norm": 13.43812416231905, "learning_rate": 8.218310430121045e-07, - "logits/chosen": 0.6950886845588684, - "logits/rejected": 0.720539927482605, - "logps/chosen": -8.707314491271973, - "logps/rejected": -9.449724197387695, - "loss": 0.558, + "logits/chosen": -0.15295056998729706, + "logits/rejected": -0.10682132095098495, + "logps/chosen": -3.765388011932373, + "logps/rejected": -4.43670654296875, + "loss": 0.5524, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -8.707314491271973, - "rewards/margins": 0.74241042137146, - "rewards/rejected": -9.449724197387695, - "semantic_entropy": 0.0032343785278499126, + "rewards/chosen": -3.765388011932373, + "rewards/margins": 0.6713187098503113, + "rewards/rejected": -4.43670654296875, "step": 1960 }, { "epoch": 1.051680883090818, - "grad_norm": 13.044302764643096, + "grad_norm": 11.762991646884768, "learning_rate": 8.20637588185541e-07, - "logits/chosen": 0.6273518800735474, - "logits/rejected": 0.6855801939964294, - "logps/chosen": -8.859047889709473, - "logps/rejected": -9.979398727416992, - "loss": 0.4405, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -8.859047889709473, - "rewards/margins": 1.1203503608703613, - "rewards/rejected": -9.979398727416992, - "semantic_entropy": 0.0030110005754977465, + "logits/chosen": -0.12886402010917664, + "logits/rejected": -0.03368421643972397, + "logps/chosen": -4.097124099731445, + "logps/rejected": -5.125328063964844, + "loss": 0.4534, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.097124099731445, + "rewards/margins": 1.0282045602798462, + "rewards/rejected": -5.125328063964844, "step": 1965 }, { "epoch": 1.0543569158722195, - "grad_norm": 16.03642878238039, + "grad_norm": 13.14102308142049, "learning_rate": 8.194410225316906e-07, - "logits/chosen": 0.5873863697052002, - "logits/rejected": 0.6804260015487671, - "logps/chosen": -8.77137565612793, - "logps/rejected": -9.576631546020508, - "loss": 0.5406, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.77137565612793, - "rewards/margins": 0.8052547574043274, - "rewards/rejected": -9.576631546020508, - "semantic_entropy": 0.0030320039950311184, + "logits/chosen": -0.1860276460647583, + "logits/rejected": -0.015146763995289803, + "logps/chosen": -3.8575377464294434, + "logps/rejected": -4.522668361663818, + "loss": 0.5442, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.8575377464294434, + "rewards/margins": 0.6651301383972168, + "rewards/rejected": -4.522668361663818, "step": 1970 }, { "epoch": 1.057032948653621, - "grad_norm": 22.084325288130714, + "grad_norm": 12.545476946680903, "learning_rate": 8.182413576596385e-07, - "logits/chosen": 0.6454890370368958, - "logits/rejected": 0.6792441606521606, - "logps/chosen": -8.825540542602539, - "logps/rejected": -9.584020614624023, - "loss": 0.5592, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -8.825540542602539, - "rewards/margins": 0.7584813833236694, - "rewards/rejected": -9.584020614624023, - "semantic_entropy": 0.0030581161845475435, + "logits/chosen": -0.034415435045957565, + "logits/rejected": 0.03188232704997063, + "logps/chosen": -3.7823524475097656, + "logps/rejected": -4.487369537353516, + "loss": 0.5456, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.7823524475097656, + "rewards/margins": 0.7050174474716187, + "rewards/rejected": -4.487369537353516, "step": 1975 }, { "epoch": 1.0597089814350227, - "grad_norm": 19.767902079352925, + "grad_norm": 14.854671434024349, "learning_rate": 8.170386052085389e-07, - "logits/chosen": 0.5810804963111877, - "logits/rejected": 0.6741968989372253, - "logps/chosen": -8.81079387664795, - "logps/rejected": -9.655781745910645, - "loss": 0.5413, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -8.81079387664795, - "rewards/margins": 0.8449875712394714, - "rewards/rejected": -9.655781745910645, - "semantic_entropy": 0.0033272195141762495, + "logits/chosen": -0.008093967102468014, + "logits/rejected": 0.11006908118724823, + "logps/chosen": -3.901071071624756, + "logps/rejected": -4.616329193115234, + "loss": 0.5611, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.901071071624756, + "rewards/margins": 0.7152583003044128, + "rewards/rejected": -4.616329193115234, "step": 1980 }, { "epoch": 1.0623850142164242, - "grad_norm": 16.00497801864881, + "grad_norm": 14.228757856264517, "learning_rate": 8.158327768475008e-07, - "logits/chosen": 0.5558470487594604, - "logits/rejected": 0.6562130451202393, - "logps/chosen": -8.712007522583008, - "logps/rejected": -9.437994003295898, - "loss": 0.566, - "rewards/accuracies": 0.71875, - "rewards/chosen": -8.712007522583008, - "rewards/margins": 0.7259871959686279, - "rewards/rejected": -9.437994003295898, - "semantic_entropy": 0.004049594048410654, + "logits/chosen": -0.08297140896320343, + "logits/rejected": 0.08276952803134918, + "logps/chosen": -3.8276724815368652, + "logps/rejected": -4.379798889160156, + "loss": 0.6079, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.8276724815368652, + "rewards/margins": 0.5521261096000671, + "rewards/rejected": -4.379798889160156, "step": 1985 }, { "epoch": 1.0650610469978257, - "grad_norm": 22.86662817638401, + "grad_norm": 16.197029600409895, "learning_rate": 8.146238842754767e-07, - "logits/chosen": 0.48547202348709106, - "logits/rejected": 0.5629103779792786, - "logps/chosen": -8.967915534973145, - "logps/rejected": -9.580000877380371, - "loss": 0.5851, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -8.967915534973145, - "rewards/margins": 0.6120861172676086, - "rewards/rejected": -9.580000877380371, - "semantic_entropy": 0.002668407978489995, + "logits/chosen": -0.14716288447380066, + "logits/rejected": -0.038800377398729324, + "logps/chosen": -3.973102569580078, + "logps/rejected": -4.57892370223999, + "loss": 0.5557, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.973102569580078, + "rewards/margins": 0.6058207750320435, + "rewards/rejected": -4.57892370223999, "step": 1990 }, { "epoch": 1.0677370797792274, - "grad_norm": 23.23762612453471, + "grad_norm": 14.276536022913021, "learning_rate": 8.134119392211476e-07, - "logits/chosen": 0.5937298536300659, - "logits/rejected": 0.7143954038619995, - "logps/chosen": -8.771955490112305, - "logps/rejected": -9.69508171081543, - "loss": 0.5095, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.771955490112305, - "rewards/margins": 0.9231254458427429, - "rewards/rejected": -9.69508171081543, - "semantic_entropy": 0.002983763115480542, + "logits/chosen": -0.07419179379940033, + "logits/rejected": 0.10331116616725922, + "logps/chosen": -3.718053102493286, + "logps/rejected": -4.622757911682129, + "loss": 0.5081, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.718053102493286, + "rewards/margins": 0.9047052264213562, + "rewards/rejected": -4.622757911682129, "step": 1995 }, { "epoch": 1.0704131125606289, - "grad_norm": 20.894254936050405, + "grad_norm": 20.444076112036456, "learning_rate": 8.121969534428094e-07, - "logits/chosen": 0.5293421745300293, - "logits/rejected": 0.6653727293014526, - "logps/chosen": -8.856660842895508, - "logps/rejected": -9.56396198272705, - "loss": 0.5834, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.856660842895508, - "rewards/margins": 0.7073008418083191, - "rewards/rejected": -9.56396198272705, - "semantic_entropy": 0.002809601603075862, + "logits/chosen": -0.1916288435459137, + "logits/rejected": -0.012524427846074104, + "logps/chosen": -3.8357975482940674, + "logps/rejected": -4.536237716674805, + "loss": 0.5765, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.8357975482940674, + "rewards/margins": 0.7004397511482239, + "rewards/rejected": -4.536237716674805, "step": 2000 }, { "epoch": 1.0704131125606289, - "eval_logits/chosen": 0.7784110307693481, - "eval_logits/rejected": 0.8619949221611023, - "eval_logps/chosen": -8.725646018981934, - "eval_logps/rejected": -9.51314926147461, - "eval_loss": 0.5343691110610962, - "eval_rewards/accuracies": 0.7158753871917725, - "eval_rewards/chosen": -8.725646018981934, - "eval_rewards/margins": 0.7875038385391235, - "eval_rewards/rejected": -9.51314926147461, - "eval_runtime": 34.7505, - "eval_samples_per_second": 38.704, - "eval_semantic_entropy": 0.00273532303981483, - "eval_steps_per_second": 9.698, + "eval_logits/chosen": 0.1978289783000946, + "eval_logits/rejected": 0.3103795349597931, + "eval_logps/chosen": -3.670088529586792, + "eval_logps/rejected": -4.435221195220947, + "eval_loss": 0.5305001139640808, + "eval_rewards/accuracies": 0.7240356206893921, + "eval_rewards/chosen": -3.670088529586792, + "eval_rewards/margins": 0.7651325464248657, + "eval_rewards/rejected": -4.435221195220947, + "eval_runtime": 40.3632, + "eval_samples_per_second": 33.322, + "eval_steps_per_second": 8.349, "step": 2000 }, { "epoch": 1.0730891453420304, - "grad_norm": 16.92593879734535, + "grad_norm": 13.972623540932345, "learning_rate": 8.109789387282599e-07, - "logits/chosen": 0.5764074921607971, - "logits/rejected": 0.6182007193565369, - "logps/chosen": -8.691645622253418, - "logps/rejected": -9.37775707244873, - "loss": 0.5661, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.691645622253418, - "rewards/margins": 0.6861115097999573, - "rewards/rejected": -9.37775707244873, - "semantic_entropy": 0.003028175327926874, + "logits/chosen": -0.1030389815568924, + "logits/rejected": -0.02766525186598301, + "logps/chosen": -3.711275577545166, + "logps/rejected": -4.282042026519775, + "loss": 0.6105, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.711275577545166, + "rewards/margins": 0.5707670450210571, + "rewards/rejected": -4.282042026519775, "step": 2005 }, { "epoch": 1.075765178123432, - "grad_norm": 18.60708769092857, + "grad_norm": 16.213414908194064, "learning_rate": 8.097579068946827e-07, - "logits/chosen": 0.5846803784370422, - "logits/rejected": 0.6755378842353821, - "logps/chosen": -8.488496780395508, - "logps/rejected": -9.236639022827148, - "loss": 0.5124, - "rewards/accuracies": 0.71875, - "rewards/chosen": -8.488496780395508, - "rewards/margins": 0.7481436729431152, - "rewards/rejected": -9.236639022827148, - "semantic_entropy": 0.0031055829022079706, + "logits/chosen": -0.09503203630447388, + "logits/rejected": 0.038334041833877563, + "logps/chosen": -3.355236768722534, + "logps/rejected": -4.0225419998168945, + "loss": 0.526, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.355236768722534, + "rewards/margins": 0.667305588722229, + "rewards/rejected": -4.0225419998168945, "step": 2010 }, { "epoch": 1.0784412109048336, - "grad_norm": 16.81295395917736, + "grad_norm": 13.657557074791193, "learning_rate": 8.085338697885344e-07, - "logits/chosen": 0.5960233807563782, - "logits/rejected": 0.6982234120368958, - "logps/chosen": -8.587759017944336, - "logps/rejected": -9.308615684509277, - "loss": 0.5304, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.587759017944336, - "rewards/margins": 0.720857560634613, - "rewards/rejected": -9.308615684509277, - "semantic_entropy": 0.003149865660816431, + "logits/chosen": -0.06998047977685928, + "logits/rejected": 0.06363551318645477, + "logps/chosen": -3.416102647781372, + "logps/rejected": -4.039597988128662, + "loss": 0.545, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.416102647781372, + "rewards/margins": 0.6234956383705139, + "rewards/rejected": -4.039597988128662, "step": 2015 }, { "epoch": 1.081117243686235, - "grad_norm": 19.584362200308394, + "grad_norm": 13.15590509511544, "learning_rate": 8.073068392854282e-07, - "logits/chosen": 0.4914863705635071, - "logits/rejected": 0.6277307868003845, - "logps/chosen": -8.720789909362793, - "logps/rejected": -9.521505355834961, - "loss": 0.4904, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.720789909362793, - "rewards/margins": 0.8007165789604187, - "rewards/rejected": -9.521505355834961, - "semantic_entropy": 0.0029073634650558233, + "logits/chosen": -0.2124735414981842, + "logits/rejected": 0.014531132765114307, + "logps/chosen": -3.6042842864990234, + "logps/rejected": -4.378244400024414, + "loss": 0.478, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.6042842864990234, + "rewards/margins": 0.7739599943161011, + "rewards/rejected": -4.378244400024414, "step": 2020 }, { "epoch": 1.0837932764676368, - "grad_norm": 14.905704270703776, + "grad_norm": 12.27499504735936, "learning_rate": 8.060768272900193e-07, - "logits/chosen": 0.5698509812355042, - "logits/rejected": 0.6809700727462769, - "logps/chosen": -8.530683517456055, - "logps/rejected": -9.367597579956055, - "loss": 0.5261, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.530683517456055, - "rewards/margins": 0.8369154930114746, - "rewards/rejected": -9.367597579956055, - "semantic_entropy": 0.0036535891704261303, + "logits/chosen": -0.07546254247426987, + "logits/rejected": 0.08290650695562363, + "logps/chosen": -3.372459888458252, + "logps/rejected": -4.075785160064697, + "loss": 0.5385, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.372459888458252, + "rewards/margins": 0.7033251523971558, + "rewards/rejected": -4.075785160064697, "step": 2025 }, { "epoch": 1.0864693092490383, - "grad_norm": 11.142809207007566, + "grad_norm": 9.660990244963829, "learning_rate": 8.0484384573589e-07, - "logits/chosen": 0.4973204731941223, - "logits/rejected": 0.5573136210441589, - "logps/chosen": -8.422048568725586, - "logps/rejected": -9.212953567504883, - "loss": 0.5264, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.422048568725586, - "rewards/margins": 0.7909058332443237, - "rewards/rejected": -9.212953567504883, - "semantic_entropy": 0.003728007199242711, + "logits/chosen": -0.1517535150051117, + "logits/rejected": -0.10255370289087296, + "logps/chosen": -3.4786152839660645, + "logps/rejected": -4.14971923828125, + "loss": 0.5266, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.4786152839660645, + "rewards/margins": 0.6711039543151855, + "rewards/rejected": -4.14971923828125, "step": 2030 }, { "epoch": 1.0891453420304398, - "grad_norm": 18.70125115826522, + "grad_norm": 16.9430530623428, "learning_rate": 8.03607906585432e-07, - "logits/chosen": 0.5369696617126465, - "logits/rejected": 0.6485335230827332, - "logps/chosen": -8.6622896194458, - "logps/rejected": -9.386190414428711, - "loss": 0.5708, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -8.6622896194458, - "rewards/margins": 0.7239011526107788, - "rewards/rejected": -9.386190414428711, - "semantic_entropy": 0.0035473487805575132, + "logits/chosen": -0.1517014056444168, + "logits/rejected": 0.0120916236191988, + "logps/chosen": -3.6777946949005127, + "logps/rejected": -4.302873134613037, + "loss": 0.564, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.6777946949005127, + "rewards/margins": 0.6250786781311035, + "rewards/rejected": -4.302873134613037, "step": 2035 }, { "epoch": 1.0918213748118415, - "grad_norm": 26.024840250891568, + "grad_norm": 27.175303772016154, "learning_rate": 8.023690218297329e-07, - "logits/chosen": 0.47266292572021484, - "logits/rejected": 0.526736855506897, - "logps/chosen": -8.556685447692871, - "logps/rejected": -9.491031646728516, - "loss": 0.4911, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.556685447692871, - "rewards/margins": 0.9343463778495789, - "rewards/rejected": -9.491031646728516, - "semantic_entropy": 0.0032796214800328016, + "logits/chosen": -0.2550305724143982, + "logits/rejected": -0.17679789662361145, + "logps/chosen": -3.824284076690674, + "logps/rejected": -4.610040664672852, + "loss": 0.5268, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.824284076690674, + "rewards/margins": 0.7857571840286255, + "rewards/rejected": -4.610040664672852, "step": 2040 }, { "epoch": 1.094497407593243, - "grad_norm": 18.59073371986523, + "grad_norm": 14.642197153374909, "learning_rate": 8.01127203488458e-07, - "logits/chosen": 0.5553776025772095, - "logits/rejected": 0.6106212735176086, - "logps/chosen": -8.6795015335083, - "logps/rejected": -9.444005012512207, - "loss": 0.5309, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.6795015335083, - "rewards/margins": 0.7645029425621033, - "rewards/rejected": -9.444005012512207, - "semantic_entropy": 0.0029942230321466923, + "logits/chosen": -0.12059801816940308, + "logits/rejected": -0.05259256437420845, + "logps/chosen": -4.1158952713012695, + "logps/rejected": -4.826113700866699, + "loss": 0.5149, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.1158952713012695, + "rewards/margins": 0.710218608379364, + "rewards/rejected": -4.826113700866699, "step": 2045 }, { "epoch": 1.0971734403746445, - "grad_norm": 19.661856792543684, + "grad_norm": 15.455962306139176, "learning_rate": 7.998824636097339e-07, - "logits/chosen": 0.5739470720291138, - "logits/rejected": 0.6971379518508911, - "logps/chosen": -8.599574089050293, - "logps/rejected": -9.433286666870117, - "loss": 0.5109, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -8.599574089050293, - "rewards/margins": 0.833710789680481, - "rewards/rejected": -9.433286666870117, - "semantic_entropy": 0.002899765968322754, + "logits/chosen": -0.19361500442028046, + "logits/rejected": -0.03223999962210655, + "logps/chosen": -4.080569744110107, + "logps/rejected": -4.8871917724609375, + "loss": 0.5034, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.080569744110107, + "rewards/margins": 0.8066216707229614, + "rewards/rejected": -4.8871917724609375, "step": 2050 }, { "epoch": 1.0998494731560462, - "grad_norm": 16.83932620346083, + "grad_norm": 17.53063349786442, "learning_rate": 7.986348142700328e-07, - "logits/chosen": 0.5915915966033936, - "logits/rejected": 0.7208577394485474, - "logps/chosen": -8.551434516906738, - "logps/rejected": -9.56495189666748, - "loss": 0.4975, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.551434516906738, - "rewards/margins": 1.0135180950164795, - "rewards/rejected": -9.56495189666748, - "semantic_entropy": 0.004057818092405796, + "logits/chosen": -0.14430803060531616, + "logits/rejected": 0.02830803394317627, + "logps/chosen": -4.427618980407715, + "logps/rejected": -5.420300483703613, + "loss": 0.4837, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.427618980407715, + "rewards/margins": 0.992681622505188, + "rewards/rejected": -5.420300483703613, "step": 2055 }, { "epoch": 1.1025255059374477, - "grad_norm": 19.759575014791515, + "grad_norm": 19.938766156870837, "learning_rate": 7.973842675740539e-07, - "logits/chosen": 0.644290566444397, - "logits/rejected": 0.7044304609298706, - "logps/chosen": -8.437231063842773, - "logps/rejected": -9.365800857543945, - "loss": 0.4995, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.437231063842773, - "rewards/margins": 0.928569495677948, - "rewards/rejected": -9.365800857543945, - "semantic_entropy": 0.00467148469761014, + "logits/chosen": -0.11626265197992325, + "logits/rejected": -0.03846440836787224, + "logps/chosen": -4.261603355407715, + "logps/rejected": -5.1196064949035645, + "loss": 0.4964, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.261603355407715, + "rewards/margins": 0.8580025434494019, + "rewards/rejected": -5.1196064949035645, "step": 2060 }, { "epoch": 1.1052015387188494, - "grad_norm": 19.3959994469509, + "grad_norm": 16.648870279479755, "learning_rate": 7.961308356546066e-07, - "logits/chosen": 0.5765253305435181, - "logits/rejected": 0.7118976712226868, - "logps/chosen": -8.473932266235352, - "logps/rejected": -9.513734817504883, - "loss": 0.4958, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.473932266235352, - "rewards/margins": 1.0398019552230835, - "rewards/rejected": -9.513734817504883, - "semantic_entropy": 0.003963841591030359, + "logits/chosen": -0.16361525654792786, + "logits/rejected": -0.0018301442032679915, + "logps/chosen": -4.5717973709106445, + "logps/rejected": -5.516513824462891, + "loss": 0.4851, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.5717973709106445, + "rewards/margins": 0.9447162747383118, + "rewards/rejected": -5.516513824462891, "step": 2065 }, { "epoch": 1.107877571500251, - "grad_norm": 19.393093668750392, + "grad_norm": 19.38099876999855, "learning_rate": 7.948745306724931e-07, - "logits/chosen": 0.6232589483261108, - "logits/rejected": 0.7551737427711487, - "logps/chosen": -8.12829875946045, - "logps/rejected": -9.182195663452148, - "loss": 0.4412, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -8.12829875946045, - "rewards/margins": 1.0538949966430664, - "rewards/rejected": -9.182195663452148, - "semantic_entropy": 0.004817788954824209, + "logits/chosen": -0.11624745279550552, + "logits/rejected": 0.05895320326089859, + "logps/chosen": -4.024140357971191, + "logps/rejected": -5.050307273864746, + "loss": 0.4425, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.024140357971191, + "rewards/margins": 1.0261671543121338, + "rewards/rejected": -5.050307273864746, "step": 2070 }, { "epoch": 1.1105536042816524, - "grad_norm": 23.64668063780086, + "grad_norm": 18.25785996422318, "learning_rate": 7.936153648163897e-07, - "logits/chosen": 0.5677531957626343, - "logits/rejected": 0.6550413966178894, - "logps/chosen": -8.326519966125488, - "logps/rejected": -9.14603042602539, - "loss": 0.5172, + "logits/chosen": -0.2025548666715622, + "logits/rejected": -0.0748022124171257, + "logps/chosen": -4.290482521057129, + "logps/rejected": -5.058211326599121, + "loss": 0.5293, "rewards/accuracies": 0.71875, - "rewards/chosen": -8.326519966125488, - "rewards/margins": 0.8195114135742188, - "rewards/rejected": -9.14603042602539, - "semantic_entropy": 0.0040381476283073425, + "rewards/chosen": -4.290482521057129, + "rewards/margins": 0.767728865146637, + "rewards/rejected": -5.058211326599121, "step": 2075 }, { "epoch": 1.1132296370630541, - "grad_norm": 19.95159503167207, + "grad_norm": 14.66828369535671, "learning_rate": 7.92353350302729e-07, - "logits/chosen": 0.5089942216873169, - "logits/rejected": 0.6331297159194946, - "logps/chosen": -8.021492004394531, - "logps/rejected": -8.944520950317383, - "loss": 0.5098, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -8.021492004394531, - "rewards/margins": 0.9230290651321411, - "rewards/rejected": -8.944520950317383, - "semantic_entropy": 0.005194402299821377, + "logits/chosen": -0.21613673865795135, + "logits/rejected": -0.02284763753414154, + "logps/chosen": -4.175102233886719, + "logps/rejected": -5.018824100494385, + "loss": 0.5134, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.175102233886719, + "rewards/margins": 0.8437215685844421, + "rewards/rejected": -5.018824100494385, "step": 2080 }, { "epoch": 1.1159056698444556, - "grad_norm": 25.2963455688314, + "grad_norm": 20.85213069757147, "learning_rate": 7.910884993755816e-07, - "logits/chosen": 0.6509027481079102, - "logits/rejected": 0.7161253690719604, - "logps/chosen": -8.10318660736084, - "logps/rejected": -9.13819408416748, - "loss": 0.4955, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.10318660736084, - "rewards/margins": 1.0350077152252197, - "rewards/rejected": -9.13819408416748, - "semantic_entropy": 0.004798793233931065, + "logits/chosen": -0.2087005078792572, + "logits/rejected": -0.11597265303134918, + "logps/chosen": -4.15130615234375, + "logps/rejected": -4.991881847381592, + "loss": 0.5088, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.15130615234375, + "rewards/margins": 0.8405753970146179, + "rewards/rejected": -4.991881847381592, "step": 2085 }, { "epoch": 1.118581702625857, - "grad_norm": 16.477568206890176, + "grad_norm": 13.88562837272799, "learning_rate": 7.898208243065367e-07, - "logits/chosen": 0.6596091389656067, - "logits/rejected": 0.6896553635597229, - "logps/chosen": -8.11032772064209, - "logps/rejected": -8.861970901489258, - "loss": 0.533, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.11032772064209, - "rewards/margins": 0.7516436576843262, - "rewards/rejected": -8.861970901489258, - "semantic_entropy": 0.004430143162608147, + "logits/chosen": -0.2361416518688202, + "logits/rejected": -0.22439351677894592, + "logps/chosen": -3.829306125640869, + "logps/rejected": -4.449453353881836, + "loss": 0.5588, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.829306125640869, + "rewards/margins": 0.6201468110084534, + "rewards/rejected": -4.449453353881836, "step": 2090 }, { "epoch": 1.1212577354072588, - "grad_norm": 16.367388918717808, + "grad_norm": 16.582959972697164, "learning_rate": 7.88550337394583e-07, - "logits/chosen": 0.640828013420105, - "logits/rejected": 0.7348512411117554, - "logps/chosen": -8.398119926452637, - "logps/rejected": -9.17949104309082, - "loss": 0.5304, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.398119926452637, - "rewards/margins": 0.7813706398010254, - "rewards/rejected": -9.17949104309082, - "semantic_entropy": 0.0035932317841798067, + "logits/chosen": -0.26924800872802734, + "logits/rejected": -0.12902703881263733, + "logps/chosen": -4.313819408416748, + "logps/rejected": -5.034451484680176, + "loss": 0.5439, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.313819408416748, + "rewards/margins": 0.7206311225891113, + "rewards/rejected": -5.034451484680176, "step": 2095 }, { "epoch": 1.1239337681886603, - "grad_norm": 22.724539658375086, + "grad_norm": 19.190379052800594, "learning_rate": 7.872770509659905e-07, - "logits/chosen": 0.7362472414970398, - "logits/rejected": 0.7698075771331787, - "logps/chosen": -8.4552583694458, - "logps/rejected": -9.216978073120117, - "loss": 0.5361, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.4552583694458, - "rewards/margins": 0.7617195844650269, - "rewards/rejected": -9.216978073120117, - "semantic_entropy": 0.003288673236966133, + "logits/chosen": -0.18122822046279907, + "logits/rejected": -0.12670281529426575, + "logps/chosen": -4.24225378036499, + "logps/rejected": -4.936417102813721, + "loss": 0.5564, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.24225378036499, + "rewards/margins": 0.6941635608673096, + "rewards/rejected": -4.936417102813721, "step": 2100 }, { "epoch": 1.1266098009700618, - "grad_norm": 17.17232989165224, + "grad_norm": 15.904755077561903, "learning_rate": 7.860009773741896e-07, - "logits/chosen": 0.8084769248962402, - "logits/rejected": 0.9146261215209961, - "logps/chosen": -8.417569160461426, - "logps/rejected": -9.373042106628418, - "loss": 0.4631, + "logits/chosen": -0.15964198112487793, + "logits/rejected": 0.0009016230469569564, + "logps/chosen": -3.9130923748016357, + "logps/rejected": -4.815853118896484, + "loss": 0.4729, "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.417569160461426, - "rewards/margins": 0.9554733037948608, - "rewards/rejected": -9.373042106628418, - "semantic_entropy": 0.0028464009519666433, + "rewards/chosen": -3.9130923748016357, + "rewards/margins": 0.9027608036994934, + "rewards/rejected": -4.815853118896484, "step": 2105 }, { "epoch": 1.1292858337514635, - "grad_norm": 17.323667244270915, + "grad_norm": 20.92159287458505, "learning_rate": 7.84722128999652e-07, - "logits/chosen": 0.767966091632843, - "logits/rejected": 0.8326314687728882, - "logps/chosen": -8.67313003540039, - "logps/rejected": -9.694366455078125, - "loss": 0.4904, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.67313003540039, - "rewards/margins": 1.0212359428405762, - "rewards/rejected": -9.694366455078125, - "semantic_entropy": 0.0024180663749575615, + "logits/chosen": -0.22609901428222656, + "logits/rejected": -0.09971017390489578, + "logps/chosen": -4.008756637573242, + "logps/rejected": -4.922143459320068, + "loss": 0.5186, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.008756637573242, + "rewards/margins": 0.9133870005607605, + "rewards/rejected": -4.922143459320068, "step": 2110 }, { "epoch": 1.131961866532865, - "grad_norm": 18.88990514293712, + "grad_norm": 15.572115178230387, "learning_rate": 7.834405182497699e-07, - "logits/chosen": 0.8208998441696167, - "logits/rejected": 0.8627697229385376, - "logps/chosen": -8.815618515014648, - "logps/rejected": -9.641824722290039, - "loss": 0.5307, - "rewards/accuracies": 0.75, - "rewards/chosen": -8.815618515014648, - "rewards/margins": 0.826204776763916, - "rewards/rejected": -9.641824722290039, - "semantic_entropy": 0.0024063908495008945, + "logits/chosen": -0.12319934368133545, + "logits/rejected": -0.07263253629207611, + "logps/chosen": -4.130381107330322, + "logps/rejected": -4.964022159576416, + "loss": 0.5171, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.130381107330322, + "rewards/margins": 0.833641529083252, + "rewards/rejected": -4.964022159576416, "step": 2115 }, { "epoch": 1.1346378993142665, - "grad_norm": 22.075179293885157, + "grad_norm": 15.754581140896095, "learning_rate": 7.821561575587368e-07, - "logits/chosen": 0.772208571434021, - "logits/rejected": 0.8185374140739441, - "logps/chosen": -8.663274765014648, - "logps/rejected": -9.404105186462402, - "loss": 0.5304, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -8.663274765014648, - "rewards/margins": 0.7408307790756226, - "rewards/rejected": -9.404105186462402, - "semantic_entropy": 0.0030980452429503202, + "logits/chosen": -0.23031914234161377, + "logits/rejected": -0.18207421898841858, + "logps/chosen": -4.073249816894531, + "logps/rejected": -4.771031379699707, + "loss": 0.5161, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.073249816894531, + "rewards/margins": 0.6977812051773071, + "rewards/rejected": -4.771031379699707, "step": 2120 }, { "epoch": 1.1373139320956682, - "grad_norm": 16.278122487967558, + "grad_norm": 13.245317956551107, "learning_rate": 7.808690593874254e-07, - "logits/chosen": 0.745190441608429, - "logits/rejected": 0.8001850247383118, - "logps/chosen": -8.882969856262207, - "logps/rejected": -9.755064010620117, - "loss": 0.5305, + "logits/chosen": -0.2058139592409134, + "logits/rejected": -0.10925110429525375, + "logps/chosen": -4.241046905517578, + "logps/rejected": -5.0685505867004395, + "loss": 0.5189, "rewards/accuracies": 0.6875, - "rewards/chosen": -8.882969856262207, - "rewards/margins": 0.872094452381134, - "rewards/rejected": -9.755064010620117, - "semantic_entropy": 0.0022930700797587633, + "rewards/chosen": -4.241046905517578, + "rewards/margins": 0.8275042772293091, + "rewards/rejected": -5.0685505867004395, "step": 2125 }, { "epoch": 1.1399899648770697, - "grad_norm": 20.293385998629976, + "grad_norm": 16.011962622060487, "learning_rate": 7.79579236223268e-07, - "logits/chosen": 0.8587775230407715, - "logits/rejected": 0.9622253179550171, - "logps/chosen": -8.695469856262207, - "logps/rejected": -9.686747550964355, - "loss": 0.4915, - "rewards/accuracies": 0.75, - "rewards/chosen": -8.695469856262207, - "rewards/margins": 0.991279125213623, - "rewards/rejected": -9.686747550964355, - "semantic_entropy": 0.0027502470184117556, + "logits/chosen": -0.14689859747886658, + "logits/rejected": 0.10222943872213364, + "logps/chosen": -4.002467155456543, + "logps/rejected": -4.936595916748047, + "loss": 0.5007, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.002467155456543, + "rewards/margins": 0.9341288805007935, + "rewards/rejected": -4.936595916748047, "step": 2130 }, { "epoch": 1.1426659976584714, - "grad_norm": 20.703488040967667, + "grad_norm": 16.690659580199203, "learning_rate": 7.782867005801346e-07, - "logits/chosen": 0.765255868434906, - "logits/rejected": 0.8885319828987122, - "logps/chosen": -8.589404106140137, - "logps/rejected": -9.656288146972656, - "loss": 0.4855, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.589404106140137, - "rewards/margins": 1.0668823719024658, - "rewards/rejected": -9.656288146972656, - "semantic_entropy": 0.0031393137760460377, + "logits/chosen": -0.17893119156360626, + "logits/rejected": 0.028008561581373215, + "logps/chosen": -4.011521816253662, + "logps/rejected": -4.890550136566162, + "loss": 0.5168, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.011521816253662, + "rewards/margins": 0.8790282011032104, + "rewards/rejected": -4.890550136566162, "step": 2135 }, { "epoch": 1.145342030439873, - "grad_norm": 23.797314324550555, + "grad_norm": 20.97173129310675, "learning_rate": 7.769914649982117e-07, - "logits/chosen": 0.8055821657180786, - "logits/rejected": 0.8668516874313354, - "logps/chosen": -8.526637077331543, - "logps/rejected": -9.445337295532227, - "loss": 0.4964, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -8.526637077331543, - "rewards/margins": 0.9186998605728149, - "rewards/rejected": -9.445337295532227, - "semantic_entropy": 0.003491030540317297, + "logits/chosen": -0.1559801697731018, + "logits/rejected": -0.004843524657189846, + "logps/chosen": -3.805170774459839, + "logps/rejected": -4.627260684967041, + "loss": 0.5168, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.805170774459839, + "rewards/margins": 0.8220895528793335, + "rewards/rejected": -4.627260684967041, "step": 2140 }, { "epoch": 1.1480180632212744, - "grad_norm": 15.411161024417602, + "grad_norm": 14.200317440947485, "learning_rate": 7.756935420438803e-07, - "logits/chosen": 0.8090022206306458, - "logits/rejected": 0.8830445408821106, - "logps/chosen": -8.554264068603516, - "logps/rejected": -9.823812484741211, - "loss": 0.4472, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.554264068603516, - "rewards/margins": 1.269547462463379, - "rewards/rejected": -9.823812484741211, - "semantic_entropy": 0.0031209487933665514, + "logits/chosen": -0.14100058376789093, + "logits/rejected": -0.027155673131346703, + "logps/chosen": -3.6497740745544434, + "logps/rejected": -4.748330116271973, + "loss": 0.4641, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.6497740745544434, + "rewards/margins": 1.0985558032989502, + "rewards/rejected": -4.748330116271973, "step": 2145 }, { "epoch": 1.1506940960026761, - "grad_norm": 16.571465015989386, + "grad_norm": 11.58486123945302, "learning_rate": 7.743929443095951e-07, - "logits/chosen": 0.773921549320221, - "logits/rejected": 0.8259444236755371, - "logps/chosen": -8.577144622802734, - "logps/rejected": -9.52406120300293, - "loss": 0.4723, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -8.577144622802734, - "rewards/margins": 0.9469181895256042, - "rewards/rejected": -9.52406120300293, - "semantic_entropy": 0.0030519163701683283, + "logits/chosen": -0.16581018269062042, + "logits/rejected": -0.08899960666894913, + "logps/chosen": -3.9121031761169434, + "logps/rejected": -4.850736141204834, + "loss": 0.4583, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.9121031761169434, + "rewards/margins": 0.9386330842971802, + "rewards/rejected": -4.850736141204834, "step": 2150 }, { "epoch": 1.1533701287840776, - "grad_norm": 19.150879536804933, + "grad_norm": 14.109730547048228, "learning_rate": 7.730896844137609e-07, - "logits/chosen": 0.7496171593666077, - "logits/rejected": 0.8101975321769714, - "logps/chosen": -8.777600288391113, - "logps/rejected": -9.443353652954102, - "loss": 0.5967, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -8.777600288391113, - "rewards/margins": 0.6657532453536987, - "rewards/rejected": -9.443353652954102, - "semantic_entropy": 0.002408596221357584, + "logits/chosen": -0.13190411031246185, + "logits/rejected": -0.03360188007354736, + "logps/chosen": -4.247917175292969, + "logps/rejected": -4.913575649261475, + "loss": 0.5814, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.247917175292969, + "rewards/margins": 0.665658175945282, + "rewards/rejected": -4.913575649261475, "step": 2155 }, { "epoch": 1.1560461615654791, - "grad_norm": 20.084289617276934, + "grad_norm": 16.811934376374936, "learning_rate": 7.717837750006106e-07, - "logits/chosen": 0.7996751666069031, - "logits/rejected": 0.8572956919670105, - "logps/chosen": -8.599963188171387, - "logps/rejected": -9.580442428588867, - "loss": 0.5205, + "logits/chosen": -0.16165809333324432, + "logits/rejected": -0.07190193235874176, + "logps/chosen": -3.847546100616455, + "logps/rejected": -4.843579292297363, + "loss": 0.4932, "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.599963188171387, - "rewards/margins": 0.9804786443710327, - "rewards/rejected": -9.580442428588867, - "semantic_entropy": 0.0031619679648429155, + "rewards/chosen": -3.847546100616455, + "rewards/margins": 0.9960338473320007, + "rewards/rejected": -4.843579292297363, "step": 2160 }, { "epoch": 1.1587221943468808, - "grad_norm": 19.904342457753952, + "grad_norm": 15.237893306947743, "learning_rate": 7.704752287400832e-07, - "logits/chosen": 0.7399067282676697, - "logits/rejected": 0.8611429333686829, - "logps/chosen": -8.674205780029297, - "logps/rejected": -9.685873985290527, - "loss": 0.5023, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.674205780029297, - "rewards/margins": 1.0116674900054932, - "rewards/rejected": -9.685873985290527, - "semantic_entropy": 0.0029065976850688457, + "logits/chosen": -0.1384824961423874, + "logits/rejected": 0.05629171058535576, + "logps/chosen": -3.988621950149536, + "logps/rejected": -4.960763454437256, + "loss": 0.5142, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.988621950149536, + "rewards/margins": 0.9721416234970093, + "rewards/rejected": -4.960763454437256, "step": 2165 }, { "epoch": 1.1613982271282823, - "grad_norm": 11.68066385401199, + "grad_norm": 10.958998440094163, "learning_rate": 7.691640583277004e-07, - "logits/chosen": 0.8236852884292603, - "logits/rejected": 0.8967201113700867, - "logps/chosen": -8.778889656066895, - "logps/rejected": -9.750692367553711, - "loss": 0.5151, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -8.778889656066895, - "rewards/margins": 0.9718036651611328, - "rewards/rejected": -9.750692367553711, - "semantic_entropy": 0.0026112559717148542, + "logits/chosen": -0.12546658515930176, + "logits/rejected": 0.04151463881134987, + "logps/chosen": -3.8756096363067627, + "logps/rejected": -4.8096723556518555, + "loss": 0.5107, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.8756096363067627, + "rewards/margins": 0.9340632557868958, + "rewards/rejected": -4.8096723556518555, "step": 2170 }, { "epoch": 1.1640742599096838, - "grad_norm": 13.781442467832825, + "grad_norm": 11.3360466086079, "learning_rate": 7.678502764844433e-07, - "logits/chosen": 0.7699551582336426, - "logits/rejected": 0.8938447833061218, - "logps/chosen": -8.977490425109863, - "logps/rejected": -9.788980484008789, - "loss": 0.5165, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.977490425109863, - "rewards/margins": 0.8114897012710571, - "rewards/rejected": -9.788980484008789, - "semantic_entropy": 0.0019960529170930386, + "logits/chosen": -0.1835601031780243, + "logits/rejected": 0.032366055995225906, + "logps/chosen": -3.997211456298828, + "logps/rejected": -4.779942512512207, + "loss": 0.5249, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.997211456298828, + "rewards/margins": 0.7827308177947998, + "rewards/rejected": -4.779942512512207, "step": 2175 }, { "epoch": 1.1667502926910855, - "grad_norm": 14.44343492374134, + "grad_norm": 15.744450575142155, "learning_rate": 7.665338959566288e-07, - "logits/chosen": 0.8235516548156738, - "logits/rejected": 0.8966760635375977, - "logps/chosen": -9.135007858276367, - "logps/rejected": -10.119426727294922, - "loss": 0.4607, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -9.135007858276367, - "rewards/margins": 0.9844182133674622, - "rewards/rejected": -10.119426727294922, - "semantic_entropy": 0.0018916798289865255, + "logits/chosen": -0.13790066540241241, + "logits/rejected": -0.03549501299858093, + "logps/chosen": -3.7901408672332764, + "logps/rejected": -4.743773460388184, + "loss": 0.4751, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.7901408672332764, + "rewards/margins": 0.953632652759552, + "rewards/rejected": -4.743773460388184, "step": 2180 }, { "epoch": 1.169426325472487, - "grad_norm": 17.800127012280676, + "grad_norm": 16.79764363889196, "learning_rate": 7.652149295157868e-07, - "logits/chosen": 0.8629690408706665, - "logits/rejected": 0.9295064210891724, - "logps/chosen": -9.345608711242676, - "logps/rejected": -10.031997680664062, - "loss": 0.5446, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -9.345608711242676, - "rewards/margins": 0.6863887906074524, - "rewards/rejected": -10.031997680664062, - "semantic_entropy": 0.0015552560798823833, + "logits/chosen": -0.10358766466379166, + "logits/rejected": 0.06537457555532455, + "logps/chosen": -4.023680210113525, + "logps/rejected": -4.730215072631836, + "loss": 0.5318, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.023680210113525, + "rewards/margins": 0.7065349221229553, + "rewards/rejected": -4.730215072631836, "step": 2185 }, { "epoch": 1.1721023582538885, - "grad_norm": 22.042952861910784, + "grad_norm": 13.732972098563016, "learning_rate": 7.638933899585354e-07, - "logits/chosen": 0.9068318605422974, - "logits/rejected": 0.924595832824707, - "logps/chosen": -9.173646926879883, - "logps/rejected": -10.000932693481445, - "loss": 0.5156, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -9.173646926879883, - "rewards/margins": 0.8272865414619446, - "rewards/rejected": -10.000932693481445, - "semantic_entropy": 0.0017204980831593275, + "logits/chosen": -0.032866887748241425, + "logits/rejected": 0.0010335519909858704, + "logps/chosen": -3.808356523513794, + "logps/rejected": -4.6879377365112305, + "loss": 0.4971, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.808356523513794, + "rewards/margins": 0.879581093788147, + "rewards/rejected": -4.6879377365112305, "step": 2190 }, { "epoch": 1.1747783910352902, - "grad_norm": 18.715280386882995, + "grad_norm": 13.846635336998247, "learning_rate": 7.625692901064573e-07, - "logits/chosen": 0.8207842707633972, - "logits/rejected": 0.9047282934188843, - "logps/chosen": -9.131489753723145, - "logps/rejected": -9.951040267944336, - "loss": 0.5302, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -9.131489753723145, - "rewards/margins": 0.8195503950119019, - "rewards/rejected": -9.951040267944336, - "semantic_entropy": 0.0020564752630889416, + "logits/chosen": -0.13992778956890106, + "logits/rejected": -0.030388470739126205, + "logps/chosen": -4.271843910217285, + "logps/rejected": -5.148895740509033, + "loss": 0.5584, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.271843910217285, + "rewards/margins": 0.877051830291748, + "rewards/rejected": -5.148895740509033, "step": 2195 }, { "epoch": 1.1774544238166917, - "grad_norm": 14.79847154131496, + "grad_norm": 14.027825122344895, "learning_rate": 7.61242642805975e-07, - "logits/chosen": 0.8447543382644653, - "logits/rejected": 0.8663375973701477, - "logps/chosen": -9.094636917114258, - "logps/rejected": -9.878302574157715, - "loss": 0.534, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -9.094636917114258, - "rewards/margins": 0.7836667895317078, - "rewards/rejected": -9.878302574157715, - "semantic_entropy": 0.0019203886622563004, + "logits/chosen": -0.18375355005264282, + "logits/rejected": -0.1969098001718521, + "logps/chosen": -4.013321876525879, + "logps/rejected": -4.780495643615723, + "loss": 0.5307, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.013321876525879, + "rewards/margins": 0.7671735882759094, + "rewards/rejected": -4.780495643615723, "step": 2200 }, { "epoch": 1.1801304565980932, - "grad_norm": 17.15954179734088, + "grad_norm": 16.149446549096666, "learning_rate": 7.599134609282266e-07, - "logits/chosen": 0.7871206998825073, - "logits/rejected": 0.8676565289497375, - "logps/chosen": -9.28339672088623, - "logps/rejected": -10.069405555725098, - "loss": 0.5129, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -9.28339672088623, - "rewards/margins": 0.7860093712806702, - "rewards/rejected": -10.069405555725098, - "semantic_entropy": 0.001820198493078351, + "logits/chosen": -0.24408617615699768, + "logits/rejected": -0.05820148438215256, + "logps/chosen": -4.111595630645752, + "logps/rejected": -4.874477863311768, + "loss": 0.5321, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.111595630645752, + "rewards/margins": 0.7628819346427917, + "rewards/rejected": -4.874477863311768, "step": 2205 }, { "epoch": 1.182806489379495, - "grad_norm": 24.404592657138192, + "grad_norm": 16.24369996337799, "learning_rate": 7.585817573689402e-07, - "logits/chosen": 0.7938421368598938, - "logits/rejected": 0.8801844716072083, - "logps/chosen": -8.840426445007324, - "logps/rejected": -9.785604476928711, - "loss": 0.4784, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.840426445007324, - "rewards/margins": 0.9451776742935181, - "rewards/rejected": -9.785604476928711, - "semantic_entropy": 0.002649650676175952, + "logits/chosen": -0.2631383538246155, + "logits/rejected": -0.13807713985443115, + "logps/chosen": -3.6678924560546875, + "logps/rejected": -4.618288516998291, + "loss": 0.4753, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.6678924560546875, + "rewards/margins": 0.9503962397575378, + "rewards/rejected": -4.618288516998291, "step": 2210 }, { "epoch": 1.1854825221608964, - "grad_norm": 17.132762998778745, + "grad_norm": 13.063642162059777, "learning_rate": 7.572475450483098e-07, - "logits/chosen": 0.7745561003684998, - "logits/rejected": 0.8122493624687195, - "logps/chosen": -8.980504035949707, - "logps/rejected": -9.769770622253418, - "loss": 0.5316, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.980504035949707, - "rewards/margins": 0.7892670035362244, - "rewards/rejected": -9.769770622253418, - "semantic_entropy": 0.0022570898290723562, + "logits/chosen": -0.23292243480682373, + "logits/rejected": -0.14422407746315002, + "logps/chosen": -3.8685173988342285, + "logps/rejected": -4.650615215301514, + "loss": 0.5319, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.8685173988342285, + "rewards/margins": 0.7820984125137329, + "rewards/rejected": -4.650615215301514, "step": 2215 }, { "epoch": 1.188158554942298, - "grad_norm": 20.513738139152867, + "grad_norm": 16.191176569672564, "learning_rate": 7.559108369108689e-07, - "logits/chosen": 0.7253280878067017, - "logits/rejected": 0.7848079204559326, - "logps/chosen": -8.66881275177002, - "logps/rejected": -9.506206512451172, - "loss": 0.5316, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -8.66881275177002, - "rewards/margins": 0.8373939394950867, - "rewards/rejected": -9.506206512451172, - "semantic_entropy": 0.0028397340793162584, + "logits/chosen": -0.2543533444404602, + "logits/rejected": -0.1253151148557663, + "logps/chosen": -3.7200026512145996, + "logps/rejected": -4.5218400955200195, + "loss": 0.5324, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.7200026512145996, + "rewards/margins": 0.8018368482589722, + "rewards/rejected": -4.5218400955200195, "step": 2220 }, { "epoch": 1.1908345877236997, - "grad_norm": 13.082835254565163, + "grad_norm": 11.466097536440817, "learning_rate": 7.54571645925366e-07, - "logits/chosen": 0.6793020367622375, - "logits/rejected": 0.8425678014755249, - "logps/chosen": -8.629182815551758, - "logps/rejected": -9.746764183044434, - "loss": 0.4487, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.629182815551758, - "rewards/margins": 1.1175806522369385, - "rewards/rejected": -9.746764183044434, - "semantic_entropy": 0.003014157759025693, + "logits/chosen": -0.2799068093299866, + "logits/rejected": -0.01841496117413044, + "logps/chosen": -3.601719617843628, + "logps/rejected": -4.537893772125244, + "loss": 0.4811, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.601719617843628, + "rewards/margins": 0.936174213886261, + "rewards/rejected": -4.537893772125244, "step": 2225 }, { "epoch": 1.1935106205051011, - "grad_norm": 15.319269039008326, + "grad_norm": 16.609319489327003, "learning_rate": 7.532299850846378e-07, - "logits/chosen": 0.6559053659439087, - "logits/rejected": 0.7742191553115845, - "logps/chosen": -8.408263206481934, - "logps/rejected": -9.492993354797363, - "loss": 0.4948, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.408263206481934, - "rewards/margins": 1.084729790687561, - "rewards/rejected": -9.492993354797363, - "semantic_entropy": 0.0036600581370294094, + "logits/chosen": -0.2313324213027954, + "logits/rejected": -0.06041599437594414, + "logps/chosen": -3.469024181365967, + "logps/rejected": -4.3954010009765625, + "loss": 0.5028, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.469024181365967, + "rewards/margins": 0.9263772964477539, + "rewards/rejected": -4.3954010009765625, "step": 2230 }, { "epoch": 1.1961866532865026, - "grad_norm": 22.81260636310479, + "grad_norm": 19.422877856286505, "learning_rate": 7.518858674054838e-07, - "logits/chosen": 0.6717875003814697, - "logits/rejected": 0.8029670715332031, - "logps/chosen": -8.644887924194336, - "logps/rejected": -9.598337173461914, - "loss": 0.5115, + "logits/chosen": -0.23634760081768036, + "logits/rejected": -0.034086667001247406, + "logps/chosen": -3.51098370552063, + "logps/rejected": -4.399679660797119, + "loss": 0.4938, "rewards/accuracies": 0.75, - "rewards/chosen": -8.644887924194336, - "rewards/margins": 0.9534481763839722, - "rewards/rejected": -9.598337173461914, - "semantic_entropy": 0.002926050452515483, + "rewards/chosen": -3.51098370552063, + "rewards/margins": 0.8886961936950684, + "rewards/rejected": -4.399679660797119, "step": 2235 }, { "epoch": 1.1988626860679044, - "grad_norm": 17.071928193449306, + "grad_norm": 13.24290962815228, "learning_rate": 7.505393059285394e-07, - "logits/chosen": 0.6294863224029541, - "logits/rejected": 0.7521852254867554, - "logps/chosen": -8.822403907775879, - "logps/rejected": -9.73637866973877, - "loss": 0.5241, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.822403907775879, - "rewards/margins": 0.9139748811721802, - "rewards/rejected": -9.73637866973877, - "semantic_entropy": 0.003058222122490406, + "logits/chosen": -0.22185154259204865, + "logits/rejected": -0.04469820857048035, + "logps/chosen": -3.6821494102478027, + "logps/rejected": -4.451887130737305, + "loss": 0.5297, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.6821494102478027, + "rewards/margins": 0.769737720489502, + "rewards/rejected": -4.451887130737305, "step": 2240 }, { "epoch": 1.2015387188493059, - "grad_norm": 21.59100967195749, + "grad_norm": 16.145693884997254, "learning_rate": 7.491903137181501e-07, - "logits/chosen": 0.6673406362533569, - "logits/rejected": 0.6980730295181274, - "logps/chosen": -8.757534980773926, - "logps/rejected": -9.63608169555664, - "loss": 0.4955, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.757534980773926, - "rewards/margins": 0.8785461187362671, - "rewards/rejected": -9.63608169555664, - "semantic_entropy": 0.003113445593044162, + "logits/chosen": -0.15161992609500885, + "logits/rejected": -0.11036952584981918, + "logps/chosen": -3.4583168029785156, + "logps/rejected": -4.245116233825684, + "loss": 0.5043, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4583168029785156, + "rewards/margins": 0.7867995500564575, + "rewards/rejected": -4.245116233825684, "step": 2245 }, { "epoch": 1.2042147516307076, - "grad_norm": 17.12424676715307, + "grad_norm": 14.289044060858481, "learning_rate": 7.478389038622441e-07, - "logits/chosen": 0.6984297633171082, - "logits/rejected": 0.7338518500328064, - "logps/chosen": -8.893332481384277, - "logps/rejected": -9.793367385864258, - "loss": 0.527, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -8.893332481384277, - "rewards/margins": 0.9000345468521118, - "rewards/rejected": -9.793367385864258, - "semantic_entropy": 0.002758896443992853, + "logits/chosen": -0.10382457077503204, + "logits/rejected": -0.06964464485645294, + "logps/chosen": -3.6171226501464844, + "logps/rejected": -4.406100273132324, + "loss": 0.5245, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.6171226501464844, + "rewards/margins": 0.7889779806137085, + "rewards/rejected": -4.406100273132324, "step": 2250 }, { "epoch": 1.206890784412109, - "grad_norm": 26.22447421233564, + "grad_norm": 19.89108969811621, "learning_rate": 7.46485089472206e-07, - "logits/chosen": 0.6646834015846252, - "logits/rejected": 0.7228942513465881, - "logps/chosen": -8.950407028198242, - "logps/rejected": -9.782625198364258, - "loss": 0.5624, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -8.950407028198242, - "rewards/margins": 0.8322180509567261, - "rewards/rejected": -9.782625198364258, - "semantic_entropy": 0.0024748151190578938, + "logits/chosen": -0.16020123660564423, + "logits/rejected": -0.08688847720623016, + "logps/chosen": -3.6236777305603027, + "logps/rejected": -4.379891872406006, + "loss": 0.5439, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.6236777305603027, + "rewards/margins": 0.7562140822410583, + "rewards/rejected": -4.379891872406006, "step": 2255 }, { "epoch": 1.2095668171935106, - "grad_norm": 17.893944580761662, + "grad_norm": 14.668864342230435, "learning_rate": 7.451288836827487e-07, - "logits/chosen": 0.7343819737434387, - "logits/rejected": 0.763200044631958, - "logps/chosen": -8.684735298156738, - "logps/rejected": -9.369165420532227, - "loss": 0.5689, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.684735298156738, - "rewards/margins": 0.6844292283058167, - "rewards/rejected": -9.369165420532227, - "semantic_entropy": 0.003174789249897003, + "logits/chosen": -0.14463329315185547, + "logits/rejected": -0.14039286971092224, + "logps/chosen": -3.446162700653076, + "logps/rejected": -4.076861381530762, + "loss": 0.5478, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.446162700653076, + "rewards/margins": 0.6306983828544617, + "rewards/rejected": -4.076861381530762, "step": 2260 }, { "epoch": 1.2122428499749123, - "grad_norm": 15.807684902195147, + "grad_norm": 11.777168051930923, "learning_rate": 7.437702996517869e-07, - "logits/chosen": 0.6750258207321167, - "logits/rejected": 0.7452644109725952, - "logps/chosen": -8.592541694641113, - "logps/rejected": -9.485219955444336, - "loss": 0.5089, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -8.592541694641113, - "rewards/margins": 0.8926795721054077, - "rewards/rejected": -9.485219955444336, - "semantic_entropy": 0.0034456239081919193, + "logits/chosen": -0.1577952653169632, + "logits/rejected": -0.061899833381175995, + "logps/chosen": -3.5160889625549316, + "logps/rejected": -4.357062816619873, + "loss": 0.505, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.5160889625549316, + "rewards/margins": 0.8409740328788757, + "rewards/rejected": -4.357062816619873, "step": 2265 }, { "epoch": 1.2149188827563138, - "grad_norm": 18.6126390663135, + "grad_norm": 19.6563796992474, "learning_rate": 7.424093505603087e-07, - "logits/chosen": 0.6281952857971191, - "logits/rejected": 0.7401161789894104, - "logps/chosen": -8.631272315979004, - "logps/rejected": -9.659812927246094, - "loss": 0.4665, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.631272315979004, - "rewards/margins": 1.0285407304763794, - "rewards/rejected": -9.659812927246094, - "semantic_entropy": 0.0035267819184809923, + "logits/chosen": -0.2527870237827301, + "logits/rejected": -0.07215356826782227, + "logps/chosen": -3.7058396339416504, + "logps/rejected": -4.651564121246338, + "loss": 0.4631, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.7058396339416504, + "rewards/margins": 0.945724368095398, + "rewards/rejected": -4.651564121246338, "step": 2270 }, { "epoch": 1.2175949155377153, - "grad_norm": 18.495386475386976, + "grad_norm": 14.759263482172233, "learning_rate": 7.410460496122482e-07, - "logits/chosen": 0.6814571619033813, - "logits/rejected": 0.7883174419403076, - "logps/chosen": -8.451251983642578, - "logps/rejected": -9.589815139770508, - "loss": 0.4347, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -8.451251983642578, - "rewards/margins": 1.1385620832443237, - "rewards/rejected": -9.589815139770508, - "semantic_entropy": 0.0035903877578675747, + "logits/chosen": -0.1696304976940155, + "logits/rejected": 0.0013329103821888566, + "logps/chosen": -3.512709140777588, + "logps/rejected": -4.526141166687012, + "loss": 0.4561, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.512709140777588, + "rewards/margins": 1.013432264328003, + "rewards/rejected": -4.526141166687012, "step": 2275 }, { "epoch": 1.220270948319117, - "grad_norm": 20.866364773443532, + "grad_norm": 15.423197560845116, "learning_rate": 7.396804100343572e-07, - "logits/chosen": 0.6894387602806091, - "logits/rejected": 0.7951668500900269, - "logps/chosen": -8.350536346435547, - "logps/rejected": -9.263737678527832, - "loss": 0.492, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.350536346435547, - "rewards/margins": 0.9132000207901001, - "rewards/rejected": -9.263737678527832, - "semantic_entropy": 0.003823335049673915, + "logits/chosen": -0.2050914764404297, + "logits/rejected": 0.01497526466846466, + "logps/chosen": -3.5561916828155518, + "logps/rejected": -4.397391319274902, + "loss": 0.4934, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.5561916828155518, + "rewards/margins": 0.8411990404129028, + "rewards/rejected": -4.397391319274902, "step": 2280 }, { "epoch": 1.2229469811005185, - "grad_norm": 11.808941649198584, + "grad_norm": 12.361509616423726, "learning_rate": 7.383124450760768e-07, - "logits/chosen": 0.7374765276908875, - "logits/rejected": 0.8545964956283569, - "logps/chosen": -8.481078147888184, - "logps/rejected": -9.478879928588867, - "loss": 0.4777, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -8.481078147888184, - "rewards/margins": 0.9978022575378418, - "rewards/rejected": -9.478879928588867, - "semantic_entropy": 0.003645769553259015, + "logits/chosen": -0.17751066386699677, + "logits/rejected": 0.03661578148603439, + "logps/chosen": -3.8919754028320312, + "logps/rejected": -4.821598052978516, + "loss": 0.4898, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.8919754028320312, + "rewards/margins": 0.9296229481697083, + "rewards/rejected": -4.821598052978516, "step": 2285 }, { "epoch": 1.22562301388192, - "grad_norm": 17.775252169557806, + "grad_norm": 17.511813357376422, "learning_rate": 7.369421680094091e-07, - "logits/chosen": 0.6624468564987183, - "logits/rejected": 0.7552576661109924, - "logps/chosen": -8.490701675415039, - "logps/rejected": -9.470184326171875, - "loss": 0.5227, + "logits/chosen": -0.2436923086643219, + "logits/rejected": -0.07352737337350845, + "logps/chosen": -3.8112850189208984, + "logps/rejected": -4.750272274017334, + "loss": 0.5236, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.490701675415039, - "rewards/margins": 0.9794837832450867, - "rewards/rejected": -9.470184326171875, - "semantic_entropy": 0.0034333504736423492, + "rewards/chosen": -3.8112850189208984, + "rewards/margins": 0.9389876127243042, + "rewards/rejected": -4.750272274017334, "step": 2290 }, { "epoch": 1.2282990466633217, - "grad_norm": 23.97776160790946, + "grad_norm": 15.509900277029512, "learning_rate": 7.355695921287881e-07, - "logits/chosen": 0.6835793256759644, - "logits/rejected": 0.7225985527038574, - "logps/chosen": -8.687114715576172, - "logps/rejected": -9.38883113861084, - "loss": 0.6041, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -8.687114715576172, - "rewards/margins": 0.7017166018486023, - "rewards/rejected": -9.38883113861084, - "semantic_entropy": 0.003160933731123805, + "logits/chosen": -0.21522513031959534, + "logits/rejected": -0.12015016376972198, + "logps/chosen": -3.933703899383545, + "logps/rejected": -4.7270050048828125, + "loss": 0.5638, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.933703899383545, + "rewards/margins": 0.7933009266853333, + "rewards/rejected": -4.7270050048828125, "step": 2295 }, { "epoch": 1.2309750794447232, - "grad_norm": 21.69249472023334, + "grad_norm": 17.223275738399877, "learning_rate": 7.341947307509513e-07, - "logits/chosen": 0.7158384919166565, - "logits/rejected": 0.8074569702148438, - "logps/chosen": -8.510350227355957, - "logps/rejected": -9.450441360473633, - "loss": 0.5061, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -8.510350227355957, - "rewards/margins": 0.940090537071228, - "rewards/rejected": -9.450441360473633, - "semantic_entropy": 0.00320886867120862, + "logits/chosen": -0.1936924159526825, + "logits/rejected": -0.04473690316081047, + "logps/chosen": -3.7253880500793457, + "logps/rejected": -4.6146321296691895, + "loss": 0.5192, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.7253880500793457, + "rewards/margins": 0.8892437815666199, + "rewards/rejected": -4.6146321296691895, "step": 2300 }, { "epoch": 1.233651112226125, - "grad_norm": 17.13076671528079, + "grad_norm": 22.206684856769794, "learning_rate": 7.328175972148094e-07, - "logits/chosen": 0.7047310471534729, - "logits/rejected": 0.7689910531044006, - "logps/chosen": -8.937776565551758, - "logps/rejected": -9.842035293579102, - "loss": 0.5066, + "logits/chosen": -0.18479090929031372, + "logits/rejected": -0.05861486867070198, + "logps/chosen": -4.098392486572266, + "logps/rejected": -4.941969871520996, + "loss": 0.5279, "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -8.937776565551758, - "rewards/margins": 0.9042595624923706, - "rewards/rejected": -9.842035293579102, - "semantic_entropy": 0.0021909018978476524, + "rewards/chosen": -4.098392486572266, + "rewards/margins": 0.8435776829719543, + "rewards/rejected": -4.941969871520996, "step": 2305 }, { "epoch": 1.2363271450075264, - "grad_norm": 22.05914195430034, + "grad_norm": 19.787017286664245, "learning_rate": 7.314382048813185e-07, - "logits/chosen": 0.7231523394584656, - "logits/rejected": 0.8367801904678345, - "logps/chosen": -8.771172523498535, - "logps/rejected": -9.783547401428223, - "loss": 0.4775, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -8.771172523498535, - "rewards/margins": 1.0123744010925293, - "rewards/rejected": -9.783547401428223, - "semantic_entropy": 0.0027366154827177525, + "logits/chosen": -0.1217123493552208, + "logits/rejected": 0.157485693693161, + "logps/chosen": -3.7948594093322754, + "logps/rejected": -4.761152267456055, + "loss": 0.4622, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.7948594093322754, + "rewards/margins": 0.9662927389144897, + "rewards/rejected": -4.761152267456055, "step": 2310 }, { "epoch": 1.2390031777889279, - "grad_norm": 15.488052133555222, + "grad_norm": 12.711403458698207, "learning_rate": 7.300565671333486e-07, - "logits/chosen": 0.6668115854263306, - "logits/rejected": 0.7803434133529663, - "logps/chosen": -8.952492713928223, - "logps/rejected": -9.73788070678711, - "loss": 0.5417, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.952492713928223, - "rewards/margins": 0.7853885293006897, - "rewards/rejected": -9.73788070678711, - "semantic_entropy": 0.002661502454429865, + "logits/chosen": -0.20252633094787598, + "logits/rejected": 0.02651221491396427, + "logps/chosen": -4.037034511566162, + "logps/rejected": -4.946874618530273, + "loss": 0.4834, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.037034511566162, + "rewards/margins": 0.9098396301269531, + "rewards/rejected": -4.946874618530273, "step": 2315 }, { "epoch": 1.2416792105703296, - "grad_norm": 15.301361368412175, + "grad_norm": 13.91059851018264, "learning_rate": 7.286726973755554e-07, - "logits/chosen": 0.7436283826828003, - "logits/rejected": 0.783458411693573, - "logps/chosen": -8.722562789916992, - "logps/rejected": -9.623026847839355, - "loss": 0.4961, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -8.722562789916992, - "rewards/margins": 0.9004641771316528, - "rewards/rejected": -9.623026847839355, - "semantic_entropy": 0.0026164718437939882, + "logits/chosen": -0.0752243846654892, + "logits/rejected": -0.027604978531599045, + "logps/chosen": -3.9934284687042236, + "logps/rejected": -4.914708137512207, + "loss": 0.4662, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.9934284687042236, + "rewards/margins": 0.921279788017273, + "rewards/rejected": -4.914708137512207, "step": 2320 }, { "epoch": 1.244355243351731, - "grad_norm": 18.344895783311472, + "grad_norm": 13.225082763388388, "learning_rate": 7.272866090342493e-07, - "logits/chosen": 0.7868816256523132, - "logits/rejected": 0.8121258020401001, - "logps/chosen": -8.369720458984375, - "logps/rejected": -9.339384078979492, - "loss": 0.4349, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -8.369720458984375, - "rewards/margins": 0.9696633219718933, - "rewards/rejected": -9.339384078979492, - "semantic_entropy": 0.004334195517003536, + "logits/chosen": -0.03810407966375351, + "logits/rejected": 0.03583959490060806, + "logps/chosen": -4.031801223754883, + "logps/rejected": -4.960965156555176, + "loss": 0.4555, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.031801223754883, + "rewards/margins": 0.9291635751724243, + "rewards/rejected": -4.960965156555176, "step": 2325 }, { "epoch": 1.2470312761331326, - "grad_norm": 20.284036778511826, + "grad_norm": 17.189820916446404, "learning_rate": 7.258983155572656e-07, - "logits/chosen": 0.662312388420105, - "logits/rejected": 0.7393311262130737, - "logps/chosen": -8.260697364807129, - "logps/rejected": -9.110601425170898, - "loss": 0.5587, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -8.260697364807129, - "rewards/margins": 0.8499045372009277, - "rewards/rejected": -9.110601425170898, - "semantic_entropy": 0.0039521572180092335, + "logits/chosen": -0.1681860387325287, + "logits/rejected": -0.04851319268345833, + "logps/chosen": -4.049219608306885, + "logps/rejected": -4.977755069732666, + "loss": 0.5095, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.049219608306885, + "rewards/margins": 0.9285354614257812, + "rewards/rejected": -4.977755069732666, "step": 2330 }, { "epoch": 1.2497073089145343, - "grad_norm": 13.687544225959545, + "grad_norm": 14.537710726352651, "learning_rate": 7.245078304138335e-07, - "logits/chosen": 0.695865273475647, - "logits/rejected": 0.759333074092865, - "logps/chosen": -8.318536758422852, - "logps/rejected": -9.270764350891113, - "loss": 0.4915, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.318536758422852, - "rewards/margins": 0.9522277116775513, - "rewards/rejected": -9.270764350891113, - "semantic_entropy": 0.003750443458557129, + "logits/chosen": -0.09201566874980927, + "logits/rejected": 0.009965101256966591, + "logps/chosen": -4.405162811279297, + "logps/rejected": -5.2908525466918945, + "loss": 0.5195, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.405162811279297, + "rewards/margins": 0.8856894373893738, + "rewards/rejected": -5.2908525466918945, "step": 2335 }, { "epoch": 1.2523833416959358, - "grad_norm": 14.99477345835548, + "grad_norm": 15.255824846806979, "learning_rate": 7.231151670944462e-07, - "logits/chosen": 0.5629149079322815, - "logits/rejected": 0.659963846206665, - "logps/chosen": -8.367746353149414, - "logps/rejected": -9.242141723632812, - "loss": 0.5076, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -8.367746353149414, - "rewards/margins": 0.8743956685066223, - "rewards/rejected": -9.242141723632812, - "semantic_entropy": 0.0034209941513836384, + "logits/chosen": -0.27478939294815063, + "logits/rejected": -0.04711084067821503, + "logps/chosen": -4.467315196990967, + "logps/rejected": -5.319950580596924, + "loss": 0.5209, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.467315196990967, + "rewards/margins": 0.8526347875595093, + "rewards/rejected": -5.319950580596924, "step": 2340 }, { "epoch": 1.2550593744773373, - "grad_norm": 14.622457962908232, + "grad_norm": 15.397491483544737, "learning_rate": 7.217203391107291e-07, - "logits/chosen": 0.6555184721946716, - "logits/rejected": 0.7649224996566772, - "logps/chosen": -8.27055549621582, - "logps/rejected": -9.224145889282227, - "loss": 0.5084, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.27055549621582, - "rewards/margins": 0.9535905122756958, - "rewards/rejected": -9.224145889282227, - "semantic_entropy": 0.0038777173031121492, + "logits/chosen": -0.19195149838924408, + "logits/rejected": 0.0006151467678137124, + "logps/chosen": -4.258540153503418, + "logps/rejected": -5.2639946937561035, + "loss": 0.4893, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.258540153503418, + "rewards/margins": 1.0054543018341064, + "rewards/rejected": -5.2639946937561035, "step": 2345 }, { "epoch": 1.257735407258739, - "grad_norm": 18.63345646684996, + "grad_norm": 14.39154599039423, "learning_rate": 7.203233599953096e-07, - "logits/chosen": 0.6671181917190552, - "logits/rejected": 0.7599374651908875, - "logps/chosen": -8.387980461120605, - "logps/rejected": -9.266815185546875, - "loss": 0.4867, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.387980461120605, - "rewards/margins": 0.8788350820541382, - "rewards/rejected": -9.266815185546875, - "semantic_entropy": 0.0032208203338086605, + "logits/chosen": -0.1541978418827057, + "logits/rejected": 0.034764986485242844, + "logps/chosen": -4.571907043457031, + "logps/rejected": -5.512131690979004, + "loss": 0.499, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.571907043457031, + "rewards/margins": 0.9402249455451965, + "rewards/rejected": -5.512131690979004, "step": 2350 }, { "epoch": 1.2604114400401405, - "grad_norm": 16.89717027672504, + "grad_norm": 19.352427484128782, "learning_rate": 7.189242433016852e-07, - "logits/chosen": 0.685912013053894, - "logits/rejected": 0.7816206812858582, - "logps/chosen": -8.186650276184082, - "logps/rejected": -9.200610160827637, - "loss": 0.4687, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -8.186650276184082, - "rewards/margins": 1.0139598846435547, - "rewards/rejected": -9.200610160827637, - "semantic_entropy": 0.004348042421042919, + "logits/chosen": -0.1378113180398941, + "logits/rejected": 0.02001635916531086, + "logps/chosen": -4.137023448944092, + "logps/rejected": -5.235510349273682, + "loss": 0.4745, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.137023448944092, + "rewards/margins": 1.0984867811203003, + "rewards/rejected": -5.235510349273682, "step": 2355 }, { "epoch": 1.263087472821542, - "grad_norm": 16.86807483534206, + "grad_norm": 19.198925061748312, "learning_rate": 7.17523002604092e-07, - "logits/chosen": 0.6663065552711487, - "logits/rejected": 0.7573191523551941, - "logps/chosen": -8.505255699157715, - "logps/rejected": -9.44536304473877, - "loss": 0.4819, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -8.505255699157715, - "rewards/margins": 0.940106987953186, - "rewards/rejected": -9.44536304473877, - "semantic_entropy": 0.0034768693149089813, + "logits/chosen": -0.16778619587421417, + "logits/rejected": 0.02000817097723484, + "logps/chosen": -4.636425495147705, + "logps/rejected": -5.5533552169799805, + "loss": 0.5201, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.636425495147705, + "rewards/margins": 0.916930079460144, + "rewards/rejected": -5.5533552169799805, "step": 2360 }, { "epoch": 1.2657635056029437, - "grad_norm": 17.687247649811177, + "grad_norm": 17.387523508758527, "learning_rate": 7.161196514973734e-07, - "logits/chosen": 0.7061843276023865, - "logits/rejected": 0.7796521186828613, - "logps/chosen": -8.41321086883545, - "logps/rejected": -9.373991012573242, + "logits/chosen": -0.134668231010437, + "logits/rejected": 0.04346030205488205, + "logps/chosen": -4.236111640930176, + "logps/rejected": -5.2220282554626465, "loss": 0.5037, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -8.41321086883545, - "rewards/margins": 0.9607791900634766, - "rewards/rejected": -9.373991012573242, - "semantic_entropy": 0.0037853557150810957, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.236111640930176, + "rewards/margins": 0.9859166145324707, + "rewards/rejected": -5.2220282554626465, "step": 2365 }, { "epoch": 1.2684395383843452, - "grad_norm": 19.15884763927205, + "grad_norm": 19.209870116396534, "learning_rate": 7.147142035968483e-07, - "logits/chosen": 0.7049607038497925, - "logits/rejected": 0.8010439872741699, - "logps/chosen": -8.644028663635254, - "logps/rejected": -9.527328491210938, - "loss": 0.4998, - "rewards/accuracies": 0.75, - "rewards/chosen": -8.644028663635254, - "rewards/margins": 0.8833004832267761, - "rewards/rejected": -9.527328491210938, - "semantic_entropy": 0.0030520078726112843, + "logits/chosen": -0.1428346037864685, + "logits/rejected": 0.042151451110839844, + "logps/chosen": -4.5736212730407715, + "logps/rejected": -5.45636510848999, + "loss": 0.4946, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.5736212730407715, + "rewards/margins": 0.8827441334724426, + "rewards/rejected": -5.45636510848999, "step": 2370 }, { "epoch": 1.2711155711657467, - "grad_norm": 16.73014781307649, + "grad_norm": 14.120484220670464, "learning_rate": 7.133066725381781e-07, - "logits/chosen": 0.637940526008606, - "logits/rejected": 0.7165664434432983, - "logps/chosen": -8.474000930786133, - "logps/rejected": -9.344751358032227, - "loss": 0.5156, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -8.474000930786133, - "rewards/margins": 0.8707484006881714, - "rewards/rejected": -9.344751358032227, - "semantic_entropy": 0.003291874658316374, + "logits/chosen": -0.28036433458328247, + "logits/rejected": -0.09433353692293167, + "logps/chosen": -4.296485900878906, + "logps/rejected": -5.19989013671875, + "loss": 0.5159, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.296485900878906, + "rewards/margins": 0.9034039378166199, + "rewards/rejected": -5.19989013671875, "step": 2375 }, { "epoch": 1.2737916039471484, - "grad_norm": 20.729681567322082, + "grad_norm": 16.321553778072253, "learning_rate": 7.118970719772354e-07, - "logits/chosen": 0.6396089792251587, - "logits/rejected": 0.747488796710968, - "logps/chosen": -8.582317352294922, - "logps/rejected": -9.629173278808594, - "loss": 0.4984, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.582317352294922, - "rewards/margins": 1.0468562841415405, - "rewards/rejected": -9.629173278808594, - "semantic_entropy": 0.0034858197905123234, + "logits/chosen": -0.19098159670829773, + "logits/rejected": -0.00930701196193695, + "logps/chosen": -4.362380504608154, + "logps/rejected": -5.396433353424072, + "loss": 0.4879, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.362380504608154, + "rewards/margins": 1.0340524911880493, + "rewards/rejected": -5.396433353424072, "step": 2380 }, { "epoch": 1.27646763672855, - "grad_norm": 19.291131856489386, + "grad_norm": 16.308969537567915, "learning_rate": 7.104854155899711e-07, - "logits/chosen": 0.6974250078201294, - "logits/rejected": 0.7831848859786987, - "logps/chosen": -8.711091041564941, - "logps/rejected": -9.662050247192383, - "loss": 0.5122, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -8.711091041564941, - "rewards/margins": 0.9509603381156921, - "rewards/rejected": -9.662050247192383, - "semantic_entropy": 0.0031486363150179386, + "logits/chosen": -0.15315239131450653, + "logits/rejected": -0.01740015484392643, + "logps/chosen": -4.446219444274902, + "logps/rejected": -5.332039833068848, + "loss": 0.5148, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.446219444274902, + "rewards/margins": 0.8858197927474976, + "rewards/rejected": -5.332039833068848, "step": 2385 }, { "epoch": 1.2791436695099514, - "grad_norm": 18.66005580137364, + "grad_norm": 16.054207224273373, "learning_rate": 7.090717170722817e-07, - "logits/chosen": 0.6889894008636475, - "logits/rejected": 0.7326347231864929, - "logps/chosen": -8.706632614135742, - "logps/rejected": -9.896500587463379, - "loss": 0.4453, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -8.706632614135742, - "rewards/margins": 1.1898666620254517, - "rewards/rejected": -9.896500587463379, - "semantic_entropy": 0.002780457027256489, + "logits/chosen": -0.10979852825403214, + "logits/rejected": -0.05432642251253128, + "logps/chosen": -4.382155418395996, + "logps/rejected": -5.446093559265137, + "loss": 0.457, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.382155418395996, + "rewards/margins": 1.0639379024505615, + "rewards/rejected": -5.446093559265137, "step": 2390 }, { "epoch": 1.2818197022913531, - "grad_norm": 22.671781708487075, + "grad_norm": 18.289470966406338, "learning_rate": 7.076559901398762e-07, - "logits/chosen": 0.6582309603691101, - "logits/rejected": 0.7270200252532959, - "logps/chosen": -8.679912567138672, - "logps/rejected": -9.480433464050293, - "loss": 0.5314, + "logits/chosen": -0.29051655530929565, + "logits/rejected": -0.15174946188926697, + "logps/chosen": -4.167346000671387, + "logps/rejected": -4.969964981079102, + "loss": 0.5246, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -8.679912567138672, - "rewards/margins": 0.8005210161209106, - "rewards/rejected": -9.480433464050293, - "semantic_entropy": 0.002767809433862567, + "rewards/chosen": -4.167346000671387, + "rewards/margins": 0.8026192784309387, + "rewards/rejected": -4.969964981079102, "step": 2395 }, { "epoch": 1.2844957350727546, - "grad_norm": 22.234310993156075, + "grad_norm": 22.263094039776135, "learning_rate": 7.062382485281436e-07, - "logits/chosen": 0.6792951822280884, - "logits/rejected": 0.7309907674789429, - "logps/chosen": -8.538030624389648, - "logps/rejected": -9.394686698913574, - "loss": 0.5261, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.538030624389648, - "rewards/margins": 0.8566561937332153, - "rewards/rejected": -9.394686698913574, - "semantic_entropy": 0.0033909387420862913, + "logits/chosen": -0.22863061726093292, + "logits/rejected": -0.10596136748790741, + "logps/chosen": -4.243943214416504, + "logps/rejected": -4.922046661376953, + "loss": 0.5449, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.243943214416504, + "rewards/margins": 0.6781042218208313, + "rewards/rejected": -4.922046661376953, "step": 2400 }, { "epoch": 1.2844957350727546, - "eval_logits/chosen": 0.8011821508407593, - "eval_logits/rejected": 0.872314453125, - "eval_logps/chosen": -8.710298538208008, - "eval_logps/rejected": -9.651128768920898, - "eval_loss": 0.5312913060188293, - "eval_rewards/accuracies": 0.7136498689651489, - "eval_rewards/chosen": -8.710298538208008, - "eval_rewards/margins": 0.9408305883407593, - "eval_rewards/rejected": -9.651128768920898, - "eval_runtime": 34.8607, - "eval_samples_per_second": 38.582, - "eval_semantic_entropy": 0.002928712172433734, - "eval_steps_per_second": 9.667, + "eval_logits/chosen": 0.11842069774866104, + "eval_logits/rejected": 0.22465933859348297, + "eval_logps/chosen": -4.314882755279541, + "eval_logps/rejected": -5.234780788421631, + "eval_loss": 0.5197760462760925, + "eval_rewards/accuracies": 0.7351632118225098, + "eval_rewards/chosen": -4.314882755279541, + "eval_rewards/margins": 0.9198984503746033, + "eval_rewards/rejected": -5.234780788421631, + "eval_runtime": 40.4426, + "eval_samples_per_second": 33.257, + "eval_steps_per_second": 8.333, "step": 2400 }, { "epoch": 1.287171767854156, - "grad_norm": 14.920939180127396, + "grad_norm": 10.721897093439248, "learning_rate": 7.048185059920193e-07, - "logits/chosen": 0.6384707093238831, - "logits/rejected": 0.7600412368774414, - "logps/chosen": -8.579252243041992, - "logps/rejected": -9.70583724975586, - "loss": 0.4806, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.579252243041992, - "rewards/margins": 1.1265841722488403, - "rewards/rejected": -9.70583724975586, - "semantic_entropy": 0.0032008637208491564, + "logits/chosen": -0.20763030648231506, + "logits/rejected": -0.05173329636454582, + "logps/chosen": -4.146874904632568, + "logps/rejected": -5.2338762283325195, + "loss": 0.4934, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.146874904632568, + "rewards/margins": 1.0870013236999512, + "rewards/rejected": -5.2338762283325195, "step": 2405 }, { "epoch": 1.2898478006355578, - "grad_norm": 18.47727128635857, + "grad_norm": 18.486968382549335, "learning_rate": 7.033967763058516e-07, - "logits/chosen": 0.5698826313018799, - "logits/rejected": 0.6842392683029175, - "logps/chosen": -8.608453750610352, - "logps/rejected": -9.420400619506836, - "loss": 0.5163, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -8.608453750610352, - "rewards/margins": 0.8119487762451172, - "rewards/rejected": -9.420400619506836, - "semantic_entropy": 0.0028821511659771204, + "logits/chosen": -0.2687072455883026, + "logits/rejected": -0.08006079494953156, + "logps/chosen": -4.119261741638184, + "logps/rejected": -4.916508674621582, + "loss": 0.5084, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.119261741638184, + "rewards/margins": 0.7972471117973328, + "rewards/rejected": -4.916508674621582, "step": 2410 }, { "epoch": 1.2925238334169593, - "grad_norm": 16.15173827430261, + "grad_norm": 13.765650666419546, "learning_rate": 7.019730732632681e-07, - "logits/chosen": 0.6563664078712463, - "logits/rejected": 0.7400893568992615, - "logps/chosen": -8.490577697753906, - "logps/rejected": -9.582011222839355, - "loss": 0.4587, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.490577697753906, - "rewards/margins": 1.091435194015503, - "rewards/rejected": -9.582011222839355, - "semantic_entropy": 0.003659659530967474, + "logits/chosen": -0.10080672800540924, + "logits/rejected": 0.0009728640434332192, + "logps/chosen": -4.115511894226074, + "logps/rejected": -5.100127696990967, + "loss": 0.4957, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.115511894226074, + "rewards/margins": 0.9846161603927612, + "rewards/rejected": -5.100127696990967, "step": 2415 }, { "epoch": 1.2951998661983608, - "grad_norm": 20.007292439163773, + "grad_norm": 12.913127034105079, "learning_rate": 7.005474106770418e-07, - "logits/chosen": 0.57745361328125, - "logits/rejected": 0.6826112866401672, - "logps/chosen": -8.502618789672852, - "logps/rejected": -9.526583671569824, - "loss": 0.5005, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -8.502618789672852, - "rewards/margins": 1.0239640474319458, - "rewards/rejected": -9.526583671569824, - "semantic_entropy": 0.0039854454807937145, + "logits/chosen": -0.22971129417419434, + "logits/rejected": -0.09994227439165115, + "logps/chosen": -4.015557765960693, + "logps/rejected": -4.983471870422363, + "loss": 0.4687, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.015557765960693, + "rewards/margins": 0.967913806438446, + "rewards/rejected": -4.983471870422363, "step": 2420 }, { "epoch": 1.2978758989797625, - "grad_norm": 16.126086508254385, + "grad_norm": 12.689975388758963, "learning_rate": 6.991198023789577e-07, - "logits/chosen": 0.6350833177566528, - "logits/rejected": 0.7082042098045349, - "logps/chosen": -8.247810363769531, - "logps/rejected": -9.1130952835083, - "loss": 0.5063, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -8.247810363769531, - "rewards/margins": 0.8652847409248352, - "rewards/rejected": -9.1130952835083, - "semantic_entropy": 0.0047439588233828545, + "logits/chosen": -0.19630928337574005, + "logits/rejected": -0.0999438539147377, + "logps/chosen": -3.7357354164123535, + "logps/rejected": -4.510828971862793, + "loss": 0.4974, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.7357354164123535, + "rewards/margins": 0.7750941514968872, + "rewards/rejected": -4.510828971862793, "step": 2425 }, { "epoch": 1.300551931761164, - "grad_norm": 23.717409436049728, + "grad_norm": 18.60615090365754, "learning_rate": 6.976902622196776e-07, - "logits/chosen": 0.5765770077705383, - "logits/rejected": 0.6420444250106812, - "logps/chosen": -8.346854209899902, - "logps/rejected": -9.272150993347168, - "loss": 0.5377, - "rewards/accuracies": 0.75, - "rewards/chosen": -8.346854209899902, - "rewards/margins": 0.9252961277961731, - "rewards/rejected": -9.272150993347168, - "semantic_entropy": 0.003574087517336011, + "logits/chosen": -0.17983554303646088, + "logits/rejected": -0.09636779874563217, + "logps/chosen": -3.998325824737549, + "logps/rejected": -4.882527828216553, + "loss": 0.5096, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.998325824737549, + "rewards/margins": 0.8842023611068726, + "rewards/rejected": -4.882527828216553, "step": 2430 }, { "epoch": 1.3032279645425655, - "grad_norm": 17.97651242704695, + "grad_norm": 13.05436275734575, "learning_rate": 6.962588040686064e-07, - "logits/chosen": 0.5552124381065369, - "logits/rejected": 0.658178448677063, - "logps/chosen": -8.291497230529785, - "logps/rejected": -9.098273277282715, - "loss": 0.5842, - "rewards/accuracies": 0.6875, - "rewards/chosen": -8.291497230529785, - "rewards/margins": 0.806775689125061, - "rewards/rejected": -9.098273277282715, - "semantic_entropy": 0.004253287799656391, + "logits/chosen": -0.19618090987205505, + "logits/rejected": -0.03529966622591019, + "logps/chosen": -3.7990849018096924, + "logps/rejected": -4.577143669128418, + "loss": 0.5481, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.7990849018096924, + "rewards/margins": 0.7780588269233704, + "rewards/rejected": -4.577143669128418, "step": 2435 }, { "epoch": 1.3059039973239672, - "grad_norm": 20.217954122529044, + "grad_norm": 16.34385016696788, "learning_rate": 6.948254418137573e-07, - "logits/chosen": 0.5669525861740112, - "logits/rejected": 0.6433640718460083, - "logps/chosen": -8.215181350708008, - "logps/rejected": -9.121828079223633, - "loss": 0.5425, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -8.215181350708008, - "rewards/margins": 0.9066460728645325, - "rewards/rejected": -9.121828079223633, - "semantic_entropy": 0.004315282683819532, + "logits/chosen": -0.20888130366802216, + "logits/rejected": -0.08102947473526001, + "logps/chosen": -3.852299451828003, + "logps/rejected": -4.725593090057373, + "loss": 0.5312, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.852299451828003, + "rewards/margins": 0.8732938766479492, + "rewards/rejected": -4.725593090057373, "step": 2440 }, { "epoch": 1.3085800301053687, - "grad_norm": 21.969818944329187, + "grad_norm": 21.863174467956682, "learning_rate": 6.933901893616174e-07, - "logits/chosen": 0.5023918151855469, - "logits/rejected": 0.614323616027832, - "logps/chosen": -8.214559555053711, - "logps/rejected": -9.092178344726562, - "loss": 0.5167, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.214559555053711, - "rewards/margins": 0.8776181936264038, - "rewards/rejected": -9.092178344726562, - "semantic_entropy": 0.004376448690891266, + "logits/chosen": -0.24802479147911072, + "logits/rejected": -0.09617750346660614, + "logps/chosen": -3.940642833709717, + "logps/rejected": -4.669715881347656, + "loss": 0.5496, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.940642833709717, + "rewards/margins": 0.7290723323822021, + "rewards/rejected": -4.669715881347656, "step": 2445 }, { "epoch": 1.3112560628867704, - "grad_norm": 21.680469755063655, + "grad_norm": 18.818122446573366, "learning_rate": 6.919530606370121e-07, - "logits/chosen": 0.48196372389793396, - "logits/rejected": 0.5732806921005249, - "logps/chosen": -8.17034912109375, - "logps/rejected": -9.072924613952637, - "loss": 0.5107, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.17034912109375, - "rewards/margins": 0.9025766253471375, - "rewards/rejected": -9.072924613952637, - "semantic_entropy": 0.004180104471743107, + "logits/chosen": -0.1729886382818222, + "logits/rejected": -0.01753205619752407, + "logps/chosen": -3.728374481201172, + "logps/rejected": -4.626367092132568, + "loss": 0.4819, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.728374481201172, + "rewards/margins": 0.8979929089546204, + "rewards/rejected": -4.626367092132568, "step": 2450 }, { "epoch": 1.313932095668172, - "grad_norm": 14.07946177566356, + "grad_norm": 13.280262640651967, "learning_rate": 6.905140695829706e-07, - "logits/chosen": 0.47136348485946655, - "logits/rejected": 0.6471112370491028, - "logps/chosen": -8.491350173950195, - "logps/rejected": -9.42007827758789, - "loss": 0.4935, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.491350173950195, - "rewards/margins": 0.9287282228469849, - "rewards/rejected": -9.42007827758789, - "semantic_entropy": 0.003645123215392232, + "logits/chosen": -0.2827102839946747, + "logits/rejected": -0.000569726515095681, + "logps/chosen": -4.006006240844727, + "logps/rejected": -4.881535053253174, + "loss": 0.47, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.006006240844727, + "rewards/margins": 0.8755286931991577, + "rewards/rejected": -4.881535053253174, "step": 2455 }, { "epoch": 1.3166081284495736, - "grad_norm": 23.554632267605992, + "grad_norm": 19.991696854335885, "learning_rate": 6.890732301605904e-07, - "logits/chosen": 0.5830351114273071, - "logits/rejected": 0.6560341119766235, - "logps/chosen": -8.401416778564453, - "logps/rejected": -9.3002290725708, - "loss": 0.5216, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -8.401416778564453, - "rewards/margins": 0.8988133668899536, - "rewards/rejected": -9.3002290725708, - "semantic_entropy": 0.0037794325035065413, + "logits/chosen": -0.20953328907489777, + "logits/rejected": -0.10398004204034805, + "logps/chosen": -3.7585856914520264, + "logps/rejected": -4.512194633483887, + "loss": 0.5458, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.7585856914520264, + "rewards/margins": 0.753609299659729, + "rewards/rejected": -4.512194633483887, "step": 2460 }, { "epoch": 1.3192841612309751, - "grad_norm": 18.146924951726284, + "grad_norm": 13.1849500052812, "learning_rate": 6.876305563489021e-07, - "logits/chosen": 0.5521279573440552, - "logits/rejected": 0.6386257410049438, - "logps/chosen": -8.719072341918945, - "logps/rejected": -9.739156723022461, - "loss": 0.4651, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -8.719072341918945, - "rewards/margins": 1.0200841426849365, - "rewards/rejected": -9.739156723022461, - "semantic_entropy": 0.002688236068934202, + "logits/chosen": -0.25444942712783813, + "logits/rejected": -0.14460231363773346, + "logps/chosen": -4.147862911224365, + "logps/rejected": -5.1401591300964355, + "loss": 0.4538, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.147862911224365, + "rewards/margins": 0.9922965168952942, + "rewards/rejected": -5.1401591300964355, "step": 2465 }, { "epoch": 1.3219601940123766, - "grad_norm": 20.103380904418128, + "grad_norm": 18.711260789093544, "learning_rate": 6.861860621447331e-07, - "logits/chosen": 0.5402216911315918, - "logits/rejected": 0.6326644420623779, - "logps/chosen": -8.76352596282959, - "logps/rejected": -9.569908142089844, - "loss": 0.5324, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.76352596282959, - "rewards/margins": 0.8063834309577942, - "rewards/rejected": -9.569908142089844, - "semantic_entropy": 0.0027508633211255074, + "logits/chosen": -0.33607983589172363, + "logits/rejected": -0.20735201239585876, + "logps/chosen": -4.21028995513916, + "logps/rejected": -4.893194198608398, + "loss": 0.5613, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.21028995513916, + "rewards/margins": 0.6829042434692383, + "rewards/rejected": -4.893194198608398, "step": 2470 }, { "epoch": 1.3246362267937783, - "grad_norm": 19.93100548564064, + "grad_norm": 17.399840669384467, "learning_rate": 6.847397615625725e-07, - "logits/chosen": 0.6381164789199829, - "logits/rejected": 0.6684954762458801, - "logps/chosen": -8.71910572052002, - "logps/rejected": -9.549886703491211, - "loss": 0.5264, - "rewards/accuracies": 0.71875, - "rewards/chosen": -8.71910572052002, - "rewards/margins": 0.8307819366455078, - "rewards/rejected": -9.549886703491211, - "semantic_entropy": 0.0028809071518480778, + "logits/chosen": -0.19514963030815125, + "logits/rejected": -0.12894995510578156, + "logps/chosen": -4.164662837982178, + "logps/rejected": -4.920551300048828, + "loss": 0.5299, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.164662837982178, + "rewards/margins": 0.7558885216712952, + "rewards/rejected": -4.920551300048828, "step": 2475 }, { "epoch": 1.3273122595751798, - "grad_norm": 15.7435684829995, + "grad_norm": 10.180022808251918, "learning_rate": 6.83291668634435e-07, - "logits/chosen": 0.6417192220687866, - "logits/rejected": 0.7400007843971252, - "logps/chosen": -8.722017288208008, - "logps/rejected": -9.792933464050293, - "loss": 0.4668, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.722017288208008, - "rewards/margins": 1.0709177255630493, - "rewards/rejected": -9.792933464050293, - "semantic_entropy": 0.003136158687993884, + "logits/chosen": -0.32387158274650574, + "logits/rejected": -0.12202148139476776, + "logps/chosen": -4.078675270080566, + "logps/rejected": -5.19416618347168, + "loss": 0.4508, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.078675270080566, + "rewards/margins": 1.1154905557632446, + "rewards/rejected": -5.19416618347168, "step": 2480 }, { "epoch": 1.3299882923565813, - "grad_norm": 19.570820563611054, + "grad_norm": 14.971809450773378, "learning_rate": 6.818417974097246e-07, - "logits/chosen": 0.7284759283065796, - "logits/rejected": 0.8398087620735168, - "logps/chosen": -8.61033821105957, - "logps/rejected": -9.802716255187988, - "loss": 0.4663, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.61033821105957, - "rewards/margins": 1.1923778057098389, - "rewards/rejected": -9.802716255187988, - "semantic_entropy": 0.003474020166322589, + "logits/chosen": -0.11587011814117432, + "logits/rejected": 0.03978449106216431, + "logps/chosen": -4.016814231872559, + "logps/rejected": -5.165071964263916, + "loss": 0.4472, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.016814231872559, + "rewards/margins": 1.1482579708099365, + "rewards/rejected": -5.165071964263916, "step": 2485 }, { "epoch": 1.332664325137983, - "grad_norm": 19.36403331204118, + "grad_norm": 15.885397041422925, "learning_rate": 6.803901619550981e-07, - "logits/chosen": 0.6692131757736206, - "logits/rejected": 0.7076988220214844, - "logps/chosen": -8.740633964538574, - "logps/rejected": -9.593387603759766, - "loss": 0.504, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.740633964538574, - "rewards/margins": 0.8527532815933228, - "rewards/rejected": -9.593387603759766, - "semantic_entropy": 0.0032423834782093763, + "logits/chosen": -0.24591884016990662, + "logits/rejected": -0.17632192373275757, + "logps/chosen": -4.10286808013916, + "logps/rejected": -5.040411472320557, + "loss": 0.4844, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.10286808013916, + "rewards/margins": 0.9375430345535278, + "rewards/rejected": -5.040411472320557, "step": 2490 }, { "epoch": 1.3353403579193845, - "grad_norm": 18.303427693985547, + "grad_norm": 16.859436018681723, "learning_rate": 6.789367763543292e-07, - "logits/chosen": 0.7160294651985168, - "logits/rejected": 0.7507287859916687, - "logps/chosen": -8.617566108703613, - "logps/rejected": -9.467153549194336, - "loss": 0.5475, + "logits/chosen": -0.1643955409526825, + "logits/rejected": -0.13142207264900208, + "logps/chosen": -4.022536754608154, + "logps/rejected": -4.80590295791626, + "loss": 0.5564, "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -8.617566108703613, - "rewards/margins": 0.8495874404907227, - "rewards/rejected": -9.467153549194336, - "semantic_entropy": 0.003361668437719345, + "rewards/chosen": -4.022536754608154, + "rewards/margins": 0.7833660840988159, + "rewards/rejected": -4.80590295791626, "step": 2495 }, { "epoch": 1.338016390700786, - "grad_norm": 23.850604033392393, + "grad_norm": 16.460694317926784, "learning_rate": 6.774816547081714e-07, - "logits/chosen": 0.6444199681282043, - "logits/rejected": 0.7431300282478333, - "logps/chosen": -8.590035438537598, - "logps/rejected": -9.325902938842773, - "loss": 0.5461, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.590035438537598, - "rewards/margins": 0.7358676791191101, - "rewards/rejected": -9.325902938842773, - "semantic_entropy": 0.0030975653789937496, + "logits/chosen": -0.1877204179763794, + "logits/rejected": -0.006307822652161121, + "logps/chosen": -4.082192897796631, + "logps/rejected": -4.773401737213135, + "loss": 0.5337, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.082192897796631, + "rewards/margins": 0.6912088394165039, + "rewards/rejected": -4.773401737213135, "step": 2500 }, { "epoch": 1.3406924234821878, - "grad_norm": 18.77168242134909, + "grad_norm": 14.39003093398491, "learning_rate": 6.760248111342211e-07, - "logits/chosen": 0.6908949017524719, - "logits/rejected": 0.7892520427703857, - "logps/chosen": -8.384283065795898, - "logps/rejected": -9.46776008605957, - "loss": 0.468, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -8.384283065795898, - "rewards/margins": 1.0834753513336182, - "rewards/rejected": -9.46776008605957, - "semantic_entropy": 0.003537180367857218, + "logits/chosen": -0.18344026803970337, + "logits/rejected": -0.008022744208574295, + "logps/chosen": -3.7875473499298096, + "logps/rejected": -4.711060047149658, + "loss": 0.4964, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.7875473499298096, + "rewards/margins": 0.9235126376152039, + "rewards/rejected": -4.711060047149658, "step": 2505 }, { "epoch": 1.3433684562635893, - "grad_norm": 18.137558397174647, + "grad_norm": 17.685400602485455, "learning_rate": 6.745662597667813e-07, - "logits/chosen": 0.6804380416870117, - "logits/rejected": 0.7819782495498657, - "logps/chosen": -8.316720962524414, - "logps/rejected": -9.37002944946289, - "loss": 0.4474, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.316720962524414, - "rewards/margins": 1.0533078908920288, - "rewards/rejected": -9.37002944946289, - "semantic_entropy": 0.0034739505499601364, + "logits/chosen": -0.2205597460269928, + "logits/rejected": -0.06239140033721924, + "logps/chosen": -3.718266248703003, + "logps/rejected": -4.625300884246826, + "loss": 0.4689, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.718266248703003, + "rewards/margins": 0.907034695148468, + "rewards/rejected": -4.625300884246826, "step": 2510 }, { "epoch": 1.3460444890449907, - "grad_norm": 15.709292621392008, + "grad_norm": 17.27560313219041, "learning_rate": 6.731060147567236e-07, - "logits/chosen": 0.7852478623390198, - "logits/rejected": 0.8401540517807007, - "logps/chosen": -8.328946113586426, - "logps/rejected": -9.2665433883667, - "loss": 0.4953, + "logits/chosen": -0.12970446050167084, + "logits/rejected": -0.03152019903063774, + "logps/chosen": -3.629901885986328, + "logps/rejected": -4.584659576416016, + "loss": 0.4662, "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.328946113586426, - "rewards/margins": 0.937597393989563, - "rewards/rejected": -9.2665433883667, - "semantic_entropy": 0.003876983653753996, + "rewards/chosen": -3.629901885986328, + "rewards/margins": 0.9547576904296875, + "rewards/rejected": -4.584659576416016, "step": 2515 }, { "epoch": 1.3487205218263925, - "grad_norm": 16.260852565852623, + "grad_norm": 16.93765233142808, "learning_rate": 6.716440902713515e-07, - "logits/chosen": 0.720770001411438, - "logits/rejected": 0.7988892793655396, - "logps/chosen": -8.437568664550781, - "logps/rejected": -9.330516815185547, - "loss": 0.4806, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -8.437568664550781, - "rewards/margins": 0.8929487466812134, - "rewards/rejected": -9.330516815185547, - "semantic_entropy": 0.0032981105614453554, + "logits/chosen": -0.219585582613945, + "logits/rejected": -0.1354239284992218, + "logps/chosen": -3.9962940216064453, + "logps/rejected": -4.826720714569092, + "loss": 0.4811, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.9962940216064453, + "rewards/margins": 0.8304263353347778, + "rewards/rejected": -4.826720714569092, "step": 2520 }, { "epoch": 1.351396554607794, - "grad_norm": 20.213224180218113, + "grad_norm": 20.777307232750122, "learning_rate": 6.701805004942627e-07, - "logits/chosen": 0.7619292140007019, - "logits/rejected": 0.8122636079788208, - "logps/chosen": -8.559782981872559, - "logps/rejected": -9.494768142700195, - "loss": 0.5013, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.559782981872559, - "rewards/margins": 0.9349856376647949, - "rewards/rejected": -9.494768142700195, - "semantic_entropy": 0.0034507550299167633, + "logits/chosen": -0.17510934174060822, + "logits/rejected": -0.08583525568246841, + "logps/chosen": -4.107699394226074, + "logps/rejected": -4.977747440338135, + "loss": 0.5123, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.107699394226074, + "rewards/margins": 0.8700485229492188, + "rewards/rejected": -4.977747440338135, "step": 2525 }, { "epoch": 1.3540725873891954, - "grad_norm": 23.935038029395674, + "grad_norm": 24.44313523211283, "learning_rate": 6.687152596252119e-07, - "logits/chosen": 0.8029264211654663, - "logits/rejected": 0.8429144620895386, - "logps/chosen": -8.917330741882324, - "logps/rejected": -9.750707626342773, - "loss": 0.5555, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -8.917330741882324, - "rewards/margins": 0.8333770632743835, - "rewards/rejected": -9.750707626342773, - "semantic_entropy": 0.0024835984222590923, + "logits/chosen": -0.1518809199333191, + "logits/rejected": -0.08226124942302704, + "logps/chosen": -4.158895969390869, + "logps/rejected": -4.897714614868164, + "loss": 0.5806, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.158895969390869, + "rewards/margins": 0.738818347454071, + "rewards/rejected": -4.897714614868164, "step": 2530 }, { "epoch": 1.3567486201705972, - "grad_norm": 26.97837329749874, + "grad_norm": 23.646339854010634, "learning_rate": 6.672483818799722e-07, - "logits/chosen": 0.756155788898468, - "logits/rejected": 0.8390409350395203, - "logps/chosen": -9.110207557678223, - "logps/rejected": -9.918048858642578, - "loss": 0.5293, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -9.110207557678223, - "rewards/margins": 0.8078413009643555, - "rewards/rejected": -9.918048858642578, - "semantic_entropy": 0.0021289088763296604, + "logits/chosen": -0.23540440201759338, + "logits/rejected": -0.07612784206867218, + "logps/chosen": -4.173985481262207, + "logps/rejected": -4.968315601348877, + "loss": 0.5331, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.173985481262207, + "rewards/margins": 0.7943307161331177, + "rewards/rejected": -4.968315601348877, "step": 2535 }, { "epoch": 1.3594246529519987, - "grad_norm": 17.786890203549685, + "grad_norm": 19.525950299681234, "learning_rate": 6.657798814901978e-07, - "logits/chosen": 0.7632160186767578, - "logits/rejected": 0.8699263334274292, - "logps/chosen": -9.151754379272461, - "logps/rejected": -10.03592586517334, - "loss": 0.499, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.151754379272461, - "rewards/margins": 0.884171187877655, - "rewards/rejected": -10.03592586517334, - "semantic_entropy": 0.0021057447884231806, + "logits/chosen": -0.17305561900138855, + "logits/rejected": 0.023584945127367973, + "logps/chosen": -4.21708345413208, + "logps/rejected": -4.954843044281006, + "loss": 0.5351, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.21708345413208, + "rewards/margins": 0.7377598285675049, + "rewards/rejected": -4.954843044281006, "step": 2540 }, { "epoch": 1.3621006857334002, - "grad_norm": 20.119224807141038, + "grad_norm": 14.252845583027911, "learning_rate": 6.643097727032863e-07, - "logits/chosen": 0.7189488410949707, - "logits/rejected": 0.8399428129196167, - "logps/chosen": -9.119746208190918, - "logps/rejected": -10.26237964630127, - "loss": 0.4471, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -9.119746208190918, - "rewards/margins": 1.1426328420639038, - "rewards/rejected": -10.26237964630127, - "semantic_entropy": 0.0021095951087772846, + "logits/chosen": -0.19684430956840515, + "logits/rejected": 0.018279695883393288, + "logps/chosen": -4.030699729919434, + "logps/rejected": -4.988565444946289, + "loss": 0.4636, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.030699729919434, + "rewards/margins": 0.9578655958175659, + "rewards/rejected": -4.988565444946289, "step": 2545 }, { "epoch": 1.3647767185148019, - "grad_norm": 20.813656956825653, + "grad_norm": 15.022928567672803, "learning_rate": 6.628380697822392e-07, - "logits/chosen": 0.7267470955848694, - "logits/rejected": 0.823375403881073, - "logps/chosen": -9.255183219909668, - "logps/rejected": -9.974876403808594, - "loss": 0.5697, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -9.255183219909668, - "rewards/margins": 0.7196929454803467, - "rewards/rejected": -9.974876403808594, - "semantic_entropy": 0.0019825948402285576, + "logits/chosen": -0.16153240203857422, + "logits/rejected": 0.008358100429177284, + "logps/chosen": -4.1901750564575195, + "logps/rejected": -4.988600730895996, + "loss": 0.5124, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.1901750564575195, + "rewards/margins": 0.7984258532524109, + "rewards/rejected": -4.988600730895996, "step": 2550 }, { "epoch": 1.3674527512962034, - "grad_norm": 21.495996998491723, + "grad_norm": 16.523042819593876, "learning_rate": 6.61364787005525e-07, - "logits/chosen": 0.7436253428459167, - "logits/rejected": 0.8189195394515991, - "logps/chosen": -8.933206558227539, - "logps/rejected": -10.065174102783203, - "loss": 0.4548, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -8.933206558227539, - "rewards/margins": 1.1319692134857178, - "rewards/rejected": -10.065174102783203, - "semantic_entropy": 0.0025200708769261837, + "logits/chosen": -0.13803143799304962, + "logits/rejected": 0.004865099675953388, + "logps/chosen": -3.7624809741973877, + "logps/rejected": -4.800298690795898, + "loss": 0.487, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.7624809741973877, + "rewards/margins": 1.0378177165985107, + "rewards/rejected": -4.800298690795898, "step": 2555 }, { "epoch": 1.3701287840776049, - "grad_norm": 20.492917918741757, + "grad_norm": 19.922635567449543, "learning_rate": 6.598899386669395e-07, - "logits/chosen": 0.6491128206253052, - "logits/rejected": 0.7273428440093994, - "logps/chosen": -8.97862434387207, - "logps/rejected": -9.844133377075195, - "loss": 0.5339, - "rewards/accuracies": 0.75, - "rewards/chosen": -8.97862434387207, - "rewards/margins": 0.8655084371566772, - "rewards/rejected": -9.844133377075195, - "semantic_entropy": 0.0024933055974543095, + "logits/chosen": -0.13467541337013245, + "logits/rejected": -0.0024763576220721006, + "logps/chosen": -4.113837718963623, + "logps/rejected": -4.890066623687744, + "loss": 0.5398, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.113837718963623, + "rewards/margins": 0.7762287259101868, + "rewards/rejected": -4.890066623687744, "step": 2560 }, { "epoch": 1.3728048168590066, - "grad_norm": 29.52052844095437, + "grad_norm": 26.413224668769875, "learning_rate": 6.584135390754679e-07, - "logits/chosen": 0.618812620639801, - "logits/rejected": 0.7136391997337341, - "logps/chosen": -8.866046905517578, - "logps/rejected": -9.910287857055664, - "loss": 0.495, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.866046905517578, - "rewards/margins": 1.04423987865448, - "rewards/rejected": -9.910287857055664, - "semantic_entropy": 0.002750510349869728, + "logits/chosen": -0.1479804813861847, + "logits/rejected": 0.0066998242400586605, + "logps/chosen": -3.9280076026916504, + "logps/rejected": -4.87653112411499, + "loss": 0.4927, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.9280076026916504, + "rewards/margins": 0.9485238194465637, + "rewards/rejected": -4.87653112411499, "step": 2565 }, { "epoch": 1.375480849640408, - "grad_norm": 14.963175375540626, + "grad_norm": 11.550050194783667, "learning_rate": 6.569356025551454e-07, - "logits/chosen": 0.6298393607139587, - "logits/rejected": 0.6999781727790833, - "logps/chosen": -8.907299995422363, - "logps/rejected": -9.864768981933594, - "loss": 0.5256, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -8.907299995422363, - "rewards/margins": 0.9574697613716125, - "rewards/rejected": -9.864768981933594, - "semantic_entropy": 0.002827054588124156, + "logits/chosen": -0.14004406332969666, + "logits/rejected": -0.03751618415117264, + "logps/chosen": -3.8066012859344482, + "logps/rejected": -4.75297737121582, + "loss": 0.4978, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.8066012859344482, + "rewards/margins": 0.9463754892349243, + "rewards/rejected": -4.75297737121582, "step": 2570 }, { "epoch": 1.3781568824218096, - "grad_norm": 24.51017226672691, + "grad_norm": 14.05501674560556, "learning_rate": 6.554561434449186e-07, - "logits/chosen": 0.6173363327980042, - "logits/rejected": 0.7241848111152649, - "logps/chosen": -8.95418930053711, - "logps/rejected": -9.835868835449219, - "loss": 0.523, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -8.95418930053711, - "rewards/margins": 0.8816791772842407, - "rewards/rejected": -9.835868835449219, - "semantic_entropy": 0.0022052470594644547, + "logits/chosen": -0.24392935633659363, + "logits/rejected": -0.0797763466835022, + "logps/chosen": -3.786935329437256, + "logps/rejected": -4.675662994384766, + "loss": 0.5107, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.786935329437256, + "rewards/margins": 0.8887276649475098, + "rewards/rejected": -4.675662994384766, "step": 2575 }, { "epoch": 1.3808329152032113, - "grad_norm": 23.404717225980484, + "grad_norm": 24.52300017210514, "learning_rate": 6.539751760985063e-07, - "logits/chosen": 0.6575708985328674, - "logits/rejected": 0.7401934266090393, - "logps/chosen": -9.168517112731934, - "logps/rejected": -9.852693557739258, - "loss": 0.5703, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -9.168517112731934, - "rewards/margins": 0.6841762661933899, - "rewards/rejected": -9.852693557739258, - "semantic_entropy": 0.0024021922145038843, + "logits/chosen": -0.1415601670742035, + "logits/rejected": -0.0478815920650959, + "logps/chosen": -4.04063606262207, + "logps/rejected": -4.686512470245361, + "loss": 0.5615, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.04063606262207, + "rewards/margins": 0.645876407623291, + "rewards/rejected": -4.686512470245361, "step": 2580 }, { "epoch": 1.3835089479846128, - "grad_norm": 20.395741155582876, + "grad_norm": 11.638534706063929, "learning_rate": 6.524927148842602e-07, - "logits/chosen": 0.6744663119316101, - "logits/rejected": 0.7450428009033203, - "logps/chosen": -9.043600082397461, - "logps/rejected": -9.901880264282227, - "loss": 0.5319, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -9.043600082397461, - "rewards/margins": 0.8582803010940552, - "rewards/rejected": -9.901880264282227, - "semantic_entropy": 0.002837617415934801, + "logits/chosen": -0.09704883396625519, + "logits/rejected": 0.06853713095188141, + "logps/chosen": -3.7049994468688965, + "logps/rejected": -4.615536689758301, + "loss": 0.4765, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.7049994468688965, + "rewards/margins": 0.9105366468429565, + "rewards/rejected": -4.615536689758301, "step": 2585 }, { "epoch": 1.3861849807660143, - "grad_norm": 20.903114229317822, + "grad_norm": 19.68500516556148, "learning_rate": 6.510087741850254e-07, - "logits/chosen": 0.6738818287849426, - "logits/rejected": 0.7437289953231812, - "logps/chosen": -8.894235610961914, - "logps/rejected": -9.801753044128418, - "loss": 0.5257, + "logits/chosen": -0.1664876490831375, + "logits/rejected": -0.03002684935927391, + "logps/chosen": -3.6252174377441406, + "logps/rejected": -4.4369587898254395, + "loss": 0.5291, "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -8.894235610961914, - "rewards/margins": 0.907518744468689, - "rewards/rejected": -9.801753044128418, - "semantic_entropy": 0.002832833444699645, + "rewards/chosen": -3.6252174377441406, + "rewards/margins": 0.8117408752441406, + "rewards/rejected": -4.4369587898254395, "step": 2590 }, { "epoch": 1.388861013547416, - "grad_norm": 25.8050758741644, + "grad_norm": 17.769533420080563, "learning_rate": 6.495233683980012e-07, - "logits/chosen": 0.6168414354324341, - "logits/rejected": 0.659568190574646, - "logps/chosen": -9.102640151977539, - "logps/rejected": -9.882707595825195, - "loss": 0.5363, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -9.102640151977539, - "rewards/margins": 0.780068576335907, - "rewards/rejected": -9.882707595825195, - "semantic_entropy": 0.002202157862484455, + "logits/chosen": -0.14186950027942657, + "logits/rejected": -0.07949542254209518, + "logps/chosen": -3.936850070953369, + "logps/rejected": -4.659331321716309, + "loss": 0.5282, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.936850070953369, + "rewards/margins": 0.722480833530426, + "rewards/rejected": -4.659331321716309, "step": 2595 }, { "epoch": 1.3915370463288175, - "grad_norm": 22.37455211473883, + "grad_norm": 19.084290754737555, "learning_rate": 6.480365119346011e-07, - "logits/chosen": 0.6996050477027893, - "logits/rejected": 0.7838853597640991, - "logps/chosen": -8.78913402557373, - "logps/rejected": -9.713356018066406, - "loss": 0.4855, - "rewards/accuracies": 0.75, - "rewards/chosen": -8.78913402557373, - "rewards/margins": 0.924220085144043, - "rewards/rejected": -9.713356018066406, - "semantic_entropy": 0.003083221148699522, + "logits/chosen": -0.08699657022953033, + "logits/rejected": 0.06183575466275215, + "logps/chosen": -3.7870116233825684, + "logps/rejected": -4.614631175994873, + "loss": 0.5013, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.7870116233825684, + "rewards/margins": 0.827619194984436, + "rewards/rejected": -4.614631175994873, "step": 2600 }, { "epoch": 1.394213079110219, - "grad_norm": 15.384141820184274, + "grad_norm": 13.506781305612822, "learning_rate": 6.465482192203129e-07, - "logits/chosen": 0.705297589302063, - "logits/rejected": 0.7580437660217285, - "logps/chosen": -8.660966873168945, - "logps/rejected": -9.467788696289062, - "loss": 0.5043, + "logits/chosen": -0.05460857227444649, + "logits/rejected": 0.023105621337890625, + "logps/chosen": -3.786332368850708, + "logps/rejected": -4.6110429763793945, + "loss": 0.4909, "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.660966873168945, - "rewards/margins": 0.8068218231201172, - "rewards/rejected": -9.467788696289062, - "semantic_entropy": 0.003117068437859416, + "rewards/chosen": -3.786332368850708, + "rewards/margins": 0.8247110247612, + "rewards/rejected": -4.6110429763793945, "step": 2605 }, { "epoch": 1.3968891118916207, - "grad_norm": 19.949668972130336, + "grad_norm": 23.38206236535951, "learning_rate": 6.45058504694559e-07, - "logits/chosen": 0.6838169097900391, - "logits/rejected": 0.7196789383888245, - "logps/chosen": -8.577276229858398, - "logps/rejected": -9.5936861038208, - "loss": 0.4727, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -8.577276229858398, - "rewards/margins": 1.01641047000885, - "rewards/rejected": -9.5936861038208, - "semantic_entropy": 0.003384160343557596, + "logits/chosen": -0.049968838691711426, + "logits/rejected": -0.0022939909249544144, + "logps/chosen": -3.90215802192688, + "logps/rejected": -4.76132345199585, + "loss": 0.502, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.90215802192688, + "rewards/margins": 0.8591658473014832, + "rewards/rejected": -4.76132345199585, "step": 2610 }, { "epoch": 1.3995651446730222, - "grad_norm": 23.363596001347148, + "grad_norm": 18.840268885604583, "learning_rate": 6.435673828105564e-07, - "logits/chosen": 0.6700709462165833, - "logits/rejected": 0.7268036007881165, - "logps/chosen": -8.639312744140625, - "logps/rejected": -9.69861888885498, - "loss": 0.4863, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -8.639312744140625, - "rewards/margins": 1.0593070983886719, - "rewards/rejected": -9.69861888885498, - "semantic_entropy": 0.003042886033654213, + "logits/chosen": -0.16323044896125793, + "logits/rejected": -0.015252292156219482, + "logps/chosen": -3.8421053886413574, + "logps/rejected": -4.7655816078186035, + "loss": 0.5105, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.8421053886413574, + "rewards/margins": 0.9234757423400879, + "rewards/rejected": -4.7655816078186035, "step": 2615 }, { "epoch": 1.402241177454424, - "grad_norm": 16.823068494049753, + "grad_norm": 14.522487169076495, "learning_rate": 6.420748680351763e-07, - "logits/chosen": 0.7304331064224243, - "logits/rejected": 0.7038922309875488, - "logps/chosen": -8.778966903686523, - "logps/rejected": -9.502559661865234, - "loss": 0.5525, + "logits/chosen": -0.07755996286869049, + "logits/rejected": -0.11836843192577362, + "logps/chosen": -3.9717350006103516, + "logps/rejected": -4.630441188812256, + "loss": 0.5649, "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -8.778966903686523, - "rewards/margins": 0.7235932350158691, - "rewards/rejected": -9.502559661865234, - "semantic_entropy": 0.0028461969923228025, + "rewards/chosen": -3.9717350006103516, + "rewards/margins": 0.6587058305740356, + "rewards/rejected": -4.630441188812256, "step": 2620 }, { "epoch": 1.4049172102358254, - "grad_norm": 24.59681543759587, + "grad_norm": 22.28830015950564, "learning_rate": 6.405809748488032e-07, - "logits/chosen": 0.6792068481445312, - "logits/rejected": 0.7667452096939087, - "logps/chosen": -8.76710319519043, - "logps/rejected": -9.83530330657959, - "loss": 0.4983, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -8.76710319519043, - "rewards/margins": 1.0682008266448975, - "rewards/rejected": -9.83530330657959, - "semantic_entropy": 0.0029692454263567924, + "logits/chosen": -0.1548529714345932, + "logits/rejected": 0.002293407917022705, + "logps/chosen": -3.878488063812256, + "logps/rejected": -4.881241798400879, + "loss": 0.5014, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.878488063812256, + "rewards/margins": 1.0027529001235962, + "rewards/rejected": -4.881241798400879, "step": 2625 }, { "epoch": 1.4075932430172269, - "grad_norm": 19.14917858388336, + "grad_norm": 15.289975003619917, "learning_rate": 6.390857177451956e-07, - "logits/chosen": 0.5627522468566895, - "logits/rejected": 0.6816359758377075, - "logps/chosen": -8.73742389678955, - "logps/rejected": -9.567540168762207, - "loss": 0.5112, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.73742389678955, - "rewards/margins": 0.8301169276237488, - "rewards/rejected": -9.567540168762207, - "semantic_entropy": 0.0030936195980757475, + "logits/chosen": -0.2382952868938446, + "logits/rejected": -0.047243960201740265, + "logps/chosen": -4.022583484649658, + "logps/rejected": -4.875062465667725, + "loss": 0.4944, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.022583484649658, + "rewards/margins": 0.852479100227356, + "rewards/rejected": -4.875062465667725, "step": 2630 }, { "epoch": 1.4102692757986286, - "grad_norm": 17.47734554681335, + "grad_norm": 18.018907334987315, "learning_rate": 6.375891112313445e-07, - "logits/chosen": 0.6170838475227356, - "logits/rejected": 0.6742144823074341, - "logps/chosen": -8.985275268554688, - "logps/rejected": -9.946958541870117, - "loss": 0.4664, + "logits/chosen": -0.2297677993774414, + "logits/rejected": -0.11802766472101212, + "logps/chosen": -4.24521541595459, + "logps/rejected": -5.118771553039551, + "loss": 0.4961, "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.985275268554688, - "rewards/margins": 0.9616818428039551, - "rewards/rejected": -9.946958541870117, - "semantic_entropy": 0.002239787485450506, + "rewards/chosen": -4.24521541595459, + "rewards/margins": 0.8735561370849609, + "rewards/rejected": -5.118771553039551, "step": 2635 }, { "epoch": 1.41294530858003, - "grad_norm": 18.087276533577075, + "grad_norm": 16.12761786217012, "learning_rate": 6.360911698273326e-07, - "logits/chosen": 0.6644759774208069, - "logits/rejected": 0.7190378904342651, - "logps/chosen": -9.075571060180664, - "logps/rejected": -9.81869888305664, - "loss": 0.5604, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -9.075571060180664, - "rewards/margins": 0.7431273460388184, - "rewards/rejected": -9.81869888305664, - "semantic_entropy": 0.002120462479069829, + "logits/chosen": -0.1403321921825409, + "logits/rejected": -0.008636483922600746, + "logps/chosen": -4.370502471923828, + "logps/rejected": -5.154816150665283, + "loss": 0.5331, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.370502471923828, + "rewards/margins": 0.7843137979507446, + "rewards/rejected": -5.154816150665283, "step": 2640 }, { "epoch": 1.4156213413614318, - "grad_norm": 17.443905406025888, + "grad_norm": 15.980521230604575, "learning_rate": 6.345919080661944e-07, - "logits/chosen": 0.6211899518966675, - "logits/rejected": 0.6783931851387024, - "logps/chosen": -8.705963134765625, - "logps/rejected": -9.728338241577148, - "loss": 0.4588, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.705963134765625, - "rewards/margins": 1.0223755836486816, - "rewards/rejected": -9.728338241577148, - "semantic_entropy": 0.0033381134271621704, + "logits/chosen": -0.1588279902935028, + "logits/rejected": -0.06751420348882675, + "logps/chosen": -4.013942241668701, + "logps/rejected": -5.0026535987854, + "loss": 0.4663, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.013942241668701, + "rewards/margins": 0.9887111783027649, + "rewards/rejected": -5.0026535987854, "step": 2645 }, { "epoch": 1.4182973741428333, - "grad_norm": 18.870609176206806, + "grad_norm": 16.55742994297145, "learning_rate": 6.330913404937737e-07, - "logits/chosen": 0.6599079370498657, - "logits/rejected": 0.736240565776825, - "logps/chosen": -8.820722579956055, - "logps/rejected": -9.812540054321289, - "loss": 0.49, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.820722579956055, - "rewards/margins": 0.9918166995048523, - "rewards/rejected": -9.812540054321289, - "semantic_entropy": 0.002813478233292699, + "logits/chosen": -0.21487262845039368, + "logits/rejected": -0.06771984696388245, + "logps/chosen": -4.20560884475708, + "logps/rejected": -5.279620170593262, + "loss": 0.4633, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.20560884475708, + "rewards/margins": 1.0740115642547607, + "rewards/rejected": -5.279620170593262, "step": 2650 }, { "epoch": 1.4209734069242348, - "grad_norm": 20.325954850202407, + "grad_norm": 18.85772663274055, "learning_rate": 6.315894816685838e-07, - "logits/chosen": 0.6205192804336548, - "logits/rejected": 0.6998498439788818, - "logps/chosen": -8.68531608581543, - "logps/rejected": -9.498059272766113, - "loss": 0.5063, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.68531608581543, - "rewards/margins": 0.8127420544624329, - "rewards/rejected": -9.498059272766113, - "semantic_entropy": 0.0029031294398009777, + "logits/chosen": -0.15051880478858948, + "logits/rejected": -0.0067515261471271515, + "logps/chosen": -4.13101053237915, + "logps/rejected": -5.050341606140137, + "loss": 0.4636, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.13101053237915, + "rewards/margins": 0.9193310737609863, + "rewards/rejected": -5.050341606140137, "step": 2655 }, { "epoch": 1.4236494397056365, - "grad_norm": 17.30912171919916, + "grad_norm": 16.132692534813696, "learning_rate": 6.300863461616657e-07, - "logits/chosen": 0.6297236680984497, - "logits/rejected": 0.6760424971580505, - "logps/chosen": -8.35009765625, - "logps/rejected": -9.166117668151855, - "loss": 0.5628, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -8.35009765625, - "rewards/margins": 0.816020131111145, - "rewards/rejected": -9.166117668151855, - "semantic_entropy": 0.003771452931687236, + "logits/chosen": -0.12720605731010437, + "logits/rejected": -0.03322717547416687, + "logps/chosen": -4.043786525726318, + "logps/rejected": -4.780965805053711, + "loss": 0.5772, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.043786525726318, + "rewards/margins": 0.7371792793273926, + "rewards/rejected": -4.780965805053711, "step": 2660 }, { "epoch": 1.426325472487038, - "grad_norm": 15.64812739081427, + "grad_norm": 12.882884837799947, "learning_rate": 6.285819485564465e-07, - "logits/chosen": 0.5272424817085266, - "logits/rejected": 0.5996197462081909, - "logps/chosen": -8.598016738891602, - "logps/rejected": -9.50200080871582, - "loss": 0.4918, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.598016738891602, - "rewards/margins": 0.9039848446846008, - "rewards/rejected": -9.50200080871582, - "semantic_entropy": 0.003393507096916437, + "logits/chosen": -0.2650359570980072, + "logits/rejected": -0.10435433685779572, + "logps/chosen": -4.332977294921875, + "logps/rejected": -5.244199752807617, + "loss": 0.4594, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.332977294921875, + "rewards/margins": 0.9112218022346497, + "rewards/rejected": -5.244199752807617, "step": 2665 }, { "epoch": 1.4290015052684395, - "grad_norm": 22.54029918814073, + "grad_norm": 17.909203800856503, "learning_rate": 6.270763034485986e-07, - "logits/chosen": 0.6191005706787109, - "logits/rejected": 0.6678223609924316, - "logps/chosen": -8.656926155090332, - "logps/rejected": -9.598608016967773, - "loss": 0.5089, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -8.656926155090332, - "rewards/margins": 0.9416826963424683, - "rewards/rejected": -9.598608016967773, - "semantic_entropy": 0.0036796010099351406, + "logits/chosen": -0.09289058297872543, + "logits/rejected": 0.007665800396353006, + "logps/chosen": -4.531368255615234, + "logps/rejected": -5.438704967498779, + "loss": 0.4941, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.531368255615234, + "rewards/margins": 0.9073368310928345, + "rewards/rejected": -5.438704967498779, "step": 2670 }, { "epoch": 1.4316775380498412, - "grad_norm": 38.66235795954998, + "grad_norm": 25.926101834020756, "learning_rate": 6.255694254458972e-07, - "logits/chosen": 0.5672577619552612, - "logits/rejected": 0.6477295160293579, - "logps/chosen": -8.749332427978516, - "logps/rejected": -9.726335525512695, - "loss": 0.5089, - "rewards/accuracies": 0.75, - "rewards/chosen": -8.749332427978516, - "rewards/margins": 0.9770025014877319, - "rewards/rejected": -9.726335525512695, - "semantic_entropy": 0.003146649803966284, + "logits/chosen": -0.14800944924354553, + "logits/rejected": 0.017013153061270714, + "logps/chosen": -4.503047943115234, + "logps/rejected": -5.369532108306885, + "loss": 0.5534, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.503047943115234, + "rewards/margins": 0.8664838671684265, + "rewards/rejected": -5.369532108306885, "step": 2675 }, { "epoch": 1.4343535708312427, - "grad_norm": 24.587723329790435, + "grad_norm": 26.865499014276942, "learning_rate": 6.240613291680795e-07, - "logits/chosen": 0.532563328742981, - "logits/rejected": 0.6375452280044556, - "logps/chosen": -8.473979949951172, - "logps/rejected": -9.374165534973145, - "loss": 0.5416, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.473979949951172, - "rewards/margins": 0.9001848101615906, - "rewards/rejected": -9.374165534973145, - "semantic_entropy": 0.004033363424241543, + "logits/chosen": -0.2220575362443924, + "logits/rejected": -0.039562225341796875, + "logps/chosen": -4.085072040557861, + "logps/rejected": -4.88592529296875, + "loss": 0.5594, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.085072040557861, + "rewards/margins": 0.8008529543876648, + "rewards/rejected": -4.88592529296875, "step": 2680 }, { "epoch": 1.4370296036126442, - "grad_norm": 17.100229454274775, + "grad_norm": 14.623768170528443, "learning_rate": 6.225520292467021e-07, - "logits/chosen": 0.5713559985160828, - "logits/rejected": 0.6829615831375122, - "logps/chosen": -8.452000617980957, - "logps/rejected": -9.610584259033203, - "loss": 0.4244, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -8.452000617980957, - "rewards/margins": 1.1585838794708252, - "rewards/rejected": -9.610584259033203, - "semantic_entropy": 0.0038808733224868774, + "logits/chosen": -0.2240525484085083, + "logits/rejected": 0.043808113783597946, + "logps/chosen": -3.905118942260742, + "logps/rejected": -4.946297645568848, + "loss": 0.4464, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.905118942260742, + "rewards/margins": 1.0411783456802368, + "rewards/rejected": -4.946297645568848, "step": 2685 }, { "epoch": 1.439705636394046, - "grad_norm": 22.766099328641907, + "grad_norm": 23.96577537202541, "learning_rate": 6.210415403249993e-07, - "logits/chosen": 0.5507108569145203, - "logits/rejected": 0.6832348704338074, - "logps/chosen": -8.426264762878418, - "logps/rejected": -9.474390029907227, - "loss": 0.49, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -8.426264762878418, - "rewards/margins": 1.0481255054473877, - "rewards/rejected": -9.474390029907227, - "semantic_entropy": 0.004706330597400665, + "logits/chosen": -0.3126766085624695, + "logits/rejected": -0.03334660455584526, + "logps/chosen": -3.910989761352539, + "logps/rejected": -4.82614803314209, + "loss": 0.5248, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.910989761352539, + "rewards/margins": 0.9151586294174194, + "rewards/rejected": -4.82614803314209, "step": 2690 }, { "epoch": 1.4423816691754474, - "grad_norm": 21.28132833379913, + "grad_norm": 18.139502639513744, "learning_rate": 6.195298770577415e-07, - "logits/chosen": 0.6715400815010071, - "logits/rejected": 0.6841408610343933, - "logps/chosen": -8.57282829284668, - "logps/rejected": -9.571008682250977, - "loss": 0.5092, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.57282829284668, - "rewards/margins": 0.9981800317764282, - "rewards/rejected": -9.571008682250977, - "semantic_entropy": 0.0033726401161402464, + "logits/chosen": -0.08792294561862946, + "logits/rejected": -0.08294588327407837, + "logps/chosen": -3.9402127265930176, + "logps/rejected": -4.816552639007568, + "loss": 0.5174, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.9402127265930176, + "rewards/margins": 0.876340389251709, + "rewards/rejected": -4.816552639007568, "step": 2695 }, { "epoch": 1.445057701956849, - "grad_norm": 17.178979708018453, + "grad_norm": 12.958184378106308, "learning_rate": 6.180170541110923e-07, - "logits/chosen": 0.644763708114624, - "logits/rejected": 0.746247410774231, - "logps/chosen": -8.67873477935791, - "logps/rejected": -9.667515754699707, - "loss": 0.4689, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.67873477935791, - "rewards/margins": 0.9887820482254028, - "rewards/rejected": -9.667515754699707, - "semantic_entropy": 0.0032643512822687626, + "logits/chosen": -0.17743578553199768, + "logits/rejected": 0.009365704841911793, + "logps/chosen": -4.034564971923828, + "logps/rejected": -4.933711528778076, + "loss": 0.495, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.034564971923828, + "rewards/margins": 0.8991463780403137, + "rewards/rejected": -4.933711528778076, "step": 2700 }, { "epoch": 1.4477337347382506, - "grad_norm": 18.421052634418867, + "grad_norm": 13.423208522802895, "learning_rate": 6.165030861624663e-07, - "logits/chosen": 0.5887877345085144, - "logits/rejected": 0.7074635624885559, - "logps/chosen": -8.820067405700684, - "logps/rejected": -10.023954391479492, - "loss": 0.4421, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -8.820067405700684, - "rewards/margins": 1.2038882970809937, - "rewards/rejected": -10.023954391479492, - "semantic_entropy": 0.002584748435765505, + "logits/chosen": -0.26237523555755615, + "logits/rejected": -0.023884903639554977, + "logps/chosen": -4.09101676940918, + "logps/rejected": -5.211627960205078, + "loss": 0.4343, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.09101676940918, + "rewards/margins": 1.1206114292144775, + "rewards/rejected": -5.211627960205078, "step": 2705 }, { "epoch": 1.4504097675196521, - "grad_norm": 18.781234298542874, + "grad_norm": 16.342819743396124, "learning_rate": 6.149879879003876e-07, - "logits/chosen": 0.7198264598846436, - "logits/rejected": 0.7411429286003113, - "logps/chosen": -8.748977661132812, - "logps/rejected": -9.773608207702637, - "loss": 0.4872, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -8.748977661132812, - "rewards/margins": 1.0246312618255615, - "rewards/rejected": -9.773608207702637, - "semantic_entropy": 0.003493456868454814, + "logits/chosen": -0.1304188072681427, + "logits/rejected": -0.11706867069005966, + "logps/chosen": -4.046146392822266, + "logps/rejected": -4.988315105438232, + "loss": 0.4757, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.046146392822266, + "rewards/margins": 0.9421690106391907, + "rewards/rejected": -4.988315105438232, "step": 2710 }, { "epoch": 1.4530858003010536, - "grad_norm": 16.626832554859085, + "grad_norm": 13.23129375271714, "learning_rate": 6.13471774024346e-07, - "logits/chosen": 0.6111767292022705, - "logits/rejected": 0.6929227113723755, - "logps/chosen": -8.618370056152344, - "logps/rejected": -9.645769119262695, - "loss": 0.4634, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -8.618370056152344, - "rewards/margins": 1.0273983478546143, - "rewards/rejected": -9.645769119262695, - "semantic_entropy": 0.003336191177368164, + "logits/chosen": -0.28209811449050903, + "logits/rejected": -0.14693889021873474, + "logps/chosen": -4.028688430786133, + "logps/rejected": -4.9088454246521, + "loss": 0.4855, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.028688430786133, + "rewards/margins": 0.8801568746566772, + "rewards/rejected": -4.9088454246521, "step": 2715 }, { "epoch": 1.4557618330824553, - "grad_norm": 15.275820526471591, + "grad_norm": 14.046738829331373, "learning_rate": 6.119544592446551e-07, - "logits/chosen": 0.6123021841049194, - "logits/rejected": 0.6909358501434326, - "logps/chosen": -8.73341178894043, - "logps/rejected": -9.467869758605957, - "loss": 0.5598, - "rewards/accuracies": 0.6875, - "rewards/chosen": -8.73341178894043, - "rewards/margins": 0.7344561815261841, - "rewards/rejected": -9.467869758605957, - "semantic_entropy": 0.0026931720785796642, + "logits/chosen": -0.24227353930473328, + "logits/rejected": -0.10609109699726105, + "logps/chosen": -4.236247539520264, + "logps/rejected": -5.029049396514893, + "loss": 0.5129, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.236247539520264, + "rewards/margins": 0.7928018569946289, + "rewards/rejected": -5.029049396514893, "step": 2720 }, { "epoch": 1.4584378658638568, - "grad_norm": 22.157104502252846, + "grad_norm": 19.904370240114492, "learning_rate": 6.104360582823096e-07, - "logits/chosen": 0.7188630104064941, - "logits/rejected": 0.7658997774124146, - "logps/chosen": -8.619566917419434, - "logps/rejected": -9.543168067932129, - "loss": 0.4784, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -8.619566917419434, - "rewards/margins": 0.9236003160476685, - "rewards/rejected": -9.543168067932129, - "semantic_entropy": 0.003137335879728198, + "logits/chosen": -0.18201594054698944, + "logits/rejected": -0.0855020210146904, + "logps/chosen": -4.132702827453613, + "logps/rejected": -4.958357810974121, + "loss": 0.5056, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.132702827453613, + "rewards/margins": 0.8256546258926392, + "rewards/rejected": -4.958357810974121, "step": 2725 }, { "epoch": 1.4611138986452583, - "grad_norm": 20.352841774469262, + "grad_norm": 18.14453504650319, "learning_rate": 6.089165858688423e-07, - "logits/chosen": 0.6846107244491577, - "logits/rejected": 0.7849973440170288, - "logps/chosen": -8.482339859008789, - "logps/rejected": -9.482155799865723, - "loss": 0.5162, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -8.482339859008789, - "rewards/margins": 0.9998153448104858, - "rewards/rejected": -9.482155799865723, - "semantic_entropy": 0.0033546772319823503, + "logits/chosen": -0.17252039909362793, + "logits/rejected": 0.024833858013153076, + "logps/chosen": -4.125124931335449, + "logps/rejected": -5.089757442474365, + "loss": 0.5062, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.125124931335449, + "rewards/margins": 0.9646331071853638, + "rewards/rejected": -5.089757442474365, "step": 2730 }, { "epoch": 1.46378993142666, - "grad_norm": 12.410524435742113, + "grad_norm": 11.291826335425242, "learning_rate": 6.073960567461811e-07, - "logits/chosen": 0.7148826718330383, - "logits/rejected": 0.8275319933891296, - "logps/chosen": -8.265142440795898, - "logps/rejected": -9.430082321166992, - "loss": 0.417, + "logits/chosen": -0.2030678689479828, + "logits/rejected": -0.00813041627407074, + "logps/chosen": -3.82664155960083, + "logps/rejected": -4.876031398773193, + "loss": 0.4311, "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -8.265142440795898, - "rewards/margins": 1.1649402379989624, - "rewards/rejected": -9.430082321166992, - "semantic_entropy": 0.0043460773304104805, + "rewards/chosen": -3.82664155960083, + "rewards/margins": 1.049390196800232, + "rewards/rejected": -4.876031398773193, "step": 2735 }, { "epoch": 1.4664659642080615, - "grad_norm": 19.113883533650075, + "grad_norm": 14.384423068684766, "learning_rate": 6.058744856665065e-07, - "logits/chosen": 0.63080894947052, - "logits/rejected": 0.6792045831680298, - "logps/chosen": -8.33402156829834, - "logps/rejected": -9.474609375, - "loss": 0.4612, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.33402156829834, - "rewards/margins": 1.140586256980896, - "rewards/rejected": -9.474609375, - "semantic_entropy": 0.0038649775087833405, + "logits/chosen": -0.1970929056406021, + "logits/rejected": -0.07971971482038498, + "logps/chosen": -4.0747551918029785, + "logps/rejected": -5.077336311340332, + "loss": 0.4713, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.0747551918029785, + "rewards/margins": 1.0025817155838013, + "rewards/rejected": -5.077336311340332, "step": 2740 }, { "epoch": 1.469141996989463, - "grad_norm": 19.573081780536924, + "grad_norm": 19.101808732166063, "learning_rate": 6.043518873921074e-07, - "logits/chosen": 0.6687484979629517, - "logits/rejected": 0.7621025443077087, - "logps/chosen": -8.205583572387695, - "logps/rejected": -9.156460762023926, - "loss": 0.4883, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.205583572387695, - "rewards/margins": 0.9508770108222961, - "rewards/rejected": -9.156460762023926, - "semantic_entropy": 0.004209198523312807, + "logits/chosen": -0.21917016804218292, + "logits/rejected": -0.0444403775036335, + "logps/chosen": -4.0646562576293945, + "logps/rejected": -4.963902950286865, + "loss": 0.4614, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.0646562576293945, + "rewards/margins": 0.8992471694946289, + "rewards/rejected": -4.963902950286865, "step": 2745 }, { "epoch": 1.4718180297708647, - "grad_norm": 19.22285858261932, + "grad_norm": 24.32436925339396, "learning_rate": 6.028282766952393e-07, - "logits/chosen": 0.6872994303703308, - "logits/rejected": 0.7447125315666199, - "logps/chosen": -8.262472152709961, - "logps/rejected": -9.375197410583496, - "loss": 0.4668, - "rewards/accuracies": 0.8125, - "rewards/chosen": -8.262472152709961, - "rewards/margins": 1.1127252578735352, - "rewards/rejected": -9.375197410583496, - "semantic_entropy": 0.004294519778341055, + "logits/chosen": -0.1193154901266098, + "logits/rejected": 0.005037794820964336, + "logps/chosen": -4.153010368347168, + "logps/rejected": -5.114907264709473, + "loss": 0.4768, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.153010368347168, + "rewards/margins": 0.961897075176239, + "rewards/rejected": -5.114907264709473, "step": 2750 }, { "epoch": 1.4744940625522662, - "grad_norm": 28.965956806202943, + "grad_norm": 26.770451685953176, "learning_rate": 6.013036683579798e-07, - "logits/chosen": 0.7001906633377075, - "logits/rejected": 0.7653275728225708, - "logps/chosen": -8.254480361938477, - "logps/rejected": -9.233253479003906, - "loss": 0.5039, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.254480361938477, - "rewards/margins": 0.9787724614143372, - "rewards/rejected": -9.233253479003906, - "semantic_entropy": 0.00447105010971427, + "logits/chosen": -0.11503756046295166, + "logits/rejected": 0.02100886031985283, + "logps/chosen": -4.14285945892334, + "logps/rejected": -5.1375885009765625, + "loss": 0.4733, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.14285945892334, + "rewards/margins": 0.9947288632392883, + "rewards/rejected": -5.1375885009765625, "step": 2755 }, { "epoch": 1.4771700953336677, - "grad_norm": 19.46901251194481, + "grad_norm": 16.529007615908103, "learning_rate": 5.997780771720854e-07, - "logits/chosen": 0.6296931505203247, - "logits/rejected": 0.7145162224769592, - "logps/chosen": -8.382627487182617, - "logps/rejected": -9.445598602294922, - "loss": 0.4638, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.382627487182617, - "rewards/margins": 1.0629713535308838, - "rewards/rejected": -9.445598602294922, - "semantic_entropy": 0.004158531315624714, + "logits/chosen": -0.2039274275302887, + "logits/rejected": 0.0059952931478619576, + "logps/chosen": -4.310267448425293, + "logps/rejected": -5.279348373413086, + "loss": 0.4739, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.310267448425293, + "rewards/margins": 0.9690812230110168, + "rewards/rejected": -5.279348373413086, "step": 2760 }, { "epoch": 1.4798461281150694, - "grad_norm": 19.53223501670479, + "grad_norm": 18.56132573520242, "learning_rate": 5.982515179388486e-07, - "logits/chosen": 0.7034457325935364, - "logits/rejected": 0.7711877226829529, - "logps/chosen": -8.465707778930664, - "logps/rejected": -9.426530838012695, - "loss": 0.5123, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.465707778930664, - "rewards/margins": 0.9608221054077148, - "rewards/rejected": -9.426530838012695, - "semantic_entropy": 0.003798137651756406, + "logits/chosen": -0.09212259203195572, + "logits/rejected": 0.04316640645265579, + "logps/chosen": -4.247354984283447, + "logps/rejected": -5.117578029632568, + "loss": 0.513, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.247354984283447, + "rewards/margins": 0.8702225685119629, + "rewards/rejected": -5.117578029632568, "step": 2765 }, { "epoch": 1.482522160896471, - "grad_norm": 14.100323670272886, + "grad_norm": 15.194335679561014, "learning_rate": 5.967240054689541e-07, - "logits/chosen": 0.6083649396896362, - "logits/rejected": 0.6571983098983765, - "logps/chosen": -8.479659080505371, - "logps/rejected": -9.488851547241211, - "loss": 0.4886, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.479659080505371, - "rewards/margins": 1.0091919898986816, - "rewards/rejected": -9.488851547241211, - "semantic_entropy": 0.0037951588165014982, + "logits/chosen": -0.19906362891197205, + "logits/rejected": -0.11630463600158691, + "logps/chosen": -4.503647804260254, + "logps/rejected": -5.335875034332275, + "loss": 0.5397, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.503647804260254, + "rewards/margins": 0.8322272300720215, + "rewards/rejected": -5.335875034332275, "step": 2770 }, { "epoch": 1.4851981936778724, - "grad_norm": 19.799070365162166, + "grad_norm": 19.33573735843689, "learning_rate": 5.951955545823342e-07, - "logits/chosen": 0.6102844476699829, - "logits/rejected": 0.6613792181015015, - "logps/chosen": -8.798731803894043, - "logps/rejected": -9.752110481262207, - "loss": 0.5194, + "logits/chosen": -0.13369004428386688, + "logits/rejected": -0.047192804515361786, + "logps/chosen": -4.645261764526367, + "logps/rejected": -5.574832439422607, + "loss": 0.5236, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -8.798731803894043, - "rewards/margins": 0.9533787965774536, - "rewards/rejected": -9.752110481262207, - "semantic_entropy": 0.003143253503367305, + "rewards/chosen": -4.645261764526367, + "rewards/margins": 0.9295710325241089, + "rewards/rejected": -5.574832439422607, "step": 2775 }, { "epoch": 1.4878742264592741, - "grad_norm": 17.219558141254097, + "grad_norm": 14.046585247255592, "learning_rate": 5.936661801080263e-07, - "logits/chosen": 0.5687362551689148, - "logits/rejected": 0.634920597076416, - "logps/chosen": -8.662237167358398, - "logps/rejected": -9.508859634399414, - "loss": 0.5463, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -8.662237167358398, - "rewards/margins": 0.8466218709945679, - "rewards/rejected": -9.508859634399414, - "semantic_entropy": 0.0033071953803300858, + "logits/chosen": -0.13667592406272888, + "logits/rejected": -0.00916269700974226, + "logps/chosen": -4.604886054992676, + "logps/rejected": -5.3852715492248535, + "loss": 0.562, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.604886054992676, + "rewards/margins": 0.7803853750228882, + "rewards/rejected": -5.3852715492248535, "step": 2780 }, { "epoch": 1.4905502592406756, - "grad_norm": 20.01873928855486, + "grad_norm": 12.966098862370366, "learning_rate": 5.92135896884028e-07, - "logits/chosen": 0.6002562642097473, - "logits/rejected": 0.696466326713562, - "logps/chosen": -8.673624992370605, - "logps/rejected": -9.823293685913086, - "loss": 0.455, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -8.673624992370605, - "rewards/margins": 1.1496690511703491, - "rewards/rejected": -9.823293685913086, - "semantic_entropy": 0.003196306060999632, + "logits/chosen": -0.21844227612018585, + "logits/rejected": -0.05417442321777344, + "logps/chosen": -4.419846057891846, + "logps/rejected": -5.375156879425049, + "loss": 0.486, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.419846057891846, + "rewards/margins": 0.9553111791610718, + "rewards/rejected": -5.375156879425049, "step": 2785 }, { "epoch": 1.4932262920220774, - "grad_norm": 30.437982634244353, + "grad_norm": 20.519633351655134, "learning_rate": 5.906047197571541e-07, - "logits/chosen": 0.5805534720420837, - "logits/rejected": 0.5821332931518555, - "logps/chosen": -8.47557258605957, - "logps/rejected": -9.357189178466797, - "loss": 0.5343, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -8.47557258605957, - "rewards/margins": 0.8816182017326355, - "rewards/rejected": -9.357189178466797, - "semantic_entropy": 0.0038487245328724384, + "logits/chosen": -0.14281408488750458, + "logits/rejected": -0.16088511049747467, + "logps/chosen": -4.128978729248047, + "logps/rejected": -4.905943870544434, + "loss": 0.5317, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.128978729248047, + "rewards/margins": 0.7769646644592285, + "rewards/rejected": -4.905943870544434, "step": 2790 }, { "epoch": 1.4959023248034788, - "grad_norm": 14.831624104584504, + "grad_norm": 13.218552229359782, "learning_rate": 5.890726635828919e-07, - "logits/chosen": 0.5990924835205078, - "logits/rejected": 0.6138975024223328, - "logps/chosen": -8.312957763671875, - "logps/rejected": -9.31361198425293, - "loss": 0.5031, + "logits/chosen": -0.08836929500102997, + "logits/rejected": -0.09392032027244568, + "logps/chosen": -3.944596767425537, + "logps/rejected": -4.9046220779418945, + "loss": 0.4989, "rewards/accuracies": 0.75, - "rewards/chosen": -8.312957763671875, - "rewards/margins": 1.0006548166275024, - "rewards/rejected": -9.31361198425293, - "semantic_entropy": 0.004499537404626608, + "rewards/chosen": -3.944596767425537, + "rewards/margins": 0.9600253105163574, + "rewards/rejected": -4.9046220779418945, "step": 2795 }, { "epoch": 1.4985783575848803, - "grad_norm": 21.33910911582425, + "grad_norm": 19.36723916074196, "learning_rate": 5.875397432252569e-07, - "logits/chosen": 0.5481540560722351, - "logits/rejected": 0.6002416610717773, - "logps/chosen": -8.367044448852539, - "logps/rejected": -9.348922729492188, - "loss": 0.4879, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.367044448852539, - "rewards/margins": 0.9818779230117798, - "rewards/rejected": -9.348922729492188, - "semantic_entropy": 0.004131897818297148, + "logits/chosen": -0.17327983677387238, + "logits/rejected": -0.1026223674416542, + "logps/chosen": -3.93298077583313, + "logps/rejected": -4.787880897521973, + "loss": 0.518, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.93298077583313, + "rewards/margins": 0.8549000024795532, + "rewards/rejected": -4.787880897521973, "step": 2800 }, { "epoch": 1.4985783575848803, - "eval_logits/chosen": 0.6895690560340881, - "eval_logits/rejected": 0.749624490737915, - "eval_logps/chosen": -8.626724243164062, - "eval_logps/rejected": -9.53298282623291, - "eval_loss": 0.5264463424682617, - "eval_rewards/accuracies": 0.7218101024627686, - "eval_rewards/chosen": -8.626724243164062, - "eval_rewards/margins": 0.9062579870223999, - "eval_rewards/rejected": -9.53298282623291, - "eval_runtime": 35.1374, - "eval_samples_per_second": 38.278, - "eval_semantic_entropy": 0.0033146331552416086, - "eval_steps_per_second": 9.591, + "eval_logits/chosen": 0.21856442093849182, + "eval_logits/rejected": 0.33184969425201416, + "eval_logps/chosen": -4.243931293487549, + "eval_logps/rejected": -5.142279624938965, + "eval_loss": 0.5189453959465027, + "eval_rewards/accuracies": 0.7351632118225098, + "eval_rewards/chosen": -4.243931293487549, + "eval_rewards/margins": 0.8983485102653503, + "eval_rewards/rejected": -5.142279624938965, + "eval_runtime": 40.4782, + "eval_samples_per_second": 33.228, + "eval_steps_per_second": 8.325, "step": 2800 }, { "epoch": 1.5012543903662818, - "grad_norm": 15.846955607641894, + "grad_norm": 12.089546365672936, "learning_rate": 5.860059735566491e-07, - "logits/chosen": 0.4758935868740082, - "logits/rejected": 0.5631710290908813, - "logps/chosen": -8.500029563903809, - "logps/rejected": -9.449440002441406, - "loss": 0.5011, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.500029563903809, - "rewards/margins": 0.9494104385375977, - "rewards/rejected": -9.449440002441406, - "semantic_entropy": 0.004371006041765213, + "logits/chosen": -0.28701701760292053, + "logits/rejected": -0.10909227281808853, + "logps/chosen": -4.066670894622803, + "logps/rejected": -4.952474117279053, + "loss": 0.5021, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.066670894622803, + "rewards/margins": 0.88580322265625, + "rewards/rejected": -4.952474117279053, "step": 2805 }, { "epoch": 1.5039304231476835, - "grad_norm": 24.705548492382672, + "grad_norm": 17.55398786795427, "learning_rate": 5.844713694577087e-07, - "logits/chosen": 0.5791555643081665, - "logits/rejected": 0.6237837672233582, - "logps/chosen": -8.626651763916016, - "logps/rejected": -9.473905563354492, - "loss": 0.5144, + "logits/chosen": -0.18107187747955322, + "logits/rejected": -0.07903929799795151, + "logps/chosen": -4.007254600524902, + "logps/rejected": -4.912121295928955, + "loss": 0.4902, "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.626651763916016, - "rewards/margins": 0.8472524881362915, - "rewards/rejected": -9.473905563354492, - "semantic_entropy": 0.003329088445752859, + "rewards/chosen": -4.007254600524902, + "rewards/margins": 0.9048662185668945, + "rewards/rejected": -4.912121295928955, "step": 2810 }, { "epoch": 1.5066064559290853, - "grad_norm": 14.693823746449954, + "grad_norm": 12.352426455973694, "learning_rate": 5.829359458171714e-07, - "logits/chosen": 0.5436751842498779, - "logits/rejected": 0.5992386341094971, - "logps/chosen": -8.613186836242676, - "logps/rejected": -9.71760368347168, - "loss": 0.4307, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -8.613186836242676, - "rewards/margins": 1.1044175624847412, - "rewards/rejected": -9.71760368347168, - "semantic_entropy": 0.003140996443107724, + "logits/chosen": -0.12731540203094482, + "logits/rejected": -0.005612348672002554, + "logps/chosen": -4.040764808654785, + "logps/rejected": -5.169407844543457, + "loss": 0.4127, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.040764808654785, + "rewards/margins": 1.1286427974700928, + "rewards/rejected": -5.169407844543457, "step": 2815 }, { "epoch": 1.5092824887104868, - "grad_norm": 15.499555019749941, + "grad_norm": 15.22196173080677, "learning_rate": 5.81399717531724e-07, - "logits/chosen": 0.5179445147514343, - "logits/rejected": 0.6111316084861755, - "logps/chosen": -8.66978645324707, - "logps/rejected": -9.439632415771484, - "loss": 0.5827, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -8.66978645324707, - "rewards/margins": 0.7698466777801514, - "rewards/rejected": -9.439632415771484, - "semantic_entropy": 0.0032921708188951015, + "logits/chosen": -0.22254343330860138, + "logits/rejected": -0.009092235937714577, + "logps/chosen": -4.34829568862915, + "logps/rejected": -5.156822204589844, + "loss": 0.5482, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.34829568862915, + "rewards/margins": 0.8085265159606934, + "rewards/rejected": -5.156822204589844, "step": 2820 }, { "epoch": 1.5119585214918883, - "grad_norm": 16.093368752669864, + "grad_norm": 14.820359785440933, "learning_rate": 5.798626995058602e-07, - "logits/chosen": 0.5145665407180786, - "logits/rejected": 0.6263571977615356, - "logps/chosen": -8.70081901550293, - "logps/rejected": -9.660847663879395, - "loss": 0.4992, - "rewards/accuracies": 0.71875, - "rewards/chosen": -8.70081901550293, - "rewards/margins": 0.9600294232368469, - "rewards/rejected": -9.660847663879395, - "semantic_entropy": 0.002892556134611368, + "logits/chosen": -0.2766607403755188, + "logits/rejected": -0.0658869668841362, + "logps/chosen": -4.2642364501953125, + "logps/rejected": -5.150816440582275, + "loss": 0.5033, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.2642364501953125, + "rewards/margins": 0.8865801095962524, + "rewards/rejected": -5.150816440582275, "step": 2825 }, { "epoch": 1.51463455427329, - "grad_norm": 13.79559161731438, + "grad_norm": 15.517342473511313, "learning_rate": 5.783249066517354e-07, - "logits/chosen": 0.5084502696990967, - "logits/rejected": 0.5790830850601196, - "logps/chosen": -8.357169151306152, - "logps/rejected": -9.426858901977539, - "loss": 0.4507, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -8.357169151306152, - "rewards/margins": 1.0696887969970703, - "rewards/rejected": -9.426858901977539, - "semantic_entropy": 0.0035817469470202923, + "logits/chosen": -0.20810966193675995, + "logits/rejected": -0.07054269313812256, + "logps/chosen": -3.993712902069092, + "logps/rejected": -4.975928783416748, + "loss": 0.4683, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.993712902069092, + "rewards/margins": 0.9822158813476562, + "rewards/rejected": -4.975928783416748, "step": 2830 }, { "epoch": 1.5173105870546915, - "grad_norm": 17.910427446812193, + "grad_norm": 24.800555957021622, "learning_rate": 5.767863538890228e-07, - "logits/chosen": 0.5757554769515991, - "logits/rejected": 0.6622332334518433, - "logps/chosen": -8.620783805847168, - "logps/rejected": -9.76025390625, - "loss": 0.4271, - "rewards/accuracies": 0.8125, - "rewards/chosen": -8.620783805847168, - "rewards/margins": 1.1394703388214111, - "rewards/rejected": -9.76025390625, - "semantic_entropy": 0.0031910459510982037, + "logits/chosen": -0.2221885472536087, + "logits/rejected": -0.06265170872211456, + "logps/chosen": -4.191830635070801, + "logps/rejected": -5.305400848388672, + "loss": 0.4373, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.191830635070801, + "rewards/margins": 1.113569974899292, + "rewards/rejected": -5.305400848388672, "step": 2835 }, { "epoch": 1.519986619836093, - "grad_norm": 17.705706566190077, + "grad_norm": 18.955169904961647, "learning_rate": 5.75247056144768e-07, - "logits/chosen": 0.5833605527877808, - "logits/rejected": 0.6117344498634338, - "logps/chosen": -8.490338325500488, - "logps/rejected": -9.415372848510742, - "loss": 0.5481, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.490338325500488, - "rewards/margins": 0.925035834312439, - "rewards/rejected": -9.415372848510742, - "semantic_entropy": 0.0037457395810633898, + "logits/chosen": -0.15927252173423767, + "logits/rejected": -0.08590450137853622, + "logps/chosen": -4.189095973968506, + "logps/rejected": -4.989994049072266, + "loss": 0.5548, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.189095973968506, + "rewards/margins": 0.800898551940918, + "rewards/rejected": -4.989994049072266, "step": 2840 }, { "epoch": 1.5226626526174947, - "grad_norm": 19.220022114950083, + "grad_norm": 17.05940455396654, "learning_rate": 5.737070283532444e-07, - "logits/chosen": 0.6395395994186401, - "logits/rejected": 0.6757252812385559, - "logps/chosen": -8.574124336242676, - "logps/rejected": -9.431645393371582, - "loss": 0.5835, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -8.574124336242676, - "rewards/margins": 0.8575227856636047, - "rewards/rejected": -9.431645393371582, - "semantic_entropy": 0.0034476309083402157, + "logits/chosen": -0.1411527395248413, + "logits/rejected": -0.05970393866300583, + "logps/chosen": -4.173077583312988, + "logps/rejected": -5.009397506713867, + "loss": 0.5847, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.173077583312988, + "rewards/margins": 0.8363205194473267, + "rewards/rejected": -5.009397506713867, "step": 2845 }, { "epoch": 1.5253386853988962, - "grad_norm": 16.469084913870834, + "grad_norm": 18.26614899573096, "learning_rate": 5.721662854558084e-07, - "logits/chosen": 0.5754357576370239, - "logits/rejected": 0.6329125165939331, - "logps/chosen": -8.597195625305176, - "logps/rejected": -9.660018920898438, - "loss": 0.4696, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.597195625305176, - "rewards/margins": 1.0628234148025513, - "rewards/rejected": -9.660018920898438, - "semantic_entropy": 0.003124454291537404, + "logits/chosen": -0.2171013057231903, + "logits/rejected": -0.10839462280273438, + "logps/chosen": -4.178244590759277, + "logps/rejected": -5.187560558319092, + "loss": 0.4555, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.178244590759277, + "rewards/margins": 1.0093164443969727, + "rewards/rejected": -5.187560558319092, "step": 2850 }, { "epoch": 1.5280147181802977, - "grad_norm": 18.533405817853968, + "grad_norm": 13.907642022002634, "learning_rate": 5.706248424007545e-07, - "logits/chosen": 0.4836948812007904, - "logits/rejected": 0.5998759269714355, - "logps/chosen": -8.543926239013672, - "logps/rejected": -9.415602684020996, - "loss": 0.5262, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.543926239013672, - "rewards/margins": 0.871677577495575, - "rewards/rejected": -9.415602684020996, - "semantic_entropy": 0.0034175370819866657, + "logits/chosen": -0.22572903335094452, + "logits/rejected": -0.036730024963617325, + "logps/chosen": -4.441842555999756, + "logps/rejected": -5.290661334991455, + "loss": 0.5243, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.441842555999756, + "rewards/margins": 0.8488187789916992, + "rewards/rejected": -5.290661334991455, "step": 2855 }, { "epoch": 1.5306907509616994, - "grad_norm": 16.66857458253813, + "grad_norm": 16.644227450663575, "learning_rate": 5.690827141431699e-07, - "logits/chosen": 0.5200189352035522, - "logits/rejected": 0.6539384126663208, - "logps/chosen": -8.513689041137695, - "logps/rejected": -9.264158248901367, - "loss": 0.5351, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -8.513689041137695, - "rewards/margins": 0.7504681348800659, - "rewards/rejected": -9.264158248901367, - "semantic_entropy": 0.00305316224694252, + "logits/chosen": -0.25977617502212524, + "logits/rejected": -0.04683301970362663, + "logps/chosen": -4.101646423339844, + "logps/rejected": -4.958111763000488, + "loss": 0.4837, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.101646423339844, + "rewards/margins": 0.8564653396606445, + "rewards/rejected": -4.958111763000488, "step": 2860 }, { "epoch": 1.5333667837431009, - "grad_norm": 21.309556253730577, + "grad_norm": 22.024176729768243, "learning_rate": 5.675399156447897e-07, - "logits/chosen": 0.5738528370857239, - "logits/rejected": 0.6424544453620911, - "logps/chosen": -8.373230934143066, - "logps/rejected": -9.177302360534668, - "loss": 0.5274, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -8.373230934143066, - "rewards/margins": 0.8040705919265747, - "rewards/rejected": -9.177302360534668, - "semantic_entropy": 0.0038657269906252623, + "logits/chosen": -0.24323506653308868, + "logits/rejected": -0.09325195848941803, + "logps/chosen": -4.163077354431152, + "logps/rejected": -4.862407684326172, + "loss": 0.5602, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.163077354431152, + "rewards/margins": 0.6993306875228882, + "rewards/rejected": -4.862407684326172, "step": 2865 }, { "epoch": 1.5360428165245024, - "grad_norm": 18.48692651527053, + "grad_norm": 17.82375297034397, "learning_rate": 5.659964618738515e-07, - "logits/chosen": 0.5925968289375305, - "logits/rejected": 0.6500064730644226, - "logps/chosen": -8.42739486694336, - "logps/rejected": -9.342714309692383, - "loss": 0.524, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -8.42739486694336, - "rewards/margins": 0.9153194427490234, - "rewards/rejected": -9.342714309692383, - "semantic_entropy": 0.0032528643496334553, + "logits/chosen": -0.18338052928447723, + "logits/rejected": -0.051286481320858, + "logps/chosen": -4.1060333251953125, + "logps/rejected": -4.907165050506592, + "loss": 0.5353, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.1060333251953125, + "rewards/margins": 0.8011317253112793, + "rewards/rejected": -4.907165050506592, "step": 2870 }, { "epoch": 1.538718849305904, - "grad_norm": 18.694567668937122, + "grad_norm": 19.347428281132334, "learning_rate": 5.644523678049509e-07, - "logits/chosen": 0.5311469435691833, - "logits/rejected": 0.6227244138717651, - "logps/chosen": -8.448528289794922, - "logps/rejected": -9.284102439880371, - "loss": 0.5258, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.448528289794922, - "rewards/margins": 0.8355741500854492, - "rewards/rejected": -9.284102439880371, - "semantic_entropy": 0.0036838327068835497, + "logits/chosen": -0.19409145414829254, + "logits/rejected": -0.04179038107395172, + "logps/chosen": -4.089101791381836, + "logps/rejected": -4.8999786376953125, + "loss": 0.5214, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.089101791381836, + "rewards/margins": 0.8108770251274109, + "rewards/rejected": -4.8999786376953125, "step": 2875 }, { "epoch": 1.5413948820873056, - "grad_norm": 20.341898488782054, + "grad_norm": 16.286992576548986, "learning_rate": 5.629076484188952e-07, - "logits/chosen": 0.6697776913642883, - "logits/rejected": 0.7325208187103271, - "logps/chosen": -8.417803764343262, - "logps/rejected": -9.420540809631348, - "loss": 0.4782, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -8.417803764343262, - "rewards/margins": 1.0027358531951904, - "rewards/rejected": -9.420540809631348, - "semantic_entropy": 0.0036022099666297436, + "logits/chosen": -0.07925917953252792, + "logits/rejected": 0.04356354475021362, + "logps/chosen": -3.6903717517852783, + "logps/rejected": -4.596569061279297, + "loss": 0.4687, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.6903717517852783, + "rewards/margins": 0.9061976671218872, + "rewards/rejected": -4.596569061279297, "step": 2880 }, { "epoch": 1.544070914868707, - "grad_norm": 17.610305324362457, + "grad_norm": 16.726394889562613, "learning_rate": 5.613623187025587e-07, - "logits/chosen": 0.5705369710922241, - "logits/rejected": 0.6492313146591187, - "logps/chosen": -8.502610206604004, - "logps/rejected": -9.509003639221191, - "loss": 0.4879, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.502610206604004, - "rewards/margins": 1.0063927173614502, - "rewards/rejected": -9.509003639221191, - "semantic_entropy": 0.003285133745521307, + "logits/chosen": -0.18165457248687744, + "logits/rejected": -0.032997775822877884, + "logps/chosen": -3.897244930267334, + "logps/rejected": -4.805023193359375, + "loss": 0.4841, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.897244930267334, + "rewards/margins": 0.907778263092041, + "rewards/rejected": -4.805023193359375, "step": 2885 }, { "epoch": 1.5467469476501088, - "grad_norm": 17.881369724017024, + "grad_norm": 15.137350779125713, "learning_rate": 5.598163936487369e-07, - "logits/chosen": 0.573552131652832, - "logits/rejected": 0.6860645413398743, - "logps/chosen": -8.498343467712402, - "logps/rejected": -9.553579330444336, - "loss": 0.4739, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.498343467712402, - "rewards/margins": 1.055237054824829, - "rewards/rejected": -9.553579330444336, - "semantic_entropy": 0.0032834571320563555, + "logits/chosen": -0.1927015483379364, + "logits/rejected": 0.013611644506454468, + "logps/chosen": -3.958200454711914, + "logps/rejected": -5.039696216583252, + "loss": 0.4571, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.958200454711914, + "rewards/margins": 1.0814956426620483, + "rewards/rejected": -5.039696216583252, "step": 2890 }, { "epoch": 1.5494229804315103, - "grad_norm": 17.628648766982945, + "grad_norm": 13.169622584126246, "learning_rate": 5.582698882560017e-07, - "logits/chosen": 0.6237468719482422, - "logits/rejected": 0.7165063619613647, - "logps/chosen": -8.482809066772461, - "logps/rejected": -9.492294311523438, - "loss": 0.4927, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -8.482809066772461, - "rewards/margins": 1.0094853639602661, - "rewards/rejected": -9.492294311523438, - "semantic_entropy": 0.003665131749585271, + "logits/chosen": -0.2026037871837616, + "logits/rejected": -0.033614836633205414, + "logps/chosen": -3.7065186500549316, + "logps/rejected": -4.574999809265137, + "loss": 0.5054, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.7065186500549316, + "rewards/margins": 0.8684809803962708, + "rewards/rejected": -4.574999809265137, "step": 2895 }, { "epoch": 1.5520990132129118, - "grad_norm": 17.76187822982553, + "grad_norm": 13.116686679423292, "learning_rate": 5.567228175285549e-07, - "logits/chosen": 0.6243129968643188, - "logits/rejected": 0.7127053737640381, - "logps/chosen": -8.367454528808594, - "logps/rejected": -9.463773727416992, - "loss": 0.455, - "rewards/accuracies": 0.75, - "rewards/chosen": -8.367454528808594, - "rewards/margins": 1.0963184833526611, - "rewards/rejected": -9.463773727416992, - "semantic_entropy": 0.003789290087297559, + "logits/chosen": -0.13192197680473328, + "logits/rejected": -0.0064797429367899895, + "logps/chosen": -3.9415931701660156, + "logps/rejected": -4.972613334655762, + "loss": 0.4388, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.9415931701660156, + "rewards/margins": 1.031019926071167, + "rewards/rejected": -4.972613334655762, "step": 2900 }, { "epoch": 1.5547750459943135, - "grad_norm": 22.486981787052432, + "grad_norm": 17.13422876713816, "learning_rate": 5.551751964760838e-07, - "logits/chosen": 0.7026504278182983, - "logits/rejected": 0.7289483547210693, - "logps/chosen": -8.396336555480957, - "logps/rejected": -9.454975128173828, - "loss": 0.4576, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -8.396336555480957, - "rewards/margins": 1.058638334274292, - "rewards/rejected": -9.454975128173828, - "semantic_entropy": 0.003976074513047934, + "logits/chosen": -0.07308979332447052, + "logits/rejected": -0.050973545759916306, + "logps/chosen": -3.8907017707824707, + "logps/rejected": -4.847752094268799, + "loss": 0.4746, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.8907017707824707, + "rewards/margins": 0.9570503234863281, + "rewards/rejected": -4.847752094268799, "step": 2905 }, { "epoch": 1.557451078775715, - "grad_norm": 21.34998407613261, + "grad_norm": 23.02362478633762, "learning_rate": 5.536270401136145e-07, - "logits/chosen": 0.6059376001358032, - "logits/rejected": 0.6654237508773804, - "logps/chosen": -8.55673599243164, - "logps/rejected": -9.484312057495117, - "loss": 0.4952, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.55673599243164, - "rewards/margins": 0.927575945854187, - "rewards/rejected": -9.484312057495117, - "semantic_entropy": 0.0034220025409013033, + "logits/chosen": -0.15105082094669342, + "logits/rejected": -0.018276555463671684, + "logps/chosen": -4.224644184112549, + "logps/rejected": -5.093600273132324, + "loss": 0.4937, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.224644184112549, + "rewards/margins": 0.8689563870429993, + "rewards/rejected": -5.093600273132324, "step": 2910 }, { "epoch": 1.5601271115571165, - "grad_norm": 25.338171297276553, + "grad_norm": 23.190466924863443, "learning_rate": 5.520783634613667e-07, - "logits/chosen": 0.6434666514396667, - "logits/rejected": 0.7561715841293335, - "logps/chosen": -8.727119445800781, - "logps/rejected": -9.780064582824707, - "loss": 0.5051, + "logits/chosen": -0.1338997781276703, + "logits/rejected": 0.0801398903131485, + "logps/chosen": -4.213241100311279, + "logps/rejected": -5.244158744812012, + "loss": 0.486, "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.727119445800781, - "rewards/margins": 1.0529462099075317, - "rewards/rejected": -9.780064582824707, - "semantic_entropy": 0.002783454256132245, + "rewards/chosen": -4.213241100311279, + "rewards/margins": 1.0309178829193115, + "rewards/rejected": -5.244158744812012, "step": 2915 }, { "epoch": 1.5628031443385182, - "grad_norm": 19.446663229697716, + "grad_norm": 21.995648860256928, "learning_rate": 5.505291815446082e-07, - "logits/chosen": 0.622826099395752, - "logits/rejected": 0.6913628578186035, - "logps/chosen": -8.684412002563477, - "logps/rejected": -9.680010795593262, - "loss": 0.5004, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.684412002563477, - "rewards/margins": 0.9955987930297852, - "rewards/rejected": -9.680010795593262, - "semantic_entropy": 0.0030565441120415926, + "logits/chosen": -0.12605342268943787, + "logits/rejected": -0.0040770829655230045, + "logps/chosen": -4.4240007400512695, + "logps/rejected": -5.362966537475586, + "loss": 0.5245, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.4240007400512695, + "rewards/margins": 0.9389656186103821, + "rewards/rejected": -5.362966537475586, "step": 2920 }, { "epoch": 1.5654791771199197, - "grad_norm": 21.499839435107912, + "grad_norm": 17.722246953590286, "learning_rate": 5.489795093935089e-07, - "logits/chosen": 0.66752690076828, - "logits/rejected": 0.7305563688278198, - "logps/chosen": -8.636419296264648, - "logps/rejected": -9.558130264282227, - "loss": 0.5297, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.636419296264648, - "rewards/margins": 0.9217103123664856, - "rewards/rejected": -9.558130264282227, - "semantic_entropy": 0.0032252557575702667, + "logits/chosen": -0.12273578345775604, + "logits/rejected": -0.005004202015697956, + "logps/chosen": -4.183632850646973, + "logps/rejected": -5.099278450012207, + "loss": 0.5161, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.183632850646973, + "rewards/margins": 0.9156457781791687, + "rewards/rejected": -5.099278450012207, "step": 2925 }, { "epoch": 1.5681552099013212, - "grad_norm": 17.043070253231157, + "grad_norm": 15.092068550343328, "learning_rate": 5.474293620429946e-07, - "logits/chosen": 0.6017109155654907, - "logits/rejected": 0.6921178698539734, - "logps/chosen": -8.539863586425781, - "logps/rejected": -9.874781608581543, - "loss": 0.455, - "rewards/accuracies": 0.8125, - "rewards/chosen": -8.539863586425781, - "rewards/margins": 1.334917426109314, - "rewards/rejected": -9.874781608581543, - "semantic_entropy": 0.0031048119999468327, + "logits/chosen": -0.24956543743610382, + "logits/rejected": -0.06484591960906982, + "logps/chosen": -4.036123752593994, + "logps/rejected": -5.323879241943359, + "loss": 0.4256, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.036123752593994, + "rewards/margins": 1.2877554893493652, + "rewards/rejected": -5.323879241943359, "step": 2930 }, { "epoch": 1.570831242682723, - "grad_norm": 17.31470565155956, + "grad_norm": 17.162860560844383, "learning_rate": 5.458787545326018e-07, - "logits/chosen": 0.6002456545829773, - "logits/rejected": 0.6670821905136108, - "logps/chosen": -8.838860511779785, - "logps/rejected": -9.790765762329102, - "loss": 0.4888, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.838860511779785, - "rewards/margins": 0.9519071578979492, - "rewards/rejected": -9.790765762329102, - "semantic_entropy": 0.0028922937344759703, + "logits/chosen": -0.20718617737293243, + "logits/rejected": -0.09000232070684433, + "logps/chosen": -4.496607303619385, + "logps/rejected": -5.470868110656738, + "loss": 0.4779, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.496607303619385, + "rewards/margins": 0.9742609262466431, + "rewards/rejected": -5.470868110656738, "step": 2935 }, { "epoch": 1.5735072754641244, - "grad_norm": 18.420355066758702, + "grad_norm": 18.41669645344611, "learning_rate": 5.443277019063311e-07, - "logits/chosen": 0.6272684335708618, - "logits/rejected": 0.7411568760871887, - "logps/chosen": -8.951470375061035, - "logps/rejected": -10.118718147277832, - "loss": 0.4787, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -8.951470375061035, - "rewards/margins": 1.1672481298446655, - "rewards/rejected": -10.118718147277832, - "semantic_entropy": 0.0028697990346699953, + "logits/chosen": -0.18697470426559448, + "logits/rejected": -0.006905508227646351, + "logps/chosen": -4.588727951049805, + "logps/rejected": -5.690907955169678, + "loss": 0.4919, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.588727951049805, + "rewards/margins": 1.1021801233291626, + "rewards/rejected": -5.690907955169678, "step": 2940 }, { "epoch": 1.5761833082455259, - "grad_norm": 24.156663129325842, + "grad_norm": 20.435923918445454, "learning_rate": 5.427762192125023e-07, - "logits/chosen": 0.6460695862770081, - "logits/rejected": 0.7259084582328796, - "logps/chosen": -8.902268409729004, - "logps/rejected": -9.860254287719727, - "loss": 0.5189, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -8.902268409729004, - "rewards/margins": 0.9579856991767883, - "rewards/rejected": -9.860254287719727, - "semantic_entropy": 0.0026515666395425797, + "logits/chosen": -0.12378094345331192, + "logits/rejected": 0.02095230296254158, + "logps/chosen": -4.525023937225342, + "logps/rejected": -5.497542381286621, + "loss": 0.4924, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.525023937225342, + "rewards/margins": 0.9725181460380554, + "rewards/rejected": -5.497542381286621, "step": 2945 }, { "epoch": 1.5788593410269276, - "grad_norm": 28.217781936607324, + "grad_norm": 19.546590434641306, "learning_rate": 5.41224321503607e-07, - "logits/chosen": 0.6646770238876343, - "logits/rejected": 0.7777436375617981, - "logps/chosen": -8.907236099243164, - "logps/rejected": -10.02210807800293, - "loss": 0.422, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -8.907236099243164, - "rewards/margins": 1.1148706674575806, - "rewards/rejected": -10.02210807800293, - "semantic_entropy": 0.0026082415133714676, + "logits/chosen": -0.15033094584941864, + "logits/rejected": 0.11329858005046844, + "logps/chosen": -4.530350685119629, + "logps/rejected": -5.605813026428223, + "loss": 0.4394, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.530350685119629, + "rewards/margins": 1.0754621028900146, + "rewards/rejected": -5.605813026428223, "step": 2950 }, { "epoch": 1.5815353738083293, - "grad_norm": 22.254732967367147, + "grad_norm": 23.78030408410312, "learning_rate": 5.396720238361637e-07, - "logits/chosen": 0.7216917872428894, - "logits/rejected": 0.781305193901062, - "logps/chosen": -8.932952880859375, - "logps/rejected": -9.814626693725586, - "loss": 0.5321, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.932952880859375, - "rewards/margins": 0.8816744089126587, - "rewards/rejected": -9.814626693725586, - "semantic_entropy": 0.0031717985402792692, + "logits/chosen": -0.06801259517669678, + "logits/rejected": 0.06830494105815887, + "logps/chosen": -4.690810680389404, + "logps/rejected": -5.577322959899902, + "loss": 0.5261, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.690810680389404, + "rewards/margins": 0.8865123987197876, + "rewards/rejected": -5.577322959899902, "step": 2955 }, { "epoch": 1.5842114065897306, - "grad_norm": 17.792398556391674, + "grad_norm": 11.692522431015478, "learning_rate": 5.381193412705711e-07, - "logits/chosen": 0.6349023580551147, - "logits/rejected": 0.7170180678367615, - "logps/chosen": -8.807271003723145, - "logps/rejected": -9.855276107788086, - "loss": 0.4582, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -8.807271003723145, - "rewards/margins": 1.0480067729949951, - "rewards/rejected": -9.855276107788086, - "semantic_entropy": 0.0029146361630409956, + "logits/chosen": -0.1936405748128891, + "logits/rejected": -0.018577950075268745, + "logps/chosen": -4.5329084396362305, + "logps/rejected": -5.558853626251221, + "loss": 0.4417, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.5329084396362305, + "rewards/margins": 1.0259445905685425, + "rewards/rejected": -5.558853626251221, "step": 2960 }, { "epoch": 1.5868874393711323, - "grad_norm": 16.167553601713156, + "grad_norm": 15.807997760154327, "learning_rate": 5.365662888709622e-07, - "logits/chosen": 0.6512748599052429, - "logits/rejected": 0.7206937074661255, - "logps/chosen": -8.949135780334473, - "logps/rejected": -10.088407516479492, - "loss": 0.4511, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -8.949135780334473, - "rewards/margins": 1.139272689819336, - "rewards/rejected": -10.088407516479492, - "semantic_entropy": 0.0026951334439218044, + "logits/chosen": -0.13124777376651764, + "logits/rejected": 0.01589716598391533, + "logps/chosen": -4.806272983551025, + "logps/rejected": -5.8966875076293945, + "loss": 0.4767, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.806272983551025, + "rewards/margins": 1.09041428565979, + "rewards/rejected": -5.8966875076293945, "step": 2965 }, { "epoch": 1.589563472152534, - "grad_norm": 21.334679501529834, + "grad_norm": 21.02532327442932, "learning_rate": 5.350128817050585e-07, - "logits/chosen": 0.6061184406280518, - "logits/rejected": 0.6971312761306763, - "logps/chosen": -9.021596908569336, - "logps/rejected": -9.995620727539062, - "loss": 0.5042, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -9.021596908569336, - "rewards/margins": 0.974023163318634, - "rewards/rejected": -9.995620727539062, - "semantic_entropy": 0.002106505911797285, + "logits/chosen": -0.18969455361366272, + "logits/rejected": 0.033272065222263336, + "logps/chosen": -4.86229944229126, + "logps/rejected": -5.850628852844238, + "loss": 0.5099, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.86229944229126, + "rewards/margins": 0.9883286356925964, + "rewards/rejected": -5.850628852844238, "step": 2970 }, { "epoch": 1.5922395049339353, - "grad_norm": 27.712943840731256, + "grad_norm": 20.069984417279098, "learning_rate": 5.334591348440229e-07, - "logits/chosen": 0.6605676412582397, - "logits/rejected": 0.7498981952667236, - "logps/chosen": -8.881559371948242, - "logps/rejected": -9.735440254211426, - "loss": 0.5282, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.881559371948242, - "rewards/margins": 0.8538818359375, - "rewards/rejected": -9.735440254211426, - "semantic_entropy": 0.0024334299378097057, + "logits/chosen": -0.15437844395637512, + "logits/rejected": 0.03944334387779236, + "logps/chosen": -4.461657524108887, + "logps/rejected": -5.339333534240723, + "loss": 0.5176, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.461657524108887, + "rewards/margins": 0.8776756525039673, + "rewards/rejected": -5.339333534240723, "step": 2975 }, { "epoch": 1.594915537715337, - "grad_norm": 14.991037570020522, + "grad_norm": 15.556311753489473, "learning_rate": 5.319050633623141e-07, - "logits/chosen": 0.6337238550186157, - "logits/rejected": 0.7245572805404663, - "logps/chosen": -8.76764965057373, - "logps/rejected": -9.690933227539062, - "loss": 0.482, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.76764965057373, - "rewards/margins": 0.9232838749885559, - "rewards/rejected": -9.690933227539062, - "semantic_entropy": 0.002793360035866499, + "logits/chosen": -0.16790291666984558, + "logits/rejected": 0.03451234847307205, + "logps/chosen": -4.535186767578125, + "logps/rejected": -5.3687744140625, + "loss": 0.489, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.535186767578125, + "rewards/margins": 0.8335875272750854, + "rewards/rejected": -5.3687744140625, "step": 2980 }, { "epoch": 1.5975915704967387, - "grad_norm": 24.077369910851743, + "grad_norm": 20.89927611576879, "learning_rate": 5.303506823375409e-07, - "logits/chosen": 0.5908278226852417, - "logits/rejected": 0.7130194902420044, - "logps/chosen": -8.76014518737793, - "logps/rejected": -9.954129219055176, - "loss": 0.5029, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -8.76014518737793, - "rewards/margins": 1.1939831972122192, - "rewards/rejected": -9.954129219055176, - "semantic_entropy": 0.003040383802726865, + "logits/chosen": -0.15259508788585663, + "logits/rejected": 0.09763552993535995, + "logps/chosen": -4.506748199462891, + "logps/rejected": -5.663259506225586, + "loss": 0.4533, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.506748199462891, + "rewards/margins": 1.156510591506958, + "rewards/rejected": -5.663259506225586, "step": 2985 }, { "epoch": 1.60026760327814, - "grad_norm": 15.211762910677832, + "grad_norm": 15.423759871468647, "learning_rate": 5.287960068503143e-07, - "logits/chosen": 0.6387141942977905, - "logits/rejected": 0.7284534573554993, - "logps/chosen": -8.648200035095215, - "logps/rejected": -9.790531158447266, - "loss": 0.4399, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -8.648200035095215, - "rewards/margins": 1.1423308849334717, - "rewards/rejected": -9.790531158447266, - "semantic_entropy": 0.003230876522138715, + "logits/chosen": -0.17297227680683136, + "logits/rejected": 0.05435257405042648, + "logps/chosen": -4.407393455505371, + "logps/rejected": -5.477519989013672, + "loss": 0.4562, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.407393455505371, + "rewards/margins": 1.070127248764038, + "rewards/rejected": -5.477519989013672, "step": 2990 }, { "epoch": 1.6029436360595417, - "grad_norm": 20.214549437081253, + "grad_norm": 18.031386403181592, "learning_rate": 5.272410519841032e-07, - "logits/chosen": 0.6860362887382507, - "logits/rejected": 0.7700978517532349, - "logps/chosen": -8.748394012451172, - "logps/rejected": -9.925196647644043, - "loss": 0.4639, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -8.748394012451172, - "rewards/margins": 1.1768031120300293, - "rewards/rejected": -9.925196647644043, - "semantic_entropy": 0.002992126392200589, + "logits/chosen": -0.08059139549732208, + "logits/rejected": 0.08161180466413498, + "logps/chosen": -4.596780300140381, + "logps/rejected": -5.738574504852295, + "loss": 0.4574, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.596780300140381, + "rewards/margins": 1.1417946815490723, + "rewards/rejected": -5.738574504852295, "step": 2995 }, { "epoch": 1.6056196688409434, - "grad_norm": 13.676962647539636, + "grad_norm": 12.687254790516409, "learning_rate": 5.256858328250861e-07, - "logits/chosen": 0.6779240965843201, - "logits/rejected": 0.7815280556678772, - "logps/chosen": -8.563261985778809, - "logps/rejected": -9.472494125366211, - "loss": 0.5373, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.563261985778809, - "rewards/margins": 0.9092334508895874, - "rewards/rejected": -9.472494125366211, - "semantic_entropy": 0.0036033024080097675, + "logits/chosen": -0.13768552243709564, + "logits/rejected": 0.09984893351793289, + "logps/chosen": -4.220940113067627, + "logps/rejected": -5.111268043518066, + "loss": 0.5187, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.220940113067627, + "rewards/margins": 0.8903279304504395, + "rewards/rejected": -5.111268043518066, "step": 3000 }, { "epoch": 1.608295701622345, - "grad_norm": 35.682884824697986, + "grad_norm": 32.357158889411856, "learning_rate": 5.241303644620063e-07, - "logits/chosen": 0.6307097673416138, - "logits/rejected": 0.7322098612785339, - "logps/chosen": -8.694342613220215, - "logps/rejected": -9.436209678649902, - "loss": 0.5628, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -8.694342613220215, - "rewards/margins": 0.7418667078018188, - "rewards/rejected": -9.436209678649902, - "semantic_entropy": 0.003090116661041975, + "logits/chosen": -0.22303049266338348, + "logits/rejected": -0.027170103043317795, + "logps/chosen": -4.376584529876709, + "logps/rejected": -5.081091403961182, + "loss": 0.5708, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.376584529876709, + "rewards/margins": 0.7045064568519592, + "rewards/rejected": -5.081091403961182, "step": 3005 }, { "epoch": 1.6109717344037464, - "grad_norm": 20.559441487883948, + "grad_norm": 24.408056116758363, "learning_rate": 5.225746619860248e-07, - "logits/chosen": 0.651374101638794, - "logits/rejected": 0.7265350222587585, - "logps/chosen": -8.542802810668945, - "logps/rejected": -9.415166854858398, - "loss": 0.5648, - "rewards/accuracies": 0.75, - "rewards/chosen": -8.542802810668945, - "rewards/margins": 0.8723649978637695, - "rewards/rejected": -9.415166854858398, - "semantic_entropy": 0.0034926377702504396, + "logits/chosen": -0.17798592150211334, + "logits/rejected": -0.01685064099729061, + "logps/chosen": -4.4778547286987305, + "logps/rejected": -5.306777000427246, + "loss": 0.5816, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.4778547286987305, + "rewards/margins": 0.8289216756820679, + "rewards/rejected": -5.306777000427246, "step": 3010 }, { "epoch": 1.6136477671851481, - "grad_norm": 24.620427653936158, + "grad_norm": 20.367905967572472, "learning_rate": 5.210187404905735e-07, - "logits/chosen": 0.7381612658500671, - "logits/rejected": 0.7867849469184875, - "logps/chosen": -8.523360252380371, - "logps/rejected": -9.46071720123291, - "loss": 0.5041, - "rewards/accuracies": 0.75, - "rewards/chosen": -8.523360252380371, - "rewards/margins": 0.9373563528060913, - "rewards/rejected": -9.46071720123291, - "semantic_entropy": 0.003742937697097659, + "logits/chosen": 0.023410063236951828, + "logits/rejected": 0.11523191630840302, + "logps/chosen": -4.377657890319824, + "logps/rejected": -5.284204959869385, + "loss": 0.5063, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.377657890319824, + "rewards/margins": 0.906546413898468, + "rewards/rejected": -5.284204959869385, "step": 3015 }, { "epoch": 1.6163237999665496, - "grad_norm": 20.150742954590974, + "grad_norm": 15.982725203624536, "learning_rate": 5.194626150712098e-07, - "logits/chosen": 0.6840203404426575, - "logits/rejected": 0.7242007851600647, - "logps/chosen": -8.462261199951172, - "logps/rejected": -9.2748384475708, - "loss": 0.5223, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -8.462261199951172, - "rewards/margins": 0.8125771284103394, - "rewards/rejected": -9.2748384475708, - "semantic_entropy": 0.0034177147317677736, + "logits/chosen": -0.10671371221542358, + "logits/rejected": 0.01962994411587715, + "logps/chosen": -4.419652938842773, + "logps/rejected": -5.214144229888916, + "loss": 0.5143, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.419652938842773, + "rewards/margins": 0.7944914698600769, + "rewards/rejected": -5.214144229888916, "step": 3020 }, { "epoch": 1.6189998327479511, - "grad_norm": 22.925549219305793, + "grad_norm": 19.538461660040223, "learning_rate": 5.179063008254695e-07, - "logits/chosen": 0.6633858680725098, - "logits/rejected": 0.7617511749267578, - "logps/chosen": -8.355466842651367, - "logps/rejected": -9.252098083496094, - "loss": 0.5333, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -8.355466842651367, - "rewards/margins": 0.896629810333252, - "rewards/rejected": -9.252098083496094, - "semantic_entropy": 0.003791496157646179, + "logits/chosen": -0.14063067734241486, + "logits/rejected": 0.05624137446284294, + "logps/chosen": -4.218569755554199, + "logps/rejected": -5.073284149169922, + "loss": 0.5264, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.218569755554199, + "rewards/margins": 0.8547142148017883, + "rewards/rejected": -5.073284149169922, "step": 3025 }, { "epoch": 1.6216758655293528, - "grad_norm": 24.09406066747849, + "grad_norm": 15.902668337846292, "learning_rate": 5.163498128527199e-07, - "logits/chosen": 0.6741195917129517, - "logits/rejected": 0.7528184056282043, - "logps/chosen": -8.633956909179688, - "logps/rejected": -9.625297546386719, - "loss": 0.5164, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.633956909179688, - "rewards/margins": 0.9913405179977417, - "rewards/rejected": -9.625297546386719, - "semantic_entropy": 0.0035090327728539705, + "logits/chosen": -0.06368695199489594, + "logits/rejected": 0.10362465679645538, + "logps/chosen": -4.577967166900635, + "logps/rejected": -5.447230815887451, + "loss": 0.5207, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.577967166900635, + "rewards/margins": 0.8692638278007507, + "rewards/rejected": -5.447230815887451, "step": 3030 }, { "epoch": 1.6243518983107543, - "grad_norm": 17.245493463371307, + "grad_norm": 18.618757121688425, "learning_rate": 5.147931662540144e-07, - "logits/chosen": 0.7293022871017456, - "logits/rejected": 0.8059667348861694, - "logps/chosen": -8.407529830932617, - "logps/rejected": -9.319184303283691, - "loss": 0.5002, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -8.407529830932617, - "rewards/margins": 0.9116536378860474, - "rewards/rejected": -9.319184303283691, - "semantic_entropy": 0.004189362749457359, + "logits/chosen": 0.002679321216419339, + "logits/rejected": 0.16721181571483612, + "logps/chosen": -4.334213733673096, + "logps/rejected": -5.127976417541504, + "loss": 0.5157, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.334213733673096, + "rewards/margins": 0.7937629818916321, + "rewards/rejected": -5.127976417541504, "step": 3035 }, { "epoch": 1.6270279310921558, - "grad_norm": 22.579101949094103, + "grad_norm": 15.165316457324282, "learning_rate": 5.132363761319449e-07, - "logits/chosen": 0.6055505275726318, - "logits/rejected": 0.6722275018692017, - "logps/chosen": -8.391597747802734, - "logps/rejected": -9.499109268188477, - "loss": 0.4719, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -8.391597747802734, - "rewards/margins": 1.1075109243392944, - "rewards/rejected": -9.499109268188477, - "semantic_entropy": 0.003600142430514097, + "logits/chosen": -0.09818046540021896, + "logits/rejected": -0.005679869093000889, + "logps/chosen": -4.2758283615112305, + "logps/rejected": -5.330855369567871, + "loss": 0.4658, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.2758283615112305, + "rewards/margins": 1.055027961730957, + "rewards/rejected": -5.330855369567871, "step": 3040 }, { "epoch": 1.6297039638735575, - "grad_norm": 42.73415984103176, + "grad_norm": 28.53155654068744, "learning_rate": 5.116794575904962e-07, - "logits/chosen": 0.6817172765731812, - "logits/rejected": 0.7639212608337402, - "logps/chosen": -8.343725204467773, - "logps/rejected": -9.283025741577148, - "loss": 0.5163, - "rewards/accuracies": 0.75, - "rewards/chosen": -8.343725204467773, - "rewards/margins": 0.939300537109375, - "rewards/rejected": -9.283025741577148, - "semantic_entropy": 0.004117668606340885, + "logits/chosen": -0.0761515349149704, + "logits/rejected": 0.052153535187244415, + "logps/chosen": -3.8393592834472656, + "logps/rejected": -4.6910529136657715, + "loss": 0.5157, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.8393592834472656, + "rewards/margins": 0.8516939282417297, + "rewards/rejected": -4.6910529136657715, "step": 3045 }, { "epoch": 1.632379996654959, - "grad_norm": 13.671217913035836, + "grad_norm": 12.742945844712942, "learning_rate": 5.101224257348987e-07, - "logits/chosen": 0.6588679552078247, - "logits/rejected": 0.7599143385887146, - "logps/chosen": -8.530394554138184, - "logps/rejected": -9.683368682861328, - "loss": 0.4385, + "logits/chosen": -0.09062933921813965, + "logits/rejected": 0.07079356908798218, + "logps/chosen": -4.113347053527832, + "logps/rejected": -5.1801629066467285, + "loss": 0.4354, "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -8.530394554138184, - "rewards/margins": 1.1529743671417236, - "rewards/rejected": -9.683368682861328, - "semantic_entropy": 0.003528149798512459, + "rewards/chosen": -4.113347053527832, + "rewards/margins": 1.0668165683746338, + "rewards/rejected": -5.1801629066467285, "step": 3050 }, { "epoch": 1.6350560294363605, - "grad_norm": 17.926663757907765, + "grad_norm": 15.143344376036385, "learning_rate": 5.085652956714823e-07, - "logits/chosen": 0.6311002373695374, - "logits/rejected": 0.7291450500488281, - "logps/chosen": -8.84853744506836, - "logps/rejected": -9.728075981140137, - "loss": 0.5199, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.84853744506836, - "rewards/margins": 0.8795391917228699, - "rewards/rejected": -9.728075981140137, - "semantic_entropy": 0.0028904026839882135, + "logits/chosen": -0.16082966327667236, + "logits/rejected": 0.015038782730698586, + "logps/chosen": -4.255005836486816, + "logps/rejected": -5.055515289306641, + "loss": 0.5259, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.255005836486816, + "rewards/margins": 0.800508975982666, + "rewards/rejected": -5.055515289306641, "step": 3055 }, { "epoch": 1.6377320622177622, - "grad_norm": 17.340322531594975, + "grad_norm": 15.044671329639788, "learning_rate": 5.070080825075298e-07, - "logits/chosen": 0.7018830180168152, - "logits/rejected": 0.8387192487716675, - "logps/chosen": -8.556188583374023, - "logps/rejected": -9.5839204788208, - "loss": 0.5109, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.556188583374023, - "rewards/margins": 1.0277318954467773, - "rewards/rejected": -9.5839204788208, - "semantic_entropy": 0.004027285613119602, + "logits/chosen": -0.15031415224075317, + "logits/rejected": 0.07118190079927444, + "logps/chosen": -4.20255708694458, + "logps/rejected": -5.11803674697876, + "loss": 0.5288, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.20255708694458, + "rewards/margins": 0.9154800176620483, + "rewards/rejected": -5.11803674697876, "step": 3060 }, { "epoch": 1.6404080949991637, - "grad_norm": 15.73972180449081, + "grad_norm": 15.420445476428075, "learning_rate": 5.0545080135113e-07, - "logits/chosen": 0.6703477501869202, - "logits/rejected": 0.7229039669036865, - "logps/chosen": -8.628973007202148, - "logps/rejected": -9.581435203552246, - "loss": 0.5456, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -8.628973007202148, - "rewards/margins": 0.9524634480476379, - "rewards/rejected": -9.581435203552246, - "semantic_entropy": 0.003708144649863243, + "logits/chosen": -0.11134666204452515, + "logits/rejected": -0.006379268132150173, + "logps/chosen": -4.11629581451416, + "logps/rejected": -5.032707214355469, + "loss": 0.5305, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.11629581451416, + "rewards/margins": 0.9164111018180847, + "rewards/rejected": -5.032707214355469, "step": 3065 }, { "epoch": 1.6430841277805652, - "grad_norm": 23.480077098467, + "grad_norm": 22.452347441802623, "learning_rate": 5.038934673110316e-07, - "logits/chosen": 0.6456252336502075, - "logits/rejected": 0.7398085594177246, - "logps/chosen": -8.677080154418945, - "logps/rejected": -9.685178756713867, - "loss": 0.5225, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -8.677080154418945, - "rewards/margins": 1.0080986022949219, - "rewards/rejected": -9.685178756713867, - "semantic_entropy": 0.0029380209743976593, + "logits/chosen": -0.13836877048015594, + "logits/rejected": -0.012528735212981701, + "logps/chosen": -4.3048095703125, + "logps/rejected": -5.227591037750244, + "loss": 0.522, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.3048095703125, + "rewards/margins": 0.9227815866470337, + "rewards/rejected": -5.227591037750244, "step": 3070 }, { "epoch": 1.645760160561967, - "grad_norm": 15.433407858304248, + "grad_norm": 16.138077689656445, "learning_rate": 5.023360954964963e-07, - "logits/chosen": 0.6237664222717285, - "logits/rejected": 0.6907469630241394, - "logps/chosen": -8.563250541687012, - "logps/rejected": -9.583332061767578, - "loss": 0.4477, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -8.563250541687012, - "rewards/margins": 1.0200810432434082, - "rewards/rejected": -9.583332061767578, - "semantic_entropy": 0.0032678351271897554, + "logits/chosen": -0.19199270009994507, + "logits/rejected": -0.09373234212398529, + "logps/chosen": -3.9371368885040283, + "logps/rejected": -4.86196231842041, + "loss": 0.4506, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.9371368885040283, + "rewards/margins": 0.9248256683349609, + "rewards/rejected": -4.86196231842041, "step": 3075 }, { "epoch": 1.6484361933433684, - "grad_norm": 15.623034117961172, + "grad_norm": 12.54611105713789, "learning_rate": 5.007787010171524e-07, - "logits/chosen": 0.5593664050102234, - "logits/rejected": 0.6882971525192261, - "logps/chosen": -8.571015357971191, - "logps/rejected": -9.626784324645996, - "loss": 0.4314, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -8.571015357971191, - "rewards/margins": 1.055769681930542, - "rewards/rejected": -9.626784324645996, - "semantic_entropy": 0.003013583132997155, + "logits/chosen": -0.22732773423194885, + "logits/rejected": -0.018737921491265297, + "logps/chosen": -3.984222412109375, + "logps/rejected": -5.039055824279785, + "loss": 0.418, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.984222412109375, + "rewards/margins": 1.0548332929611206, + "rewards/rejected": -5.039055824279785, "step": 3080 }, { "epoch": 1.65111222612477, - "grad_norm": 19.95888030099492, + "grad_norm": 20.917349099935535, "learning_rate": 4.992212989828477e-07, - "logits/chosen": 0.6781376004219055, - "logits/rejected": 0.7037637233734131, - "logps/chosen": -8.676929473876953, - "logps/rejected": -9.475358009338379, - "loss": 0.523, - "rewards/accuracies": 0.75, - "rewards/chosen": -8.676929473876953, - "rewards/margins": 0.7984285354614258, - "rewards/rejected": -9.475358009338379, - "semantic_entropy": 0.003011090215295553, + "logits/chosen": -0.08695609867572784, + "logits/rejected": -0.04461172968149185, + "logps/chosen": -4.1265549659729, + "logps/rejected": -4.9023756980896, + "loss": 0.5261, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.1265549659729, + "rewards/margins": 0.7758212685585022, + "rewards/rejected": -4.9023756980896, "step": 3085 }, { "epoch": 1.6537882589061716, - "grad_norm": 23.790232395129802, + "grad_norm": 19.167861398379152, "learning_rate": 4.976639045035036e-07, - "logits/chosen": 0.6791437268257141, - "logits/rejected": 0.7153705358505249, - "logps/chosen": -8.594088554382324, - "logps/rejected": -9.416778564453125, - "loss": 0.5839, + "logits/chosen": -0.07181166857481003, + "logits/rejected": 0.022754056379199028, + "logps/chosen": -4.001575946807861, + "logps/rejected": -4.776135444641113, + "loss": 0.5852, "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -8.594088554382324, - "rewards/margins": 0.8226897120475769, - "rewards/rejected": -9.416778564453125, - "semantic_entropy": 0.0035625225864350796, + "rewards/chosen": -4.001575946807861, + "rewards/margins": 0.7745591402053833, + "rewards/rejected": -4.776135444641113, "step": 3090 }, { "epoch": 1.6564642916875731, - "grad_norm": 19.641758817894523, + "grad_norm": 20.88223288933374, "learning_rate": 4.961065326889683e-07, - "logits/chosen": 0.6901504397392273, - "logits/rejected": 0.7671633958816528, - "logps/chosen": -8.59467601776123, - "logps/rejected": -9.425036430358887, - "loss": 0.52, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -8.59467601776123, - "rewards/margins": 0.8303607702255249, - "rewards/rejected": -9.425036430358887, - "semantic_entropy": 0.003136052517220378, + "logits/chosen": -0.07407991588115692, + "logits/rejected": 0.0893828421831131, + "logps/chosen": -4.302178859710693, + "logps/rejected": -5.163575172424316, + "loss": 0.5056, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.302178859710693, + "rewards/margins": 0.8613961935043335, + "rewards/rejected": -5.163575172424316, "step": 3095 }, { "epoch": 1.6591403244689746, - "grad_norm": 21.418887952150815, + "grad_norm": 18.979152348622804, "learning_rate": 4.9454919864887e-07, - "logits/chosen": 0.6037132143974304, - "logits/rejected": 0.7032198905944824, - "logps/chosen": -8.467050552368164, - "logps/rejected": -9.3862886428833, - "loss": 0.52, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -8.467050552368164, - "rewards/margins": 0.9192383885383606, - "rewards/rejected": -9.3862886428833, - "semantic_entropy": 0.0032492957543581724, + "logits/chosen": -0.19874528050422668, + "logits/rejected": -0.04767128452658653, + "logps/chosen": -4.221526622772217, + "logps/rejected": -5.159215927124023, + "loss": 0.4899, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.221526622772217, + "rewards/margins": 0.9376896023750305, + "rewards/rejected": -5.159215927124023, "step": 3100 }, { "epoch": 1.6618163572503764, - "grad_norm": 23.776890869160376, + "grad_norm": 22.748498491840614, "learning_rate": 4.929919174924701e-07, - "logits/chosen": 0.6591798663139343, - "logits/rejected": 0.7749283909797668, - "logps/chosen": -8.419390678405762, - "logps/rejected": -9.24329948425293, - "loss": 0.5334, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.419390678405762, - "rewards/margins": 0.8239078521728516, - "rewards/rejected": -9.24329948425293, - "semantic_entropy": 0.003245703876018524, + "logits/chosen": -0.2026331126689911, + "logits/rejected": 0.007918933406472206, + "logps/chosen": -4.143319129943848, + "logps/rejected": -4.981770992279053, + "loss": 0.5079, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.143319129943848, + "rewards/margins": 0.8384523391723633, + "rewards/rejected": -4.981770992279053, "step": 3105 }, { "epoch": 1.6644923900317778, - "grad_norm": 14.314831569908868, + "grad_norm": 17.01211406253825, "learning_rate": 4.914347043285177e-07, - "logits/chosen": 0.695237398147583, - "logits/rejected": 0.7862176299095154, - "logps/chosen": -8.383251190185547, - "logps/rejected": -9.337328910827637, - "loss": 0.474, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -8.383251190185547, - "rewards/margins": 0.954079270362854, - "rewards/rejected": -9.337328910827637, - "semantic_entropy": 0.0036495565436780453, + "logits/chosen": -0.12050051987171173, + "logits/rejected": 0.02635257877409458, + "logps/chosen": -4.086925506591797, + "logps/rejected": -4.978209495544434, + "loss": 0.4865, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.086925506591797, + "rewards/margins": 0.8912846446037292, + "rewards/rejected": -4.978209495544434, "step": 3110 }, { "epoch": 1.6671684228131793, - "grad_norm": 17.93218545649754, + "grad_norm": 14.504970799788612, "learning_rate": 4.898775742651013e-07, - "logits/chosen": 0.6778665781021118, - "logits/rejected": 0.7646596431732178, - "logps/chosen": -8.393171310424805, - "logps/rejected": -9.438430786132812, - "loss": 0.4243, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -8.393171310424805, - "rewards/margins": 1.0452605485916138, - "rewards/rejected": -9.438430786132812, - "semantic_entropy": 0.0038520165253430605, + "logits/chosen": -0.08566021174192429, + "logits/rejected": 0.06341859698295593, + "logps/chosen": -4.2797980308532715, + "logps/rejected": -5.325926780700684, + "loss": 0.4269, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.2797980308532715, + "rewards/margins": 1.046128511428833, + "rewards/rejected": -5.325926780700684, "step": 3115 }, { "epoch": 1.669844455594581, - "grad_norm": 11.631955236946794, + "grad_norm": 14.97043506687999, "learning_rate": 4.883205424095037e-07, - "logits/chosen": 0.6586011648178101, - "logits/rejected": 0.7384502291679382, - "logps/chosen": -8.368680000305176, - "logps/rejected": -9.391559600830078, - "loss": 0.4636, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.368680000305176, - "rewards/margins": 1.0228804349899292, - "rewards/rejected": -9.391559600830078, - "semantic_entropy": 0.0038888491690158844, + "logits/chosen": -0.19909431040287018, + "logits/rejected": -0.01872243359684944, + "logps/chosen": -4.544089317321777, + "logps/rejected": -5.535252571105957, + "loss": 0.4885, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.544089317321777, + "rewards/margins": 0.9911627769470215, + "rewards/rejected": -5.535252571105957, "step": 3120 }, { "epoch": 1.6725204883759828, - "grad_norm": 17.817638170727182, + "grad_norm": 15.74032606127491, "learning_rate": 4.86763623868055e-07, - "logits/chosen": 0.7192140817642212, - "logits/rejected": 0.7774937152862549, - "logps/chosen": -8.442755699157715, - "logps/rejected": -9.343069076538086, - "loss": 0.5072, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.442755699157715, - "rewards/margins": 0.9003141522407532, - "rewards/rejected": -9.343069076538086, - "semantic_entropy": 0.0035094446502625942, + "logits/chosen": -0.116361603140831, + "logits/rejected": 0.03129710629582405, + "logps/chosen": -4.280386924743652, + "logps/rejected": -5.212136268615723, + "loss": 0.4979, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.280386924743652, + "rewards/margins": 0.9317496418952942, + "rewards/rejected": -5.212136268615723, "step": 3125 }, { "epoch": 1.675196521157384, - "grad_norm": 15.880665450221413, + "grad_norm": 17.422126751525436, "learning_rate": 4.852068337459856e-07, - "logits/chosen": 0.7191354036331177, - "logits/rejected": 0.788988471031189, - "logps/chosen": -8.542181015014648, - "logps/rejected": -9.512245178222656, - "loss": 0.4687, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.542181015014648, - "rewards/margins": 0.9700649380683899, - "rewards/rejected": -9.512245178222656, - "semantic_entropy": 0.0031049910467118025, + "logits/chosen": -0.04978395625948906, + "logits/rejected": 0.1433534175157547, + "logps/chosen": -4.729816436767578, + "logps/rejected": -5.6223344802856445, + "loss": 0.4869, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.729816436767578, + "rewards/margins": 0.8925185203552246, + "rewards/rejected": -5.6223344802856445, "step": 3130 }, { "epoch": 1.6778725539387858, - "grad_norm": 20.022219719640713, + "grad_norm": 17.16402888128835, "learning_rate": 4.8365018714728e-07, - "logits/chosen": 0.798575222492218, - "logits/rejected": 0.8416634798049927, - "logps/chosen": -8.635334968566895, - "logps/rejected": -9.463285446166992, - "loss": 0.5202, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.635334968566895, - "rewards/margins": 0.8279510736465454, - "rewards/rejected": -9.463285446166992, - "semantic_entropy": 0.0029447092674672604, + "logits/chosen": -0.04248099401593208, + "logits/rejected": 0.05999818444252014, + "logps/chosen": -4.707949638366699, + "logps/rejected": -5.520773887634277, + "loss": 0.5197, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.707949638366699, + "rewards/margins": 0.8128247261047363, + "rewards/rejected": -5.520773887634277, "step": 3135 }, { "epoch": 1.6805485867201875, - "grad_norm": 22.26373435182509, + "grad_norm": 19.607695887922233, "learning_rate": 4.820936991745304e-07, - "logits/chosen": 0.6276187896728516, - "logits/rejected": 0.6913308501243591, - "logps/chosen": -8.587217330932617, - "logps/rejected": -9.444761276245117, - "loss": 0.5068, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -8.587217330932617, - "rewards/margins": 0.8575426936149597, - "rewards/rejected": -9.444761276245117, - "semantic_entropy": 0.003062673145905137, + "logits/chosen": -0.2511487305164337, + "logits/rejected": -0.11629515886306763, + "logps/chosen": -4.59854793548584, + "logps/rejected": -5.481645107269287, + "loss": 0.4882, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.59854793548584, + "rewards/margins": 0.8830973505973816, + "rewards/rejected": -5.481645107269287, "step": 3140 }, { "epoch": 1.6832246195015887, - "grad_norm": 26.04794148992061, + "grad_norm": 21.129033027158872, "learning_rate": 4.8053738492879e-07, - "logits/chosen": 0.6948825120925903, - "logits/rejected": 0.7602331042289734, - "logps/chosen": -8.406000137329102, - "logps/rejected": -9.530774116516113, - "loss": 0.4554, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -8.406000137329102, - "rewards/margins": 1.1247742176055908, - "rewards/rejected": -9.530774116516113, - "semantic_entropy": 0.0036121797747910023, + "logits/chosen": -0.0823192298412323, + "logits/rejected": 0.06743039190769196, + "logps/chosen": -4.393311500549316, + "logps/rejected": -5.496063232421875, + "loss": 0.479, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.393311500549316, + "rewards/margins": 1.102751612663269, + "rewards/rejected": -5.496063232421875, "step": 3145 }, { "epoch": 1.6859006522829905, - "grad_norm": 23.24113283268114, + "grad_norm": 16.591101177115327, "learning_rate": 4.789812595094265e-07, - "logits/chosen": 0.6636757254600525, - "logits/rejected": 0.7241615653038025, - "logps/chosen": -8.501133918762207, - "logps/rejected": -9.556479454040527, - "loss": 0.4467, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.501133918762207, - "rewards/margins": 1.0553454160690308, - "rewards/rejected": -9.556479454040527, - "semantic_entropy": 0.00418940931558609, + "logits/chosen": -0.21462829411029816, + "logits/rejected": -0.07454577833414078, + "logps/chosen": -4.507380485534668, + "logps/rejected": -5.64649772644043, + "loss": 0.4375, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.507380485534668, + "rewards/margins": 1.1391172409057617, + "rewards/rejected": -5.64649772644043, "step": 3150 }, { "epoch": 1.6885766850643922, - "grad_norm": 17.478492942232236, + "grad_norm": 14.267524876476317, "learning_rate": 4.774253380139752e-07, - "logits/chosen": 0.6438261270523071, - "logits/rejected": 0.7361315488815308, - "logps/chosen": -8.412601470947266, - "logps/rejected": -9.485505104064941, - "loss": 0.4467, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.412601470947266, - "rewards/margins": 1.0729031562805176, - "rewards/rejected": -9.485505104064941, - "semantic_entropy": 0.0037474199198186398, + "logits/chosen": -0.21954481303691864, + "logits/rejected": -0.05956953018903732, + "logps/chosen": -4.3540849685668945, + "logps/rejected": -5.457587242126465, + "loss": 0.4491, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.3540849685668945, + "rewards/margins": 1.1035025119781494, + "rewards/rejected": -5.457587242126465, "step": 3155 }, { "epoch": 1.6912527178457935, - "grad_norm": 19.822571481610865, + "grad_norm": 19.7328750201594, "learning_rate": 4.758696355379936e-07, - "logits/chosen": 0.7401809692382812, - "logits/rejected": 0.7346007227897644, - "logps/chosen": -8.39743423461914, - "logps/rejected": -9.354679107666016, - "loss": 0.4803, + "logits/chosen": -0.055561043322086334, + "logits/rejected": -0.09367877244949341, + "logps/chosen": -4.204165458679199, + "logps/rejected": -5.185473918914795, + "loss": 0.4812, "rewards/accuracies": 0.78125, - "rewards/chosen": -8.39743423461914, - "rewards/margins": 0.9572445154190063, - "rewards/rejected": -9.354679107666016, - "semantic_entropy": 0.004037821665406227, + "rewards/chosen": -4.204165458679199, + "rewards/margins": 0.98130863904953, + "rewards/rejected": -5.185473918914795, "step": 3160 }, { "epoch": 1.6939287506271952, - "grad_norm": 18.508878104944426, + "grad_norm": 17.807859767315552, "learning_rate": 4.743141671749138e-07, - "logits/chosen": 0.6463350057601929, - "logits/rejected": 0.7294069528579712, - "logps/chosen": -8.58276081085205, - "logps/rejected": -9.354612350463867, - "loss": 0.5592, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -8.58276081085205, - "rewards/margins": 0.7718508243560791, - "rewards/rejected": -9.354612350463867, - "semantic_entropy": 0.0035912543535232544, + "logits/chosen": -0.24831703305244446, + "logits/rejected": -0.07738317549228668, + "logps/chosen": -4.652713298797607, + "logps/rejected": -5.397271156311035, + "loss": 0.5803, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.652713298797607, + "rewards/margins": 0.7445586323738098, + "rewards/rejected": -5.397271156311035, "step": 3165 }, { "epoch": 1.6966047834085969, - "grad_norm": 19.828630412175407, + "grad_norm": 17.04544012147205, "learning_rate": 4.727589480158968e-07, - "logits/chosen": 0.6823207139968872, - "logits/rejected": 0.7240070104598999, - "logps/chosen": -8.653319358825684, - "logps/rejected": -9.661191940307617, - "loss": 0.4801, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.653319358825684, - "rewards/margins": 1.0078718662261963, - "rewards/rejected": -9.661191940307617, - "semantic_entropy": 0.0033484199084341526, + "logits/chosen": -0.11613799631595612, + "logits/rejected": -0.006324891000986099, + "logps/chosen": -4.461399078369141, + "logps/rejected": -5.5352678298950195, + "loss": 0.4535, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.461399078369141, + "rewards/margins": 1.073868989944458, + "rewards/rejected": -5.5352678298950195, "step": 3170 }, { "epoch": 1.6992808161899984, - "grad_norm": 20.43246886248836, + "grad_norm": 23.414639126588206, "learning_rate": 4.712039931496855e-07, - "logits/chosen": 0.6765194535255432, - "logits/rejected": 0.7426118850708008, - "logps/chosen": -8.664289474487305, - "logps/rejected": -9.393746376037598, - "loss": 0.5722, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -8.664289474487305, - "rewards/margins": 0.7294565439224243, - "rewards/rejected": -9.393746376037598, - "semantic_entropy": 0.003087881486862898, + "logits/chosen": -0.1476917564868927, + "logits/rejected": 0.023240093141794205, + "logps/chosen": -4.518770217895508, + "logps/rejected": -5.184556484222412, + "loss": 0.6228, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.518770217895508, + "rewards/margins": 0.6657860279083252, + "rewards/rejected": -5.184556484222412, "step": 3175 }, { "epoch": 1.7019568489713999, - "grad_norm": 21.003656385946787, + "grad_norm": 18.951114312489995, "learning_rate": 4.6964931766245905e-07, - "logits/chosen": 0.7278314828872681, - "logits/rejected": 0.7725498080253601, - "logps/chosen": -8.796308517456055, - "logps/rejected": -9.755891799926758, - "loss": 0.4998, + "logits/chosen": -0.06605806201696396, + "logits/rejected": 0.007165558636188507, + "logps/chosen": -4.520981788635254, + "logps/rejected": -5.459076881408691, + "loss": 0.4927, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.796308517456055, - "rewards/margins": 0.9595831036567688, - "rewards/rejected": -9.755891799926758, - "semantic_entropy": 0.002779710106551647, + "rewards/chosen": -4.520981788635254, + "rewards/margins": 0.9380949139595032, + "rewards/rejected": -5.459076881408691, "step": 3180 }, { "epoch": 1.7046328817528016, - "grad_norm": 21.709608881311866, + "grad_norm": 18.32807827225364, "learning_rate": 4.6809493663768575e-07, - "logits/chosen": 0.6481348276138306, - "logits/rejected": 0.6695024967193604, - "logps/chosen": -8.799505233764648, - "logps/rejected": -9.460579872131348, - "loss": 0.5856, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -8.799505233764648, - "rewards/margins": 0.6610761880874634, - "rewards/rejected": -9.460579872131348, - "semantic_entropy": 0.0028663822449743748, + "logits/chosen": -0.10945770889520645, + "logits/rejected": -0.03913877531886101, + "logps/chosen": -4.4924821853637695, + "logps/rejected": -5.100353717803955, + "loss": 0.5857, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.4924821853637695, + "rewards/margins": 0.6078712940216064, + "rewards/rejected": -5.100353717803955, "step": 3185 }, { "epoch": 1.707308914534203, - "grad_norm": 16.96070709578509, + "grad_norm": 15.389053336359513, "learning_rate": 4.6654086515597716e-07, - "logits/chosen": 0.59629225730896, - "logits/rejected": 0.6733515858650208, - "logps/chosen": -8.772279739379883, - "logps/rejected": -9.883055686950684, - "loss": 0.4559, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -8.772279739379883, - "rewards/margins": 1.1107757091522217, - "rewards/rejected": -9.883055686950684, - "semantic_entropy": 0.0028946802485734224, + "logits/chosen": -0.21891799569129944, + "logits/rejected": -0.025166088715195656, + "logps/chosen": -4.661751747131348, + "logps/rejected": -5.695508003234863, + "loss": 0.4537, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.661751747131348, + "rewards/margins": 1.0337554216384888, + "rewards/rejected": -5.695508003234863, "step": 3190 }, { "epoch": 1.7099849473156046, - "grad_norm": 15.2495463629594, + "grad_norm": 14.149911803642658, "learning_rate": 4.6498711829494154e-07, - "logits/chosen": 0.6147344708442688, - "logits/rejected": 0.6999740600585938, - "logps/chosen": -8.856141090393066, - "logps/rejected": -9.772600173950195, - "loss": 0.5042, + "logits/chosen": -0.20324234664440155, + "logits/rejected": -0.07716294378042221, + "logps/chosen": -4.39736270904541, + "logps/rejected": -5.346957683563232, + "loss": 0.4868, "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -8.856141090393066, - "rewards/margins": 0.916458010673523, - "rewards/rejected": -9.772600173950195, - "semantic_entropy": 0.002791165839880705, + "rewards/chosen": -4.39736270904541, + "rewards/margins": 0.9495952725410461, + "rewards/rejected": -5.346957683563232, "step": 3195 }, { "epoch": 1.7126609800970063, - "grad_norm": 17.706370672383937, + "grad_norm": 18.643176189276186, "learning_rate": 4.6343371112903777e-07, - "logits/chosen": 0.7594738006591797, - "logits/rejected": 0.8430054783821106, - "logps/chosen": -8.928936958312988, - "logps/rejected": -9.845270156860352, - "loss": 0.5524, - "rewards/accuracies": 0.65625, - "rewards/chosen": -8.928936958312988, - "rewards/margins": 0.916333019733429, - "rewards/rejected": -9.845270156860352, - "semantic_entropy": 0.002889876952394843, + "logits/chosen": -0.10087054967880249, + "logits/rejected": 0.06265222281217575, + "logps/chosen": -4.439296722412109, + "logps/rejected": -5.2684125900268555, + "loss": 0.5602, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.439296722412109, + "rewards/margins": 0.8291157484054565, + "rewards/rejected": -5.2684125900268555, "step": 3200 }, { "epoch": 1.7126609800970063, - "eval_logits/chosen": 0.8485396504402161, - "eval_logits/rejected": 0.9051938652992249, - "eval_logps/chosen": -8.875685691833496, - "eval_logps/rejected": -9.834607124328613, - "eval_loss": 0.5206592679023743, - "eval_rewards/accuracies": 0.716617226600647, - "eval_rewards/chosen": -8.875685691833496, - "eval_rewards/margins": 0.9589214324951172, - "eval_rewards/rejected": -9.834607124328613, - "eval_runtime": 35.3345, - "eval_samples_per_second": 38.065, - "eval_semantic_entropy": 0.0029725246131420135, - "eval_steps_per_second": 9.537, + "eval_logits/chosen": 0.23622488975524902, + "eval_logits/rejected": 0.347230464220047, + "eval_logps/chosen": -4.331513404846191, + "eval_logps/rejected": -5.250945091247559, + "eval_loss": 0.5173797011375427, + "eval_rewards/accuracies": 0.7381305694580078, + "eval_rewards/chosen": -4.331513404846191, + "eval_rewards/margins": 0.9194318056106567, + "eval_rewards/rejected": -5.250945091247559, + "eval_runtime": 40.098, + "eval_samples_per_second": 33.543, + "eval_steps_per_second": 8.404, "step": 3200 }, { "epoch": 1.7153370128784078, - "grad_norm": 16.489166146612195, + "grad_norm": 14.560935681363715, "learning_rate": 4.618806587294291e-07, - "logits/chosen": 0.6345168948173523, - "logits/rejected": 0.724500834941864, - "logps/chosen": -8.844565391540527, - "logps/rejected": -9.868181228637695, - "loss": 0.491, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.844565391540527, - "rewards/margins": 1.023616075515747, - "rewards/rejected": -9.868181228637695, - "semantic_entropy": 0.0029267659410834312, + "logits/chosen": -0.2483961582183838, + "logits/rejected": -0.12073061615228653, + "logps/chosen": -4.123127460479736, + "logps/rejected": -5.143646240234375, + "loss": 0.4697, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.123127460479736, + "rewards/margins": 1.0205187797546387, + "rewards/rejected": -5.143646240234375, "step": 3205 }, { "epoch": 1.7180130456598093, - "grad_norm": 20.284904352984434, + "grad_norm": 18.661089209850036, "learning_rate": 4.603279761638365e-07, - "logits/chosen": 0.6574803590774536, - "logits/rejected": 0.7301944494247437, - "logps/chosen": -8.73315143585205, - "logps/rejected": -9.601076126098633, - "loss": 0.5384, - "rewards/accuracies": 0.71875, - "rewards/chosen": -8.73315143585205, - "rewards/margins": 0.8679240942001343, - "rewards/rejected": -9.601076126098633, - "semantic_entropy": 0.003197681624442339, + "logits/chosen": -0.21365094184875488, + "logits/rejected": -0.08110430836677551, + "logps/chosen": -4.18242073059082, + "logps/rejected": -4.97620153427124, + "loss": 0.548, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.18242073059082, + "rewards/margins": 0.7937807440757751, + "rewards/rejected": -4.97620153427124, "step": 3210 }, { "epoch": 1.720689078441211, - "grad_norm": 18.178656885884827, + "grad_norm": 18.751067387727602, "learning_rate": 4.5877567849639315e-07, - "logits/chosen": 0.7295519709587097, - "logits/rejected": 0.775715708732605, - "logps/chosen": -8.844693183898926, - "logps/rejected": -9.844103813171387, - "loss": 0.4747, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -8.844693183898926, - "rewards/margins": 0.9994112253189087, - "rewards/rejected": -9.844103813171387, - "semantic_entropy": 0.003269757376983762, + "logits/chosen": -0.13676145672798157, + "logits/rejected": -0.02357124350965023, + "logps/chosen": -4.17262077331543, + "logps/rejected": -5.113052845001221, + "loss": 0.4781, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.17262077331543, + "rewards/margins": 0.9404315948486328, + "rewards/rejected": -5.113052845001221, "step": 3215 }, { "epoch": 1.7233651112226125, - "grad_norm": 18.18536875280265, + "grad_norm": 15.14292884303689, "learning_rate": 4.572237807874979e-07, - "logits/chosen": 0.7071816325187683, - "logits/rejected": 0.8377809524536133, - "logps/chosen": -9.233766555786133, - "logps/rejected": -10.121223449707031, - "loss": 0.5734, - "rewards/accuracies": 0.6875, - "rewards/chosen": -9.233766555786133, - "rewards/margins": 0.887457549571991, - "rewards/rejected": -10.121223449707031, - "semantic_entropy": 0.0021587200462818146, + "logits/chosen": -0.254339337348938, + "logits/rejected": 0.025339126586914062, + "logps/chosen": -4.56683349609375, + "logps/rejected": -5.518930435180664, + "loss": 0.5444, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.56683349609375, + "rewards/margins": 0.9520975351333618, + "rewards/rejected": -5.518930435180664, "step": 3220 }, { "epoch": 1.726041144004014, - "grad_norm": 19.728824808262466, + "grad_norm": 15.869497785864858, "learning_rate": 4.5567229809366895e-07, - "logits/chosen": 0.7191265225410461, - "logits/rejected": 0.7812397480010986, - "logps/chosen": -8.780452728271484, - "logps/rejected": -9.705193519592285, - "loss": 0.5172, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.780452728271484, - "rewards/margins": 0.9247404932975769, - "rewards/rejected": -9.705193519592285, - "semantic_entropy": 0.0030050217173993587, + "logits/chosen": -0.15390262007713318, + "logits/rejected": -0.013458246365189552, + "logps/chosen": -4.316327095031738, + "logps/rejected": -5.195765495300293, + "loss": 0.5068, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.316327095031738, + "rewards/margins": 0.8794384002685547, + "rewards/rejected": -5.195765495300293, "step": 3225 }, { "epoch": 1.7287171767854157, - "grad_norm": 22.902320646291393, + "grad_norm": 23.933945487462342, "learning_rate": 4.541212454673984e-07, - "logits/chosen": 0.7195814847946167, - "logits/rejected": 0.7792760133743286, - "logps/chosen": -9.003668785095215, - "logps/rejected": -10.163053512573242, - "loss": 0.4755, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -9.003668785095215, - "rewards/margins": 1.159385085105896, - "rewards/rejected": -10.163053512573242, - "semantic_entropy": 0.0027733384631574154, + "logits/chosen": -0.17181718349456787, + "logits/rejected": 0.01202545128762722, + "logps/chosen": -4.428623199462891, + "logps/rejected": -5.631206512451172, + "loss": 0.4636, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.428623199462891, + "rewards/margins": 1.2025833129882812, + "rewards/rejected": -5.631206512451172, "step": 3230 }, { "epoch": 1.7313932095668172, - "grad_norm": 21.27644304238722, + "grad_norm": 18.840855336084854, "learning_rate": 4.525706379570055e-07, - "logits/chosen": 0.754095196723938, - "logits/rejected": 0.8056744337081909, - "logps/chosen": -8.933822631835938, - "logps/rejected": -9.91698169708252, - "loss": 0.5001, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -8.933822631835938, - "rewards/margins": 0.983159065246582, - "rewards/rejected": -9.91698169708252, - "semantic_entropy": 0.002852677833288908, + "logits/chosen": -0.21135802567005157, + "logits/rejected": -0.09564751386642456, + "logps/chosen": -4.4705305099487305, + "logps/rejected": -5.403426170349121, + "loss": 0.493, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.4705305099487305, + "rewards/margins": 0.9328955411911011, + "rewards/rejected": -5.403426170349121, "step": 3235 }, { "epoch": 1.7340692423482187, - "grad_norm": 16.305218184451377, + "grad_norm": 15.475508350274092, "learning_rate": 4.510204906064911e-07, - "logits/chosen": 0.7781286239624023, - "logits/rejected": 0.8381627798080444, - "logps/chosen": -9.009490013122559, - "logps/rejected": -10.12246036529541, - "loss": 0.4383, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.009490013122559, - "rewards/margins": 1.1129701137542725, - "rewards/rejected": -10.12246036529541, - "semantic_entropy": 0.0021688812412321568, + "logits/chosen": -0.13211984932422638, + "logits/rejected": 0.03232773393392563, + "logps/chosen": -4.4460225105285645, + "logps/rejected": -5.514318943023682, + "loss": 0.4659, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.4460225105285645, + "rewards/margins": 1.0682958364486694, + "rewards/rejected": -5.514318943023682, "step": 3240 }, { "epoch": 1.7367452751296204, - "grad_norm": 21.167999522279032, + "grad_norm": 17.370736375041048, "learning_rate": 4.4947081845539177e-07, - "logits/chosen": 0.7233031988143921, - "logits/rejected": 0.7836328148841858, - "logps/chosen": -9.125692367553711, - "logps/rejected": -10.023492813110352, - "loss": 0.5164, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -9.125692367553711, - "rewards/margins": 0.8977994918823242, - "rewards/rejected": -10.023492813110352, - "semantic_entropy": 0.0024925144389271736, + "logits/chosen": -0.2612045407295227, + "logits/rejected": -0.1181323304772377, + "logps/chosen": -4.549315452575684, + "logps/rejected": -5.401080131530762, + "loss": 0.5392, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.549315452575684, + "rewards/margins": 0.8517640829086304, + "rewards/rejected": -5.401080131530762, "step": 3245 }, { "epoch": 1.739421307911022, - "grad_norm": 17.847444177521478, + "grad_norm": 15.891473488750362, "learning_rate": 4.479216365386333e-07, - "logits/chosen": 0.7969452142715454, - "logits/rejected": 0.8838433027267456, - "logps/chosen": -9.0504789352417, - "logps/rejected": -10.168962478637695, - "loss": 0.445, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.0504789352417, - "rewards/margins": 1.1184842586517334, - "rewards/rejected": -10.168962478637695, - "semantic_entropy": 0.00228295405395329, + "logits/chosen": -0.11899392306804657, + "logits/rejected": 0.09806482493877411, + "logps/chosen": -4.545066833496094, + "logps/rejected": -5.613814353942871, + "loss": 0.4826, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.545066833496094, + "rewards/margins": 1.068747878074646, + "rewards/rejected": -5.613814353942871, "step": 3250 }, { "epoch": 1.7420973406924234, - "grad_norm": 13.863230971283963, + "grad_norm": 12.764649010634617, "learning_rate": 4.4637295988638555e-07, - "logits/chosen": 0.7870410680770874, - "logits/rejected": 0.8611448407173157, - "logps/chosen": -8.87813663482666, - "logps/rejected": -9.888373374938965, - "loss": 0.4735, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.87813663482666, - "rewards/margins": 1.0102384090423584, - "rewards/rejected": -9.888373374938965, - "semantic_entropy": 0.00256515690125525, + "logits/chosen": -0.16263428330421448, + "logits/rejected": -0.03970929607748985, + "logps/chosen": -4.319584846496582, + "logps/rejected": -5.312398910522461, + "loss": 0.4803, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.319584846496582, + "rewards/margins": 0.9928141832351685, + "rewards/rejected": -5.312398910522461, "step": 3255 }, { "epoch": 1.744773373473825, - "grad_norm": 23.662117895070335, + "grad_norm": 23.678446980790923, "learning_rate": 4.4482480352391623e-07, - "logits/chosen": 0.6543598175048828, - "logits/rejected": 0.7426427006721497, - "logps/chosen": -9.041504859924316, - "logps/rejected": -10.04463005065918, - "loss": 0.4806, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -9.041504859924316, - "rewards/margins": 1.0031250715255737, - "rewards/rejected": -10.04463005065918, - "semantic_entropy": 0.0023853727616369724, + "logits/chosen": -0.18533803522586823, + "logits/rejected": -0.04181788116693497, + "logps/chosen": -4.463208198547363, + "logps/rejected": -5.43758487701416, + "loss": 0.4871, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.463208198547363, + "rewards/margins": 0.9743775129318237, + "rewards/rejected": -5.43758487701416, "step": 3260 }, { "epoch": 1.7474494062552266, - "grad_norm": 24.256153218206705, + "grad_norm": 24.563102035360398, "learning_rate": 4.4327718247144507e-07, - "logits/chosen": 0.7513245940208435, - "logits/rejected": 0.8328276872634888, - "logps/chosen": -9.090994834899902, - "logps/rejected": -10.088811874389648, - "loss": 0.4761, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.090994834899902, - "rewards/margins": 0.9978184700012207, - "rewards/rejected": -10.088811874389648, - "semantic_entropy": 0.002217040164396167, + "logits/chosen": -0.10527654737234116, + "logits/rejected": 0.03702473267912865, + "logps/chosen": -4.315720558166504, + "logps/rejected": -5.340867042541504, + "loss": 0.4846, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.315720558166504, + "rewards/margins": 1.0251458883285522, + "rewards/rejected": -5.340867042541504, "step": 3265 }, { "epoch": 1.750125439036628, - "grad_norm": 26.314796306829628, + "grad_norm": 23.491136508107537, "learning_rate": 4.417301117439984e-07, - "logits/chosen": 0.7460024356842041, - "logits/rejected": 0.8103092312812805, - "logps/chosen": -9.169168472290039, - "logps/rejected": -10.078702926635742, - "loss": 0.5253, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -9.169168472290039, - "rewards/margins": 0.9095350503921509, - "rewards/rejected": -10.078702926635742, - "semantic_entropy": 0.0022672966588288546, + "logits/chosen": -0.15799733996391296, + "logits/rejected": -0.03357836231589317, + "logps/chosen": -4.52625036239624, + "logps/rejected": -5.480228424072266, + "loss": 0.5027, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.52625036239624, + "rewards/margins": 0.9539781808853149, + "rewards/rejected": -5.480228424072266, "step": 3270 }, { "epoch": 1.7528014718180298, - "grad_norm": 18.707979583798863, + "grad_norm": 18.13962881645596, "learning_rate": 4.401836063512631e-07, - "logits/chosen": 0.7222810983657837, - "logits/rejected": 0.8605899810791016, - "logps/chosen": -8.943084716796875, - "logps/rejected": -10.04680347442627, - "loss": 0.4723, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -8.943084716796875, - "rewards/margins": 1.1037187576293945, - "rewards/rejected": -10.04680347442627, - "semantic_entropy": 0.002709039021283388, + "logits/chosen": -0.22965040802955627, + "logits/rejected": 0.11366041749715805, + "logps/chosen": -4.402631759643555, + "logps/rejected": -5.404372215270996, + "loss": 0.4869, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.402631759643555, + "rewards/margins": 1.0017400979995728, + "rewards/rejected": -5.404372215270996, "step": 3275 }, { "epoch": 1.7554775045994313, - "grad_norm": 24.168153990326203, + "grad_norm": 23.516838872808083, "learning_rate": 4.386376812974413e-07, - "logits/chosen": 0.6883140802383423, - "logits/rejected": 0.7442909479141235, - "logps/chosen": -8.88626766204834, - "logps/rejected": -9.887288093566895, - "loss": 0.4843, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -8.88626766204834, - "rewards/margins": 1.0010201930999756, - "rewards/rejected": -9.887288093566895, - "semantic_entropy": 0.002547713927924633, + "logits/chosen": -0.18983802199363708, + "logits/rejected": -0.10174103081226349, + "logps/chosen": -3.8793816566467285, + "logps/rejected": -4.9282755851745605, + "loss": 0.4759, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.8793816566467285, + "rewards/margins": 1.0488945245742798, + "rewards/rejected": -4.9282755851745605, "step": 3280 }, { "epoch": 1.7581535373808328, - "grad_norm": 21.306584019538054, + "grad_norm": 16.16734173697255, "learning_rate": 4.370923515811048e-07, - "logits/chosen": 0.7414734363555908, - "logits/rejected": 0.8382769823074341, - "logps/chosen": -9.08434009552002, - "logps/rejected": -10.091033935546875, - "loss": 0.4818, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.08434009552002, - "rewards/margins": 1.0066949129104614, - "rewards/rejected": -10.091033935546875, - "semantic_entropy": 0.002196715446189046, + "logits/chosen": -0.16550886631011963, + "logits/rejected": 0.0685269683599472, + "logps/chosen": -4.262026786804199, + "logps/rejected": -5.302098751068115, + "loss": 0.4625, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.262026786804199, + "rewards/margins": 1.0400713682174683, + "rewards/rejected": -5.302098751068115, "step": 3285 }, { "epoch": 1.7608295701622345, - "grad_norm": 17.567070937529156, + "grad_norm": 16.20480251179305, "learning_rate": 4.35547632195049e-07, - "logits/chosen": 0.7264934778213501, - "logits/rejected": 0.8058233261108398, - "logps/chosen": -8.905702590942383, - "logps/rejected": -9.89527416229248, - "loss": 0.456, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.905702590942383, - "rewards/margins": 0.9895727038383484, - "rewards/rejected": -9.89527416229248, - "semantic_entropy": 0.0023129256442189217, + "logits/chosen": -0.12412941455841064, + "logits/rejected": 0.01536425482481718, + "logps/chosen": -4.020644664764404, + "logps/rejected": -4.96931266784668, + "loss": 0.4688, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.020644664764404, + "rewards/margins": 0.9486686587333679, + "rewards/rejected": -4.96931266784668, "step": 3290 }, { "epoch": 1.763505602943636, - "grad_norm": 21.091679239749574, + "grad_norm": 18.49155351293187, "learning_rate": 4.340035381261484e-07, - "logits/chosen": 0.7000614404678345, - "logits/rejected": 0.7599838972091675, - "logps/chosen": -9.041738510131836, - "logps/rejected": -10.077522277832031, - "loss": 0.4989, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -9.041738510131836, - "rewards/margins": 1.0357847213745117, - "rewards/rejected": -10.077522277832031, - "semantic_entropy": 0.002556400140747428, + "logits/chosen": -0.12859898805618286, + "logits/rejected": -0.011179554276168346, + "logps/chosen": -4.367478847503662, + "logps/rejected": -5.33084774017334, + "loss": 0.5202, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.367478847503662, + "rewards/margins": 0.9633690714836121, + "rewards/rejected": -5.33084774017334, "step": 3295 }, { "epoch": 1.7661816357250375, - "grad_norm": 19.4750798931357, + "grad_norm": 19.236099975610713, "learning_rate": 4.324600843552104e-07, - "logits/chosen": 0.61224764585495, - "logits/rejected": 0.694617509841919, - "logps/chosen": -9.045601844787598, - "logps/rejected": -10.111780166625977, - "loss": 0.5129, - "rewards/accuracies": 0.75, - "rewards/chosen": -9.045601844787598, - "rewards/margins": 1.0661789178848267, - "rewards/rejected": -10.111780166625977, - "semantic_entropy": 0.002750884275883436, + "logits/chosen": -0.22424840927124023, + "logits/rejected": -0.054463207721710205, + "logps/chosen": -4.646658420562744, + "logps/rejected": -5.635561943054199, + "loss": 0.508, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.646658420562744, + "rewards/margins": 0.9889039993286133, + "rewards/rejected": -5.635561943054199, "step": 3300 }, { "epoch": 1.7688576685064392, - "grad_norm": 27.700841135900863, + "grad_norm": 21.528822461151616, "learning_rate": 4.309172858568302e-07, - "logits/chosen": 0.6138121485710144, - "logits/rejected": 0.7267636060714722, - "logps/chosen": -8.853018760681152, - "logps/rejected": -9.896512985229492, - "loss": 0.4664, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -8.853018760681152, - "rewards/margins": 1.0434927940368652, - "rewards/rejected": -9.896512985229492, - "semantic_entropy": 0.002849545329809189, + "logits/chosen": -0.25173068046569824, + "logits/rejected": -0.03483206778764725, + "logps/chosen": -4.445437431335449, + "logps/rejected": -5.461155414581299, + "loss": 0.4912, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.445437431335449, + "rewards/margins": 1.0157172679901123, + "rewards/rejected": -5.461155414581299, "step": 3305 }, { "epoch": 1.771533701287841, - "grad_norm": 17.137464902365757, + "grad_norm": 17.72386634956042, "learning_rate": 4.293751575992455e-07, - "logits/chosen": 0.7429224848747253, - "logits/rejected": 0.792006254196167, - "logps/chosen": -8.867963790893555, - "logps/rejected": -9.816696166992188, - "loss": 0.4852, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.867963790893555, - "rewards/margins": 0.9487320184707642, - "rewards/rejected": -9.816696166992188, - "semantic_entropy": 0.0024321440141648054, + "logits/chosen": -0.033337343484163284, + "logits/rejected": 0.024440549314022064, + "logps/chosen": -4.518048286437988, + "logps/rejected": -5.440402030944824, + "loss": 0.4921, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.518048286437988, + "rewards/margins": 0.9223540425300598, + "rewards/rejected": -5.440402030944824, "step": 3310 }, { "epoch": 1.7742097340692422, - "grad_norm": 23.093031626174174, + "grad_norm": 28.00697313993164, "learning_rate": 4.278337145441916e-07, - "logits/chosen": 0.703718900680542, - "logits/rejected": 0.7882632613182068, - "logps/chosen": -8.92485237121582, - "logps/rejected": -9.829282760620117, - "loss": 0.4997, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.92485237121582, - "rewards/margins": 0.904431939125061, - "rewards/rejected": -9.829282760620117, - "semantic_entropy": 0.00211041746661067, + "logits/chosen": -0.1750796139240265, + "logits/rejected": 0.0117019172757864, + "logps/chosen": -4.513766765594482, + "logps/rejected": -5.465371131896973, + "loss": 0.4922, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.513766765594482, + "rewards/margins": 0.9516035914421082, + "rewards/rejected": -5.465371131896973, "step": 3315 }, { "epoch": 1.776885766850644, - "grad_norm": 14.81174004133826, + "grad_norm": 14.21248158151053, "learning_rate": 4.262929716467556e-07, - "logits/chosen": 0.7307204008102417, - "logits/rejected": 0.828132152557373, - "logps/chosen": -8.699949264526367, - "logps/rejected": -9.868879318237305, - "loss": 0.4528, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -8.699949264526367, - "rewards/margins": 1.1689304113388062, - "rewards/rejected": -9.868879318237305, - "semantic_entropy": 0.0027522039599716663, + "logits/chosen": -0.15160226821899414, + "logits/rejected": 0.10243771225214005, + "logps/chosen": -4.445255279541016, + "logps/rejected": -5.540156364440918, + "loss": 0.4974, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.445255279541016, + "rewards/margins": 1.0949010848999023, + "rewards/rejected": -5.540156364440918, "step": 3320 }, { "epoch": 1.7795617996320456, - "grad_norm": 21.68742926157539, + "grad_norm": 16.15953465977365, "learning_rate": 4.247529438552321e-07, - "logits/chosen": 0.6795674562454224, - "logits/rejected": 0.7630687355995178, - "logps/chosen": -8.83338451385498, - "logps/rejected": -9.718835830688477, - "loss": 0.5331, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -8.83338451385498, - "rewards/margins": 0.8854507207870483, - "rewards/rejected": -9.718835830688477, - "semantic_entropy": 0.0027334585320204496, + "logits/chosen": -0.19283165037631989, + "logits/rejected": -0.012377461418509483, + "logps/chosen": -4.383761405944824, + "logps/rejected": -5.306110382080078, + "loss": 0.5112, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.383761405944824, + "rewards/margins": 0.9223492741584778, + "rewards/rejected": -5.306110382080078, "step": 3325 }, { "epoch": 1.782237832413447, - "grad_norm": 17.78803459405486, + "grad_norm": 19.934527696572324, "learning_rate": 4.232136461109773e-07, - "logits/chosen": 0.6920473575592041, - "logits/rejected": 0.7552872896194458, - "logps/chosen": -8.73144245147705, - "logps/rejected": -9.89186954498291, - "loss": 0.4425, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.73144245147705, - "rewards/margins": 1.1604268550872803, - "rewards/rejected": -9.89186954498291, - "semantic_entropy": 0.0029787137173116207, + "logits/chosen": -0.13882964849472046, + "logits/rejected": -0.0015836369711905718, + "logps/chosen": -4.2797040939331055, + "logps/rejected": -5.368746757507324, + "loss": 0.4603, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.2797040939331055, + "rewards/margins": 1.0890430212020874, + "rewards/rejected": -5.368746757507324, "step": 3330 }, { "epoch": 1.7849138651948486, - "grad_norm": 26.15974232065996, + "grad_norm": 29.360741053715895, "learning_rate": 4.216750933482646e-07, - "logits/chosen": 0.6749182939529419, - "logits/rejected": 0.7685472965240479, - "logps/chosen": -8.99049186706543, - "logps/rejected": -9.847522735595703, - "loss": 0.5483, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.99049186706543, - "rewards/margins": 0.8570305705070496, - "rewards/rejected": -9.847522735595703, - "semantic_entropy": 0.002466335194185376, + "logits/chosen": -0.15821383893489838, + "logits/rejected": 0.0275881327688694, + "logps/chosen": -4.581524848937988, + "logps/rejected": -5.432961463928223, + "loss": 0.5266, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.581524848937988, + "rewards/margins": 0.8514370918273926, + "rewards/rejected": -5.432961463928223, "step": 3335 }, { "epoch": 1.7875898979762503, - "grad_norm": 27.88902855871644, + "grad_norm": 32.562591289121045, "learning_rate": 4.2013730049413986e-07, - "logits/chosen": 0.7373770475387573, - "logits/rejected": 0.8123876452445984, - "logps/chosen": -8.785151481628418, - "logps/rejected": -9.975650787353516, - "loss": 0.4473, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -8.785151481628418, - "rewards/margins": 1.1904983520507812, - "rewards/rejected": -9.975650787353516, - "semantic_entropy": 0.0027192619163542986, + "logits/chosen": -0.1358291208744049, + "logits/rejected": 0.043612316250801086, + "logps/chosen": -4.320347309112549, + "logps/rejected": -5.4458723068237305, + "loss": 0.4657, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.320347309112549, + "rewards/margins": 1.1255255937576294, + "rewards/rejected": -5.4458723068237305, "step": 3340 }, { "epoch": 1.7902659307576518, - "grad_norm": 18.711785224251184, + "grad_norm": 14.942489098510793, "learning_rate": 4.1860028246827594e-07, - "logits/chosen": 0.7438098788261414, - "logits/rejected": 0.8496102094650269, - "logps/chosen": -8.687559127807617, - "logps/rejected": -9.705583572387695, - "loss": 0.4863, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -8.687559127807617, - "rewards/margins": 1.0180258750915527, - "rewards/rejected": -9.705583572387695, - "semantic_entropy": 0.0030276733450591564, + "logits/chosen": -0.13863849639892578, + "logits/rejected": 0.07033483684062958, + "logps/chosen": -4.108242511749268, + "logps/rejected": -5.095074653625488, + "loss": 0.4724, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.108242511749268, + "rewards/margins": 0.9868319630622864, + "rewards/rejected": -5.095074653625488, "step": 3345 }, { "epoch": 1.7929419635390533, - "grad_norm": 14.740013723796874, + "grad_norm": 16.96376010135844, "learning_rate": 4.170640541828285e-07, - "logits/chosen": 0.6757484674453735, - "logits/rejected": 0.7701447606086731, - "logps/chosen": -8.937789916992188, - "logps/rejected": -9.954288482666016, - "loss": 0.4742, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -8.937789916992188, - "rewards/margins": 1.0164979696273804, - "rewards/rejected": -9.954288482666016, - "semantic_entropy": 0.0024529777001589537, + "logits/chosen": -0.2579251825809479, + "logits/rejected": -0.12016648054122925, + "logps/chosen": -4.418516635894775, + "logps/rejected": -5.2958478927612305, + "loss": 0.5054, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.418516635894775, + "rewards/margins": 0.8773313760757446, + "rewards/rejected": -5.2958478927612305, "step": 3350 }, { "epoch": 1.795617996320455, - "grad_norm": 23.48077638997515, + "grad_norm": 23.22663933059764, "learning_rate": 4.1552863054229116e-07, - "logits/chosen": 0.7250600457191467, - "logits/rejected": 0.7637051343917847, - "logps/chosen": -8.986780166625977, - "logps/rejected": -9.95530891418457, - "loss": 0.5204, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.986780166625977, - "rewards/margins": 0.968528151512146, - "rewards/rejected": -9.95530891418457, - "semantic_entropy": 0.0026002321392297745, + "logits/chosen": -0.06798188388347626, + "logits/rejected": 0.003281953977420926, + "logps/chosen": -4.643111705780029, + "logps/rejected": -5.424739837646484, + "loss": 0.5757, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.643111705780029, + "rewards/margins": 0.7816286087036133, + "rewards/rejected": -5.424739837646484, "step": 3355 }, { "epoch": 1.7982940291018565, - "grad_norm": 21.480655584717656, + "grad_norm": 19.29847318386424, "learning_rate": 4.139940264433508e-07, - "logits/chosen": 0.6162451505661011, - "logits/rejected": 0.6867518424987793, - "logps/chosen": -8.727703094482422, - "logps/rejected": -9.817054748535156, - "loss": 0.485, - "rewards/accuracies": 0.71875, - "rewards/chosen": -8.727703094482422, - "rewards/margins": 1.0893512964248657, - "rewards/rejected": -9.817054748535156, - "semantic_entropy": 0.0029442054219543934, + "logits/chosen": -0.21622474491596222, + "logits/rejected": -0.011190185323357582, + "logps/chosen": -4.281977653503418, + "logps/rejected": -5.282488822937012, + "loss": 0.4851, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.281977653503418, + "rewards/margins": 1.0005111694335938, + "rewards/rejected": -5.282488822937012, "step": 3360 }, { "epoch": 1.800970061883258, - "grad_norm": 18.329533144366998, + "grad_norm": 16.71953168466975, "learning_rate": 4.1246025677474303e-07, - "logits/chosen": 0.6584054231643677, - "logits/rejected": 0.7416545152664185, - "logps/chosen": -8.870896339416504, - "logps/rejected": -9.782048225402832, - "loss": 0.4965, - "rewards/accuracies": 0.8125, - "rewards/chosen": -8.870896339416504, - "rewards/margins": 0.9111523628234863, - "rewards/rejected": -9.782048225402832, - "semantic_entropy": 0.0025828261859714985, + "logits/chosen": -0.22282078862190247, + "logits/rejected": -0.0275583453476429, + "logps/chosen": -4.534602165222168, + "logps/rejected": -5.3538737297058105, + "loss": 0.5107, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.534602165222168, + "rewards/margins": 0.8192712664604187, + "rewards/rejected": -5.3538737297058105, "step": 3365 }, { "epoch": 1.8036460946646597, - "grad_norm": 20.53246303343791, + "grad_norm": 19.585169856257103, "learning_rate": 4.10927336417108e-07, - "logits/chosen": 0.699885368347168, - "logits/rejected": 0.7778645753860474, - "logps/chosen": -9.00536060333252, - "logps/rejected": -9.712576866149902, - "loss": 0.6029, - "rewards/accuracies": 0.71875, - "rewards/chosen": -9.00536060333252, - "rewards/margins": 0.7072166204452515, - "rewards/rejected": -9.712576866149902, - "semantic_entropy": 0.0022258516401052475, + "logits/chosen": -0.1755405217409134, + "logits/rejected": -0.007827716879546642, + "logps/chosen": -4.58895206451416, + "logps/rejected": -5.3027544021606445, + "loss": 0.5797, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.58895206451416, + "rewards/margins": 0.7138028740882874, + "rewards/rejected": -5.3027544021606445, "step": 3370 }, { "epoch": 1.8063221274460612, - "grad_norm": 17.92382157104711, + "grad_norm": 20.23837833215681, "learning_rate": 4.093952802428457e-07, - "logits/chosen": 0.7124849557876587, - "logits/rejected": 0.773395836353302, - "logps/chosen": -9.165143013000488, - "logps/rejected": -10.00512409210205, - "loss": 0.5968, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -9.165143013000488, - "rewards/margins": 0.8399818539619446, - "rewards/rejected": -10.00512409210205, - "semantic_entropy": 0.0018791807815432549, + "logits/chosen": -0.03490014001727104, + "logits/rejected": 0.05154269188642502, + "logps/chosen": -4.746885299682617, + "logps/rejected": -5.609267234802246, + "loss": 0.5728, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.746885299682617, + "rewards/margins": 0.8623818159103394, + "rewards/rejected": -5.609267234802246, "step": 3375 }, { "epoch": 1.8089981602274627, - "grad_norm": 16.031873167807106, + "grad_norm": 15.462209370895746, "learning_rate": 4.0786410311597184e-07, - "logits/chosen": 0.6675196886062622, - "logits/rejected": 0.7558413743972778, - "logps/chosen": -8.87452220916748, - "logps/rejected": -9.87469482421875, - "loss": 0.5014, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.87452220916748, - "rewards/margins": 1.0001723766326904, - "rewards/rejected": -9.87469482421875, - "semantic_entropy": 0.0024310979060828686, + "logits/chosen": -0.2470814734697342, + "logits/rejected": -0.03646931052207947, + "logps/chosen": -4.390835762023926, + "logps/rejected": -5.270857810974121, + "loss": 0.5264, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.390835762023926, + "rewards/margins": 0.8800222277641296, + "rewards/rejected": -5.270857810974121, "step": 3380 }, { "epoch": 1.8116741930088645, - "grad_norm": 17.18330270756532, + "grad_norm": 17.048958526174218, "learning_rate": 4.063338198919737e-07, - "logits/chosen": 0.6833704710006714, - "logits/rejected": 0.6954981684684753, - "logps/chosen": -8.974740982055664, - "logps/rejected": -9.834104537963867, - "loss": 0.515, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.974740982055664, - "rewards/margins": 0.8593646883964539, - "rewards/rejected": -9.834104537963867, - "semantic_entropy": 0.002467888640239835, + "logits/chosen": -0.20745804905891418, + "logits/rejected": -0.18323490023612976, + "logps/chosen": -4.59398889541626, + "logps/rejected": -5.39055871963501, + "loss": 0.5444, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.59398889541626, + "rewards/margins": 0.7965695858001709, + "rewards/rejected": -5.39055871963501, "step": 3385 }, { "epoch": 1.814350225790266, - "grad_norm": 30.761929745103753, + "grad_norm": 25.62815952119199, "learning_rate": 4.0480444541766575e-07, - "logits/chosen": 0.7065908908843994, - "logits/rejected": 0.761638343334198, - "logps/chosen": -9.091318130493164, - "logps/rejected": -9.837724685668945, - "loss": 0.5966, - "rewards/accuracies": 0.6875, - "rewards/chosen": -9.091318130493164, - "rewards/margins": 0.7464063763618469, - "rewards/rejected": -9.837724685668945, - "semantic_entropy": 0.002267292933538556, + "logits/chosen": -0.18585902452468872, + "logits/rejected": -0.031981486827135086, + "logps/chosen": -4.265715599060059, + "logps/rejected": -5.0390825271606445, + "loss": 0.5792, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.265715599060059, + "rewards/margins": 0.7733665704727173, + "rewards/rejected": -5.0390825271606445, "step": 3390 }, { "epoch": 1.8170262585716674, - "grad_norm": 17.437367219946186, + "grad_norm": 17.271570676896122, "learning_rate": 4.0327599453104606e-07, - "logits/chosen": 0.6314225792884827, - "logits/rejected": 0.7021452784538269, - "logps/chosen": -8.934675216674805, - "logps/rejected": -9.954093933105469, - "loss": 0.4614, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.934675216674805, - "rewards/margins": 1.019417405128479, - "rewards/rejected": -9.954093933105469, - "semantic_entropy": 0.002328323433175683, + "logits/chosen": -0.2425771951675415, + "logits/rejected": -0.057721178978681564, + "logps/chosen": -4.322760105133057, + "logps/rejected": -5.353698253631592, + "loss": 0.4584, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.322760105133057, + "rewards/margins": 1.030938744544983, + "rewards/rejected": -5.353698253631592, "step": 3395 }, { "epoch": 1.8197022913530692, - "grad_norm": 23.666717824573407, + "grad_norm": 22.28847303476479, "learning_rate": 4.017484820611514e-07, - "logits/chosen": 0.6827374696731567, - "logits/rejected": 0.7666773796081543, - "logps/chosen": -9.024335861206055, - "logps/rejected": -9.977819442749023, - "loss": 0.499, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -9.024335861206055, - "rewards/margins": 0.9534839391708374, - "rewards/rejected": -9.977819442749023, - "semantic_entropy": 0.002623113337904215, + "logits/chosen": -0.16352578997612, + "logits/rejected": -0.01084213238209486, + "logps/chosen": -4.3149895668029785, + "logps/rejected": -5.2527360916137695, + "loss": 0.4977, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.3149895668029785, + "rewards/margins": 0.9377468228340149, + "rewards/rejected": -5.2527360916137695, "step": 3400 }, { "epoch": 1.8223783241344707, - "grad_norm": 19.47814600029652, + "grad_norm": 15.72676148973139, "learning_rate": 4.002219228279148e-07, - "logits/chosen": 0.6472792029380798, - "logits/rejected": 0.7226368188858032, - "logps/chosen": -9.10517406463623, - "logps/rejected": -9.983365058898926, - "loss": 0.4792, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -9.10517406463623, - "rewards/margins": 0.878190815448761, - "rewards/rejected": -9.983365058898926, - "semantic_entropy": 0.001885834732092917, + "logits/chosen": -0.19310136139392853, + "logits/rejected": -0.022855300456285477, + "logps/chosen": -4.426163673400879, + "logps/rejected": -5.339369773864746, + "loss": 0.469, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.426163673400879, + "rewards/margins": 0.9132067561149597, + "rewards/rejected": -5.339369773864746, "step": 3405 }, { "epoch": 1.8250543569158721, - "grad_norm": 15.863322204582039, + "grad_norm": 17.611493487868973, "learning_rate": 3.9869633164202045e-07, - "logits/chosen": 0.6499922871589661, - "logits/rejected": 0.7298166751861572, - "logps/chosen": -9.109966278076172, - "logps/rejected": -10.095043182373047, - "loss": 0.4607, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.109966278076172, - "rewards/margins": 0.9850764274597168, - "rewards/rejected": -10.095043182373047, - "semantic_entropy": 0.00205561937764287, + "logits/chosen": -0.19281618297100067, + "logits/rejected": 0.06217436119914055, + "logps/chosen": -4.503681182861328, + "logps/rejected": -5.3663763999938965, + "loss": 0.489, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.503681182861328, + "rewards/margins": 0.8626953363418579, + "rewards/rejected": -5.3663763999938965, "step": 3410 }, { "epoch": 1.8277303896972739, - "grad_norm": 20.471803353130138, + "grad_norm": 22.440127639740595, "learning_rate": 3.9717172330476077e-07, - "logits/chosen": 0.6554244756698608, - "logits/rejected": 0.7187341451644897, - "logps/chosen": -8.939603805541992, - "logps/rejected": -9.954214096069336, - "loss": 0.4732, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -8.939603805541992, - "rewards/margins": 1.0146093368530273, - "rewards/rejected": -9.954214096069336, - "semantic_entropy": 0.002255493775010109, + "logits/chosen": -0.18121816217899323, + "logits/rejected": -0.02774154581129551, + "logps/chosen": -4.424191951751709, + "logps/rejected": -5.374784469604492, + "loss": 0.5096, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.424191951751709, + "rewards/margins": 0.9505926370620728, + "rewards/rejected": -5.374784469604492, "step": 3415 }, { "epoch": 1.8304064224786754, - "grad_norm": 17.77008986004475, + "grad_norm": 24.430760007713115, "learning_rate": 3.956481126078927e-07, - "logits/chosen": 0.7610489726066589, - "logits/rejected": 0.8220788836479187, - "logps/chosen": -8.944317817687988, - "logps/rejected": -9.942276000976562, - "loss": 0.5332, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -8.944317817687988, - "rewards/margins": 0.9979581832885742, - "rewards/rejected": -9.942276000976562, - "semantic_entropy": 0.0026616621762514114, + "logits/chosen": -0.0848013311624527, + "logits/rejected": 0.03848361223936081, + "logps/chosen": -4.305295944213867, + "logps/rejected": -5.179749488830566, + "loss": 0.5759, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.305295944213867, + "rewards/margins": 0.8744528889656067, + "rewards/rejected": -5.179749488830566, "step": 3420 }, { "epoch": 1.8330824552600768, - "grad_norm": 17.655564242212915, + "grad_norm": 13.908430138951214, "learning_rate": 3.941255143334937e-07, - "logits/chosen": 0.6452963948249817, - "logits/rejected": 0.6714794039726257, - "logps/chosen": -9.091032028198242, - "logps/rejected": -10.047563552856445, - "loss": 0.4901, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -9.091032028198242, - "rewards/margins": 0.9565309286117554, - "rewards/rejected": -10.047563552856445, - "semantic_entropy": 0.0020148297771811485, + "logits/chosen": -0.19720415771007538, + "logits/rejected": -0.1273278445005417, + "logps/chosen": -4.3548479080200195, + "logps/rejected": -5.337155342102051, + "loss": 0.4799, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.3548479080200195, + "rewards/margins": 0.9823066592216492, + "rewards/rejected": -5.337155342102051, "step": 3425 }, { "epoch": 1.8357584880414786, - "grad_norm": 21.687629209942305, + "grad_norm": 21.42673110426873, "learning_rate": 3.9260394325381895e-07, - "logits/chosen": 0.6120941638946533, - "logits/rejected": 0.6991773843765259, - "logps/chosen": -8.957076072692871, - "logps/rejected": -10.24323844909668, - "loss": 0.4464, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -8.957076072692871, - "rewards/margins": 1.286162257194519, - "rewards/rejected": -10.24323844909668, - "semantic_entropy": 0.002235526219010353, + "logits/chosen": -0.18583042919635773, + "logits/rejected": -0.04028434306383133, + "logps/chosen": -4.365105152130127, + "logps/rejected": -5.638331413269043, + "loss": 0.4293, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.365105152130127, + "rewards/margins": 1.2732250690460205, + "rewards/rejected": -5.638331413269043, "step": 3430 }, { "epoch": 1.83843452082288, - "grad_norm": 20.343261070644665, + "grad_norm": 21.64752095079144, "learning_rate": 3.9108341413115784e-07, - "logits/chosen": 0.6617427468299866, - "logits/rejected": 0.7312533259391785, - "logps/chosen": -9.039822578430176, - "logps/rejected": -10.031728744506836, - "loss": 0.4638, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.039822578430176, - "rewards/margins": 0.9919074177742004, - "rewards/rejected": -10.031728744506836, - "semantic_entropy": 0.002043725224211812, + "logits/chosen": -0.19199156761169434, + "logits/rejected": -0.05491490289568901, + "logps/chosen": -4.3316545486450195, + "logps/rejected": -5.378331184387207, + "loss": 0.4224, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.3316545486450195, + "rewards/margins": 1.0466768741607666, + "rewards/rejected": -5.378331184387207, "step": 3435 }, { "epoch": 1.8411105536042816, - "grad_norm": 23.236450227403648, + "grad_norm": 21.085487019608607, "learning_rate": 3.895639417176905e-07, - "logits/chosen": 0.6440222859382629, - "logits/rejected": 0.7037031054496765, - "logps/chosen": -9.063264846801758, - "logps/rejected": -10.010323524475098, - "loss": 0.5479, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -9.063264846801758, - "rewards/margins": 0.9470599889755249, - "rewards/rejected": -10.010323524475098, - "semantic_entropy": 0.0022424368653446436, + "logits/chosen": -0.2173265963792801, + "logits/rejected": -0.10059396177530289, + "logps/chosen": -4.24751091003418, + "logps/rejected": -5.2151336669921875, + "loss": 0.5345, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.24751091003418, + "rewards/margins": 0.9676225781440735, + "rewards/rejected": -5.2151336669921875, "step": 3440 }, { "epoch": 1.8437865863856833, - "grad_norm": 19.62330937214821, + "grad_norm": 18.777693184878892, "learning_rate": 3.8804554075534497e-07, - "logits/chosen": 0.6262876987457275, - "logits/rejected": 0.7453981041908264, - "logps/chosen": -8.971453666687012, - "logps/rejected": -10.038634300231934, - "loss": 0.5028, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -8.971453666687012, - "rewards/margins": 1.067180871963501, - "rewards/rejected": -10.038634300231934, - "semantic_entropy": 0.002401644829660654, + "logits/chosen": -0.19751515984535217, + "logits/rejected": 0.03677196055650711, + "logps/chosen": -4.426905632019043, + "logps/rejected": -5.4918694496154785, + "loss": 0.4708, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.426905632019043, + "rewards/margins": 1.0649640560150146, + "rewards/rejected": -5.4918694496154785, "step": 3445 }, { "epoch": 1.8464626191670848, - "grad_norm": 20.685899363455015, + "grad_norm": 17.172953479462663, "learning_rate": 3.8652822597565403e-07, - "logits/chosen": 0.6409908533096313, - "logits/rejected": 0.7288905382156372, - "logps/chosen": -9.046746253967285, - "logps/rejected": -10.134294509887695, - "loss": 0.4441, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.046746253967285, - "rewards/margins": 1.0875482559204102, - "rewards/rejected": -10.134294509887695, - "semantic_entropy": 0.0022094310261309147, + "logits/chosen": -0.2609812617301941, + "logits/rejected": -0.07055485248565674, + "logps/chosen": -4.486452579498291, + "logps/rejected": -5.519837856292725, + "loss": 0.4686, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.486452579498291, + "rewards/margins": 1.0333855152130127, + "rewards/rejected": -5.519837856292725, "step": 3450 }, { "epoch": 1.8491386519484863, - "grad_norm": 21.553543305017367, + "grad_norm": 19.956976634962356, "learning_rate": 3.850120120996123e-07, - "logits/chosen": 0.6723691821098328, - "logits/rejected": 0.797301173210144, - "logps/chosen": -9.249448776245117, - "logps/rejected": -10.260098457336426, - "loss": 0.525, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -9.249448776245117, - "rewards/margins": 1.0106487274169922, - "rewards/rejected": -10.260098457336426, - "semantic_entropy": 0.0019731963984668255, + "logits/chosen": -0.171308696269989, + "logits/rejected": 0.05414406210184097, + "logps/chosen": -4.690035820007324, + "logps/rejected": -5.594363689422607, + "loss": 0.5517, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.690035820007324, + "rewards/margins": 0.9043287038803101, + "rewards/rejected": -5.594363689422607, "step": 3455 }, { "epoch": 1.851814684729888, - "grad_norm": 18.840448032000573, + "grad_norm": 19.128930684530037, "learning_rate": 3.8349691383753356e-07, - "logits/chosen": 0.7298885583877563, - "logits/rejected": 0.7910041213035583, - "logps/chosen": -8.926676750183105, - "logps/rejected": -9.932170867919922, - "loss": 0.4829, - "rewards/accuracies": 0.75, - "rewards/chosen": -8.926676750183105, - "rewards/margins": 1.0054935216903687, - "rewards/rejected": -9.932170867919922, - "semantic_entropy": 0.0026156664825975895, + "logits/chosen": -0.0617978498339653, + "logits/rejected": 0.07501844316720963, + "logps/chosen": -4.344292640686035, + "logps/rejected": -5.307211399078369, + "loss": 0.4951, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.344292640686035, + "rewards/margins": 0.962918758392334, + "rewards/rejected": -5.307211399078369, "step": 3460 }, { "epoch": 1.8544907175112895, - "grad_norm": 22.35639759207699, + "grad_norm": 15.25137918953851, "learning_rate": 3.819829458889078e-07, - "logits/chosen": 0.6428291201591492, - "logits/rejected": 0.6888445615768433, - "logps/chosen": -9.04511833190918, - "logps/rejected": -10.008955001831055, - "loss": 0.5023, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -9.04511833190918, - "rewards/margins": 0.9638371467590332, - "rewards/rejected": -10.008955001831055, - "semantic_entropy": 0.002010942902415991, + "logits/chosen": -0.16281290352344513, + "logits/rejected": -0.054202497005462646, + "logps/chosen": -4.475902557373047, + "logps/rejected": -5.400743007659912, + "loss": 0.5039, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.475902557373047, + "rewards/margins": 0.9248401522636414, + "rewards/rejected": -5.400743007659912, "step": 3465 }, { "epoch": 1.857166750292691, - "grad_norm": 18.756301563038658, + "grad_norm": 19.19935187011723, "learning_rate": 3.804701229422585e-07, - "logits/chosen": 0.6267444491386414, - "logits/rejected": 0.7092264890670776, - "logps/chosen": -8.965785026550293, - "logps/rejected": -10.16661548614502, - "loss": 0.4493, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.965785026550293, - "rewards/margins": 1.2008302211761475, - "rewards/rejected": -10.16661548614502, - "semantic_entropy": 0.002212436404079199, + "logits/chosen": -0.19654931128025055, + "logits/rejected": -0.083122119307518, + "logps/chosen": -4.540813446044922, + "logps/rejected": -5.519863128662109, + "loss": 0.4907, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.540813446044922, + "rewards/margins": 0.9790492057800293, + "rewards/rejected": -5.519863128662109, "step": 3470 }, { "epoch": 1.8598427830740927, - "grad_norm": 20.509740408539155, + "grad_norm": 21.557677490835825, "learning_rate": 3.789584596750007e-07, - "logits/chosen": 0.644954264163971, - "logits/rejected": 0.6691696047782898, - "logps/chosen": -9.087681770324707, - "logps/rejected": -10.077864646911621, - "loss": 0.5039, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -9.087681770324707, - "rewards/margins": 0.9901838302612305, - "rewards/rejected": -10.077864646911621, - "semantic_entropy": 0.0026786925736814737, + "logits/chosen": -0.1726953387260437, + "logits/rejected": -0.11982186138629913, + "logps/chosen": -4.424330711364746, + "logps/rejected": -5.297823905944824, + "loss": 0.5175, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.424330711364746, + "rewards/margins": 0.8734933137893677, + "rewards/rejected": -5.297823905944824, "step": 3475 }, { "epoch": 1.8625188158554944, - "grad_norm": 21.56561893288092, + "grad_norm": 17.188244260553706, "learning_rate": 3.77447970753298e-07, - "logits/chosen": 0.6968456506729126, - "logits/rejected": 0.7207155823707581, - "logps/chosen": -9.34924602508545, - "logps/rejected": -10.384513854980469, - "loss": 0.5044, + "logits/chosen": -0.06679714471101761, + "logits/rejected": -0.02872612699866295, + "logps/chosen": -4.5763044357299805, + "logps/rejected": -5.521466255187988, + "loss": 0.5031, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -9.34924602508545, - "rewards/margins": 1.0352654457092285, - "rewards/rejected": -10.384513854980469, - "semantic_entropy": 0.001824896433390677, + "rewards/chosen": -4.5763044357299805, + "rewards/margins": 0.9451617002487183, + "rewards/rejected": -5.521466255187988, "step": 3480 }, { "epoch": 1.8651948486368957, - "grad_norm": 27.331504916633676, + "grad_norm": 21.330308841164406, "learning_rate": 3.7593867083192057e-07, - "logits/chosen": 0.6225256323814392, - "logits/rejected": 0.7147785425186157, - "logps/chosen": -9.113728523254395, - "logps/rejected": -10.127340316772461, - "loss": 0.4956, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -9.113728523254395, - "rewards/margins": 1.0136115550994873, - "rewards/rejected": -10.127340316772461, - "semantic_entropy": 0.002069010864943266, + "logits/chosen": -0.12558963894844055, + "logits/rejected": 0.0574454739689827, + "logps/chosen": -4.26003360748291, + "logps/rejected": -5.228446006774902, + "loss": 0.504, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.26003360748291, + "rewards/margins": 0.9684122204780579, + "rewards/rejected": -5.228446006774902, "step": 3485 }, { "epoch": 1.8678708814182974, - "grad_norm": 24.045050734996725, + "grad_norm": 20.4385786429565, "learning_rate": 3.7443057455410276e-07, - "logits/chosen": 0.7259531617164612, - "logits/rejected": 0.767625093460083, - "logps/chosen": -9.031217575073242, - "logps/rejected": -10.065279006958008, - "loss": 0.453, + "logits/chosen": -0.12516388297080994, + "logits/rejected": 0.005091439001262188, + "logps/chosen": -4.299023628234863, + "logps/rejected": -5.376700401306152, + "loss": 0.4385, "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.031217575073242, - "rewards/margins": 1.034061312675476, - "rewards/rejected": -10.065279006958008, - "semantic_entropy": 0.002121392637491226, + "rewards/chosen": -4.299023628234863, + "rewards/margins": 1.07767653465271, + "rewards/rejected": -5.376700401306152, "step": 3490 }, { "epoch": 1.870546914199699, - "grad_norm": 22.0898928139177, + "grad_norm": 15.670160037161486, "learning_rate": 3.7292369655140145e-07, - "logits/chosen": 0.6421200037002563, - "logits/rejected": 0.7500838041305542, - "logps/chosen": -9.124689102172852, - "logps/rejected": -10.039737701416016, - "loss": 0.4787, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.124689102172852, - "rewards/margins": 0.915047824382782, - "rewards/rejected": -10.039737701416016, - "semantic_entropy": 0.0020720604807138443, + "logits/chosen": -0.24718156456947327, + "logits/rejected": -0.0475880391895771, + "logps/chosen": -4.409168243408203, + "logps/rejected": -5.326218128204346, + "loss": 0.4619, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.409168243408203, + "rewards/margins": 0.9170497059822083, + "rewards/rejected": -5.326218128204346, "step": 3495 }, { "epoch": 1.8732229469811004, - "grad_norm": 17.368463245273844, + "grad_norm": 15.774441007236756, "learning_rate": 3.714180514435534e-07, - "logits/chosen": 0.6820253133773804, - "logits/rejected": 0.767578125, - "logps/chosen": -8.85074520111084, - "logps/rejected": -9.993677139282227, - "loss": 0.4724, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.85074520111084, - "rewards/margins": 1.1429319381713867, - "rewards/rejected": -9.993677139282227, - "semantic_entropy": 0.0029044263064861298, + "logits/chosen": -0.06465111672878265, + "logits/rejected": 0.11878099292516708, + "logps/chosen": -4.233417510986328, + "logps/rejected": -5.2592573165893555, + "loss": 0.4718, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.233417510986328, + "rewards/margins": 1.0258398056030273, + "rewards/rejected": -5.2592573165893555, "step": 3500 }, { "epoch": 1.875898979762502, - "grad_norm": 26.111048656274622, + "grad_norm": 25.156196483113263, "learning_rate": 3.6991365383833426e-07, - "logits/chosen": 0.6787170767784119, - "logits/rejected": 0.7552027702331543, - "logps/chosen": -8.890897750854492, - "logps/rejected": -9.884078025817871, - "loss": 0.476, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -8.890897750854492, - "rewards/margins": 0.9931808710098267, - "rewards/rejected": -9.884078025817871, - "semantic_entropy": 0.0023928822483867407, + "logits/chosen": -0.20147418975830078, + "logits/rejected": -0.03710639104247093, + "logps/chosen": -4.1881256103515625, + "logps/rejected": -5.202637672424316, + "loss": 0.465, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.1881256103515625, + "rewards/margins": 1.014512062072754, + "rewards/rejected": -5.202637672424316, "step": 3505 }, { "epoch": 1.8785750125439038, - "grad_norm": 25.720344446595462, + "grad_norm": 22.83215155289933, "learning_rate": 3.684105183314162e-07, - "logits/chosen": 0.6534699201583862, - "logits/rejected": 0.7134217619895935, - "logps/chosen": -8.654134750366211, - "logps/rejected": -9.688114166259766, - "loss": 0.4572, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.654134750366211, - "rewards/margins": 1.0339783430099487, - "rewards/rejected": -9.688114166259766, - "semantic_entropy": 0.0033304274547845125, + "logits/chosen": -0.18839846551418304, + "logits/rejected": -0.06658022105693817, + "logps/chosen": -4.085474491119385, + "logps/rejected": -5.068699836730957, + "loss": 0.4541, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.085474491119385, + "rewards/margins": 0.98322594165802, + "rewards/rejected": -5.068699836730957, "step": 3510 }, { "epoch": 1.881251045325305, - "grad_norm": 26.719905564320833, + "grad_norm": 22.57857569066431, "learning_rate": 3.669086595062263e-07, - "logits/chosen": 0.6928398609161377, - "logits/rejected": 0.7959692478179932, - "logps/chosen": -8.998773574829102, - "logps/rejected": -10.005119323730469, - "loss": 0.4712, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -8.998773574829102, - "rewards/margins": 1.0063453912734985, - "rewards/rejected": -10.005119323730469, - "semantic_entropy": 0.002207712968811393, + "logits/chosen": -0.17122144997119904, + "logits/rejected": 0.06019320338964462, + "logps/chosen": -4.4079766273498535, + "logps/rejected": -5.387908935546875, + "loss": 0.4658, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.4079766273498535, + "rewards/margins": 0.9799333810806274, + "rewards/rejected": -5.387908935546875, "step": 3515 }, { "epoch": 1.8839270781067068, - "grad_norm": 18.310966271031408, + "grad_norm": 17.754362317313795, "learning_rate": 3.654080919338056e-07, - "logits/chosen": 0.6792198419570923, - "logits/rejected": 0.7416011691093445, - "logps/chosen": -9.011211395263672, - "logps/rejected": -9.959385871887207, - "loss": 0.5049, - "rewards/accuracies": 0.71875, - "rewards/chosen": -9.011211395263672, - "rewards/margins": 0.9481745958328247, - "rewards/rejected": -9.959385871887207, - "semantic_entropy": 0.0023789291735738516, + "logits/chosen": -0.22097405791282654, + "logits/rejected": -0.04111310839653015, + "logps/chosen": -4.28373908996582, + "logps/rejected": -5.258654594421387, + "loss": 0.5031, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.28373908996582, + "rewards/margins": 0.9749153256416321, + "rewards/rejected": -5.258654594421387, "step": 3520 }, { "epoch": 1.8866031108881085, - "grad_norm": 23.54294511956383, + "grad_norm": 19.46081673994115, "learning_rate": 3.639088301726673e-07, - "logits/chosen": 0.7045190334320068, - "logits/rejected": 0.814649224281311, - "logps/chosen": -9.096199035644531, - "logps/rejected": -9.993253707885742, - "loss": 0.5336, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -9.096199035644531, - "rewards/margins": 0.8970546722412109, - "rewards/rejected": -9.993253707885742, - "semantic_entropy": 0.0019266394665464759, + "logits/chosen": -0.16798846423625946, + "logits/rejected": 0.055651675909757614, + "logps/chosen": -4.321869850158691, + "logps/rejected": -5.203028202056885, + "loss": 0.5181, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.321869850158691, + "rewards/margins": 0.8811577558517456, + "rewards/rejected": -5.203028202056885, "step": 3525 }, { "epoch": 1.88927914366951, - "grad_norm": 25.00575901129148, + "grad_norm": 16.357009247333153, "learning_rate": 3.624108887686556e-07, - "logits/chosen": 0.717838704586029, - "logits/rejected": 0.7664039134979248, - "logps/chosen": -9.02385139465332, - "logps/rejected": -9.899523735046387, - "loss": 0.4944, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -9.02385139465332, - "rewards/margins": 0.8756723403930664, - "rewards/rejected": -9.899523735046387, - "semantic_entropy": 0.00239885738119483, + "logits/chosen": -0.12408437579870224, + "logits/rejected": -0.03319060057401657, + "logps/chosen": -4.363692283630371, + "logps/rejected": -5.2759294509887695, + "loss": 0.4826, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.363692283630371, + "rewards/margins": 0.9122363328933716, + "rewards/rejected": -5.2759294509887695, "step": 3530 }, { "epoch": 1.8919551764509115, - "grad_norm": 14.206266277834583, + "grad_norm": 13.720223045516777, "learning_rate": 3.6091428225480433e-07, - "logits/chosen": 0.6777101755142212, - "logits/rejected": 0.7591882944107056, - "logps/chosen": -8.996365547180176, - "logps/rejected": -10.051039695739746, - "loss": 0.4775, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -8.996365547180176, - "rewards/margins": 1.0546749830245972, - "rewards/rejected": -10.051039695739746, - "semantic_entropy": 0.0023488677106797695, + "logits/chosen": -0.20934045314788818, + "logits/rejected": -0.05957505851984024, + "logps/chosen": -4.465426445007324, + "logps/rejected": -5.4984331130981445, + "loss": 0.4747, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.465426445007324, + "rewards/margins": 1.0330082178115845, + "rewards/rejected": -5.4984331130981445, "step": 3535 }, { "epoch": 1.8946312092323132, - "grad_norm": 24.582046979640456, + "grad_norm": 20.823485402135383, "learning_rate": 3.5941902515119674e-07, - "logits/chosen": 0.6657333374023438, - "logits/rejected": 0.7678895592689514, - "logps/chosen": -9.090019226074219, - "logps/rejected": -9.867273330688477, - "loss": 0.5413, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -9.090019226074219, - "rewards/margins": 0.7772535085678101, - "rewards/rejected": -9.867273330688477, - "semantic_entropy": 0.0021225649397820234, + "logits/chosen": -0.22213831543922424, + "logits/rejected": 0.017714042216539383, + "logps/chosen": -4.511366367340088, + "logps/rejected": -5.3054704666137695, + "loss": 0.5423, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.511366367340088, + "rewards/margins": 0.7941038012504578, + "rewards/rejected": -5.3054704666137695, "step": 3540 }, { "epoch": 1.8973072420137147, - "grad_norm": 22.002900126766963, + "grad_norm": 18.933031722518173, "learning_rate": 3.5792513196482373e-07, - "logits/chosen": 0.6315397620201111, - "logits/rejected": 0.7467874884605408, - "logps/chosen": -8.85982894897461, - "logps/rejected": -9.882316589355469, - "loss": 0.4531, + "logits/chosen": -0.349143922328949, + "logits/rejected": -0.04255596175789833, + "logps/chosen": -4.3842267990112305, + "logps/rejected": -5.422542095184326, + "loss": 0.4251, "rewards/accuracies": 0.8125, - "rewards/chosen": -8.85982894897461, - "rewards/margins": 1.0224884748458862, - "rewards/rejected": -9.882316589355469, - "semantic_entropy": 0.0025396724231541157, + "rewards/chosen": -4.3842267990112305, + "rewards/margins": 1.0383151769638062, + "rewards/rejected": -5.422542095184326, "step": 3545 }, { "epoch": 1.8999832747951162, - "grad_norm": 18.88844359088285, + "grad_norm": 22.710470757810597, "learning_rate": 3.5643261718944346e-07, - "logits/chosen": 0.7221022844314575, - "logits/rejected": 0.779187798500061, - "logps/chosen": -9.089310646057129, - "logps/rejected": -9.891887664794922, - "loss": 0.5786, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -9.089310646057129, - "rewards/margins": 0.802577018737793, - "rewards/rejected": -9.891887664794922, - "semantic_entropy": 0.002080023754388094, + "logits/chosen": -0.11484891176223755, + "logits/rejected": -0.019009608775377274, + "logps/chosen": -4.535388469696045, + "logps/rejected": -5.346112251281738, + "loss": 0.5479, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.535388469696045, + "rewards/margins": 0.8107244372367859, + "rewards/rejected": -5.346112251281738, "step": 3550 }, { "epoch": 1.902659307576518, - "grad_norm": 14.948819514875911, + "grad_norm": 15.448424438319284, "learning_rate": 3.5494149530544087e-07, - "logits/chosen": 0.6752597093582153, - "logits/rejected": 0.7371557354927063, - "logps/chosen": -8.859451293945312, - "logps/rejected": -9.911179542541504, - "loss": 0.483, - "rewards/accuracies": 0.75, - "rewards/chosen": -8.859451293945312, - "rewards/margins": 1.051727294921875, - "rewards/rejected": -9.911179542541504, - "semantic_entropy": 0.0026133800856769085, + "logits/chosen": -0.2659691870212555, + "logits/rejected": -0.13825161755084991, + "logps/chosen": -4.363196849822998, + "logps/rejected": -5.303255081176758, + "loss": 0.5276, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.363196849822998, + "rewards/margins": 0.9400573968887329, + "rewards/rejected": -5.303255081176758, "step": 3555 }, { "epoch": 1.9053353403579194, - "grad_norm": 24.637227593137656, + "grad_norm": 19.820165953091802, "learning_rate": 3.534517807796871e-07, - "logits/chosen": 0.6935003995895386, - "logits/rejected": 0.7413294315338135, - "logps/chosen": -8.910869598388672, - "logps/rejected": -9.782427787780762, - "loss": 0.5241, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -8.910869598388672, - "rewards/margins": 0.8715595006942749, - "rewards/rejected": -9.782427787780762, - "semantic_entropy": 0.002564162714406848, + "logits/chosen": -0.20676305890083313, + "logits/rejected": -0.08138835430145264, + "logps/chosen": -4.205061435699463, + "logps/rejected": -5.133881568908691, + "loss": 0.4845, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.205061435699463, + "rewards/margins": 0.9288201332092285, + "rewards/rejected": -5.133881568908691, "step": 3560 }, { "epoch": 1.908011373139321, - "grad_norm": 16.09008068793547, + "grad_norm": 17.32119377950289, "learning_rate": 3.519634880653988e-07, - "logits/chosen": 0.7049506902694702, - "logits/rejected": 0.7636333703994751, - "logps/chosen": -9.050897598266602, - "logps/rejected": -10.201239585876465, - "loss": 0.4495, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -9.050897598266602, - "rewards/margins": 1.150342583656311, - "rewards/rejected": -10.201239585876465, - "semantic_entropy": 0.0020480218809098005, + "logits/chosen": -0.17501512169837952, + "logits/rejected": -0.050323713570833206, + "logps/chosen": -4.466790676116943, + "logps/rejected": -5.576212406158447, + "loss": 0.4618, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.466790676116943, + "rewards/margins": 1.1094213724136353, + "rewards/rejected": -5.576212406158447, "step": 3565 }, { "epoch": 1.9106874059207226, - "grad_norm": 17.29460063325407, + "grad_norm": 17.75795955969999, "learning_rate": 3.504766316019987e-07, - "logits/chosen": 0.6761201024055481, - "logits/rejected": 0.7888168096542358, - "logps/chosen": -8.835293769836426, - "logps/rejected": -9.917633056640625, - "loss": 0.454, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -8.835293769836426, - "rewards/margins": 1.0823395252227783, - "rewards/rejected": -9.917633056640625, - "semantic_entropy": 0.002739850664511323, + "logits/chosen": -0.22475700080394745, + "logits/rejected": -0.00793300848454237, + "logps/chosen": -4.253852844238281, + "logps/rejected": -5.2876973152160645, + "loss": 0.4467, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.253852844238281, + "rewards/margins": 1.0338438749313354, + "rewards/rejected": -5.2876973152160645, "step": 3570 }, { "epoch": 1.913363438702124, - "grad_norm": 15.951273378502073, + "grad_norm": 14.815850213121063, "learning_rate": 3.489912258149745e-07, - "logits/chosen": 0.7415227293968201, - "logits/rejected": 0.8035387992858887, - "logps/chosen": -8.881102561950684, - "logps/rejected": -9.969578742980957, - "loss": 0.4531, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -8.881102561950684, - "rewards/margins": 1.0884764194488525, - "rewards/rejected": -9.969578742980957, - "semantic_entropy": 0.0023522416595369577, + "logits/chosen": -0.09060492366552353, + "logits/rejected": 0.03362895920872688, + "logps/chosen": -4.342713356018066, + "logps/rejected": -5.367025375366211, + "loss": 0.4945, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.342713356018066, + "rewards/margins": 1.024311900138855, + "rewards/rejected": -5.367025375366211, "step": 3575 }, { "epoch": 1.9160394714835256, - "grad_norm": 18.18795560922933, + "grad_norm": 17.030311726326453, "learning_rate": 3.475072851157397e-07, - "logits/chosen": 0.7050553560256958, - "logits/rejected": 0.7514214515686035, - "logps/chosen": -8.872556686401367, - "logps/rejected": -9.893532752990723, - "loss": 0.4645, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.872556686401367, - "rewards/margins": 1.0209757089614868, - "rewards/rejected": -9.893532752990723, - "semantic_entropy": 0.0025408435612916946, + "logits/chosen": -0.16693338751792908, + "logits/rejected": -0.08966173231601715, + "logps/chosen": -4.327611923217773, + "logps/rejected": -5.33335018157959, + "loss": 0.4674, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.327611923217773, + "rewards/margins": 1.005738615989685, + "rewards/rejected": -5.33335018157959, "step": 3580 }, { "epoch": 1.9187155042649273, - "grad_norm": 15.738805690279383, + "grad_norm": 17.51759471006501, "learning_rate": 3.460248239014936e-07, - "logits/chosen": 0.7101159691810608, - "logits/rejected": 0.7551933526992798, - "logps/chosen": -9.055107116699219, - "logps/rejected": -10.217333793640137, - "loss": 0.4421, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -9.055107116699219, - "rewards/margins": 1.1622273921966553, - "rewards/rejected": -10.217333793640137, - "semantic_entropy": 0.002511825645342469, + "logits/chosen": -0.04630185291171074, + "logits/rejected": 0.02972991392016411, + "logps/chosen": -4.471953392028809, + "logps/rejected": -5.5180158615112305, + "loss": 0.4486, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.471953392028809, + "rewards/margins": 1.0460630655288696, + "rewards/rejected": -5.5180158615112305, "step": 3585 }, { "epoch": 1.9213915370463288, - "grad_norm": 19.41964183644154, + "grad_norm": 18.193022458183634, "learning_rate": 3.4454385655508134e-07, - "logits/chosen": 0.7462642788887024, - "logits/rejected": 0.7571308016777039, - "logps/chosen": -9.074853897094727, - "logps/rejected": -9.919515609741211, - "loss": 0.5562, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -9.074853897094727, - "rewards/margins": 0.8446613550186157, - "rewards/rejected": -9.919515609741211, - "semantic_entropy": 0.002418497810140252, + "logits/chosen": -0.0739821344614029, + "logits/rejected": -0.01234457828104496, + "logps/chosen": -4.524746894836426, + "logps/rejected": -5.325782775878906, + "loss": 0.5622, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.524746894836426, + "rewards/margins": 0.8010364770889282, + "rewards/rejected": -5.325782775878906, "step": 3590 }, { "epoch": 1.9240675698277303, - "grad_norm": 15.49567951743004, + "grad_norm": 15.888727779181057, "learning_rate": 3.4306439744485447e-07, - "logits/chosen": 0.6995843052864075, - "logits/rejected": 0.7880675792694092, - "logps/chosen": -9.252038955688477, - "logps/rejected": -10.221213340759277, - "loss": 0.5099, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -9.252038955688477, - "rewards/margins": 0.969176173210144, - "rewards/rejected": -10.221213340759277, - "semantic_entropy": 0.0019108497072011232, + "logits/chosen": -0.2023768424987793, + "logits/rejected": 0.018109945580363274, + "logps/chosen": -4.4982194900512695, + "logps/rejected": -5.501367092132568, + "loss": 0.4898, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.4982194900512695, + "rewards/margins": 1.003147840499878, + "rewards/rejected": -5.501367092132568, "step": 3595 }, { "epoch": 1.926743602609132, - "grad_norm": 21.391951740269207, + "grad_norm": 21.62738988988104, "learning_rate": 3.415864609245322e-07, - "logits/chosen": 0.7241548895835876, - "logits/rejected": 0.7919793725013733, - "logps/chosen": -9.234020233154297, - "logps/rejected": -10.174389839172363, - "loss": 0.5311, + "logits/chosen": -0.07273901253938675, + "logits/rejected": 0.11738868802785873, + "logps/chosen": -4.535969257354736, + "logps/rejected": -5.478444576263428, + "loss": 0.5482, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -9.234020233154297, - "rewards/margins": 0.9403679966926575, - "rewards/rejected": -10.174389839172363, - "semantic_entropy": 0.0019412841647863388, + "rewards/chosen": -4.535969257354736, + "rewards/margins": 0.9424754977226257, + "rewards/rejected": -5.478444576263428, "step": 3600 }, { "epoch": 1.926743602609132, - "eval_logits/chosen": 0.7883932590484619, - "eval_logits/rejected": 0.8341716527938843, - "eval_logps/chosen": -9.098273277282715, - "eval_logps/rejected": -10.07473087310791, - "eval_loss": 0.5169808268547058, - "eval_rewards/accuracies": 0.7232937812805176, - "eval_rewards/chosen": -9.098273277282715, - "eval_rewards/margins": 0.9764575362205505, - "eval_rewards/rejected": -10.07473087310791, - "eval_runtime": 35.2413, - "eval_samples_per_second": 38.165, - "eval_semantic_entropy": 0.0023804251104593277, - "eval_steps_per_second": 9.563, + "eval_logits/chosen": 0.2233075499534607, + "eval_logits/rejected": 0.33296361565589905, + "eval_logps/chosen": -4.367976188659668, + "eval_logps/rejected": -5.331971168518066, + "eval_loss": 0.5152115225791931, + "eval_rewards/accuracies": 0.7329376935958862, + "eval_rewards/chosen": -4.367976188659668, + "eval_rewards/margins": 0.9639953970909119, + "eval_rewards/rejected": -5.331971168518066, + "eval_runtime": 40.4342, + "eval_samples_per_second": 33.264, + "eval_steps_per_second": 8.335, "step": 3600 }, { "epoch": 1.9294196353905335, - "grad_norm": 20.553397352111812, + "grad_norm": 17.797301056741528, "learning_rate": 3.401100613330605e-07, - "logits/chosen": 0.7208374738693237, - "logits/rejected": 0.7372707724571228, - "logps/chosen": -8.946023941040039, - "logps/rejected": -9.88565731048584, - "loss": 0.5125, - "rewards/accuracies": 0.71875, - "rewards/chosen": -8.946023941040039, - "rewards/margins": 0.9396332502365112, - "rewards/rejected": -9.88565731048584, - "semantic_entropy": 0.0025065175723284483, + "logits/chosen": -0.1574392020702362, + "logits/rejected": -0.128297820687294, + "logps/chosen": -4.099370002746582, + "logps/rejected": -5.055327415466309, + "loss": 0.4793, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.099370002746582, + "rewards/margins": 0.9559570550918579, + "rewards/rejected": -5.055327415466309, "step": 3605 }, { "epoch": 1.932095668171935, - "grad_norm": 14.899765335539588, + "grad_norm": 16.459509862558576, "learning_rate": 3.3863521299447514e-07, - "logits/chosen": 0.66487056016922, - "logits/rejected": 0.7429142594337463, - "logps/chosen": -8.882848739624023, - "logps/rejected": -9.959342956542969, - "loss": 0.4247, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -8.882848739624023, - "rewards/margins": 1.0764933824539185, - "rewards/rejected": -9.959342956542969, - "semantic_entropy": 0.0027935917023569345, + "logits/chosen": -0.21471652388572693, + "logits/rejected": -0.03747572377324104, + "logps/chosen": -4.148980140686035, + "logps/rejected": -5.138084411621094, + "loss": 0.4397, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.148980140686035, + "rewards/margins": 0.9891044497489929, + "rewards/rejected": -5.138084411621094, "step": 3610 }, { "epoch": 1.9347717009533367, - "grad_norm": 18.125423863526265, + "grad_norm": 18.676775632577122, "learning_rate": 3.371619302177609e-07, - "logits/chosen": 0.7205886840820312, - "logits/rejected": 0.783849835395813, - "logps/chosen": -9.081689834594727, - "logps/rejected": -10.110027313232422, - "loss": 0.493, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -9.081689834594727, - "rewards/margins": 1.0283381938934326, - "rewards/rejected": -10.110027313232422, - "semantic_entropy": 0.0018970107194036245, + "logits/chosen": -0.09667553007602692, + "logits/rejected": 0.04763801768422127, + "logps/chosen": -4.519248962402344, + "logps/rejected": -5.497490882873535, + "loss": 0.5015, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.519248962402344, + "rewards/margins": 0.9782422780990601, + "rewards/rejected": -5.497490882873535, "step": 3615 }, { "epoch": 1.9374477337347382, - "grad_norm": 22.214798729957145, + "grad_norm": 22.888173601331346, "learning_rate": 3.3569022729671393e-07, - "logits/chosen": 0.7102506160736084, - "logits/rejected": 0.7653765678405762, - "logps/chosen": -9.172645568847656, - "logps/rejected": -10.060796737670898, - "loss": 0.5162, - "rewards/accuracies": 0.71875, - "rewards/chosen": -9.172645568847656, - "rewards/margins": 0.8881510496139526, - "rewards/rejected": -10.060796737670898, - "semantic_entropy": 0.002231413032859564, + "logits/chosen": -0.13325098156929016, + "logits/rejected": -0.022176718339323997, + "logps/chosen": -4.5649895668029785, + "logps/rejected": -5.448463439941406, + "loss": 0.5126, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.5649895668029785, + "rewards/margins": 0.8834730386734009, + "rewards/rejected": -5.448463439941406, "step": 3620 }, { "epoch": 1.9401237665161397, - "grad_norm": 16.808694866109835, + "grad_norm": 17.82416824443674, "learning_rate": 3.342201185098024e-07, - "logits/chosen": 0.7016305923461914, - "logits/rejected": 0.7090336084365845, - "logps/chosen": -8.874353408813477, - "logps/rejected": -9.903203010559082, - "loss": 0.4595, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -8.874353408813477, - "rewards/margins": 1.028850793838501, - "rewards/rejected": -9.903203010559082, - "semantic_entropy": 0.0032983936835080385, + "logits/chosen": -0.08785581588745117, + "logits/rejected": -0.09261415153741837, + "logps/chosen": -4.270584583282471, + "logps/rejected": -5.197326183319092, + "loss": 0.4818, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.270584583282471, + "rewards/margins": 0.9267421960830688, + "rewards/rejected": -5.197326183319092, "step": 3625 }, { "epoch": 1.9427997992975414, - "grad_norm": 19.764545534044544, + "grad_norm": 20.972415824663752, "learning_rate": 3.3275161812002807e-07, - "logits/chosen": 0.6539800763130188, - "logits/rejected": 0.6890634894371033, - "logps/chosen": -8.991025924682617, - "logps/rejected": -10.090238571166992, - "loss": 0.4918, - "rewards/accuracies": 0.75, - "rewards/chosen": -8.991025924682617, - "rewards/margins": 1.0992109775543213, - "rewards/rejected": -10.090238571166992, - "semantic_entropy": 0.002287736628204584, + "logits/chosen": -0.14352867007255554, + "logits/rejected": -0.09147917479276657, + "logps/chosen": -4.372386455535889, + "logps/rejected": -5.40442419052124, + "loss": 0.5074, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.372386455535889, + "rewards/margins": 1.0320374965667725, + "rewards/rejected": -5.40442419052124, "step": 3630 }, { "epoch": 1.945475832078943, - "grad_norm": 22.455179400576377, + "grad_norm": 17.954275525914145, "learning_rate": 3.312847403747883e-07, - "logits/chosen": 0.6598862409591675, - "logits/rejected": 0.7323023676872253, - "logps/chosen": -8.853995323181152, - "logps/rejected": -10.00835132598877, - "loss": 0.4441, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -8.853995323181152, - "rewards/margins": 1.1543556451797485, - "rewards/rejected": -10.00835132598877, - "semantic_entropy": 0.0028118849731981754, + "logits/chosen": -0.2075006514787674, + "logits/rejected": -0.0739402323961258, + "logps/chosen": -4.240146160125732, + "logps/rejected": -5.294926643371582, + "loss": 0.4628, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.240146160125732, + "rewards/margins": 1.0547807216644287, + "rewards/rejected": -5.294926643371582, "step": 3635 }, { "epoch": 1.9481518648603444, - "grad_norm": 20.333660341959458, + "grad_norm": 18.46642293208032, "learning_rate": 3.2981949950573733e-07, - "logits/chosen": 0.6398320198059082, - "logits/rejected": 0.6992667317390442, - "logps/chosen": -9.079477310180664, - "logps/rejected": -10.012245178222656, - "loss": 0.4811, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -9.079477310180664, - "rewards/margins": 0.9327686429023743, - "rewards/rejected": -10.012245178222656, - "semantic_entropy": 0.002612376119941473, + "logits/chosen": -0.15425649285316467, + "logits/rejected": -0.07985173165798187, + "logps/chosen": -4.456984043121338, + "logps/rejected": -5.319791793823242, + "loss": 0.4899, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.456984043121338, + "rewards/margins": 0.8628080487251282, + "rewards/rejected": -5.319791793823242, "step": 3640 }, { "epoch": 1.9508278976417461, - "grad_norm": 17.83037025874042, + "grad_norm": 15.69646993563852, "learning_rate": 3.283559097286486e-07, - "logits/chosen": 0.6089428663253784, - "logits/rejected": 0.6763657331466675, - "logps/chosen": -9.060758590698242, - "logps/rejected": -9.848979949951172, - "loss": 0.5207, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -9.060758590698242, - "rewards/margins": 0.7882214188575745, - "rewards/rejected": -9.848979949951172, - "semantic_entropy": 0.002590155927464366, + "logits/chosen": -0.19153200089931488, + "logits/rejected": -0.051770709455013275, + "logps/chosen": -4.43072509765625, + "logps/rejected": -5.136530876159668, + "loss": 0.5403, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.43072509765625, + "rewards/margins": 0.7058048844337463, + "rewards/rejected": -5.136530876159668, "step": 3645 }, { "epoch": 1.9535039304231478, - "grad_norm": 18.640006592880273, + "grad_norm": 20.35074424745059, "learning_rate": 3.268939852432765e-07, - "logits/chosen": 0.6610291600227356, - "logits/rejected": 0.6989740133285522, - "logps/chosen": -9.217550277709961, - "logps/rejected": -9.963292121887207, - "loss": 0.5338, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -9.217550277709961, - "rewards/margins": 0.7457407712936401, - "rewards/rejected": -9.963292121887207, - "semantic_entropy": 0.002440792042762041, + "logits/chosen": -0.2412301003932953, + "logits/rejected": -0.14222095906734467, + "logps/chosen": -4.578049659729004, + "logps/rejected": -5.323596954345703, + "loss": 0.5504, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.578049659729004, + "rewards/margins": 0.7455474138259888, + "rewards/rejected": -5.323596954345703, "step": 3650 }, { "epoch": 1.9561799632045491, - "grad_norm": 23.51490179151936, + "grad_norm": 31.140783662852126, "learning_rate": 3.254337402332187e-07, - "logits/chosen": 0.7316364049911499, - "logits/rejected": 0.7886163592338562, - "logps/chosen": -9.14958381652832, - "logps/rejected": -10.033146858215332, - "loss": 0.5228, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -9.14958381652832, - "rewards/margins": 0.8835636377334595, - "rewards/rejected": -10.033146858215332, - "semantic_entropy": 0.0022072389256209135, + "logits/chosen": -0.1507803201675415, + "logits/rejected": -0.03821871429681778, + "logps/chosen": -4.569633483886719, + "logps/rejected": -5.4953765869140625, + "loss": 0.5163, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.569633483886719, + "rewards/margins": 0.9257429838180542, + "rewards/rejected": -5.4953765869140625, "step": 3655 }, { "epoch": 1.9588559959859508, - "grad_norm": 21.709681200914343, + "grad_norm": 16.48027680788937, "learning_rate": 3.239751888657788e-07, - "logits/chosen": 0.7047960162162781, - "logits/rejected": 0.7564027309417725, - "logps/chosen": -9.247949600219727, - "logps/rejected": -10.113713264465332, - "loss": 0.5189, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -9.247949600219727, - "rewards/margins": 0.8657627105712891, - "rewards/rejected": -10.113713264465332, - "semantic_entropy": 0.002100490964949131, + "logits/chosen": -0.17036113142967224, + "logits/rejected": -0.012807434424757957, + "logps/chosen": -4.721514701843262, + "logps/rejected": -5.606455326080322, + "loss": 0.5194, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.721514701843262, + "rewards/margins": 0.8849404454231262, + "rewards/rejected": -5.606455326080322, "step": 3660 }, { "epoch": 1.9615320287673526, - "grad_norm": 20.917519916910795, + "grad_norm": 14.496251621676281, "learning_rate": 3.2251834529182856e-07, - "logits/chosen": 0.6687744855880737, - "logits/rejected": 0.7209922671318054, - "logps/chosen": -8.913464546203613, - "logps/rejected": -10.03473949432373, - "loss": 0.4826, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -8.913464546203613, - "rewards/margins": 1.1212753057479858, - "rewards/rejected": -10.03473949432373, - "semantic_entropy": 0.0026697556022554636, + "logits/chosen": -0.19342677295207977, + "logits/rejected": -0.09373844414949417, + "logps/chosen": -4.135945796966553, + "logps/rejected": -5.1782708168029785, + "loss": 0.4884, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.135945796966553, + "rewards/margins": 1.0423250198364258, + "rewards/rejected": -5.1782708168029785, "step": 3665 }, { "epoch": 1.9642080615487538, - "grad_norm": 24.85961354517434, + "grad_norm": 16.88495152603003, "learning_rate": 3.2106322364567075e-07, - "logits/chosen": 0.7192034721374512, - "logits/rejected": 0.776824951171875, - "logps/chosen": -8.97942066192627, - "logps/rejected": -10.088353157043457, - "loss": 0.4491, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -8.97942066192627, - "rewards/margins": 1.1089332103729248, - "rewards/rejected": -10.088353157043457, - "semantic_entropy": 0.002531954552978277, + "logits/chosen": -0.16751964390277863, + "logits/rejected": -0.02767932415008545, + "logps/chosen": -4.2415852546691895, + "logps/rejected": -5.371515274047852, + "loss": 0.434, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.2415852546691895, + "rewards/margins": 1.129929780960083, + "rewards/rejected": -5.371515274047852, "step": 3670 }, { "epoch": 1.9668840943301555, - "grad_norm": 18.73523000620898, + "grad_norm": 17.95923403964621, "learning_rate": 3.1960983804490183e-07, - "logits/chosen": 0.6787633895874023, - "logits/rejected": 0.7581242322921753, - "logps/chosen": -9.208142280578613, - "logps/rejected": -10.221251487731934, - "loss": 0.5371, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -9.208142280578613, - "rewards/margins": 1.0131086111068726, - "rewards/rejected": -10.221251487731934, - "semantic_entropy": 0.0021549214143306017, + "logits/chosen": -0.20056357979774475, + "logits/rejected": -0.04248541221022606, + "logps/chosen": -4.552309989929199, + "logps/rejected": -5.515840530395508, + "loss": 0.5548, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.552309989929199, + "rewards/margins": 0.9635297656059265, + "rewards/rejected": -5.515840530395508, "step": 3675 }, { "epoch": 1.9695601271115573, - "grad_norm": 16.839870791359772, + "grad_norm": 13.779892767601456, "learning_rate": 3.1815820259027537e-07, - "logits/chosen": 0.6967512369155884, - "logits/rejected": 0.7626298666000366, - "logps/chosen": -8.988113403320312, - "logps/rejected": -10.06352710723877, - "loss": 0.439, - "rewards/accuracies": 0.78125, - "rewards/chosen": -8.988113403320312, - "rewards/margins": 1.0754133462905884, - "rewards/rejected": -10.06352710723877, - "semantic_entropy": 0.002239447785541415, + "logits/chosen": -0.17718617618083954, + "logits/rejected": -0.03854576498270035, + "logps/chosen": -4.03787899017334, + "logps/rejected": -5.037713050842285, + "loss": 0.4389, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.03787899017334, + "rewards/margins": 0.999834418296814, + "rewards/rejected": -5.037713050842285, "step": 3680 }, { "epoch": 1.9722361598929585, - "grad_norm": 25.890088694071558, + "grad_norm": 22.753839550735954, "learning_rate": 3.16708331365565e-07, - "logits/chosen": 0.7034773826599121, - "logits/rejected": 0.7456248998641968, - "logps/chosen": -9.344053268432617, - "logps/rejected": -10.413859367370605, - "loss": 0.4768, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -9.344053268432617, - "rewards/margins": 1.0698063373565674, - "rewards/rejected": -10.413859367370605, - "semantic_entropy": 0.0019446806982159615, + "logits/chosen": -0.22745446860790253, + "logits/rejected": -0.12514789402484894, + "logps/chosen": -4.472426891326904, + "logps/rejected": -5.472949028015137, + "loss": 0.4945, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.472426891326904, + "rewards/margins": 1.0005217790603638, + "rewards/rejected": -5.472949028015137, "step": 3685 }, { "epoch": 1.9749121926743602, - "grad_norm": 20.622779505918793, + "grad_norm": 17.019077482590536, "learning_rate": 3.152602384374275e-07, - "logits/chosen": 0.77290278673172, - "logits/rejected": 0.8412041664123535, - "logps/chosen": -9.321812629699707, - "logps/rejected": -10.298714637756348, - "loss": 0.4964, - "rewards/accuracies": 0.75, - "rewards/chosen": -9.321812629699707, - "rewards/margins": 0.9769018292427063, - "rewards/rejected": -10.298714637756348, - "semantic_entropy": 0.0020173420198261738, + "logits/chosen": -0.16246798634529114, + "logits/rejected": 0.03117043897509575, + "logps/chosen": -4.572426795959473, + "logps/rejected": -5.420951843261719, + "loss": 0.525, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.572426795959473, + "rewards/margins": 0.8485256433486938, + "rewards/rejected": -5.420951843261719, "step": 3690 }, { "epoch": 1.977588225455762, - "grad_norm": 20.085568144464975, + "grad_norm": 18.350191601010074, "learning_rate": 3.1381393785526697e-07, - "logits/chosen": 0.7355653643608093, - "logits/rejected": 0.7890772819519043, - "logps/chosen": -9.242449760437012, - "logps/rejected": -10.24592399597168, - "loss": 0.476, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -9.242449760437012, - "rewards/margins": 1.0034732818603516, - "rewards/rejected": -10.24592399597168, - "semantic_entropy": 0.0019115330651402473, + "logits/chosen": -0.2023898810148239, + "logits/rejected": -0.09542350471019745, + "logps/chosen": -4.383685111999512, + "logps/rejected": -5.284248352050781, + "loss": 0.5038, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.383685111999512, + "rewards/margins": 0.9005634188652039, + "rewards/rejected": -5.284248352050781, "step": 3695 }, { "epoch": 1.9802642582371635, - "grad_norm": 19.738688834879806, + "grad_norm": 14.742435709682226, "learning_rate": 3.123694436510979e-07, - "logits/chosen": 0.7584089040756226, - "logits/rejected": 0.8401368260383606, - "logps/chosen": -9.147109985351562, - "logps/rejected": -10.11845874786377, - "loss": 0.4871, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -9.147109985351562, - "rewards/margins": 0.9713494181632996, - "rewards/rejected": -10.11845874786377, - "semantic_entropy": 0.002204468008130789, + "logits/chosen": -0.15410056710243225, + "logits/rejected": -0.0026923350524157286, + "logps/chosen": -4.132970809936523, + "logps/rejected": -5.108695030212402, + "loss": 0.4573, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.132970809936523, + "rewards/margins": 0.9757240414619446, + "rewards/rejected": -5.108695030212402, "step": 3700 }, { "epoch": 1.982940291018565, - "grad_norm": 23.899312142624215, + "grad_norm": 24.8903505962167, "learning_rate": 3.1092676983940946e-07, - "logits/chosen": 0.8023883700370789, - "logits/rejected": 0.8330841064453125, - "logps/chosen": -9.159601211547852, - "logps/rejected": -10.233617782592773, - "loss": 0.4659, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -9.159601211547852, - "rewards/margins": 1.0740149021148682, - "rewards/rejected": -10.233617782592773, - "semantic_entropy": 0.0021205353550612926, + "logits/chosen": -0.1800861358642578, + "logits/rejected": -0.0938093364238739, + "logps/chosen": -4.220396041870117, + "logps/rejected": -5.242137908935547, + "loss": 0.4724, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.220396041870117, + "rewards/margins": 1.0217410326004028, + "rewards/rejected": -5.242137908935547, "step": 3705 }, { "epoch": 1.9856163237999667, - "grad_norm": 19.69531748298203, + "grad_norm": 17.737688645881573, "learning_rate": 3.094859304170293e-07, - "logits/chosen": 0.8703521490097046, - "logits/rejected": 0.9046932458877563, - "logps/chosen": -9.088752746582031, - "logps/rejected": -10.052389144897461, - "loss": 0.514, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -9.088752746582031, - "rewards/margins": 0.9636358022689819, - "rewards/rejected": -10.052389144897461, - "semantic_entropy": 0.0021650404669344425, + "logits/chosen": -0.02814059890806675, + "logits/rejected": 0.013366499915719032, + "logps/chosen": -4.256437301635742, + "logps/rejected": -5.0900115966796875, + "loss": 0.5474, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.256437301635742, + "rewards/margins": 0.8335738182067871, + "rewards/rejected": -5.0900115966796875, "step": 3710 }, { "epoch": 1.9882923565813682, - "grad_norm": 16.58014663580926, + "grad_norm": 17.994664239825184, "learning_rate": 3.0804693936298795e-07, - "logits/chosen": 0.8061652183532715, - "logits/rejected": 0.8440017700195312, - "logps/chosen": -9.173129081726074, - "logps/rejected": -10.378541946411133, - "loss": 0.4549, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -9.173129081726074, - "rewards/margins": 1.2054128646850586, - "rewards/rejected": -10.378541946411133, - "semantic_entropy": 0.002410900080576539, + "logits/chosen": -0.12317560613155365, + "logits/rejected": -0.062356334179639816, + "logps/chosen": -4.37381649017334, + "logps/rejected": -5.483007907867432, + "loss": 0.4514, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.37381649017334, + "rewards/margins": 1.109190821647644, + "rewards/rejected": -5.483007907867432, "step": 3715 }, { "epoch": 1.9909683893627697, - "grad_norm": 19.2115007374387, + "grad_norm": 17.26652888914337, "learning_rate": 3.066098106383826e-07, - "logits/chosen": 0.7924807071685791, - "logits/rejected": 0.8539689183235168, - "logps/chosen": -9.085798263549805, - "logps/rejected": -10.019353866577148, - "loss": 0.4867, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -9.085798263549805, - "rewards/margins": 0.9335559010505676, - "rewards/rejected": -10.019353866577148, - "semantic_entropy": 0.0022169214207679033, + "logits/chosen": -0.17365805804729462, + "logits/rejected": -0.04853446036577225, + "logps/chosen": -4.253520488739014, + "logps/rejected": -5.129070281982422, + "loss": 0.4973, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.253520488739014, + "rewards/margins": 0.8755496740341187, + "rewards/rejected": -5.129070281982422, "step": 3720 }, { "epoch": 1.9936444221441714, - "grad_norm": 15.73085596537741, + "grad_norm": 15.033621914246384, "learning_rate": 3.0517455818624263e-07, - "logits/chosen": 0.728915810585022, - "logits/rejected": 0.7807096838951111, - "logps/chosen": -9.136758804321289, - "logps/rejected": -10.273028373718262, - "loss": 0.4235, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.136758804321289, - "rewards/margins": 1.1362701654434204, - "rewards/rejected": -10.273028373718262, - "semantic_entropy": 0.0019926291424781084, + "logits/chosen": -0.23579935729503632, + "logits/rejected": -0.1283475011587143, + "logps/chosen": -4.27498722076416, + "logps/rejected": -5.292575359344482, + "loss": 0.4456, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.27498722076416, + "rewards/margins": 1.0175881385803223, + "rewards/rejected": -5.292575359344482, "step": 3725 }, { "epoch": 1.9963204549255729, - "grad_norm": 21.48912266471965, + "grad_norm": 14.718437754558494, "learning_rate": 3.037411959313936e-07, - "logits/chosen": 0.8049052357673645, - "logits/rejected": 0.8560088872909546, - "logps/chosen": -9.200352668762207, - "logps/rejected": -10.1701078414917, - "loss": 0.4887, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -9.200352668762207, - "rewards/margins": 0.9697545766830444, - "rewards/rejected": -10.1701078414917, - "semantic_entropy": 0.0019432473927736282, + "logits/chosen": -0.16861994564533234, + "logits/rejected": -0.02016618847846985, + "logps/chosen": -4.230063438415527, + "logps/rejected": -5.144595146179199, + "loss": 0.4428, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.230063438415527, + "rewards/margins": 0.9145313501358032, + "rewards/rejected": -5.144595146179199, "step": 3730 }, { "epoch": 1.9989964877069744, - "grad_norm": 29.255894045857996, + "grad_norm": 19.874496374199403, "learning_rate": 3.023097377803224e-07, - "logits/chosen": 0.8145462870597839, - "logits/rejected": 0.8601492047309875, - "logps/chosen": -9.245951652526855, - "logps/rejected": -10.164546012878418, - "loss": 0.5493, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -9.245951652526855, - "rewards/margins": 0.9185951352119446, - "rewards/rejected": -10.164546012878418, - "semantic_entropy": 0.0018242119112983346, + "logits/chosen": -0.10386800765991211, + "logits/rejected": -0.011268611066043377, + "logps/chosen": -4.465144157409668, + "logps/rejected": -5.329350471496582, + "loss": 0.5441, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.465144157409668, + "rewards/margins": 0.8642063140869141, + "rewards/rejected": -5.329350471496582, "step": 3735 }, { "epoch": 2.001672520488376, - "grad_norm": 20.524507990126196, + "grad_norm": 19.25527078021546, "learning_rate": 3.008801976210423e-07, - "logits/chosen": 0.8181111216545105, - "logits/rejected": 0.8561455607414246, - "logps/chosen": -9.23983383178711, - "logps/rejected": -10.13349723815918, - "loss": 0.4809, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -9.23983383178711, - "rewards/margins": 0.8936625719070435, - "rewards/rejected": -10.13349723815918, - "semantic_entropy": 0.002180408453568816, + "logits/chosen": -0.08613891899585724, + "logits/rejected": -0.017336256802082062, + "logps/chosen": -4.686578273773193, + "logps/rejected": -5.5754876136779785, + "loss": 0.4861, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.686578273773193, + "rewards/margins": 0.8889085650444031, + "rewards/rejected": -5.5754876136779785, "step": 3740 }, { "epoch": 2.0043485532697773, - "grad_norm": 17.0373176165906, + "grad_norm": 15.992865279717549, "learning_rate": 2.994525893229581e-07, - "logits/chosen": 0.8141145706176758, - "logits/rejected": 0.8517176508903503, - "logps/chosen": -9.19636344909668, - "logps/rejected": -10.442273139953613, - "loss": 0.3877, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -9.19636344909668, - "rewards/margins": 1.2459100484848022, - "rewards/rejected": -10.442273139953613, - "semantic_entropy": 0.002024973975494504, + "logits/chosen": -0.11262719333171844, + "logits/rejected": -0.02547881007194519, + "logps/chosen": -4.501664638519287, + "logps/rejected": -5.753330707550049, + "loss": 0.3816, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.501664638519287, + "rewards/margins": 1.2516658306121826, + "rewards/rejected": -5.753330707550049, "step": 3745 }, { "epoch": 2.007024586051179, - "grad_norm": 15.697982397516139, + "grad_norm": 12.66822523343584, "learning_rate": 2.98026926736732e-07, - "logits/chosen": 0.7669566869735718, - "logits/rejected": 0.811779797077179, - "logps/chosen": -8.987443923950195, - "logps/rejected": -10.217406272888184, - "loss": 0.4174, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -8.987443923950195, - "rewards/margins": 1.2299631834030151, - "rewards/rejected": -10.217406272888184, - "semantic_entropy": 0.0029276900459080935, + "logits/chosen": -0.2270674705505371, + "logits/rejected": -0.10812105238437653, + "logps/chosen": -4.234694957733154, + "logps/rejected": -5.380438804626465, + "loss": 0.4222, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.234694957733154, + "rewards/margins": 1.145743727684021, + "rewards/rejected": -5.380438804626465, "step": 3750 }, { "epoch": 2.0097006188325808, - "grad_norm": 14.632163572843812, + "grad_norm": 13.091888018005566, "learning_rate": 2.9660322369414846e-07, - "logits/chosen": 0.8088932037353516, - "logits/rejected": 0.8847886323928833, - "logps/chosen": -9.229738235473633, - "logps/rejected": -10.462305068969727, - "loss": 0.4008, + "logits/chosen": -0.1817486733198166, + "logits/rejected": -0.021705741062760353, + "logps/chosen": -4.557723045349121, + "logps/rejected": -5.74202299118042, + "loss": 0.4104, "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.229738235473633, - "rewards/margins": 1.2325657606124878, - "rewards/rejected": -10.462305068969727, - "semantic_entropy": 0.0019744504243135452, + "rewards/chosen": -4.557723045349121, + "rewards/margins": 1.1842997074127197, + "rewards/rejected": -5.74202299118042, "step": 3755 }, { "epoch": 2.0123766516139825, - "grad_norm": 13.938257971384212, + "grad_norm": 12.644607097895934, "learning_rate": 2.9518149400798063e-07, - "logits/chosen": 0.7997492551803589, - "logits/rejected": 0.8458935022354126, - "logps/chosen": -9.3661470413208, - "logps/rejected": -10.73242473602295, - "loss": 0.4011, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.3661470413208, - "rewards/margins": 1.3662781715393066, - "rewards/rejected": -10.73242473602295, - "semantic_entropy": 0.0021405040752142668, + "logits/chosen": -0.22544129192829132, + "logits/rejected": -0.15492400527000427, + "logps/chosen": -4.618565082550049, + "logps/rejected": -5.908899784088135, + "loss": 0.3958, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.618565082550049, + "rewards/margins": 1.2903343439102173, + "rewards/rejected": -5.908899784088135, "step": 3760 }, { "epoch": 2.0150526843953838, - "grad_norm": 21.92333227216617, + "grad_norm": 20.996949929083215, "learning_rate": 2.9376175147185633e-07, - "logits/chosen": 0.7903780937194824, - "logits/rejected": 0.8987275958061218, - "logps/chosen": -9.55534553527832, - "logps/rejected": -10.687799453735352, - "loss": 0.4631, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.55534553527832, - "rewards/margins": 1.132454514503479, - "rewards/rejected": -10.687799453735352, - "semantic_entropy": 0.0018273256719112396, + "logits/chosen": -0.18929916620254517, + "logits/rejected": 0.03444962576031685, + "logps/chosen": -4.787485122680664, + "logps/rejected": -5.948728561401367, + "loss": 0.4556, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.787485122680664, + "rewards/margins": 1.161243200302124, + "rewards/rejected": -5.948728561401367, "step": 3765 }, { "epoch": 2.0177287171767855, - "grad_norm": 21.21705630664952, + "grad_norm": 26.161558999114487, "learning_rate": 2.9234400986012376e-07, - "logits/chosen": 0.7476860284805298, - "logits/rejected": 0.8315266370773315, - "logps/chosen": -9.206350326538086, - "logps/rejected": -10.599584579467773, - "loss": 0.3865, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.206350326538086, - "rewards/margins": 1.3932336568832397, - "rewards/rejected": -10.599584579467773, - "semantic_entropy": 0.002457220805808902, + "logits/chosen": -0.24766620993614197, + "logits/rejected": -0.0568489208817482, + "logps/chosen": -4.606505870819092, + "logps/rejected": -5.992148399353027, + "loss": 0.3976, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.606505870819092, + "rewards/margins": 1.3856418132781982, + "rewards/rejected": -5.992148399353027, "step": 3770 }, { "epoch": 2.020404749958187, - "grad_norm": 23.717702676896593, + "grad_norm": 21.476009987366535, "learning_rate": 2.9092828292771817e-07, - "logits/chosen": 0.8404645919799805, - "logits/rejected": 0.8653911352157593, - "logps/chosen": -9.488239288330078, - "logps/rejected": -10.67754077911377, - "loss": 0.4216, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -9.488239288330078, - "rewards/margins": 1.1893017292022705, - "rewards/rejected": -10.67754077911377, - "semantic_entropy": 0.001736976788379252, + "logits/chosen": -0.12161125987768173, + "logits/rejected": -0.06397388130426407, + "logps/chosen": -4.77782678604126, + "logps/rejected": -5.994265079498291, + "loss": 0.4165, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.77782678604126, + "rewards/margins": 1.2164382934570312, + "rewards/rejected": -5.994265079498291, "step": 3775 }, { "epoch": 2.0230807827395885, - "grad_norm": 18.026440727909602, + "grad_norm": 17.35934570379457, "learning_rate": 2.8951458441002875e-07, - "logits/chosen": 0.7588644027709961, - "logits/rejected": 0.804205596446991, - "logps/chosen": -9.2258882522583, - "logps/rejected": -10.435041427612305, - "loss": 0.4317, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -9.2258882522583, - "rewards/margins": 1.2091554403305054, - "rewards/rejected": -10.435041427612305, - "semantic_entropy": 0.0019264190923422575, + "logits/chosen": -0.1698736995458603, + "logits/rejected": -0.12036323547363281, + "logps/chosen": -4.743053913116455, + "logps/rejected": -6.029024600982666, + "loss": 0.3993, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.743053913116455, + "rewards/margins": 1.28597092628479, + "rewards/rejected": -6.029024600982666, "step": 3780 }, { "epoch": 2.02575681552099, - "grad_norm": 17.35712096542329, + "grad_norm": 13.558588937520051, "learning_rate": 2.881029280227643e-07, - "logits/chosen": 0.7276099324226379, - "logits/rejected": 0.8241073489189148, - "logps/chosen": -9.210673332214355, - "logps/rejected": -10.420351028442383, - "loss": 0.443, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.210673332214355, - "rewards/margins": 1.2096776962280273, - "rewards/rejected": -10.420351028442383, - "semantic_entropy": 0.0021508794743567705, + "logits/chosen": -0.18389973044395447, + "logits/rejected": -0.002217628061771393, + "logps/chosen": -4.9552812576293945, + "logps/rejected": -6.147763729095459, + "loss": 0.4335, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.9552812576293945, + "rewards/margins": 1.1924822330474854, + "rewards/rejected": -6.147763729095459, "step": 3785 }, { "epoch": 2.028432848302392, - "grad_norm": 13.520740192725968, + "grad_norm": 15.721236781027926, "learning_rate": 2.8669332746182177e-07, - "logits/chosen": 0.6945358514785767, - "logits/rejected": 0.7733569741249084, - "logps/chosen": -9.187652587890625, - "logps/rejected": -10.517842292785645, - "loss": 0.3952, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.187652587890625, - "rewards/margins": 1.3301887512207031, - "rewards/rejected": -10.517842292785645, - "semantic_entropy": 0.0021430773194879293, + "logits/chosen": -0.22995606064796448, + "logits/rejected": -0.043004631996154785, + "logps/chosen": -4.778521537780762, + "logps/rejected": -6.072896957397461, + "loss": 0.4149, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.778521537780762, + "rewards/margins": 1.2943751811981201, + "rewards/rejected": -6.072896957397461, "step": 3790 }, { "epoch": 2.031108881083793, - "grad_norm": 20.76831890859162, + "grad_norm": 19.71845743549807, "learning_rate": 2.8528579640315156e-07, - "logits/chosen": 0.7405019998550415, - "logits/rejected": 0.7900283336639404, - "logps/chosen": -9.107033729553223, - "logps/rejected": -10.247450828552246, - "loss": 0.4329, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.107033729553223, - "rewards/margins": 1.140415906906128, - "rewards/rejected": -10.247450828552246, - "semantic_entropy": 0.0022071374114602804, + "logits/chosen": -0.12280142307281494, + "logits/rejected": -0.09112267941236496, + "logps/chosen": -4.5373406410217285, + "logps/rejected": -5.633362770080566, + "loss": 0.4468, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.5373406410217285, + "rewards/margins": 1.0960224866867065, + "rewards/rejected": -5.633362770080566, "step": 3795 }, { "epoch": 2.033784913865195, - "grad_norm": 20.516974992601128, + "grad_norm": 25.420574729511074, "learning_rate": 2.8388034850262646e-07, - "logits/chosen": 0.7259657382965088, - "logits/rejected": 0.807550311088562, - "logps/chosen": -9.008337020874023, - "logps/rejected": -10.242483139038086, - "loss": 0.4152, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.008337020874023, - "rewards/margins": 1.2341454029083252, - "rewards/rejected": -10.242483139038086, - "semantic_entropy": 0.0025359108112752438, + "logits/chosen": -0.17109845578670502, + "logits/rejected": -0.00980319269001484, + "logps/chosen": -4.882327079772949, + "logps/rejected": -6.037213325500488, + "loss": 0.4348, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.882327079772949, + "rewards/margins": 1.154886245727539, + "rewards/rejected": -6.037213325500488, "step": 3800 }, { "epoch": 2.0364609466465966, - "grad_norm": 24.572088168328634, + "grad_norm": 28.532488862416187, "learning_rate": 2.824769973959079e-07, - "logits/chosen": 0.7538091540336609, - "logits/rejected": 0.8418930172920227, - "logps/chosen": -9.286577224731445, - "logps/rejected": -10.427810668945312, - "loss": 0.4257, + "logits/chosen": -0.1478937864303589, + "logits/rejected": 0.007172366138547659, + "logps/chosen": -4.888308525085449, + "logps/rejected": -6.07503080368042, + "loss": 0.4125, "rewards/accuracies": 0.8125, - "rewards/chosen": -9.286577224731445, - "rewards/margins": 1.1412330865859985, - "rewards/rejected": -10.427810668945312, - "semantic_entropy": 0.0017364490777254105, + "rewards/chosen": -4.888308525085449, + "rewards/margins": 1.1867225170135498, + "rewards/rejected": -6.07503080368042, "step": 3805 }, { "epoch": 2.039136979427998, - "grad_norm": 18.5404432038831, + "grad_norm": 16.766727691313964, "learning_rate": 2.81075756698315e-07, - "logits/chosen": 0.7726496458053589, - "logits/rejected": 0.855174720287323, - "logps/chosen": -9.180562019348145, - "logps/rejected": -10.476685523986816, - "loss": 0.3802, + "logits/chosen": -0.07402978837490082, + "logits/rejected": 0.020395079627633095, + "logps/chosen": -4.714168548583984, + "logps/rejected": -5.9696149826049805, + "loss": 0.389, "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -9.180562019348145, - "rewards/margins": 1.29612398147583, - "rewards/rejected": -10.476685523986816, - "semantic_entropy": 0.0018982533365488052, + "rewards/chosen": -4.714168548583984, + "rewards/margins": 1.2554457187652588, + "rewards/rejected": -5.9696149826049805, "step": 3810 }, { "epoch": 2.0418130122093996, - "grad_norm": 18.915627570533157, + "grad_norm": 16.894883689533344, "learning_rate": 2.7967664000469035e-07, - "logits/chosen": 0.721420407295227, - "logits/rejected": 0.7838973999023438, - "logps/chosen": -9.233001708984375, - "logps/rejected": -10.553727149963379, - "loss": 0.3627, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -9.233001708984375, - "rewards/margins": 1.320725679397583, - "rewards/rejected": -10.553727149963379, - "semantic_entropy": 0.0020444700494408607, + "logits/chosen": -0.23170170187950134, + "logits/rejected": -0.10550439357757568, + "logps/chosen": -4.911438941955566, + "logps/rejected": -6.149017333984375, + "loss": 0.3761, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.911438941955566, + "rewards/margins": 1.2375777959823608, + "rewards/rejected": -6.149017333984375, "step": 3815 }, { "epoch": 2.0444890449908013, - "grad_norm": 18.14358020288826, + "grad_norm": 20.075347843639214, "learning_rate": 2.7827966088927095e-07, - "logits/chosen": 0.6938169598579407, - "logits/rejected": 0.8017823100090027, - "logps/chosen": -9.465526580810547, - "logps/rejected": -10.78095817565918, - "loss": 0.3999, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.465526580810547, - "rewards/margins": 1.315431833267212, - "rewards/rejected": -10.78095817565918, - "semantic_entropy": 0.001522608334198594, + "logits/chosen": -0.2513720393180847, + "logits/rejected": -0.006832315120846033, + "logps/chosen": -5.026620864868164, + "logps/rejected": -6.274362564086914, + "loss": 0.4177, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.026620864868164, + "rewards/margins": 1.2477415800094604, + "rewards/rejected": -6.274362564086914, "step": 3820 }, { "epoch": 2.0471650777722026, - "grad_norm": 17.57535140312262, + "grad_norm": 17.589562551423292, "learning_rate": 2.768848329055538e-07, - "logits/chosen": 0.7879313230514526, - "logits/rejected": 0.8248831629753113, - "logps/chosen": -9.305051803588867, - "logps/rejected": -10.57546615600586, - "loss": 0.3852, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.305051803588867, - "rewards/margins": 1.2704143524169922, - "rewards/rejected": -10.57546615600586, - "semantic_entropy": 0.0018570246174931526, + "logits/chosen": -0.1549982875585556, + "logits/rejected": -0.07923219352960587, + "logps/chosen": -4.768731117248535, + "logps/rejected": -6.0433831214904785, + "loss": 0.391, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.768731117248535, + "rewards/margins": 1.2746517658233643, + "rewards/rejected": -6.0433831214904785, "step": 3825 }, { "epoch": 2.0498411105536043, - "grad_norm": 20.119447880874766, + "grad_norm": 21.314235443602783, "learning_rate": 2.7549216958616657e-07, - "logits/chosen": 0.7350586652755737, - "logits/rejected": 0.8105939030647278, - "logps/chosen": -9.496885299682617, - "logps/rejected": -10.835896492004395, - "loss": 0.3968, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -9.496885299682617, - "rewards/margins": 1.33901047706604, - "rewards/rejected": -10.835896492004395, - "semantic_entropy": 0.0016353337559849024, + "logits/chosen": -0.23541080951690674, + "logits/rejected": -0.04748424142599106, + "logps/chosen": -5.179015636444092, + "logps/rejected": -6.491351127624512, + "loss": 0.4, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.179015636444092, + "rewards/margins": 1.312335729598999, + "rewards/rejected": -6.491351127624512, "step": 3830 }, { "epoch": 2.052517143335006, - "grad_norm": 15.071984684362736, + "grad_norm": 15.035171264869241, "learning_rate": 2.741016844427344e-07, - "logits/chosen": 0.7667199969291687, - "logits/rejected": 0.8489478826522827, - "logps/chosen": -9.403945922851562, - "logps/rejected": -10.778000831604004, - "loss": 0.3713, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -9.403945922851562, - "rewards/margins": 1.3740556240081787, - "rewards/rejected": -10.778000831604004, - "semantic_entropy": 0.001894004992209375, + "logits/chosen": -0.15218669176101685, + "logits/rejected": 0.024903254583477974, + "logps/chosen": -5.049431800842285, + "logps/rejected": -6.365915298461914, + "loss": 0.3989, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.049431800842285, + "rewards/margins": 1.316483736038208, + "rewards/rejected": -6.365915298461914, "step": 3835 }, { "epoch": 2.0551931761164073, - "grad_norm": 17.801746580163428, + "grad_norm": 17.14464397622768, "learning_rate": 2.7271339096575073e-07, - "logits/chosen": 0.7659896612167358, - "logits/rejected": 0.8438106775283813, - "logps/chosen": -9.306072235107422, - "logps/rejected": -10.509511947631836, - "loss": 0.4364, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.306072235107422, - "rewards/margins": 1.2034391164779663, - "rewards/rejected": -10.509511947631836, - "semantic_entropy": 0.00206328509375453, + "logits/chosen": -0.08245357125997543, + "logits/rejected": 0.0631280392408371, + "logps/chosen": -4.732628345489502, + "logps/rejected": -6.076385498046875, + "loss": 0.395, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.732628345489502, + "rewards/margins": 1.3437573909759521, + "rewards/rejected": -6.076385498046875, "step": 3840 }, { "epoch": 2.057869208897809, - "grad_norm": 16.02571773917183, + "grad_norm": 14.330383285037659, "learning_rate": 2.713273026244446e-07, - "logits/chosen": 0.7731425166130066, - "logits/rejected": 0.860715389251709, - "logps/chosen": -9.523012161254883, - "logps/rejected": -10.883203506469727, - "loss": 0.3804, + "logits/chosen": -0.2445097416639328, + "logits/rejected": 0.0006406203028745949, + "logps/chosen": -5.056388854980469, + "logps/rejected": -6.427834510803223, + "loss": 0.3647, "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -9.523012161254883, - "rewards/margins": 1.3601921796798706, - "rewards/rejected": -10.883203506469727, - "semantic_entropy": 0.001517820986919105, + "rewards/chosen": -5.056388854980469, + "rewards/margins": 1.371445655822754, + "rewards/rejected": -6.427834510803223, "step": 3845 }, { "epoch": 2.0605452416792107, - "grad_norm": 17.59552798096768, + "grad_norm": 18.778835155558617, "learning_rate": 2.6994343286665156e-07, - "logits/chosen": 0.7494341731071472, - "logits/rejected": 0.8325299024581909, - "logps/chosen": -9.523847579956055, - "logps/rejected": -10.638973236083984, - "loss": 0.4481, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.523847579956055, - "rewards/margins": 1.1151244640350342, - "rewards/rejected": -10.638973236083984, - "semantic_entropy": 0.0015663004014641047, + "logits/chosen": -0.1911519318819046, + "logits/rejected": 0.011799529194831848, + "logps/chosen": -5.281463146209717, + "logps/rejected": -6.34710693359375, + "loss": 0.4493, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.281463146209717, + "rewards/margins": 1.0656429529190063, + "rewards/rejected": -6.34710693359375, "step": 3850 }, { "epoch": 2.063221274460612, - "grad_norm": 21.242971901873577, + "grad_norm": 20.90226393628897, "learning_rate": 2.6856179511868156e-07, - "logits/chosen": 0.7531827092170715, - "logits/rejected": 0.8257854580879211, - "logps/chosen": -9.41790771484375, - "logps/rejected": -10.846906661987305, - "loss": 0.428, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.41790771484375, - "rewards/margins": 1.428999423980713, - "rewards/rejected": -10.846906661987305, - "semantic_entropy": 0.001963115995749831, + "logits/chosen": -0.10465452820062637, + "logits/rejected": 0.09160800278186798, + "logps/chosen": -5.096871376037598, + "logps/rejected": -6.543315887451172, + "loss": 0.4114, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.096871376037598, + "rewards/margins": 1.4464445114135742, + "rewards/rejected": -6.543315887451172, "step": 3855 }, { "epoch": 2.0658973072420137, - "grad_norm": 23.548898424244427, + "grad_norm": 20.878826905739967, "learning_rate": 2.6718240278519056e-07, - "logits/chosen": 0.7559301853179932, - "logits/rejected": 0.8003666996955872, - "logps/chosen": -9.45875072479248, - "logps/rejected": -10.853038787841797, - "loss": 0.4169, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.45875072479248, - "rewards/margins": 1.3942878246307373, - "rewards/rejected": -10.853038787841797, - "semantic_entropy": 0.0015923971077427268, + "logits/chosen": -0.05282402038574219, + "logits/rejected": 0.07892007380723953, + "logps/chosen": -5.231921672821045, + "logps/rejected": -6.597329139709473, + "loss": 0.3978, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.231921672821045, + "rewards/margins": 1.3654074668884277, + "rewards/rejected": -6.597329139709473, "step": 3860 }, { "epoch": 2.0685733400234154, - "grad_norm": 20.46304047002647, + "grad_norm": 22.477036203578066, "learning_rate": 2.6580526924904866e-07, - "logits/chosen": 0.6976224780082703, - "logits/rejected": 0.7671376466751099, - "logps/chosen": -9.384844779968262, - "logps/rejected": -10.741189956665039, - "loss": 0.3793, - "rewards/accuracies": 0.84375, - "rewards/chosen": -9.384844779968262, - "rewards/margins": 1.3563454151153564, - "rewards/rejected": -10.741189956665039, - "semantic_entropy": 0.001644113683141768, + "logits/chosen": -0.2275126874446869, + "logits/rejected": -0.06498920917510986, + "logps/chosen": -4.901474952697754, + "logps/rejected": -6.1236677169799805, + "loss": 0.4147, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.901474952697754, + "rewards/margins": 1.2221925258636475, + "rewards/rejected": -6.1236677169799805, "step": 3865 }, { "epoch": 2.0712493728048167, - "grad_norm": 24.88011754633229, + "grad_norm": 20.68086759336096, "learning_rate": 2.6443040787121186e-07, - "logits/chosen": 0.6563600301742554, - "logits/rejected": 0.6912602782249451, - "logps/chosen": -9.349275588989258, - "logps/rejected": -10.556272506713867, - "loss": 0.4195, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.349275588989258, - "rewards/margins": 1.206997036933899, - "rewards/rejected": -10.556272506713867, - "semantic_entropy": 0.0016969643766060472, + "logits/chosen": -0.21154093742370605, + "logits/rejected": -0.09025412797927856, + "logps/chosen": -4.82380485534668, + "logps/rejected": -6.02224588394165, + "loss": 0.433, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.82380485534668, + "rewards/margins": 1.1984403133392334, + "rewards/rejected": -6.02224588394165, "step": 3870 }, { "epoch": 2.0739254055862184, - "grad_norm": 24.129234255509385, + "grad_norm": 20.90739267850495, "learning_rate": 2.6305783199059084e-07, - "logits/chosen": 0.7805946469306946, - "logits/rejected": 0.8489789962768555, - "logps/chosen": -9.523481369018555, - "logps/rejected": -10.783955574035645, - "loss": 0.4536, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -9.523481369018555, - "rewards/margins": 1.2604728937149048, - "rewards/rejected": -10.783955574035645, - "semantic_entropy": 0.001705177710391581, + "logits/chosen": -0.11696934700012207, + "logits/rejected": -0.012949606403708458, + "logps/chosen": -5.097446918487549, + "logps/rejected": -6.3307390213012695, + "loss": 0.4333, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.097446918487549, + "rewards/margins": 1.2332918643951416, + "rewards/rejected": -6.3307390213012695, "step": 3875 }, { "epoch": 2.07660143836762, - "grad_norm": 20.770855194305607, + "grad_norm": 20.18769667227976, "learning_rate": 2.6168755492392324e-07, - "logits/chosen": 0.7947415113449097, - "logits/rejected": 0.8732272982597351, - "logps/chosen": -9.23397159576416, - "logps/rejected": -10.698512077331543, - "loss": 0.3445, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -9.23397159576416, - "rewards/margins": 1.4645414352416992, - "rewards/rejected": -10.698512077331543, - "semantic_entropy": 0.0018425941234454513, + "logits/chosen": -0.14381468296051025, + "logits/rejected": 0.042654264718294144, + "logps/chosen": -4.64146614074707, + "logps/rejected": -6.061637878417969, + "loss": 0.3651, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.64146614074707, + "rewards/margins": 1.4201723337173462, + "rewards/rejected": -6.061637878417969, "step": 3880 }, { "epoch": 2.0792774711490214, - "grad_norm": 19.657773873554152, + "grad_norm": 16.84859966761584, "learning_rate": 2.6031958996564274e-07, - "logits/chosen": 0.7870718240737915, - "logits/rejected": 0.8257455825805664, - "logps/chosen": -9.316213607788086, - "logps/rejected": -10.864585876464844, - "loss": 0.3707, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.316213607788086, - "rewards/margins": 1.5483721494674683, - "rewards/rejected": -10.864585876464844, - "semantic_entropy": 0.001938262372277677, + "logits/chosen": -0.12183308601379395, + "logits/rejected": -0.0017509430181235075, + "logps/chosen": -4.637547016143799, + "logps/rejected": -6.123723030090332, + "loss": 0.3572, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.637547016143799, + "rewards/margins": 1.4861762523651123, + "rewards/rejected": -6.123723030090332, "step": 3885 }, { "epoch": 2.081953503930423, - "grad_norm": 29.398133134663638, + "grad_norm": 21.7378268499826, "learning_rate": 2.589539503877518e-07, - "logits/chosen": 0.7874363660812378, - "logits/rejected": 0.8331443667411804, - "logps/chosen": -9.478517532348633, - "logps/rejected": -10.828798294067383, - "loss": 0.4304, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.478517532348633, - "rewards/margins": 1.3502806425094604, - "rewards/rejected": -10.828798294067383, - "semantic_entropy": 0.0018109595403075218, + "logits/chosen": -0.06419085711240768, + "logits/rejected": 0.041303399950265884, + "logps/chosen": -4.878803253173828, + "logps/rejected": -6.0687665939331055, + "loss": 0.4373, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.878803253173828, + "rewards/margins": 1.1899635791778564, + "rewards/rejected": -6.0687665939331055, "step": 3890 }, { "epoch": 2.084629536711825, - "grad_norm": 17.54106321474661, + "grad_norm": 17.573045037665448, "learning_rate": 2.5759064943969125e-07, - "logits/chosen": 0.7402059435844421, - "logits/rejected": 0.8292325735092163, - "logps/chosen": -9.550467491149902, - "logps/rejected": -10.937161445617676, - "loss": 0.3934, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.550467491149902, - "rewards/margins": 1.3866939544677734, - "rewards/rejected": -10.937161445617676, - "semantic_entropy": 0.0017161194700747728, + "logits/chosen": -0.12569046020507812, + "logits/rejected": 0.10935060679912567, + "logps/chosen": -4.984543800354004, + "logps/rejected": -6.226747512817383, + "loss": 0.4412, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.984543800354004, + "rewards/margins": 1.242203712463379, + "rewards/rejected": -6.226747512817383, "step": 3895 }, { "epoch": 2.087305569493226, - "grad_norm": 17.162272687965423, + "grad_norm": 17.994921209588533, "learning_rate": 2.562297003482131e-07, - "logits/chosen": 0.800572395324707, - "logits/rejected": 0.8477448225021362, - "logps/chosen": -9.5453462600708, - "logps/rejected": -10.9487943649292, - "loss": 0.3582, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -9.5453462600708, - "rewards/margins": 1.4034483432769775, - "rewards/rejected": -10.9487943649292, - "semantic_entropy": 0.0017864892724901438, + "logits/chosen": -0.017951691523194313, + "logits/rejected": 0.02780626341700554, + "logps/chosen": -4.891966819763184, + "logps/rejected": -6.207056999206543, + "loss": 0.3951, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.891966819763184, + "rewards/margins": 1.315090298652649, + "rewards/rejected": -6.207056999206543, "step": 3900 }, { "epoch": 2.089981602274628, - "grad_norm": 18.16669999802219, + "grad_norm": 18.26276313499837, "learning_rate": 2.548711163172512e-07, - "logits/chosen": 0.7785830497741699, - "logits/rejected": 0.8476356267929077, - "logps/chosen": -9.759795188903809, - "logps/rejected": -11.034205436706543, - "loss": 0.4168, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.759795188903809, - "rewards/margins": 1.2744102478027344, - "rewards/rejected": -11.034205436706543, - "semantic_entropy": 0.0020329877734184265, + "logits/chosen": -0.04082224518060684, + "logits/rejected": 0.055669914931058884, + "logps/chosen": -5.117550849914551, + "logps/rejected": -6.314211368560791, + "loss": 0.4383, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -5.117550849914551, + "rewards/margins": 1.1966601610183716, + "rewards/rejected": -6.314211368560791, "step": 3905 }, { "epoch": 2.0926576350560295, - "grad_norm": 21.502409164431704, + "grad_norm": 17.82877799265226, "learning_rate": 2.53514910527794e-07, - "logits/chosen": 0.8269200325012207, - "logits/rejected": 0.8657910227775574, - "logps/chosen": -9.461301803588867, - "logps/rejected": -10.762018203735352, - "loss": 0.3961, + "logits/chosen": -0.07770512998104095, + "logits/rejected": 0.060707222670316696, + "logps/chosen": -4.73898983001709, + "logps/rejected": -5.966341495513916, + "loss": 0.4057, "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.461301803588867, - "rewards/margins": 1.300715446472168, - "rewards/rejected": -10.762018203735352, - "semantic_entropy": 0.0016499152407050133, + "rewards/chosen": -4.73898983001709, + "rewards/margins": 1.2273520231246948, + "rewards/rejected": -5.966341495513916, "step": 3910 }, { "epoch": 2.095333667837431, - "grad_norm": 22.127771721745866, + "grad_norm": 21.777415941529306, "learning_rate": 2.5216109613775573e-07, - "logits/chosen": 0.7920838594436646, - "logits/rejected": 0.8535796403884888, - "logps/chosen": -9.840039253234863, - "logps/rejected": -11.020828247070312, - "loss": 0.4512, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.840039253234863, - "rewards/margins": 1.1807891130447388, - "rewards/rejected": -11.020828247070312, - "semantic_entropy": 0.0013382106553763151, + "logits/chosen": -0.09608994424343109, + "logits/rejected": 0.054783664643764496, + "logps/chosen": -5.109392166137695, + "logps/rejected": -6.221595764160156, + "loss": 0.474, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.109392166137695, + "rewards/margins": 1.1122041940689087, + "rewards/rejected": -6.221595764160156, "step": 3915 }, { "epoch": 2.0980097006188325, - "grad_norm": 21.03906625443135, + "grad_norm": 18.643080053428573, "learning_rate": 2.5080968628184993e-07, - "logits/chosen": 0.7727931141853333, - "logits/rejected": 0.8665952682495117, - "logps/chosen": -9.525362968444824, - "logps/rejected": -11.037253379821777, - "loss": 0.3669, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.525362968444824, - "rewards/margins": 1.5118907690048218, - "rewards/rejected": -11.037253379821777, - "semantic_entropy": 0.0015195768792182207, + "logits/chosen": -0.10473886877298355, + "logits/rejected": 0.06250426173210144, + "logps/chosen": -4.815760135650635, + "logps/rejected": -6.318374156951904, + "loss": 0.3616, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.815760135650635, + "rewards/margins": 1.5026142597198486, + "rewards/rejected": -6.318374156951904, "step": 3920 }, { "epoch": 2.1006857334002342, - "grad_norm": 17.048956060445978, + "grad_norm": 16.3784351179364, "learning_rate": 2.494606940714605e-07, - "logits/chosen": 0.7970033884048462, - "logits/rejected": 0.8288514018058777, - "logps/chosen": -9.431905746459961, - "logps/rejected": -10.85214900970459, - "loss": 0.3827, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.431905746459961, - "rewards/margins": 1.4202440977096558, - "rewards/rejected": -10.85214900970459, - "semantic_entropy": 0.001813689828850329, + "logits/chosen": -0.08500103652477264, + "logits/rejected": 0.00681284349411726, + "logps/chosen": -4.789070129394531, + "logps/rejected": -6.177746295928955, + "loss": 0.3923, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.789070129394531, + "rewards/margins": 1.3886754512786865, + "rewards/rejected": -6.177746295928955, "step": 3925 }, { "epoch": 2.103361766181636, - "grad_norm": 15.3880445844123, + "grad_norm": 15.50671936077575, "learning_rate": 2.4811413259451625e-07, - "logits/chosen": 0.7811511158943176, - "logits/rejected": 0.860381007194519, - "logps/chosen": -9.466936111450195, - "logps/rejected": -10.93088436126709, - "loss": 0.376, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.466936111450195, - "rewards/margins": 1.4639488458633423, - "rewards/rejected": -10.93088436126709, - "semantic_entropy": 0.0018378589302301407, + "logits/chosen": -0.1778106391429901, + "logits/rejected": 0.011871201917529106, + "logps/chosen": -5.102924823760986, + "logps/rejected": -6.514554023742676, + "loss": 0.4073, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.102924823760986, + "rewards/margins": 1.411629319190979, + "rewards/rejected": -6.514554023742676, "step": 3930 }, { "epoch": 2.106037798963037, - "grad_norm": 15.913978912005007, + "grad_norm": 17.34634062708944, "learning_rate": 2.46770014915362e-07, - "logits/chosen": 0.7653626203536987, - "logits/rejected": 0.8481870889663696, - "logps/chosen": -9.565362930297852, - "logps/rejected": -10.939167022705078, - "loss": 0.3977, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.565362930297852, - "rewards/margins": 1.3738042116165161, - "rewards/rejected": -10.939167022705078, - "semantic_entropy": 0.0015822149580344558, + "logits/chosen": -0.11574114859104156, + "logits/rejected": 0.0067810178734362125, + "logps/chosen": -5.015803337097168, + "logps/rejected": -6.365666389465332, + "loss": 0.4064, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.015803337097168, + "rewards/margins": 1.3498625755310059, + "rewards/rejected": -6.365666389465332, "step": 3935 }, { "epoch": 2.108713831744439, - "grad_norm": 27.05791308561908, + "grad_norm": 25.575098783434893, "learning_rate": 2.45428354074634e-07, - "logits/chosen": 0.7272459268569946, - "logits/rejected": 0.768274188041687, - "logps/chosen": -9.656865119934082, - "logps/rejected": -10.986165046691895, - "loss": 0.4415, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -9.656865119934082, - "rewards/margins": 1.3292994499206543, - "rewards/rejected": -10.986165046691895, - "semantic_entropy": 0.0016760114813223481, + "logits/chosen": -0.13069704174995422, + "logits/rejected": -0.04793712496757507, + "logps/chosen": -5.103211402893066, + "logps/rejected": -6.425663948059082, + "loss": 0.4312, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -5.103211402893066, + "rewards/margins": 1.3224518299102783, + "rewards/rejected": -6.425663948059082, "step": 3940 }, { "epoch": 2.1113898645258407, - "grad_norm": 24.89803710573063, + "grad_norm": 21.809840576401907, "learning_rate": 2.4408916308913105e-07, - "logits/chosen": 0.7583307027816772, - "logits/rejected": 0.8247106671333313, - "logps/chosen": -9.691811561584473, - "logps/rejected": -10.66722297668457, - "loss": 0.4955, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -9.691811561584473, - "rewards/margins": 0.9754101634025574, - "rewards/rejected": -10.66722297668457, - "semantic_entropy": 0.0014091429766267538, + "logits/chosen": -0.1387799084186554, + "logits/rejected": 0.040830980986356735, + "logps/chosen": -5.29327392578125, + "logps/rejected": -6.331910610198975, + "loss": 0.4772, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -5.29327392578125, + "rewards/margins": 1.038636565208435, + "rewards/rejected": -6.331910610198975, "step": 3945 }, { "epoch": 2.114065897307242, - "grad_norm": 27.703608412459293, + "grad_norm": 29.450800920794595, "learning_rate": 2.4275245495169025e-07, - "logits/chosen": 0.8197698593139648, - "logits/rejected": 0.9059907793998718, - "logps/chosen": -9.506436347961426, - "logps/rejected": -10.890914916992188, - "loss": 0.4066, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.506436347961426, - "rewards/margins": 1.3844783306121826, - "rewards/rejected": -10.890914916992188, - "semantic_entropy": 0.0022954349406063557, + "logits/chosen": -0.053323328495025635, + "logits/rejected": 0.11461678892374039, + "logps/chosen": -4.997078895568848, + "logps/rejected": -6.351992607116699, + "loss": 0.4049, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.997078895568848, + "rewards/margins": 1.3549132347106934, + "rewards/rejected": -6.351992607116699, "step": 3950 }, { "epoch": 2.1167419300886436, - "grad_norm": 23.22557014778233, + "grad_norm": 27.688323064775474, "learning_rate": 2.414182426310597e-07, - "logits/chosen": 0.758955180644989, - "logits/rejected": 0.8061805963516235, - "logps/chosen": -9.516363143920898, - "logps/rejected": -10.984567642211914, - "loss": 0.4001, - "rewards/accuracies": 0.84375, - "rewards/chosen": -9.516363143920898, - "rewards/margins": 1.4682044982910156, - "rewards/rejected": -10.984567642211914, - "semantic_entropy": 0.0018032476073130965, + "logits/chosen": -0.16766704618930817, + "logits/rejected": -0.10662659257650375, + "logps/chosen": -4.863587379455566, + "logps/rejected": -6.320513725280762, + "loss": 0.3918, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.863587379455566, + "rewards/margins": 1.4569275379180908, + "rewards/rejected": -6.320513725280762, "step": 3955 }, { "epoch": 2.1194179628700454, - "grad_norm": 13.701261098953237, + "grad_norm": 14.96418211158849, "learning_rate": 2.400865390717734e-07, - "logits/chosen": 0.7926728129386902, - "logits/rejected": 0.8731076121330261, - "logps/chosen": -9.494871139526367, - "logps/rejected": -11.167115211486816, - "loss": 0.3389, - "rewards/accuracies": 0.875, - "rewards/chosen": -9.494871139526367, - "rewards/margins": 1.6722424030303955, - "rewards/rejected": -11.167115211486816, - "semantic_entropy": 0.0018036758992820978, + "logits/chosen": -0.09762614965438843, + "logits/rejected": 0.021318193525075912, + "logps/chosen": -4.821589469909668, + "logps/rejected": -6.438128471374512, + "loss": 0.3493, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.821589469909668, + "rewards/margins": 1.616538643836975, + "rewards/rejected": -6.438128471374512, "step": 3960 }, { "epoch": 2.1220939956514466, - "grad_norm": 20.712815017204772, + "grad_norm": 22.196024989069254, "learning_rate": 2.3875735719402475e-07, - "logits/chosen": 0.7888078093528748, - "logits/rejected": 0.8680068850517273, - "logps/chosen": -9.749283790588379, - "logps/rejected": -11.215142250061035, - "loss": 0.3831, + "logits/chosen": -0.11850368976593018, + "logits/rejected": 0.029255172237753868, + "logps/chosen": -5.135270595550537, + "logps/rejected": -6.527970314025879, + "loss": 0.3877, "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.749283790588379, - "rewards/margins": 1.4658589363098145, - "rewards/rejected": -11.215142250061035, - "semantic_entropy": 0.0016434881836175919, + "rewards/chosen": -5.135270595550537, + "rewards/margins": 1.3926994800567627, + "rewards/rejected": -6.527970314025879, "step": 3965 }, { "epoch": 2.1247700284328483, - "grad_norm": 20.343949879193467, + "grad_norm": 17.290987731313717, "learning_rate": 2.3743070989354258e-07, - "logits/chosen": 0.8662029504776001, - "logits/rejected": 0.9218491315841675, - "logps/chosen": -9.647726058959961, - "logps/rejected": -11.065472602844238, - "loss": 0.4375, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -9.647726058959961, - "rewards/margins": 1.4177464246749878, - "rewards/rejected": -11.065472602844238, - "semantic_entropy": 0.001984253991395235, + "logits/chosen": -0.08603024482727051, + "logits/rejected": 0.024375446140766144, + "logps/chosen": -4.8245344161987305, + "logps/rejected": -6.2036261558532715, + "loss": 0.4437, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.8245344161987305, + "rewards/margins": 1.3790913820266724, + "rewards/rejected": -6.2036261558532715, "step": 3970 }, { "epoch": 2.12744606121425, - "grad_norm": 24.494673317414936, + "grad_norm": 18.002508242918157, "learning_rate": 2.3610661004146454e-07, - "logits/chosen": 0.8529459834098816, - "logits/rejected": 0.9132159352302551, - "logps/chosen": -9.46276569366455, - "logps/rejected": -10.809822082519531, - "loss": 0.3669, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -9.46276569366455, - "rewards/margins": 1.3470571041107178, - "rewards/rejected": -10.809822082519531, - "semantic_entropy": 0.0018992737168446183, + "logits/chosen": -0.06315483152866364, + "logits/rejected": 0.0743742361664772, + "logps/chosen": -4.623227596282959, + "logps/rejected": -6.003174781799316, + "loss": 0.3739, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.623227596282959, + "rewards/margins": 1.3799468278884888, + "rewards/rejected": -6.003174781799316, "step": 3975 }, { "epoch": 2.1301220939956513, - "grad_norm": 19.20248865933138, + "grad_norm": 18.099356334155456, "learning_rate": 2.3478507048421314e-07, - "logits/chosen": 0.8148723840713501, - "logits/rejected": 0.8485409617424011, - "logps/chosen": -9.510710716247559, - "logps/rejected": -11.025351524353027, - "loss": 0.3994, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.510710716247559, - "rewards/margins": 1.5146404504776, - "rewards/rejected": -11.025351524353027, - "semantic_entropy": 0.0019136825576424599, + "logits/chosen": -0.1367701292037964, + "logits/rejected": -0.0338885597884655, + "logps/chosen": -4.558551788330078, + "logps/rejected": -5.884751319885254, + "loss": 0.4214, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.558551788330078, + "rewards/margins": 1.3261990547180176, + "rewards/rejected": -5.884751319885254, "step": 3980 }, { "epoch": 2.132798126777053, - "grad_norm": 28.472850279807695, + "grad_norm": 25.5909902076846, "learning_rate": 2.334661040433713e-07, - "logits/chosen": 0.7736892700195312, - "logits/rejected": 0.8397472500801086, - "logps/chosen": -9.476969718933105, - "logps/rejected": -10.861968040466309, - "loss": 0.3791, + "logits/chosen": -0.20346641540527344, + "logits/rejected": -0.07897375524044037, + "logps/chosen": -4.5196943283081055, + "logps/rejected": -5.8219733238220215, + "loss": 0.3961, "rewards/accuracies": 0.84375, - "rewards/chosen": -9.476969718933105, - "rewards/margins": 1.3849985599517822, - "rewards/rejected": -10.861968040466309, - "semantic_entropy": 0.0017747702077031136, + "rewards/chosen": -4.5196943283081055, + "rewards/margins": 1.3022788763046265, + "rewards/rejected": -5.8219733238220215, "step": 3985 }, { "epoch": 2.1354741595584548, - "grad_norm": 19.047738461075404, + "grad_norm": 18.747429939961986, "learning_rate": 2.321497235155568e-07, - "logits/chosen": 0.7408386468887329, - "logits/rejected": 0.7888758778572083, - "logps/chosen": -9.377801895141602, - "logps/rejected": -10.882084846496582, - "loss": 0.3476, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -9.377801895141602, - "rewards/margins": 1.5042815208435059, - "rewards/rejected": -10.882084846496582, - "semantic_entropy": 0.00191340665332973, + "logits/chosen": -0.17417511343955994, + "logits/rejected": -0.027044976130127907, + "logps/chosen": -4.79927921295166, + "logps/rejected": -6.180619716644287, + "loss": 0.3577, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.79927921295166, + "rewards/margins": 1.3813403844833374, + "rewards/rejected": -6.180619716644287, "step": 3990 }, { "epoch": 2.138150192339856, - "grad_norm": 28.183809111461034, + "grad_norm": 27.29620567884255, "learning_rate": 2.3083594167229965e-07, - "logits/chosen": 0.748810887336731, - "logits/rejected": 0.8732229471206665, - "logps/chosen": -9.67158317565918, - "logps/rejected": -11.065845489501953, - "loss": 0.4378, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.67158317565918, - "rewards/margins": 1.3942630290985107, - "rewards/rejected": -11.065845489501953, - "semantic_entropy": 0.00166232546325773, + "logits/chosen": -0.190862774848938, + "logits/rejected": 0.07824854552745819, + "logps/chosen": -4.996739387512207, + "logps/rejected": -6.331148624420166, + "loss": 0.419, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.996739387512207, + "rewards/margins": 1.3344093561172485, + "rewards/rejected": -6.331148624420166, "step": 3995 }, { "epoch": 2.1408262251212578, - "grad_norm": 25.264626186096123, + "grad_norm": 25.300269557275737, "learning_rate": 2.295247712599167e-07, - "logits/chosen": 0.806961178779602, - "logits/rejected": 0.8473097681999207, - "logps/chosen": -9.56501579284668, - "logps/rejected": -10.97465705871582, - "loss": 0.3953, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.56501579284668, - "rewards/margins": 1.4096405506134033, - "rewards/rejected": -10.97465705871582, - "semantic_entropy": 0.0016302301082760096, + "logits/chosen": -0.09739263355731964, + "logits/rejected": -0.017596794292330742, + "logps/chosen": -4.7809834480285645, + "logps/rejected": -6.144955158233643, + "loss": 0.4259, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.7809834480285645, + "rewards/margins": 1.3639717102050781, + "rewards/rejected": -6.144955158233643, "step": 4000 }, { "epoch": 2.1408262251212578, - "eval_logits/chosen": 0.9285687804222107, - "eval_logits/rejected": 0.978207528591156, - "eval_logps/chosen": -9.8406982421875, - "eval_logps/rejected": -10.940929412841797, - "eval_loss": 0.526120126247406, - "eval_rewards/accuracies": 0.719584584236145, - "eval_rewards/chosen": -9.8406982421875, - "eval_rewards/margins": 1.100231647491455, - "eval_rewards/rejected": -10.940929412841797, - "eval_runtime": 35.0954, - "eval_samples_per_second": 38.324, - "eval_semantic_entropy": 0.001473370473831892, - "eval_steps_per_second": 9.602, + "eval_logits/chosen": 0.21429647505283356, + "eval_logits/rejected": 0.31026172637939453, + "eval_logps/chosen": -5.137231826782227, + "eval_logps/rejected": -6.215559005737305, + "eval_loss": 0.5296037197113037, + "eval_rewards/accuracies": 0.7270029783248901, + "eval_rewards/chosen": -5.137231826782227, + "eval_rewards/margins": 1.0783268213272095, + "eval_rewards/rejected": -6.215559005737305, + "eval_runtime": 40.3196, + "eval_samples_per_second": 33.358, + "eval_steps_per_second": 8.358, "step": 4000 }, { "epoch": 2.1435022579026595, - "grad_norm": 23.667192452025375, + "grad_norm": 18.112565981519023, "learning_rate": 2.2821622499938948e-07, - "logits/chosen": 0.8243509531021118, - "logits/rejected": 0.9055337905883789, - "logps/chosen": -9.871681213378906, - "logps/rejected": -11.096675872802734, - "loss": 0.4597, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.871681213378906, - "rewards/margins": 1.224994421005249, - "rewards/rejected": -11.096675872802734, - "semantic_entropy": 0.0016837811563163996, + "logits/chosen": -0.08458664268255234, + "logits/rejected": 0.11572712659835815, + "logps/chosen": -5.339541912078857, + "logps/rejected": -6.523773193359375, + "loss": 0.4424, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -5.339541912078857, + "rewards/margins": 1.1842305660247803, + "rewards/rejected": -6.523773193359375, "step": 4005 }, { "epoch": 2.1461782906840607, - "grad_norm": 27.290675701525487, + "grad_norm": 22.33261746733283, "learning_rate": 2.269103155862391e-07, - "logits/chosen": 0.76947021484375, - "logits/rejected": 0.8418231010437012, - "logps/chosen": -9.789541244506836, - "logps/rejected": -10.954092979431152, - "loss": 0.4684, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -9.789541244506836, - "rewards/margins": 1.1645511388778687, - "rewards/rejected": -10.954092979431152, - "semantic_entropy": 0.0013261919375509024, + "logits/chosen": -0.15472854673862457, + "logits/rejected": -0.02648771181702614, + "logps/chosen": -4.984389781951904, + "logps/rejected": -6.1533203125, + "loss": 0.4479, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.984389781951904, + "rewards/margins": 1.1689307689666748, + "rewards/rejected": -6.1533203125, "step": 4010 }, { "epoch": 2.1488543234654625, - "grad_norm": 22.016730657163862, + "grad_norm": 22.27551483252129, "learning_rate": 2.2560705569040483e-07, - "logits/chosen": 0.7831665873527527, - "logits/rejected": 0.8772950172424316, - "logps/chosen": -9.797338485717773, - "logps/rejected": -11.046935081481934, - "loss": 0.4435, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.797338485717773, - "rewards/margins": 1.249597191810608, - "rewards/rejected": -11.046935081481934, - "semantic_entropy": 0.0014855300541967154, + "logits/chosen": -0.12389322370290756, + "logits/rejected": 0.11753962188959122, + "logps/chosen": -4.9815168380737305, + "logps/rejected": -6.161324977874756, + "loss": 0.4486, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.9815168380737305, + "rewards/margins": 1.1798080205917358, + "rewards/rejected": -6.161324977874756, "step": 4015 }, { "epoch": 2.151530356246864, - "grad_norm": 18.024857287329926, + "grad_norm": 15.820542641069238, "learning_rate": 2.2430645795611963e-07, - "logits/chosen": 0.7459646463394165, - "logits/rejected": 0.8185451626777649, - "logps/chosen": -9.737576484680176, - "logps/rejected": -11.164031982421875, - "loss": 0.3774, + "logits/chosen": -0.20444515347480774, + "logits/rejected": -0.04208673909306526, + "logps/chosen": -5.146454811096191, + "logps/rejected": -6.500970363616943, + "loss": 0.3866, "rewards/accuracies": 0.8125, - "rewards/chosen": -9.737576484680176, - "rewards/margins": 1.4264552593231201, - "rewards/rejected": -11.164031982421875, - "semantic_entropy": 0.0017724098870530725, + "rewards/chosen": -5.146454811096191, + "rewards/margins": 1.3545156717300415, + "rewards/rejected": -6.500970363616943, "step": 4020 }, { "epoch": 2.1542063890282654, - "grad_norm": 26.64615086075229, + "grad_norm": 25.598320140948662, "learning_rate": 2.230085350017884e-07, - "logits/chosen": 0.8524463772773743, - "logits/rejected": 0.8867918848991394, - "logps/chosen": -9.587358474731445, - "logps/rejected": -10.673149108886719, - "loss": 0.4612, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.587358474731445, - "rewards/margins": 1.0857917070388794, - "rewards/rejected": -10.673149108886719, - "semantic_entropy": 0.0018163727363571525, + "logits/chosen": -0.11888334900140762, + "logits/rejected": -0.006392848677933216, + "logps/chosen": -4.706912994384766, + "logps/rejected": -5.921402931213379, + "loss": 0.4402, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.706912994384766, + "rewards/margins": 1.2144898176193237, + "rewards/rejected": -5.921402931213379, "step": 4025 }, { "epoch": 2.156882421809667, - "grad_norm": 16.629223181883795, + "grad_norm": 17.213931855252735, "learning_rate": 2.2171329941986554e-07, - "logits/chosen": 0.7545696496963501, - "logits/rejected": 0.8315639495849609, - "logps/chosen": -9.440114974975586, - "logps/rejected": -10.97568130493164, - "loss": 0.35, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -9.440114974975586, - "rewards/margins": 1.5355665683746338, - "rewards/rejected": -10.97568130493164, - "semantic_entropy": 0.0017560431733727455, + "logits/chosen": -0.1645926535129547, + "logits/rejected": -0.04748887941241264, + "logps/chosen": -4.64597225189209, + "logps/rejected": -6.07511043548584, + "loss": 0.3516, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.64597225189209, + "rewards/margins": 1.4291377067565918, + "rewards/rejected": -6.07511043548584, "step": 4030 }, { "epoch": 2.159558454591069, - "grad_norm": 16.933778427501522, + "grad_norm": 19.326827646640584, "learning_rate": 2.2042076377673202e-07, - "logits/chosen": 0.7635836005210876, - "logits/rejected": 0.7833055853843689, - "logps/chosen": -9.408586502075195, - "logps/rejected": -10.607271194458008, - "loss": 0.4138, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.408586502075195, - "rewards/margins": 1.1986857652664185, - "rewards/rejected": -10.607271194458008, - "semantic_entropy": 0.0017117311945185065, + "logits/chosen": -0.1034109964966774, + "logits/rejected": -0.08365767449140549, + "logps/chosen": -4.7854743003845215, + "logps/rejected": -5.908322811126709, + "loss": 0.4373, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.7854743003845215, + "rewards/margins": 1.1228487491607666, + "rewards/rejected": -5.908322811126709, "step": 4035 }, { "epoch": 2.16223448737247, - "grad_norm": 23.28336770400718, + "grad_norm": 20.244396139070197, "learning_rate": 2.1913094061257476e-07, - "logits/chosen": 0.8093854784965515, - "logits/rejected": 0.8086700439453125, - "logps/chosen": -9.568441390991211, - "logps/rejected": -10.82945442199707, - "loss": 0.4363, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.568441390991211, - "rewards/margins": 1.2610145807266235, - "rewards/rejected": -10.82945442199707, - "semantic_entropy": 0.0016031649429351091, + "logits/chosen": -0.08402053266763687, + "logits/rejected": -0.08290643990039825, + "logps/chosen": -4.744356155395508, + "logps/rejected": -5.956753253936768, + "loss": 0.4092, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.744356155395508, + "rewards/margins": 1.2123966217041016, + "rewards/rejected": -5.956753253936768, "step": 4040 }, { "epoch": 2.164910520153872, - "grad_norm": 21.12058624108258, + "grad_norm": 21.05213555519116, "learning_rate": 2.178438424412633e-07, - "logits/chosen": 0.8389409780502319, - "logits/rejected": 0.8938524127006531, - "logps/chosen": -9.587557792663574, - "logps/rejected": -10.771153450012207, - "loss": 0.4503, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -9.587557792663574, - "rewards/margins": 1.1835949420928955, - "rewards/rejected": -10.771153450012207, - "semantic_entropy": 0.0016682265559211373, + "logits/chosen": -0.0868399515748024, + "logits/rejected": 0.047896645963191986, + "logps/chosen": -4.763881683349609, + "logps/rejected": -5.940675735473633, + "loss": 0.4391, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.763881683349609, + "rewards/margins": 1.1767939329147339, + "rewards/rejected": -5.940675735473633, "step": 4045 }, { "epoch": 2.1675865529352736, - "grad_norm": 31.758164864561788, + "grad_norm": 25.13718176362216, "learning_rate": 2.165594817502302e-07, - "logits/chosen": 0.8181732892990112, - "logits/rejected": 0.8720429539680481, - "logps/chosen": -9.709211349487305, - "logps/rejected": -10.695291519165039, - "loss": 0.5042, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -9.709211349487305, - "rewards/margins": 0.9860790371894836, - "rewards/rejected": -10.695291519165039, - "semantic_entropy": 0.0015186185482889414, + "logits/chosen": -0.17063532769680023, + "logits/rejected": -0.031031867489218712, + "logps/chosen": -5.0333662033081055, + "logps/rejected": -6.058009624481201, + "loss": 0.496, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -5.0333662033081055, + "rewards/margins": 1.0246433019638062, + "rewards/rejected": -6.058009624481201, "step": 4050 }, { "epoch": 2.170262585716675, - "grad_norm": 22.613433139861957, + "grad_norm": 21.615979190632327, "learning_rate": 2.1527787100034806e-07, - "logits/chosen": 0.8588092923164368, - "logits/rejected": 0.8922932744026184, - "logps/chosen": -9.441000938415527, - "logps/rejected": -10.503214836120605, - "loss": 0.4521, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.441000938415527, - "rewards/margins": 1.062213659286499, - "rewards/rejected": -10.503214836120605, - "semantic_entropy": 0.0017032899195328355, + "logits/chosen": -0.057680655270814896, + "logits/rejected": 0.01926508918404579, + "logps/chosen": -4.754338264465332, + "logps/rejected": -5.772378921508789, + "loss": 0.4446, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.754338264465332, + "rewards/margins": 1.0180412530899048, + "rewards/rejected": -5.772378921508789, "step": 4055 }, { "epoch": 2.1729386184980766, - "grad_norm": 18.889543716610927, + "grad_norm": 18.15191275770377, "learning_rate": 2.1399902262581037e-07, - "logits/chosen": 0.9263399243354797, - "logits/rejected": 0.967937171459198, - "logps/chosen": -9.547457695007324, - "logps/rejected": -10.7252836227417, - "loss": 0.4549, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.547457695007324, - "rewards/margins": 1.177826166152954, - "rewards/rejected": -10.7252836227417, - "semantic_entropy": 0.0017254750709980726, + "logits/chosen": -0.018411552533507347, + "logits/rejected": 0.14466091990470886, + "logps/chosen": -4.764962196350098, + "logps/rejected": -5.915167808532715, + "loss": 0.4655, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.764962196350098, + "rewards/margins": 1.1502063274383545, + "rewards/rejected": -5.915167808532715, "step": 4060 }, { "epoch": 2.1756146512794783, - "grad_norm": 20.68930586575862, + "grad_norm": 18.60840981132187, "learning_rate": 2.127229490340094e-07, - "logits/chosen": 0.7730456590652466, - "logits/rejected": 0.801898181438446, - "logps/chosen": -9.515592575073242, - "logps/rejected": -10.984514236450195, - "loss": 0.3715, - "rewards/accuracies": 0.84375, - "rewards/chosen": -9.515592575073242, - "rewards/margins": 1.4689228534698486, - "rewards/rejected": -10.984514236450195, - "semantic_entropy": 0.0015030469512566924, + "logits/chosen": -0.14498677849769592, + "logits/rejected": -0.061090052127838135, + "logps/chosen": -4.791174411773682, + "logps/rejected": -6.255467891693115, + "loss": 0.3654, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.791174411773682, + "rewards/margins": 1.4642934799194336, + "rewards/rejected": -6.255467891693115, "step": 4065 }, { "epoch": 2.1782906840608796, - "grad_norm": 24.806502277573657, + "grad_norm": 26.127045144370538, "learning_rate": 2.1144966260541698e-07, - "logits/chosen": 0.8492151498794556, - "logits/rejected": 0.9006759524345398, - "logps/chosen": -9.50521183013916, - "logps/rejected": -11.008363723754883, - "loss": 0.4001, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.50521183013916, - "rewards/margins": 1.5031511783599854, - "rewards/rejected": -11.008363723754883, - "semantic_entropy": 0.0017372198635712266, + "logits/chosen": -0.022417958825826645, + "logits/rejected": 0.16336455941200256, + "logps/chosen": -4.942807674407959, + "logps/rejected": -6.345865726470947, + "loss": 0.4281, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.942807674407959, + "rewards/margins": 1.4030578136444092, + "rewards/rejected": -6.345865726470947, "step": 4070 }, { "epoch": 2.1809667168422813, - "grad_norm": 19.324917536101047, + "grad_norm": 17.9428086270337, "learning_rate": 2.1017917569346332e-07, - "logits/chosen": 0.814143180847168, - "logits/rejected": 0.8865806460380554, - "logps/chosen": -9.452108383178711, - "logps/rejected": -10.87096118927002, - "loss": 0.3736, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.452108383178711, - "rewards/margins": 1.4188525676727295, - "rewards/rejected": -10.87096118927002, - "semantic_entropy": 0.0015151125844568014, + "logits/chosen": -0.10057320445775986, + "logits/rejected": 0.09053415060043335, + "logps/chosen": -4.8520002365112305, + "logps/rejected": -6.100706100463867, + "loss": 0.3995, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.8520002365112305, + "rewards/margins": 1.248705267906189, + "rewards/rejected": -6.100706100463867, "step": 4075 }, { "epoch": 2.183642749623683, - "grad_norm": 18.81790999844241, + "grad_norm": 16.365975127406138, "learning_rate": 2.0891150062441837e-07, - "logits/chosen": 0.7656540870666504, - "logits/rejected": 0.8226898312568665, - "logps/chosen": -9.545916557312012, - "logps/rejected": -10.923840522766113, - "loss": 0.4004, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.545916557312012, - "rewards/margins": 1.3779232501983643, - "rewards/rejected": -10.923840522766113, - "semantic_entropy": 0.0021084630861878395, + "logits/chosen": -0.17272022366523743, + "logits/rejected": -0.02849605120718479, + "logps/chosen": -4.930100917816162, + "logps/rejected": -6.2937822341918945, + "loss": 0.3989, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.930100917816162, + "rewards/margins": 1.3636811971664429, + "rewards/rejected": -6.2937822341918945, "step": 4080 }, { "epoch": 2.1863187824050843, - "grad_norm": 21.06645650645646, + "grad_norm": 21.320589440646135, "learning_rate": 2.0764664969727086e-07, - "logits/chosen": 0.8369568586349487, - "logits/rejected": 0.9141233563423157, - "logps/chosen": -9.500614166259766, - "logps/rejected": -10.795511245727539, - "loss": 0.3764, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -9.500614166259766, - "rewards/margins": 1.2948954105377197, - "rewards/rejected": -10.795511245727539, - "semantic_entropy": 0.0015911769587546587, + "logits/chosen": -0.10787606239318848, + "logits/rejected": -0.026618346571922302, + "logps/chosen": -4.602447509765625, + "logps/rejected": -5.905449390411377, + "loss": 0.3799, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.602447509765625, + "rewards/margins": 1.3030017614364624, + "rewards/rejected": -5.905449390411377, "step": 4085 }, { "epoch": 2.188994815186486, - "grad_norm": 21.33908508690175, + "grad_norm": 16.556071465795146, "learning_rate": 2.0638463518361033e-07, - "logits/chosen": 0.7690576314926147, - "logits/rejected": 0.8658930063247681, - "logps/chosen": -9.389853477478027, - "logps/rejected": -10.788896560668945, - "loss": 0.3887, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.389853477478027, - "rewards/margins": 1.3990432024002075, - "rewards/rejected": -10.788896560668945, - "semantic_entropy": 0.002215514425188303, + "logits/chosen": -0.2298423796892166, + "logits/rejected": -0.02418164536356926, + "logps/chosen": -4.732496738433838, + "logps/rejected": -6.079532623291016, + "loss": 0.3926, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.732496738433838, + "rewards/margins": 1.3470360040664673, + "rewards/rejected": -6.079532623291016, "step": 4090 }, { "epoch": 2.1916708479678877, - "grad_norm": 22.867041410821965, + "grad_norm": 24.992470021334146, "learning_rate": 2.0512546932750702e-07, - "logits/chosen": 0.7939780950546265, - "logits/rejected": 0.8460676074028015, - "logps/chosen": -9.5419282913208, - "logps/rejected": -10.808481216430664, - "loss": 0.3914, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.5419282913208, - "rewards/margins": 1.2665529251098633, - "rewards/rejected": -10.808481216430664, - "semantic_entropy": 0.0017644502222537994, + "logits/chosen": -0.17826859652996063, + "logits/rejected": -0.08537033945322037, + "logps/chosen": -4.960954189300537, + "logps/rejected": -6.143200874328613, + "loss": 0.413, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.960954189300537, + "rewards/margins": 1.1822469234466553, + "rewards/rejected": -6.143200874328613, "step": 4095 }, { "epoch": 2.194346880749289, - "grad_norm": 23.636550449380298, + "grad_norm": 23.36686606632312, "learning_rate": 2.0386916434539343e-07, - "logits/chosen": 0.8169394731521606, - "logits/rejected": 0.8834837675094604, - "logps/chosen": -9.410249710083008, - "logps/rejected": -10.827165603637695, - "loss": 0.3956, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -9.410249710083008, - "rewards/margins": 1.416915774345398, - "rewards/rejected": -10.827165603637695, - "semantic_entropy": 0.001839539734646678, + "logits/chosen": -0.1087033748626709, + "logits/rejected": 0.05511082336306572, + "logps/chosen": -4.639834403991699, + "logps/rejected": -6.106034278869629, + "loss": 0.3491, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.639834403991699, + "rewards/margins": 1.4661996364593506, + "rewards/rejected": -6.106034278869629, "step": 4100 }, { "epoch": 2.1970229135306907, - "grad_norm": 23.012604021963845, + "grad_norm": 22.41888363027506, "learning_rate": 2.0261573242594627e-07, - "logits/chosen": 0.853449821472168, - "logits/rejected": 0.9539750218391418, - "logps/chosen": -9.770658493041992, - "logps/rejected": -11.06202507019043, - "loss": 0.4337, + "logits/chosen": -0.09782413393259048, + "logits/rejected": 0.09498012065887451, + "logps/chosen": -5.267067909240723, + "logps/rejected": -6.525610446929932, + "loss": 0.425, "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.770658493041992, - "rewards/margins": 1.2913668155670166, - "rewards/rejected": -11.06202507019043, - "semantic_entropy": 0.0014862673124298453, + "rewards/chosen": -5.267067909240723, + "rewards/margins": 1.2585420608520508, + "rewards/rejected": -6.525610446929932, "step": 4105 }, { "epoch": 2.1996989463120924, - "grad_norm": 25.51169802963628, + "grad_norm": 30.843444316257614, "learning_rate": 2.0136518572996724e-07, - "logits/chosen": 0.7688170671463013, - "logits/rejected": 0.8904584646224976, - "logps/chosen": -9.553049087524414, - "logps/rejected": -11.032699584960938, - "loss": 0.3851, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.553049087524414, - "rewards/margins": 1.4796515703201294, - "rewards/rejected": -11.032699584960938, - "semantic_entropy": 0.0020472349133342505, + "logits/chosen": -0.10866514593362808, + "logits/rejected": 0.09582415968179703, + "logps/chosen": -4.913560390472412, + "logps/rejected": -6.337634086608887, + "loss": 0.3818, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.913560390472412, + "rewards/margins": 1.4240734577178955, + "rewards/rejected": -6.337634086608887, "step": 4110 }, { "epoch": 2.202374979093494, - "grad_norm": 23.510129434710546, + "grad_norm": 19.114191510735747, "learning_rate": 2.0011753639026617e-07, - "logits/chosen": 0.7789877653121948, - "logits/rejected": 0.8432193994522095, - "logps/chosen": -9.628904342651367, - "logps/rejected": -10.928987503051758, - "loss": 0.4125, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.628904342651367, - "rewards/margins": 1.3000822067260742, - "rewards/rejected": -10.928987503051758, - "semantic_entropy": 0.001494646305218339, + "logits/chosen": -0.06542578339576721, + "logits/rejected": 0.03997331112623215, + "logps/chosen": -5.068845748901367, + "logps/rejected": -6.337197780609131, + "loss": 0.3987, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.068845748901367, + "rewards/margins": 1.2683517932891846, + "rewards/rejected": -6.337197780609131, "step": 4115 }, { "epoch": 2.2050510118748954, - "grad_norm": 25.695966110067108, + "grad_norm": 24.22664011368655, "learning_rate": 1.988727965115421e-07, - "logits/chosen": 0.8373724222183228, - "logits/rejected": 0.8486580848693848, - "logps/chosen": -9.40330696105957, - "logps/rejected": -10.690084457397461, - "loss": 0.4098, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.40330696105957, - "rewards/margins": 1.2867774963378906, - "rewards/rejected": -10.690084457397461, - "semantic_entropy": 0.0017762102652341127, + "logits/chosen": -0.10922899097204208, + "logits/rejected": 0.00666669150814414, + "logps/chosen": -4.851667881011963, + "logps/rejected": -6.107888221740723, + "loss": 0.4033, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.851667881011963, + "rewards/margins": 1.2562209367752075, + "rewards/rejected": -6.107888221740723, "step": 4120 }, { "epoch": 2.207727044656297, - "grad_norm": 17.76197982327494, + "grad_norm": 18.565365643960025, "learning_rate": 1.9763097817026713e-07, - "logits/chosen": 0.7693505883216858, - "logits/rejected": 0.8610559701919556, - "logps/chosen": -9.42241382598877, - "logps/rejected": -11.041936874389648, - "loss": 0.3312, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -9.42241382598877, - "rewards/margins": 1.619523286819458, - "rewards/rejected": -11.041936874389648, - "semantic_entropy": 0.0017702898476272821, + "logits/chosen": -0.14397795498371124, + "logits/rejected": 0.07487878948450089, + "logps/chosen": -5.009173393249512, + "logps/rejected": -6.57416296005249, + "loss": 0.3508, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.009173393249512, + "rewards/margins": 1.5649892091751099, + "rewards/rejected": -6.57416296005249, "step": 4125 }, { "epoch": 2.210403077437699, - "grad_norm": 17.95604315957035, + "grad_norm": 17.928197983166015, "learning_rate": 1.9639209341456796e-07, - "logits/chosen": 0.8016265630722046, - "logits/rejected": 0.8611480593681335, - "logps/chosen": -9.537622451782227, - "logps/rejected": -10.907397270202637, - "loss": 0.4239, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.537622451782227, - "rewards/margins": 1.3697750568389893, - "rewards/rejected": -10.907397270202637, - "semantic_entropy": 0.001632682979106903, + "logits/chosen": -0.052387069910764694, + "logits/rejected": 0.043669622391462326, + "logps/chosen": -5.129462242126465, + "logps/rejected": -6.466664791107178, + "loss": 0.4198, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.129462242126465, + "rewards/margins": 1.337203025817871, + "rewards/rejected": -6.466664791107178, "step": 4130 }, { "epoch": 2.2130791102191, - "grad_norm": 16.93384833519254, + "grad_norm": 16.249988956992546, "learning_rate": 1.951561542641102e-07, - "logits/chosen": 0.8083820343017578, - "logits/rejected": 0.8523054122924805, - "logps/chosen": -9.715473175048828, - "logps/rejected": -11.148602485656738, - "loss": 0.4369, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.715473175048828, - "rewards/margins": 1.4331295490264893, - "rewards/rejected": -11.148602485656738, - "semantic_entropy": 0.0015578053425997496, + "logits/chosen": -0.014087068848311901, + "logits/rejected": -0.011342120356857777, + "logps/chosen": -5.105328559875488, + "logps/rejected": -6.4483962059021, + "loss": 0.462, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -5.105328559875488, + "rewards/margins": 1.3430687189102173, + "rewards/rejected": -6.4483962059021, "step": 4135 }, { "epoch": 2.215755143000502, - "grad_norm": 18.601513607697683, + "grad_norm": 19.979869382905456, "learning_rate": 1.939231727099806e-07, - "logits/chosen": 0.7638577818870544, - "logits/rejected": 0.7983411550521851, - "logps/chosen": -9.586159706115723, - "logps/rejected": -10.826416015625, - "loss": 0.4351, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.586159706115723, - "rewards/margins": 1.2402559518814087, - "rewards/rejected": -10.826416015625, - "semantic_entropy": 0.001649503014050424, + "logits/chosen": -0.23013098537921906, + "logits/rejected": -0.16305793821811676, + "logps/chosen": -4.987565040588379, + "logps/rejected": -6.233033180236816, + "loss": 0.4445, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.987565040588379, + "rewards/margins": 1.2454670667648315, + "rewards/rejected": -6.233033180236816, "step": 4140 }, { "epoch": 2.2184311757819035, - "grad_norm": 24.016266301167416, + "grad_norm": 20.493509475947135, "learning_rate": 1.926931607145719e-07, - "logits/chosen": 0.8312221765518188, - "logits/rejected": 0.8952839970588684, - "logps/chosen": -9.727499008178711, - "logps/rejected": -11.005497932434082, - "loss": 0.4144, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.727499008178711, - "rewards/margins": 1.2779988050460815, - "rewards/rejected": -11.005497932434082, - "semantic_entropy": 0.0015145648503676057, + "logits/chosen": 0.013960001058876514, + "logits/rejected": 0.1441245824098587, + "logps/chosen": -5.227877140045166, + "logps/rejected": -6.4722795486450195, + "loss": 0.4363, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -5.227877140045166, + "rewards/margins": 1.2444020509719849, + "rewards/rejected": -6.4722795486450195, "step": 4145 }, { "epoch": 2.221107208563305, - "grad_norm": 20.566896327558037, + "grad_norm": 22.73569844884668, "learning_rate": 1.9146613021146564e-07, - "logits/chosen": 0.8225449323654175, - "logits/rejected": 0.8508152961730957, - "logps/chosen": -9.408650398254395, - "logps/rejected": -10.707399368286133, - "loss": 0.4173, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -9.408650398254395, - "rewards/margins": 1.2987501621246338, - "rewards/rejected": -10.707399368286133, - "semantic_entropy": 0.0019972771406173706, + "logits/chosen": -0.08814932405948639, + "logits/rejected": 0.02544325590133667, + "logps/chosen": -4.631189823150635, + "logps/rejected": -5.854756832122803, + "loss": 0.4443, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.631189823150635, + "rewards/margins": 1.2235660552978516, + "rewards/rejected": -5.854756832122803, "step": 4150 }, { "epoch": 2.2237832413447065, - "grad_norm": 22.344243213376547, + "grad_norm": 22.621549228813542, "learning_rate": 1.9024209310531736e-07, - "logits/chosen": 0.847356915473938, - "logits/rejected": 0.8640506863594055, - "logps/chosen": -9.579663276672363, - "logps/rejected": -10.968865394592285, - "loss": 0.4137, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.579663276672363, - "rewards/margins": 1.3892011642456055, - "rewards/rejected": -10.968865394592285, - "semantic_entropy": 0.0021100840531289577, + "logits/chosen": -0.05686334893107414, + "logits/rejected": -0.03554892539978027, + "logps/chosen": -5.02664852142334, + "logps/rejected": -6.327553749084473, + "loss": 0.4125, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.02664852142334, + "rewards/margins": 1.3009048700332642, + "rewards/rejected": -6.327553749084473, "step": 4155 }, { "epoch": 2.2264592741261082, - "grad_norm": 19.57624931515473, + "grad_norm": 22.76210968999022, "learning_rate": 1.890210612717401e-07, - "logits/chosen": 0.8184317350387573, - "logits/rejected": 0.88373863697052, - "logps/chosen": -9.572199821472168, - "logps/rejected": -11.031925201416016, - "loss": 0.3741, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -9.572199821472168, - "rewards/margins": 1.4597254991531372, - "rewards/rejected": -11.031925201416016, - "semantic_entropy": 0.0016687295865267515, + "logits/chosen": -0.11689623445272446, + "logits/rejected": 0.01751454547047615, + "logps/chosen": -5.113454341888428, + "logps/rejected": -6.341330051422119, + "loss": 0.4179, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.113454341888428, + "rewards/margins": 1.2278764247894287, + "rewards/rejected": -6.341330051422119, "step": 4160 }, { "epoch": 2.2291353069075095, - "grad_norm": 24.504791180268253, + "grad_norm": 19.734526632222263, "learning_rate": 1.8780304655719054e-07, - "logits/chosen": 0.8567901849746704, - "logits/rejected": 0.9107440114021301, - "logps/chosen": -9.613670349121094, - "logps/rejected": -11.115751266479492, - "loss": 0.3757, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -9.613670349121094, - "rewards/margins": 1.5020800828933716, - "rewards/rejected": -11.115751266479492, - "semantic_entropy": 0.0012760651297867298, + "logits/chosen": -0.08869336545467377, + "logits/rejected": 0.062269002199172974, + "logps/chosen": -4.9844841957092285, + "logps/rejected": -6.499150276184082, + "loss": 0.3792, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.9844841957092285, + "rewards/margins": 1.5146653652191162, + "rewards/rejected": -6.499150276184082, "step": 4165 }, { "epoch": 2.231811339688911, - "grad_norm": 35.6519577235443, + "grad_norm": 32.2380310663105, "learning_rate": 1.865880607788523e-07, - "logits/chosen": 0.8858783841133118, - "logits/rejected": 0.9201458096504211, - "logps/chosen": -9.616140365600586, - "logps/rejected": -10.997381210327148, - "loss": 0.4086, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.616140365600586, - "rewards/margins": 1.3812413215637207, - "rewards/rejected": -10.997381210327148, - "semantic_entropy": 0.0018040050053969026, + "logits/chosen": 0.03483510762453079, + "logits/rejected": 0.09509845077991486, + "logps/chosen": -4.917486190795898, + "logps/rejected": -6.278813362121582, + "loss": 0.4064, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.917486190795898, + "rewards/margins": 1.3613275289535522, + "rewards/rejected": -6.278813362121582, "step": 4170 }, { "epoch": 2.234487372470313, - "grad_norm": 26.229983342336126, + "grad_norm": 32.58317440727863, "learning_rate": 1.8537611572452316e-07, - "logits/chosen": 0.8341430425643921, - "logits/rejected": 0.8626706004142761, - "logps/chosen": -9.763750076293945, - "logps/rejected": -10.994816780090332, - "loss": 0.4061, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.763750076293945, - "rewards/margins": 1.2310662269592285, - "rewards/rejected": -10.994816780090332, - "semantic_entropy": 0.001325559918768704, + "logits/chosen": -0.06791558116674423, + "logits/rejected": 0.022037990391254425, + "logps/chosen": -5.197055816650391, + "logps/rejected": -6.390126705169678, + "loss": 0.4262, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.197055816650391, + "rewards/margins": 1.1930711269378662, + "rewards/rejected": -6.390126705169678, "step": 4175 }, { "epoch": 2.237163405251714, - "grad_norm": 19.660501965854834, + "grad_norm": 23.393381544188244, "learning_rate": 1.84167223152499e-07, - "logits/chosen": 0.8540051579475403, - "logits/rejected": 0.913652777671814, - "logps/chosen": -9.744295120239258, - "logps/rejected": -11.07103157043457, - "loss": 0.4022, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.744295120239258, - "rewards/margins": 1.3267360925674438, - "rewards/rejected": -11.07103157043457, - "semantic_entropy": 0.0015319742960855365, + "logits/chosen": -0.11754951626062393, + "logits/rejected": 0.07604455947875977, + "logps/chosen": -5.231292247772217, + "logps/rejected": -6.58504581451416, + "loss": 0.408, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -5.231292247772217, + "rewards/margins": 1.3537530899047852, + "rewards/rejected": -6.58504581451416, "step": 4180 }, { "epoch": 2.239839438033116, - "grad_norm": 22.89745562199727, + "grad_norm": 26.980147803770603, "learning_rate": 1.8296139479146112e-07, - "logits/chosen": 0.7796264886856079, - "logits/rejected": 0.8492299318313599, - "logps/chosen": -9.544806480407715, - "logps/rejected": -10.937907218933105, - "loss": 0.3939, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.544806480407715, - "rewards/margins": 1.3931005001068115, - "rewards/rejected": -10.937907218933105, - "semantic_entropy": 0.0017392231384292245, + "logits/chosen": -0.17101764678955078, + "logits/rejected": -0.11444918811321259, + "logps/chosen": -4.8785295486450195, + "logps/rejected": -6.166184425354004, + "loss": 0.4392, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.8785295486450195, + "rewards/margins": 1.2876551151275635, + "rewards/rejected": -6.166184425354004, "step": 4185 }, { "epoch": 2.2425154708145176, - "grad_norm": 21.62752883795121, + "grad_norm": 20.870217855404228, "learning_rate": 1.8175864234036132e-07, - "logits/chosen": 0.8781774640083313, - "logits/rejected": 0.9093042612075806, - "logps/chosen": -9.595394134521484, - "logps/rejected": -10.983304977416992, - "loss": 0.417, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.595394134521484, - "rewards/margins": 1.3879096508026123, - "rewards/rejected": -10.983304977416992, - "semantic_entropy": 0.0014014368643984199, + "logits/chosen": -0.002415428403764963, + "logits/rejected": 0.07124073803424835, + "logps/chosen": -4.8486175537109375, + "logps/rejected": -6.175110816955566, + "loss": 0.4358, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.8486175537109375, + "rewards/margins": 1.3264930248260498, + "rewards/rejected": -6.175110816955566, "step": 4190 }, { "epoch": 2.245191503595919, - "grad_norm": 22.389579813909606, + "grad_norm": 18.049131852361974, "learning_rate": 1.805589774683094e-07, - "logits/chosen": 0.7380444407463074, - "logits/rejected": 0.7925786972045898, - "logps/chosen": -9.564143180847168, - "logps/rejected": -10.847589492797852, - "loss": 0.3963, + "logits/chosen": -0.21017411351203918, + "logits/rejected": -0.07306221127510071, + "logps/chosen": -4.8675384521484375, + "logps/rejected": -6.002571105957031, + "loss": 0.4246, "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.564143180847168, - "rewards/margins": 1.2834450006484985, - "rewards/rejected": -10.847589492797852, - "semantic_entropy": 0.0015661569777876139, + "rewards/chosen": -4.8675384521484375, + "rewards/margins": 1.1350324153900146, + "rewards/rejected": -6.002571105957031, "step": 4195 }, { "epoch": 2.2478675363773206, - "grad_norm": 23.059520479950827, + "grad_norm": 23.551208931006524, "learning_rate": 1.79362411814459e-07, - "logits/chosen": 0.8489105105400085, - "logits/rejected": 0.8289289474487305, - "logps/chosen": -9.792860984802246, - "logps/rejected": -10.936185836791992, - "loss": 0.4414, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.792860984802246, - "rewards/margins": 1.1433252096176147, - "rewards/rejected": -10.936185836791992, - "semantic_entropy": 0.0015545317437499762, + "logits/chosen": 0.0052331797778606415, + "logits/rejected": -0.023628724738955498, + "logps/chosen": -5.03818941116333, + "logps/rejected": -6.037562370300293, + "loss": 0.4887, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -5.03818941116333, + "rewards/margins": 0.9993731379508972, + "rewards/rejected": -6.037562370300293, "step": 4200 }, { "epoch": 2.2505435691587223, - "grad_norm": 20.613769102957825, + "grad_norm": 21.766269067895124, "learning_rate": 1.7816895698789552e-07, - "logits/chosen": 0.7959033250808716, - "logits/rejected": 0.8639119267463684, - "logps/chosen": -9.70136833190918, - "logps/rejected": -10.945596694946289, - "loss": 0.4202, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.70136833190918, - "rewards/margins": 1.244227409362793, - "rewards/rejected": -10.945596694946289, - "semantic_entropy": 0.0013822594191879034, + "logits/chosen": -0.17379242181777954, + "logits/rejected": -0.0772695392370224, + "logps/chosen": -4.918708801269531, + "logps/rejected": -6.115952968597412, + "loss": 0.4068, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.918708801269531, + "rewards/margins": 1.1972440481185913, + "rewards/rejected": -6.115952968597412, "step": 4205 }, { "epoch": 2.2532196019401236, - "grad_norm": 17.406251682106838, + "grad_norm": 20.17731705984455, "learning_rate": 1.7697862456752271e-07, - "logits/chosen": 0.7929319143295288, - "logits/rejected": 0.8579456210136414, - "logps/chosen": -9.719596862792969, - "logps/rejected": -11.37035083770752, + "logits/chosen": -0.15179087221622467, + "logits/rejected": -0.0037357851397246122, + "logps/chosen": -4.983081817626953, + "logps/rejected": -6.539898872375488, "loss": 0.3609, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -9.719596862792969, - "rewards/margins": 1.6507545709609985, - "rewards/rejected": -11.37035083770752, - "semantic_entropy": 0.0013113311724737287, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.983081817626953, + "rewards/margins": 1.5568169355392456, + "rewards/rejected": -6.539898872375488, "step": 4210 }, { "epoch": 2.2558956347215253, - "grad_norm": 20.15920926644943, + "grad_norm": 22.239064822174928, "learning_rate": 1.7579142610195124e-07, - "logits/chosen": 0.7851302623748779, - "logits/rejected": 0.8529064059257507, - "logps/chosen": -9.851489067077637, - "logps/rejected": -11.166707038879395, - "loss": 0.4203, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.851489067077637, - "rewards/margins": 1.3152183294296265, - "rewards/rejected": -11.166707038879395, - "semantic_entropy": 0.0013941864017397165, + "logits/chosen": -0.13442015647888184, + "logits/rejected": 0.019773270934820175, + "logps/chosen": -5.089333534240723, + "logps/rejected": -6.431842803955078, + "loss": 0.427, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -5.089333534240723, + "rewards/margins": 1.342509150505066, + "rewards/rejected": -6.431842803955078, "step": 4215 }, { "epoch": 2.258571667502927, - "grad_norm": 19.09139226174037, + "grad_norm": 15.866734682101688, "learning_rate": 1.7460737310938568e-07, - "logits/chosen": 0.8212282061576843, - "logits/rejected": 0.877540111541748, - "logps/chosen": -9.683355331420898, - "logps/rejected": -11.168909072875977, - "loss": 0.3788, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.683355331420898, - "rewards/margins": 1.4855531454086304, - "rewards/rejected": -11.168909072875977, - "semantic_entropy": 0.0017290354007855058, + "logits/chosen": -0.1598428189754486, + "logits/rejected": 0.0658881664276123, + "logps/chosen": -4.9839558601379395, + "logps/rejected": -6.466966152191162, + "loss": 0.3554, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.9839558601379395, + "rewards/margins": 1.4830102920532227, + "rewards/rejected": -6.466966152191162, "step": 4220 }, { "epoch": 2.2612477002843283, - "grad_norm": 19.755486537220495, + "grad_norm": 22.614591957345525, "learning_rate": 1.734264770775133e-07, - "logits/chosen": 0.7770802974700928, - "logits/rejected": 0.8633508682250977, - "logps/chosen": -9.642881393432617, - "logps/rejected": -11.013988494873047, - "loss": 0.4055, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.642881393432617, - "rewards/margins": 1.3711069822311401, - "rewards/rejected": -11.013988494873047, - "semantic_entropy": 0.0014985213056206703, + "logits/chosen": -0.1323753148317337, + "logits/rejected": 0.11011920124292374, + "logps/chosen": -5.068212509155273, + "logps/rejected": -6.346895217895508, + "loss": 0.4386, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.068212509155273, + "rewards/margins": 1.2786825895309448, + "rewards/rejected": -6.346895217895508, "step": 4225 }, { "epoch": 2.26392373306573, - "grad_norm": 21.927676822159636, + "grad_norm": 22.01996671612011, "learning_rate": 1.7224874946339241e-07, - "logits/chosen": 0.8036985397338867, - "logits/rejected": 0.8095115423202515, - "logps/chosen": -9.7802152633667, - "logps/rejected": -11.069761276245117, - "loss": 0.4268, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.7802152633667, - "rewards/margins": 1.289547085762024, - "rewards/rejected": -11.069761276245117, - "semantic_entropy": 0.0012104662600904703, + "logits/chosen": -0.13518470525741577, + "logits/rejected": -0.03457862138748169, + "logps/chosen": -5.019261837005615, + "logps/rejected": -6.380149841308594, + "loss": 0.4216, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -5.019261837005615, + "rewards/margins": 1.3608882427215576, + "rewards/rejected": -6.380149841308594, "step": 4230 }, { "epoch": 2.2665997658471317, - "grad_norm": 16.00842803469047, + "grad_norm": 16.008967472936693, "learning_rate": 1.7107420169334186e-07, - "logits/chosen": 0.7866020202636719, - "logits/rejected": 0.8434419631958008, - "logps/chosen": -9.776571273803711, - "logps/rejected": -11.023462295532227, - "loss": 0.4244, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.776571273803711, - "rewards/margins": 1.2468903064727783, - "rewards/rejected": -11.023462295532227, - "semantic_entropy": 0.0012111186515539885, + "logits/chosen": -0.0846535712480545, + "logits/rejected": 0.0003061186580453068, + "logps/chosen": -5.015387535095215, + "logps/rejected": -6.243290901184082, + "loss": 0.4396, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -5.015387535095215, + "rewards/margins": 1.2279034852981567, + "rewards/rejected": -6.243290901184082, "step": 4235 }, { "epoch": 2.269275798628533, - "grad_norm": 17.323870558007282, + "grad_norm": 17.372709009777022, "learning_rate": 1.6990284516282893e-07, - "logits/chosen": 0.8010492324829102, - "logits/rejected": 0.8359723091125488, - "logps/chosen": -9.49112319946289, - "logps/rejected": -10.895849227905273, - "loss": 0.3877, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.49112319946289, - "rewards/margins": 1.404726266860962, - "rewards/rejected": -10.895849227905273, - "semantic_entropy": 0.0014989904593676329, + "logits/chosen": -0.07557263970375061, + "logits/rejected": 0.030812371522188187, + "logps/chosen": -4.888629913330078, + "logps/rejected": -6.254495143890381, + "loss": 0.3753, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.888629913330078, + "rewards/margins": 1.365865707397461, + "rewards/rejected": -6.254495143890381, "step": 4240 }, { "epoch": 2.2719518314099347, - "grad_norm": 17.259101259042957, + "grad_norm": 19.979914862080996, "learning_rate": 1.687346912363602e-07, - "logits/chosen": 0.8071710467338562, - "logits/rejected": 0.8544157147407532, - "logps/chosen": -9.638090133666992, - "logps/rejected": -11.011279106140137, - "loss": 0.3814, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.638090133666992, - "rewards/margins": 1.3731900453567505, - "rewards/rejected": -11.011279106140137, - "semantic_entropy": 0.0015442619333043694, + "logits/chosen": -0.12858818471431732, + "logits/rejected": 0.03362492471933365, + "logps/chosen": -5.134631633758545, + "logps/rejected": -6.462745666503906, + "loss": 0.3893, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.134631633758545, + "rewards/margins": 1.3281140327453613, + "rewards/rejected": -6.462745666503906, "step": 4245 }, { "epoch": 2.2746278641913364, - "grad_norm": 15.415019825942553, + "grad_norm": 19.185771828418037, "learning_rate": 1.675697512473697e-07, - "logits/chosen": 0.8083289861679077, - "logits/rejected": 0.9057637453079224, - "logps/chosen": -9.574909210205078, - "logps/rejected": -10.998074531555176, - "loss": 0.3687, - "rewards/accuracies": 0.84375, - "rewards/chosen": -9.574909210205078, - "rewards/margins": 1.4231641292572021, - "rewards/rejected": -10.998074531555176, - "semantic_entropy": 0.00155646784696728, + "logits/chosen": -0.1295214593410492, + "logits/rejected": 0.08469869196414948, + "logps/chosen": -5.124062538146973, + "logps/rejected": -6.534721374511719, + "loss": 0.3645, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.124062538146973, + "rewards/margins": 1.4106590747833252, + "rewards/rejected": -6.534721374511719, "step": 4250 }, { "epoch": 2.2773038969727377, - "grad_norm": 21.576286115755277, + "grad_norm": 19.81334564521635, "learning_rate": 1.6640803649811087e-07, - "logits/chosen": 0.8501211404800415, - "logits/rejected": 0.9308522343635559, - "logps/chosen": -9.679555892944336, - "logps/rejected": -11.19702434539795, - "loss": 0.3827, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.679555892944336, - "rewards/margins": 1.517469048500061, - "rewards/rejected": -11.19702434539795, - "semantic_entropy": 0.0016793437534943223, + "logits/chosen": -0.11614228785037994, + "logits/rejected": 0.14005012810230255, + "logps/chosen": -5.132724285125732, + "logps/rejected": -6.546229362487793, + "loss": 0.3712, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.132724285125732, + "rewards/margins": 1.4135043621063232, + "rewards/rejected": -6.546229362487793, "step": 4255 }, { "epoch": 2.2799799297541394, - "grad_norm": 26.719598028515872, + "grad_norm": 21.588721641122426, "learning_rate": 1.6524955825954472e-07, - "logits/chosen": 0.8302766680717468, - "logits/rejected": 0.8800037503242493, - "logps/chosen": -9.66600513458252, - "logps/rejected": -10.949085235595703, - "loss": 0.4151, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.66600513458252, - "rewards/margins": 1.2830795049667358, - "rewards/rejected": -10.949085235595703, - "semantic_entropy": 0.001497269026003778, + "logits/chosen": -0.0608552023768425, + "logits/rejected": 0.020806238055229187, + "logps/chosen": -4.950244903564453, + "logps/rejected": -6.293306350708008, + "loss": 0.4074, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.950244903564453, + "rewards/margins": 1.3430612087249756, + "rewards/rejected": -6.293306350708008, "step": 4260 }, { "epoch": 2.282655962535541, - "grad_norm": 18.483844690586892, + "grad_norm": 18.232378231462494, "learning_rate": 1.6409432777123277e-07, - "logits/chosen": 0.8208200335502625, - "logits/rejected": 0.8599546551704407, - "logps/chosen": -9.824455261230469, - "logps/rejected": -11.335619926452637, - "loss": 0.3885, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.824455261230469, - "rewards/margins": 1.511163353919983, - "rewards/rejected": -11.335619926452637, - "semantic_entropy": 0.0013760743895545602, + "logits/chosen": -0.12025491893291473, + "logits/rejected": 0.04680377617478371, + "logps/chosen": -5.3034772872924805, + "logps/rejected": -6.804583549499512, + "loss": 0.4041, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.3034772872924805, + "rewards/margins": 1.5011054277420044, + "rewards/rejected": -6.804583549499512, "step": 4265 }, { "epoch": 2.285331995316943, - "grad_norm": 20.65468077960879, + "grad_norm": 19.607360004659856, "learning_rate": 1.6294235624122577e-07, - "logits/chosen": 0.8452394604682922, - "logits/rejected": 0.9035156965255737, - "logps/chosen": -9.813318252563477, - "logps/rejected": -11.151365280151367, - "loss": 0.4044, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.813318252563477, - "rewards/margins": 1.338047742843628, - "rewards/rejected": -11.151365280151367, - "semantic_entropy": 0.0013139288639649749, + "logits/chosen": -0.012470404617488384, + "logits/rejected": 0.20672759413719177, + "logps/chosen": -5.386794567108154, + "logps/rejected": -6.70856237411499, + "loss": 0.4224, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -5.386794567108154, + "rewards/margins": 1.321767807006836, + "rewards/rejected": -6.70856237411499, "step": 4270 }, { "epoch": 2.288008028098344, - "grad_norm": 24.257011022001592, + "grad_norm": 19.204180836080123, "learning_rate": 1.6179365484595697e-07, - "logits/chosen": 0.7976378202438354, - "logits/rejected": 0.8221977353096008, - "logps/chosen": -9.780439376831055, - "logps/rejected": -11.116656303405762, - "loss": 0.4225, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.780439376831055, - "rewards/margins": 1.3362162113189697, - "rewards/rejected": -11.116656303405762, - "semantic_entropy": 0.0013957961928099394, + "logits/chosen": -0.12236963212490082, + "logits/rejected": -0.004489085171371698, + "logps/chosen": -5.0256028175354, + "logps/rejected": -6.2447028160095215, + "loss": 0.4351, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -5.0256028175354, + "rewards/margins": 1.2191003561019897, + "rewards/rejected": -6.2447028160095215, "step": 4275 }, { "epoch": 2.290684060879746, - "grad_norm": 22.65516964126615, + "grad_norm": 20.47607848916119, "learning_rate": 1.60648234730132e-07, - "logits/chosen": 0.8350450396537781, - "logits/rejected": 0.8612421154975891, - "logps/chosen": -9.680601119995117, - "logps/rejected": -11.20177936553955, - "loss": 0.3626, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.680601119995117, - "rewards/margins": 1.5211775302886963, - "rewards/rejected": -11.20177936553955, - "semantic_entropy": 0.0013702240539714694, + "logits/chosen": -0.10036102682352066, + "logits/rejected": -0.0004423752543516457, + "logps/chosen": -5.1717209815979, + "logps/rejected": -6.622015476226807, + "loss": 0.3553, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.1717209815979, + "rewards/margins": 1.4502942562103271, + "rewards/rejected": -6.622015476226807, "step": 4280 }, { "epoch": 2.293360093661147, - "grad_norm": 29.58219434395317, + "grad_norm": 30.105695190029074, "learning_rate": 1.595061070066222e-07, - "logits/chosen": 0.8323311805725098, - "logits/rejected": 0.8706264495849609, - "logps/chosen": -9.795225143432617, - "logps/rejected": -11.236984252929688, - "loss": 0.3767, - "rewards/accuracies": 0.84375, - "rewards/chosen": -9.795225143432617, - "rewards/margins": 1.441759467124939, - "rewards/rejected": -11.236984252929688, - "semantic_entropy": 0.0013441203627735376, + "logits/chosen": -0.06739501655101776, + "logits/rejected": -0.05254412814974785, + "logps/chosen": -4.969847679138184, + "logps/rejected": -6.367839813232422, + "loss": 0.3799, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.969847679138184, + "rewards/margins": 1.3979917764663696, + "rewards/rejected": -6.367839813232422, "step": 4285 }, { "epoch": 2.296036126442549, - "grad_norm": 29.102509702206316, + "grad_norm": 34.64655938000007, "learning_rate": 1.5836728275635542e-07, - "logits/chosen": 0.7748151421546936, - "logits/rejected": 0.81391441822052, - "logps/chosen": -9.950288772583008, - "logps/rejected": -11.165694236755371, - "loss": 0.4377, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.950288772583008, - "rewards/margins": 1.215405821800232, - "rewards/rejected": -11.165694236755371, - "semantic_entropy": 0.0010313175152987242, + "logits/chosen": -0.17743846774101257, + "logits/rejected": -0.008018224500119686, + "logps/chosen": -5.1956892013549805, + "logps/rejected": -6.402718544006348, + "loss": 0.4466, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -5.1956892013549805, + "rewards/margins": 1.2070289850234985, + "rewards/rejected": -6.402718544006348, "step": 4290 }, { "epoch": 2.2987121592239506, - "grad_norm": 21.581567035471323, + "grad_norm": 20.232754309267072, "learning_rate": 1.5723177302820984e-07, - "logits/chosen": 0.8050596117973328, - "logits/rejected": 0.8407198190689087, - "logps/chosen": -9.88306999206543, - "logps/rejected": -11.050642013549805, - "loss": 0.4321, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.88306999206543, - "rewards/margins": 1.167571783065796, - "rewards/rejected": -11.050642013549805, - "semantic_entropy": 0.0010738309938460588, + "logits/chosen": -0.15359598398208618, + "logits/rejected": -0.06650884449481964, + "logps/chosen": -5.070862770080566, + "logps/rejected": -6.234358310699463, + "loss": 0.4208, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -5.070862770080566, + "rewards/margins": 1.1634957790374756, + "rewards/rejected": -6.234358310699463, "step": 4295 }, { "epoch": 2.3013881920053523, - "grad_norm": 18.06623114943259, + "grad_norm": 18.095741662664786, "learning_rate": 1.5609958883890544e-07, - "logits/chosen": 0.8042596578598022, - "logits/rejected": 0.8785734176635742, - "logps/chosen": -9.795085906982422, - "logps/rejected": -11.104022979736328, - "loss": 0.3878, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.795085906982422, - "rewards/margins": 1.308937907218933, - "rewards/rejected": -11.104022979736328, - "semantic_entropy": 0.0012584684882313013, + "logits/chosen": -0.06105056405067444, + "logits/rejected": 0.06964129954576492, + "logps/chosen": -5.018832206726074, + "logps/rejected": -6.238587379455566, + "loss": 0.398, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.018832206726074, + "rewards/margins": 1.2197558879852295, + "rewards/rejected": -6.238587379455566, "step": 4300 }, { "epoch": 2.3040642247867535, - "grad_norm": 24.33590640447122, + "grad_norm": 20.572922956322742, "learning_rate": 1.5497074117289865e-07, - "logits/chosen": 0.7726608514785767, - "logits/rejected": 0.8293962478637695, - "logps/chosen": -9.73291015625, - "logps/rejected": -11.144505500793457, - "loss": 0.4062, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.73291015625, - "rewards/margins": 1.4115943908691406, - "rewards/rejected": -11.144505500793457, - "semantic_entropy": 0.002003467408940196, + "logits/chosen": -0.18592335283756256, + "logits/rejected": -0.05864933878183365, + "logps/chosen": -4.784070014953613, + "logps/rejected": -6.188055038452148, + "loss": 0.3878, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.784070014953613, + "rewards/margins": 1.4039851427078247, + "rewards/rejected": -6.188055038452148, "step": 4305 }, { "epoch": 2.3067402575681553, - "grad_norm": 19.852698261404328, + "grad_norm": 21.138989371275557, "learning_rate": 1.5384524098227402e-07, - "logits/chosen": 0.8046091198921204, - "logits/rejected": 0.8712922930717468, - "logps/chosen": -9.885152816772461, - "logps/rejected": -11.513572692871094, - "loss": 0.3311, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -9.885152816772461, - "rewards/margins": 1.6284195184707642, - "rewards/rejected": -11.513572692871094, - "semantic_entropy": 0.0014156540855765343, + "logits/chosen": -0.149384543299675, + "logits/rejected": 0.03493823856115341, + "logps/chosen": -5.17222785949707, + "logps/rejected": -6.66733455657959, + "loss": 0.3622, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.17222785949707, + "rewards/margins": 1.4951070547103882, + "rewards/rejected": -6.66733455657959, "step": 4310 }, { "epoch": 2.3094162903495565, - "grad_norm": 23.28791831420902, + "grad_norm": 21.235992441343203, "learning_rate": 1.5272309918663974e-07, - "logits/chosen": 0.7911036610603333, - "logits/rejected": 0.8605779409408569, - "logps/chosen": -9.974563598632812, - "logps/rejected": -11.249377250671387, - "loss": 0.467, - "rewards/accuracies": 0.75, - "rewards/chosen": -9.974563598632812, - "rewards/margins": 1.2748134136199951, - "rewards/rejected": -11.249377250671387, - "semantic_entropy": 0.0012652326840907335, + "logits/chosen": -0.1005704402923584, + "logits/rejected": 0.056185394525527954, + "logps/chosen": -5.148066997528076, + "logps/rejected": -6.280634880065918, + "loss": 0.4734, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -5.148066997528076, + "rewards/margins": 1.132568597793579, + "rewards/rejected": -6.280634880065918, "step": 4315 }, { "epoch": 2.3120923231309582, - "grad_norm": 18.031587200544166, + "grad_norm": 18.706004458361146, "learning_rate": 1.516043266730201e-07, - "logits/chosen": 0.8097645044326782, - "logits/rejected": 0.8588771820068359, - "logps/chosen": -9.800325393676758, - "logps/rejected": -11.185141563415527, - "loss": 0.393, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.800325393676758, - "rewards/margins": 1.3848176002502441, - "rewards/rejected": -11.185141563415527, - "semantic_entropy": 0.0018925167387351394, + "logits/chosen": -0.11011295020580292, + "logits/rejected": 0.03181259706616402, + "logps/chosen": -5.103433132171631, + "logps/rejected": -6.414648532867432, + "loss": 0.4006, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -5.103433132171631, + "rewards/margins": 1.3112151622772217, + "rewards/rejected": -6.414648532867432, "step": 4320 }, { "epoch": 2.31476835591236, - "grad_norm": 29.399301801147324, + "grad_norm": 29.980658866625355, "learning_rate": 1.504889342957512e-07, - "logits/chosen": 0.7945131063461304, - "logits/rejected": 0.8561771512031555, - "logps/chosen": -9.814886093139648, - "logps/rejected": -11.265652656555176, - "loss": 0.4417, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -9.814886093139648, - "rewards/margins": 1.45076584815979, - "rewards/rejected": -11.265652656555176, - "semantic_entropy": 0.0013577769277617335, + "logits/chosen": -0.10422752797603607, + "logits/rejected": 0.06190290302038193, + "logps/chosen": -5.039002895355225, + "logps/rejected": -6.2669677734375, + "loss": 0.4503, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -5.039002895355225, + "rewards/margins": 1.2279642820358276, + "rewards/rejected": -6.2669677734375, "step": 4325 }, { "epoch": 2.3174443886937617, - "grad_norm": 18.818414850816456, + "grad_norm": 17.48635292071583, "learning_rate": 1.4937693287637453e-07, - "logits/chosen": 0.782823920249939, - "logits/rejected": 0.8478276133537292, - "logps/chosen": -9.825902938842773, - "logps/rejected": -11.110780715942383, - "loss": 0.4206, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.825902938842773, - "rewards/margins": 1.2848764657974243, - "rewards/rejected": -11.110780715942383, - "semantic_entropy": 0.0011613890528678894, + "logits/chosen": -0.11002322286367416, + "logits/rejected": 0.0498075895011425, + "logps/chosen": -5.191160678863525, + "logps/rejected": -6.519074440002441, + "loss": 0.4026, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.191160678863525, + "rewards/margins": 1.3279129266738892, + "rewards/rejected": -6.519074440002441, "step": 4330 }, { "epoch": 2.320120421475163, - "grad_norm": 23.834103955908468, + "grad_norm": 19.365475895161786, "learning_rate": 1.4826833320353305e-07, - "logits/chosen": 0.7609673142433167, - "logits/rejected": 0.8239792585372925, - "logps/chosen": -9.851580619812012, - "logps/rejected": -11.248353958129883, - "loss": 0.385, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.851580619812012, - "rewards/margins": 1.3967727422714233, - "rewards/rejected": -11.248353958129883, - "semantic_entropy": 0.0012605976080521941, + "logits/chosen": -0.1270432472229004, + "logits/rejected": -0.029448796063661575, + "logps/chosen": -5.0405988693237305, + "logps/rejected": -6.4376959800720215, + "loss": 0.3882, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.0405988693237305, + "rewards/margins": 1.3970969915390015, + "rewards/rejected": -6.4376959800720215, "step": 4335 }, { "epoch": 2.3227964542565647, - "grad_norm": 23.686138337629355, + "grad_norm": 21.37133118316228, "learning_rate": 1.4716314603286528e-07, - "logits/chosen": 0.8113320469856262, - "logits/rejected": 0.853225588798523, - "logps/chosen": -9.844433784484863, - "logps/rejected": -11.266097068786621, - "loss": 0.4029, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.844433784484863, - "rewards/margins": 1.4216625690460205, - "rewards/rejected": -11.266097068786621, - "semantic_entropy": 0.0013559302315115929, + "logits/chosen": -0.17585240304470062, + "logits/rejected": 0.020751286298036575, + "logps/chosen": -4.995090484619141, + "logps/rejected": -6.4500837326049805, + "loss": 0.3617, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.995090484619141, + "rewards/margins": 1.4549932479858398, + "rewards/rejected": -6.4500837326049805, "step": 4340 }, { "epoch": 2.3254724870379664, - "grad_norm": 25.669925692295536, + "grad_norm": 36.524090722454844, "learning_rate": 1.4606138208690233e-07, - "logits/chosen": 0.7266643643379211, - "logits/rejected": 0.8064903020858765, - "logps/chosen": -9.897780418395996, - "logps/rejected": -11.229866981506348, - "loss": 0.4261, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.897780418395996, - "rewards/margins": 1.3320866823196411, - "rewards/rejected": -11.229866981506348, - "semantic_entropy": 0.0012010873761028051, + "logits/chosen": -0.10854163020849228, + "logits/rejected": -0.022362036630511284, + "logps/chosen": -5.364623069763184, + "logps/rejected": -6.612184047698975, + "loss": 0.4517, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -5.364623069763184, + "rewards/margins": 1.2475614547729492, + "rewards/rejected": -6.612184047698975, "step": 4345 }, { "epoch": 2.3281485198193677, - "grad_norm": 24.94777596239184, + "grad_norm": 17.965294847555942, "learning_rate": 1.4496305205496251e-07, - "logits/chosen": 0.796136200428009, - "logits/rejected": 0.8669074177742004, - "logps/chosen": -9.997017860412598, - "logps/rejected": -11.391059875488281, - "loss": 0.3978, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.997017860412598, - "rewards/margins": 1.3940420150756836, - "rewards/rejected": -11.391059875488281, - "semantic_entropy": 0.0011204956099390984, + "logits/chosen": -0.07267121970653534, + "logits/rejected": 0.023109078407287598, + "logps/chosen": -5.266455173492432, + "logps/rejected": -6.731914520263672, + "loss": 0.384, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.266455173492432, + "rewards/margins": 1.4654597043991089, + "rewards/rejected": -6.731914520263672, "step": 4350 }, { "epoch": 2.3308245526007694, - "grad_norm": 25.67221825991835, + "grad_norm": 18.65632310446417, "learning_rate": 1.4386816659304895e-07, - "logits/chosen": 0.7781258225440979, - "logits/rejected": 0.8242195248603821, - "logps/chosen": -9.795356750488281, - "logps/rejected": -11.114994049072266, - "loss": 0.3966, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -9.795356750488281, - "rewards/margins": 1.3196370601654053, - "rewards/rejected": -11.114994049072266, - "semantic_entropy": 0.0012639164924621582, + "logits/chosen": -0.19199849665164948, + "logits/rejected": -0.05044582486152649, + "logps/chosen": -5.1083502769470215, + "logps/rejected": -6.438894748687744, + "loss": 0.3615, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.1083502769470215, + "rewards/margins": 1.3305447101593018, + "rewards/rejected": -6.438894748687744, "step": 4355 }, { "epoch": 2.333500585382171, - "grad_norm": 29.57080046877201, + "grad_norm": 21.443380697299165, "learning_rate": 1.4277673632374492e-07, - "logits/chosen": 0.7557036876678467, - "logits/rejected": 0.8132128715515137, - "logps/chosen": -9.90876579284668, - "logps/rejected": -11.25818920135498, - "loss": 0.4118, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.90876579284668, - "rewards/margins": 1.3494237661361694, - "rewards/rejected": -11.25818920135498, - "semantic_entropy": 0.0013076277682557702, + "logits/chosen": -0.15602442622184753, + "logits/rejected": 0.06887654215097427, + "logps/chosen": -5.3054304122924805, + "logps/rejected": -6.623404502868652, + "loss": 0.4034, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -5.3054304122924805, + "rewards/margins": 1.3179748058319092, + "rewards/rejected": -6.623404502868652, "step": 4360 }, { "epoch": 2.3361766181635724, - "grad_norm": 20.78894701815076, + "grad_norm": 20.205440822446555, "learning_rate": 1.416887718361119e-07, - "logits/chosen": 0.8289060592651367, - "logits/rejected": 0.8358928561210632, - "logps/chosen": -9.96793270111084, - "logps/rejected": -11.24679183959961, - "loss": 0.4306, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.96793270111084, - "rewards/margins": 1.2788599729537964, - "rewards/rejected": -11.24679183959961, - "semantic_entropy": 0.0013636414660140872, + "logits/chosen": -0.013370638713240623, + "logits/rejected": 0.016033630818128586, + "logps/chosen": -5.237106800079346, + "logps/rejected": -6.455960273742676, + "loss": 0.429, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -5.237106800079346, + "rewards/margins": 1.2188541889190674, + "rewards/rejected": -6.455960273742676, "step": 4365 }, { "epoch": 2.338852650944974, - "grad_norm": 26.519372471909644, + "grad_norm": 22.099345050230067, "learning_rate": 1.406042836855859e-07, - "logits/chosen": 0.8788352012634277, - "logits/rejected": 0.9096766710281372, - "logps/chosen": -9.882646560668945, - "logps/rejected": -11.346869468688965, - "loss": 0.3805, - "rewards/accuracies": 0.84375, - "rewards/chosen": -9.882646560668945, - "rewards/margins": 1.464221715927124, - "rewards/rejected": -11.346869468688965, - "semantic_entropy": 0.0013449579710140824, + "logits/chosen": -0.09815029799938202, + "logits/rejected": 0.0009364187717437744, + "logps/chosen": -4.8000664710998535, + "logps/rejected": -6.2519145011901855, + "loss": 0.3744, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.8000664710998535, + "rewards/margins": 1.4518482685089111, + "rewards/rejected": -6.2519145011901855, "step": 4370 }, { "epoch": 2.341528683726376, - "grad_norm": 26.805430797019667, + "grad_norm": 25.88560586110221, "learning_rate": 1.3952328239387595e-07, - "logits/chosen": 0.7472053170204163, - "logits/rejected": 0.8391642570495605, - "logps/chosen": -9.822053909301758, - "logps/rejected": -11.334062576293945, - "loss": 0.3744, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.822053909301758, - "rewards/margins": 1.5120099782943726, - "rewards/rejected": -11.334062576293945, - "semantic_entropy": 0.0012804374564439058, + "logits/chosen": -0.19866248965263367, + "logits/rejected": 0.02257232367992401, + "logps/chosen": -5.0134124755859375, + "logps/rejected": -6.465268135070801, + "loss": 0.3949, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -5.0134124755859375, + "rewards/margins": 1.451856255531311, + "rewards/rejected": -6.465268135070801, "step": 4375 }, { "epoch": 2.344204716507777, - "grad_norm": 23.745463280246494, + "grad_norm": 20.4025446231914, "learning_rate": 1.3844577844886109e-07, - "logits/chosen": 0.8212148547172546, - "logits/rejected": 0.9034450650215149, - "logps/chosen": -9.849560737609863, - "logps/rejected": -11.316389083862305, - "loss": 0.3922, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.849560737609863, - "rewards/margins": 1.4668283462524414, - "rewards/rejected": -11.316389083862305, - "semantic_entropy": 0.0014727965462952852, + "logits/chosen": -0.137832909822464, + "logits/rejected": 0.08915011584758759, + "logps/chosen": -5.336796283721924, + "logps/rejected": -6.729138374328613, + "loss": 0.3882, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -5.336796283721924, + "rewards/margins": 1.3923413753509521, + "rewards/rejected": -6.729138374328613, "step": 4380 }, { "epoch": 2.346880749289179, - "grad_norm": 24.765118352803277, + "grad_norm": 27.32270577814917, "learning_rate": 1.3737178230448955e-07, - "logits/chosen": 0.706335723400116, - "logits/rejected": 0.7600988149642944, - "logps/chosen": -10.072949409484863, - "logps/rejected": -11.255311965942383, - "loss": 0.4585, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -10.072949409484863, - "rewards/margins": 1.1823631525039673, - "rewards/rejected": -11.255311965942383, - "semantic_entropy": 0.0010486546671018004, + "logits/chosen": -0.17152224481105804, + "logits/rejected": -0.04678625613451004, + "logps/chosen": -5.453312873840332, + "logps/rejected": -6.617292881011963, + "loss": 0.4614, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.453312873840332, + "rewards/margins": 1.1639801263809204, + "rewards/rejected": -6.617292881011963, "step": 4385 }, { "epoch": 2.3495567820705805, - "grad_norm": 16.53037049152132, + "grad_norm": 17.06453048011787, "learning_rate": 1.363013043806764e-07, - "logits/chosen": 0.8160565495491028, - "logits/rejected": 0.8825929760932922, - "logps/chosen": -9.718530654907227, - "logps/rejected": -11.098315238952637, - "loss": 0.375, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.718530654907227, - "rewards/margins": 1.3797847032546997, - "rewards/rejected": -11.098315238952637, - "semantic_entropy": 0.0014672328252345324, + "logits/chosen": -0.10966813564300537, + "logits/rejected": 0.0032951668836176395, + "logps/chosen": -5.1266961097717285, + "logps/rejected": -6.364705562591553, + "loss": 0.3984, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.1266961097717285, + "rewards/margins": 1.2380096912384033, + "rewards/rejected": -6.364705562591553, "step": 4390 }, { "epoch": 2.3522328148519818, - "grad_norm": 19.343996568570635, + "grad_norm": 18.94462468233651, "learning_rate": 1.352343550632034e-07, - "logits/chosen": 0.8211394548416138, - "logits/rejected": 0.8589351773262024, - "logps/chosen": -9.79681396484375, - "logps/rejected": -11.286918640136719, - "loss": 0.4092, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.79681396484375, - "rewards/margins": 1.4901044368743896, - "rewards/rejected": -11.286918640136719, - "semantic_entropy": 0.0013513191370293498, + "logits/chosen": -0.1321590691804886, + "logits/rejected": 0.01458764635026455, + "logps/chosen": -4.938618183135986, + "logps/rejected": -6.338255405426025, + "loss": 0.4026, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.938618183135986, + "rewards/margins": 1.399637222290039, + "rewards/rejected": -6.338255405426025, "step": 4395 }, { "epoch": 2.3549088476333835, - "grad_norm": 18.946006963734423, + "grad_norm": 18.163540668863853, "learning_rate": 1.3417094470361722e-07, - "logits/chosen": 0.777470052242279, - "logits/rejected": 0.8312585949897766, - "logps/chosen": -9.910394668579102, - "logps/rejected": -11.160832405090332, - "loss": 0.428, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.910394668579102, - "rewards/margins": 1.2504370212554932, - "rewards/rejected": -11.160832405090332, - "semantic_entropy": 0.0012193446746096015, + "logits/chosen": -0.14368955790996552, + "logits/rejected": -0.021579569205641747, + "logps/chosen": -5.094829559326172, + "logps/rejected": -6.347546577453613, + "loss": 0.4141, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -5.094829559326172, + "rewards/margins": 1.2527166604995728, + "rewards/rejected": -6.347546577453613, "step": 4400 }, { "epoch": 2.3549088476333835, - "eval_logits/chosen": 0.9214699268341064, - "eval_logits/rejected": 0.9721218943595886, - "eval_logps/chosen": -9.951480865478516, - "eval_logps/rejected": -11.088980674743652, - "eval_loss": 0.5250210762023926, - "eval_rewards/accuracies": 0.721068263053894, - "eval_rewards/chosen": -9.951480865478516, - "eval_rewards/margins": 1.1374988555908203, - "eval_rewards/rejected": -11.088980674743652, - "eval_runtime": 35.1208, - "eval_samples_per_second": 38.296, - "eval_semantic_entropy": 0.0012979113962501287, - "eval_steps_per_second": 9.595, + "eval_logits/chosen": 0.27749529480934143, + "eval_logits/rejected": 0.377564936876297, + "eval_logps/chosen": -5.3001298904418945, + "eval_logps/rejected": -6.399603843688965, + "eval_loss": 0.5245212316513062, + "eval_rewards/accuracies": 0.7277448177337646, + "eval_rewards/chosen": -5.3001298904418945, + "eval_rewards/margins": 1.0994741916656494, + "eval_rewards/rejected": -6.399603843688965, + "eval_runtime": 40.3604, + "eval_samples_per_second": 33.325, + "eval_steps_per_second": 8.35, "step": 4400 }, { "epoch": 2.357584880414785, - "grad_norm": 24.25396278476502, + "grad_norm": 21.194476622607322, "learning_rate": 1.3311108361913015e-07, - "logits/chosen": 0.7317711114883423, - "logits/rejected": 0.7822341322898865, - "logps/chosen": -9.765314102172852, - "logps/rejected": -11.11705207824707, - "loss": 0.3892, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.765314102172852, - "rewards/margins": 1.3517379760742188, - "rewards/rejected": -11.11705207824707, - "semantic_entropy": 0.001415650942362845, + "logits/chosen": -0.1644768863916397, + "logits/rejected": -0.1227949857711792, + "logps/chosen": -5.102969646453857, + "logps/rejected": -6.4108381271362305, + "loss": 0.3817, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.102969646453857, + "rewards/margins": 1.3078687191009521, + "rewards/rejected": -6.4108381271362305, "step": 4405 }, { "epoch": 2.3602609131961865, - "grad_norm": 16.799881334103528, + "grad_norm": 17.322885935174853, "learning_rate": 1.3205478209251874e-07, - "logits/chosen": 0.8066733479499817, - "logits/rejected": 0.9000295400619507, - "logps/chosen": -9.963193893432617, - "logps/rejected": -11.469663619995117, - "loss": 0.3841, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -9.963193893432617, - "rewards/margins": 1.5064703226089478, - "rewards/rejected": -11.469663619995117, - "semantic_entropy": 0.001393306301906705, + "logits/chosen": -0.12027808278799057, + "logits/rejected": -0.00729673495516181, + "logps/chosen": -5.209392547607422, + "logps/rejected": -6.626528263092041, + "loss": 0.3871, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.209392547607422, + "rewards/margins": 1.41713547706604, + "rewards/rejected": -6.626528263092041, "step": 4410 }, { "epoch": 2.362936945977588, - "grad_norm": 22.10955918100729, + "grad_norm": 19.801056338251666, "learning_rate": 1.310020503720254e-07, - "logits/chosen": 0.7781058549880981, - "logits/rejected": 0.822067141532898, - "logps/chosen": -9.866239547729492, - "logps/rejected": -11.280614852905273, - "loss": 0.4068, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.866239547729492, - "rewards/margins": 1.4143754243850708, - "rewards/rejected": -11.280614852905273, - "semantic_entropy": 0.0012620962224900723, + "logits/chosen": -0.11616314947605133, + "logits/rejected": 0.04717870429158211, + "logps/chosen": -5.280298709869385, + "logps/rejected": -6.671367645263672, + "loss": 0.3998, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -5.280298709869385, + "rewards/margins": 1.3910691738128662, + "rewards/rejected": -6.671367645263672, "step": 4415 }, { "epoch": 2.36561297875899, - "grad_norm": 24.95675503053961, + "grad_norm": 21.88508237802675, "learning_rate": 1.2995289867125752e-07, - "logits/chosen": 0.7621157765388489, - "logits/rejected": 0.7959357500076294, - "logps/chosen": -9.751391410827637, - "logps/rejected": -10.94892692565918, - "loss": 0.4499, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.751391410827637, - "rewards/margins": 1.1975345611572266, - "rewards/rejected": -10.94892692565918, - "semantic_entropy": 0.001342722331173718, + "logits/chosen": -0.09621217846870422, + "logits/rejected": -0.020419521257281303, + "logps/chosen": -5.190130233764648, + "logps/rejected": -6.343196392059326, + "loss": 0.4389, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -5.190130233764648, + "rewards/margins": 1.1530662775039673, + "rewards/rejected": -6.343196392059326, "step": 4420 }, { "epoch": 2.368289011540391, - "grad_norm": 22.719615672561844, + "grad_norm": 15.896487580795284, "learning_rate": 1.2890733716908986e-07, - "logits/chosen": 0.7777091264724731, - "logits/rejected": 0.860866367816925, - "logps/chosen": -9.617403030395508, - "logps/rejected": -11.080734252929688, - "loss": 0.3296, - "rewards/accuracies": 0.875, - "rewards/chosen": -9.617403030395508, - "rewards/margins": 1.4633299112319946, - "rewards/rejected": -11.080734252929688, - "semantic_entropy": 0.0015527913346886635, + "logits/chosen": -0.10825226455926895, + "logits/rejected": -0.011760717257857323, + "logps/chosen": -4.965066432952881, + "logps/rejected": -6.311559200286865, + "loss": 0.3263, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.965066432952881, + "rewards/margins": 1.3464924097061157, + "rewards/rejected": -6.311559200286865, "step": 4425 }, { "epoch": 2.370965044321793, - "grad_norm": 22.213207378400554, + "grad_norm": 26.661302568033804, "learning_rate": 1.2786537600956454e-07, - "logits/chosen": 0.7948800325393677, - "logits/rejected": 0.8321939706802368, - "logps/chosen": -9.664536476135254, - "logps/rejected": -11.117993354797363, - "loss": 0.3919, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.664536476135254, - "rewards/margins": 1.4534571170806885, - "rewards/rejected": -11.117993354797363, - "semantic_entropy": 0.0012703756801784039, + "logits/chosen": -0.13480785489082336, + "logits/rejected": 0.04589816555380821, + "logps/chosen": -5.229303359985352, + "logps/rejected": -6.593223571777344, + "loss": 0.4132, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.229303359985352, + "rewards/margins": 1.3639203310012817, + "rewards/rejected": -6.593223571777344, "step": 4430 }, { "epoch": 2.3736410771031946, - "grad_norm": 17.553736783320986, + "grad_norm": 17.143757223041874, "learning_rate": 1.268270253017933e-07, - "logits/chosen": 0.8016083836555481, - "logits/rejected": 0.8781582117080688, - "logps/chosen": -9.703906059265137, - "logps/rejected": -11.203841209411621, - "loss": 0.3737, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.703906059265137, - "rewards/margins": 1.4999356269836426, - "rewards/rejected": -11.203841209411621, - "semantic_entropy": 0.0014606801560148597, + "logits/chosen": -0.09140481799840927, + "logits/rejected": 0.08829962462186813, + "logps/chosen": -5.079370975494385, + "logps/rejected": -6.4564995765686035, + "loss": 0.4034, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.079370975494385, + "rewards/margins": 1.3771284818649292, + "rewards/rejected": -6.4564995765686035, "step": 4435 }, { "epoch": 2.376317109884596, - "grad_norm": 20.144746908461087, + "grad_norm": 18.8940806380552, "learning_rate": 1.257922951198591e-07, - "logits/chosen": 0.7229622602462769, - "logits/rejected": 0.8273760676383972, - "logps/chosen": -9.712352752685547, - "logps/rejected": -11.075556755065918, - "loss": 0.4149, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.712352752685547, - "rewards/margins": 1.363203763961792, - "rewards/rejected": -11.075556755065918, - "semantic_entropy": 0.0015043210005387664, + "logits/chosen": -0.18549498915672302, + "logits/rejected": 0.09089966118335724, + "logps/chosen": -5.08328104019165, + "logps/rejected": -6.345778465270996, + "loss": 0.4248, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.08328104019165, + "rewards/margins": 1.2624980211257935, + "rewards/rejected": -6.345778465270996, "step": 4440 }, { "epoch": 2.3789931426659976, - "grad_norm": 21.29344289082141, + "grad_norm": 22.945672160885042, "learning_rate": 1.24761195502719e-07, - "logits/chosen": 0.765488862991333, - "logits/rejected": 0.8269311189651489, - "logps/chosen": -9.819905281066895, - "logps/rejected": -11.093830108642578, - "loss": 0.4315, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -9.819905281066895, - "rewards/margins": 1.2739253044128418, - "rewards/rejected": -11.093830108642578, - "semantic_entropy": 0.0011897517833858728, + "logits/chosen": -0.11048096418380737, + "logits/rejected": 0.11021719127893448, + "logps/chosen": -5.0687761306762695, + "logps/rejected": -6.13747501373291, + "loss": 0.4803, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.0687761306762695, + "rewards/margins": 1.0686986446380615, + "rewards/rejected": -6.13747501373291, "step": 4445 }, { "epoch": 2.3816691754473993, - "grad_norm": 31.858698430890097, + "grad_norm": 23.87765289653876, "learning_rate": 1.2373373645410573e-07, - "logits/chosen": 0.8017476797103882, - "logits/rejected": 0.8760465383529663, - "logps/chosen": -9.925119400024414, - "logps/rejected": -11.373433113098145, - "loss": 0.4195, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.925119400024414, - "rewards/margins": 1.4483143091201782, - "rewards/rejected": -11.373433113098145, - "semantic_entropy": 0.001272709690965712, + "logits/chosen": -0.10407520830631256, + "logits/rejected": 0.03466368839144707, + "logps/chosen": -5.068427085876465, + "logps/rejected": -6.4751129150390625, + "loss": 0.4011, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -5.068427085876465, + "rewards/margins": 1.4066853523254395, + "rewards/rejected": -6.4751129150390625, "step": 4450 }, { "epoch": 2.384345208228801, - "grad_norm": 21.93569257046853, + "grad_norm": 21.584394481561436, "learning_rate": 1.2270992794243175e-07, - "logits/chosen": 0.7383990287780762, - "logits/rejected": 0.8073341250419617, - "logps/chosen": -9.690180778503418, - "logps/rejected": -11.078446388244629, - "loss": 0.4162, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.690180778503418, - "rewards/margins": 1.3882659673690796, - "rewards/rejected": -11.078446388244629, - "semantic_entropy": 0.0013964849058538675, + "logits/chosen": -0.18179450929164886, + "logits/rejected": -0.05064668506383896, + "logps/chosen": -4.943591117858887, + "logps/rejected": -6.263463020324707, + "loss": 0.4035, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.943591117858887, + "rewards/margins": 1.319872260093689, + "rewards/rejected": -6.263463020324707, "step": 4455 }, { "epoch": 2.3870212410102023, - "grad_norm": 19.97322050449576, + "grad_norm": 14.597396737807875, "learning_rate": 1.2168977990069147e-07, - "logits/chosen": 0.7342582941055298, - "logits/rejected": 0.7973549962043762, - "logps/chosen": -9.67171573638916, - "logps/rejected": -10.98813533782959, - "loss": 0.4229, - "rewards/accuracies": 0.75, - "rewards/chosen": -9.67171573638916, - "rewards/margins": 1.316420316696167, - "rewards/rejected": -10.98813533782959, - "semantic_entropy": 0.001446625916287303, + "logits/chosen": -0.1527225226163864, + "logits/rejected": 0.05775179713964462, + "logps/chosen": -4.93073844909668, + "logps/rejected": -6.238767147064209, + "loss": 0.4162, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.93073844909668, + "rewards/margins": 1.3080288171768188, + "rewards/rejected": -6.238767147064209, "step": 4460 }, { "epoch": 2.389697273791604, - "grad_norm": 20.71353603983738, + "grad_norm": 20.647566709920508, "learning_rate": 1.206733022263659e-07, - "logits/chosen": 0.7267228960990906, - "logits/rejected": 0.8412584066390991, - "logps/chosen": -9.8917875289917, - "logps/rejected": -11.24048137664795, - "loss": 0.4238, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.8917875289917, - "rewards/margins": 1.3486926555633545, - "rewards/rejected": -11.24048137664795, - "semantic_entropy": 0.0015950720990076661, + "logits/chosen": -0.12142185121774673, + "logits/rejected": 0.08596687018871307, + "logps/chosen": -5.363245010375977, + "logps/rejected": -6.603110313415527, + "loss": 0.4493, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.363245010375977, + "rewards/margins": 1.2398649454116821, + "rewards/rejected": -6.603110313415527, "step": 4465 }, { "epoch": 2.3923733065730053, - "grad_norm": 24.253708429812765, + "grad_norm": 19.81314203838739, "learning_rate": 1.1966050478132572e-07, - "logits/chosen": 0.7586138844490051, - "logits/rejected": 0.8224746584892273, - "logps/chosen": -9.68405532836914, - "logps/rejected": -11.180296897888184, - "loss": 0.3864, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -9.68405532836914, - "rewards/margins": 1.4962437152862549, - "rewards/rejected": -11.180296897888184, - "semantic_entropy": 0.0014107396127656102, + "logits/chosen": -0.08592655509710312, + "logits/rejected": 0.005946418736129999, + "logps/chosen": -4.868677616119385, + "logps/rejected": -6.154007911682129, + "loss": 0.4245, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.868677616119385, + "rewards/margins": 1.2853299379348755, + "rewards/rejected": -6.154007911682129, "step": 4470 }, { "epoch": 2.395049339354407, - "grad_norm": 21.753207135798707, + "grad_norm": 28.328876727886627, "learning_rate": 1.1865139739173635e-07, - "logits/chosen": 0.7383859753608704, - "logits/rejected": 0.8205004930496216, - "logps/chosen": -9.776418685913086, - "logps/rejected": -11.116586685180664, - "loss": 0.4023, - "rewards/accuracies": 0.84375, - "rewards/chosen": -9.776418685913086, - "rewards/margins": 1.34016752243042, - "rewards/rejected": -11.116586685180664, - "semantic_entropy": 0.0015196467284113169, + "logits/chosen": -0.1599574089050293, + "logits/rejected": 0.03871563822031021, + "logps/chosen": -5.0680742263793945, + "logps/rejected": -6.307386875152588, + "loss": 0.4097, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.0680742263793945, + "rewards/margins": 1.2393124103546143, + "rewards/rejected": -6.307386875152588, "step": 4475 }, { "epoch": 2.3977253721358087, - "grad_norm": 37.45298900218617, + "grad_norm": 20.921897739437483, "learning_rate": 1.1764598984796187e-07, - "logits/chosen": 0.7673597931861877, - "logits/rejected": 0.8331373333930969, - "logps/chosen": -9.831031799316406, - "logps/rejected": -11.022607803344727, - "loss": 0.4243, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.831031799316406, - "rewards/margins": 1.1915762424468994, - "rewards/rejected": -11.022607803344727, - "semantic_entropy": 0.001290981424972415, + "logits/chosen": -0.1660049855709076, + "logits/rejected": -0.007313963957130909, + "logps/chosen": -5.097243785858154, + "logps/rejected": -6.2743024826049805, + "loss": 0.4064, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.097243785858154, + "rewards/margins": 1.1770585775375366, + "rewards/rejected": -6.2743024826049805, "step": 4480 }, { "epoch": 2.4004014049172104, - "grad_norm": 29.063993723072343, + "grad_norm": 24.874337837474737, "learning_rate": 1.1664429190447095e-07, - "logits/chosen": 0.7792296409606934, - "logits/rejected": 0.8261978030204773, - "logps/chosen": -9.763978958129883, - "logps/rejected": -11.260164260864258, - "loss": 0.3626, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -9.763978958129883, - "rewards/margins": 1.496183156967163, - "rewards/rejected": -11.260164260864258, - "semantic_entropy": 0.0017484973650425673, + "logits/chosen": -0.0644039660692215, + "logits/rejected": 0.02724977768957615, + "logps/chosen": -5.004655361175537, + "logps/rejected": -6.376974105834961, + "loss": 0.3815, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -5.004655361175537, + "rewards/margins": 1.3723182678222656, + "rewards/rejected": -6.376974105834961, "step": 4485 }, { "epoch": 2.4030774376986117, - "grad_norm": 21.46210084893414, + "grad_norm": 26.286270602256792, "learning_rate": 1.1564631327974122e-07, - "logits/chosen": 0.7814306616783142, - "logits/rejected": 0.8631542921066284, - "logps/chosen": -9.915163040161133, - "logps/rejected": -11.228841781616211, - "loss": 0.4225, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.915163040161133, - "rewards/margins": 1.3136794567108154, - "rewards/rejected": -11.228841781616211, - "semantic_entropy": 0.0011776359751820564, + "logits/chosen": -0.12516091763973236, + "logits/rejected": 0.06686491519212723, + "logps/chosen": -5.128063678741455, + "logps/rejected": -6.460404396057129, + "loss": 0.4134, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.128063678741455, + "rewards/margins": 1.3323404788970947, + "rewards/rejected": -6.460404396057129, "step": 4490 }, { "epoch": 2.4057534704800134, - "grad_norm": 20.87464472480962, + "grad_norm": 19.935223649549474, "learning_rate": 1.1465206365616587e-07, - "logits/chosen": 0.6937421560287476, - "logits/rejected": 0.7896022796630859, - "logps/chosen": -9.791691780090332, - "logps/rejected": -11.08985710144043, - "loss": 0.4082, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.791691780090332, - "rewards/margins": 1.2981641292572021, - "rewards/rejected": -11.08985710144043, - "semantic_entropy": 0.0014528365572914481, + "logits/chosen": -0.22854992747306824, + "logits/rejected": -0.0044035897590219975, + "logps/chosen": -5.203268051147461, + "logps/rejected": -6.425678253173828, + "loss": 0.4262, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.203268051147461, + "rewards/margins": 1.2224104404449463, + "rewards/rejected": -6.425678253173828, "step": 4495 }, { "epoch": 2.408429503261415, - "grad_norm": 22.141016604754245, + "grad_norm": 19.42764380389304, "learning_rate": 1.1366155267995887e-07, - "logits/chosen": 0.8213682174682617, - "logits/rejected": 0.8454049825668335, - "logps/chosen": -9.781519889831543, - "logps/rejected": -11.107604026794434, - "loss": 0.3978, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.781519889831543, - "rewards/margins": 1.3260858058929443, - "rewards/rejected": -11.107604026794434, - "semantic_entropy": 0.0014172891387715936, + "logits/chosen": -0.09625109285116196, + "logits/rejected": -0.061596959829330444, + "logps/chosen": -4.961366176605225, + "logps/rejected": -6.242344379425049, + "loss": 0.4005, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.961366176605225, + "rewards/margins": 1.2809776067733765, + "rewards/rejected": -6.242344379425049, "step": 4500 }, { "epoch": 2.4111055360428164, - "grad_norm": 20.81249553944227, + "grad_norm": 19.212558977147925, "learning_rate": 1.1267478996106228e-07, - "logits/chosen": 0.8247060775756836, - "logits/rejected": 0.9181084632873535, - "logps/chosen": -9.852422714233398, - "logps/rejected": -11.161924362182617, - "loss": 0.4153, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -9.852422714233398, - "rewards/margins": 1.3095014095306396, - "rewards/rejected": -11.161924362182617, - "semantic_entropy": 0.0011818426428362727, + "logits/chosen": -0.10269101709127426, + "logits/rejected": 0.11532378196716309, + "logps/chosen": -5.016781806945801, + "logps/rejected": -6.184000492095947, + "loss": 0.428, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.016781806945801, + "rewards/margins": 1.167218804359436, + "rewards/rejected": -6.184000492095947, "step": 4505 }, { "epoch": 2.413781568824218, - "grad_norm": 22.065604053110732, + "grad_norm": 21.63862627929706, "learning_rate": 1.116917850730521e-07, - "logits/chosen": 0.7819138765335083, - "logits/rejected": 0.8265460133552551, - "logps/chosen": -9.933201789855957, - "logps/rejected": -11.143549919128418, - "loss": 0.5036, - "rewards/accuracies": 0.75, - "rewards/chosen": -9.933201789855957, - "rewards/margins": 1.2103482484817505, - "rewards/rejected": -11.143549919128418, - "semantic_entropy": 0.0012393039651215076, + "logits/chosen": -0.12781012058258057, + "logits/rejected": 0.00042679012403823435, + "logps/chosen": -5.128924369812012, + "logps/rejected": -6.288200378417969, + "loss": 0.4704, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.128924369812012, + "rewards/margins": 1.1592756509780884, + "rewards/rejected": -6.288200378417969, "step": 4510 }, { "epoch": 2.41645760160562, - "grad_norm": 17.547878202239335, + "grad_norm": 17.447187611449852, "learning_rate": 1.1071254755304637e-07, - "logits/chosen": 0.7445524334907532, - "logits/rejected": 0.7736660242080688, - "logps/chosen": -9.694409370422363, - "logps/rejected": -11.00683307647705, - "loss": 0.42, + "logits/chosen": -0.07410897314548492, + "logits/rejected": 0.008558815345168114, + "logps/chosen": -5.0079169273376465, + "logps/rejected": -6.22298526763916, + "loss": 0.441, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.694409370422363, - "rewards/margins": 1.3124234676361084, - "rewards/rejected": -11.00683307647705, - "semantic_entropy": 0.001383893541060388, + "rewards/chosen": -5.0079169273376465, + "rewards/margins": 1.2150683403015137, + "rewards/rejected": -6.22298526763916, "step": 4515 }, { "epoch": 2.419133634387021, - "grad_norm": 23.631179882315553, + "grad_norm": 20.83707759610774, "learning_rate": 1.0973708690161143e-07, - "logits/chosen": 0.792984664440155, - "logits/rejected": 0.8252687454223633, - "logps/chosen": -9.863832473754883, - "logps/rejected": -11.273421287536621, - "loss": 0.3964, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.863832473754883, - "rewards/margins": 1.409589409828186, - "rewards/rejected": -11.273421287536621, - "semantic_entropy": 0.001443797373212874, + "logits/chosen": -0.11768940836191177, + "logits/rejected": -0.00017354413284920156, + "logps/chosen": -5.107953071594238, + "logps/rejected": -6.3962836265563965, + "loss": 0.3982, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -5.107953071594238, + "rewards/margins": 1.2883305549621582, + "rewards/rejected": -6.3962836265563965, "step": 4520 }, { "epoch": 2.421809667168423, - "grad_norm": 31.566956401950897, + "grad_norm": 30.939917138007264, "learning_rate": 1.0876541258267119e-07, - "logits/chosen": 0.7816181182861328, - "logits/rejected": 0.873005747795105, - "logps/chosen": -9.954813003540039, - "logps/rejected": -11.395962715148926, - "loss": 0.3918, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -9.954813003540039, - "rewards/margins": 1.4411489963531494, - "rewards/rejected": -11.395962715148926, - "semantic_entropy": 0.0011875508353114128, + "logits/chosen": -0.14560499787330627, + "logits/rejected": 0.052821893244981766, + "logps/chosen": -5.192101955413818, + "logps/rejected": -6.642346382141113, + "loss": 0.3869, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.192101955413818, + "rewards/margins": 1.450244665145874, + "rewards/rejected": -6.642346382141113, "step": 4525 }, { "epoch": 2.4244856999498245, - "grad_norm": 25.748276815440583, + "grad_norm": 20.941876937927635, "learning_rate": 1.0779753402341379e-07, - "logits/chosen": 0.7940434813499451, - "logits/rejected": 0.840873122215271, - "logps/chosen": -9.88911247253418, - "logps/rejected": -11.033079147338867, - "loss": 0.4659, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -9.88911247253418, - "rewards/margins": 1.1439659595489502, - "rewards/rejected": -11.033079147338867, - "semantic_entropy": 0.0013449281686916947, + "logits/chosen": -0.12555193901062012, + "logits/rejected": -0.016242671757936478, + "logps/chosen": -5.071659088134766, + "logps/rejected": -6.193231105804443, + "loss": 0.4431, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -5.071659088134766, + "rewards/margins": 1.1215723752975464, + "rewards/rejected": -6.193231105804443, "step": 4530 }, { "epoch": 2.427161732731226, - "grad_norm": 23.532526327852437, + "grad_norm": 23.41449629724555, "learning_rate": 1.0683346061420157e-07, - "logits/chosen": 0.8855890035629272, - "logits/rejected": 0.8981055021286011, - "logps/chosen": -9.778970718383789, - "logps/rejected": -11.14280891418457, - "loss": 0.4195, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -9.778970718383789, - "rewards/margins": 1.363840103149414, - "rewards/rejected": -11.14280891418457, - "semantic_entropy": 0.001345223980024457, + "logits/chosen": -0.01965397037565708, + "logits/rejected": 0.0760636031627655, + "logps/chosen": -4.710206985473633, + "logps/rejected": -6.0541582107543945, + "loss": 0.4219, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.710206985473633, + "rewards/margins": 1.3439505100250244, + "rewards/rejected": -6.0541582107543945, "step": 4535 }, { "epoch": 2.4298377655126275, - "grad_norm": 23.144715325749804, + "grad_norm": 22.372225169717023, "learning_rate": 1.0587320170847874e-07, - "logits/chosen": 0.7933780550956726, - "logits/rejected": 0.8676679730415344, - "logps/chosen": -9.759759902954102, - "logps/rejected": -10.895282745361328, - "loss": 0.4728, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -9.759759902954102, - "rewards/margins": 1.135524034500122, - "rewards/rejected": -10.895282745361328, - "semantic_entropy": 0.001288101659156382, + "logits/chosen": -0.07419230043888092, + "logits/rejected": 0.043887943029403687, + "logps/chosen": -4.740002632141113, + "logps/rejected": -5.940243721008301, + "loss": 0.444, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.740002632141113, + "rewards/margins": 1.2002410888671875, + "rewards/rejected": -5.940243721008301, "step": 4540 }, { "epoch": 2.4325137982940293, - "grad_norm": 21.014853352663092, + "grad_norm": 17.313717637128196, "learning_rate": 1.0491676662268156e-07, - "logits/chosen": 0.8033139109611511, - "logits/rejected": 0.8606590032577515, - "logps/chosen": -9.787649154663086, - "logps/rejected": -11.041508674621582, - "loss": 0.4454, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.787649154663086, - "rewards/margins": 1.2538607120513916, - "rewards/rejected": -11.041508674621582, - "semantic_entropy": 0.0012479587458074093, + "logits/chosen": -0.04450344294309616, + "logits/rejected": 0.06495862454175949, + "logps/chosen": -4.818356513977051, + "logps/rejected": -6.039165496826172, + "loss": 0.4384, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.818356513977051, + "rewards/margins": 1.2208091020584106, + "rewards/rejected": -6.039165496826172, "step": 4545 }, { "epoch": 2.4351898310754305, - "grad_norm": 25.574697677651937, + "grad_norm": 22.779513594255896, "learning_rate": 1.0396416463614732e-07, - "logits/chosen": 0.7537301182746887, - "logits/rejected": 0.814228355884552, - "logps/chosen": -9.702180862426758, - "logps/rejected": -11.04191780090332, - "loss": 0.42, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.702180862426758, - "rewards/margins": 1.3397365808486938, - "rewards/rejected": -11.04191780090332, - "semantic_entropy": 0.001320059527643025, + "logits/chosen": -0.1395617574453354, + "logits/rejected": -0.02785954438149929, + "logps/chosen": -4.877791404724121, + "logps/rejected": -6.220587730407715, + "loss": 0.4181, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.877791404724121, + "rewards/margins": 1.3427965641021729, + "rewards/rejected": -6.220587730407715, "step": 4550 }, { "epoch": 2.4378658638568322, - "grad_norm": 24.618729215771705, + "grad_norm": 21.780719457495863, "learning_rate": 1.0301540499102479e-07, - "logits/chosen": 0.7519547343254089, - "logits/rejected": 0.8329681158065796, - "logps/chosen": -9.94709587097168, - "logps/rejected": -10.988082885742188, - "loss": 0.4853, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -9.94709587097168, - "rewards/margins": 1.0409865379333496, - "rewards/rejected": -10.988082885742188, - "semantic_entropy": 0.0011719849426299334, + "logits/chosen": -0.06138720363378525, + "logits/rejected": 0.052254825830459595, + "logps/chosen": -5.28981876373291, + "logps/rejected": -6.382269382476807, + "loss": 0.4624, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -5.28981876373291, + "rewards/margins": 1.0924499034881592, + "rewards/rejected": -6.382269382476807, "step": 4555 }, { "epoch": 2.440541896638234, - "grad_norm": 26.17462652294696, + "grad_norm": 24.923870245600042, "learning_rate": 1.0207049689218405e-07, - "logits/chosen": 0.7849665284156799, - "logits/rejected": 0.845086932182312, - "logps/chosen": -9.825363159179688, - "logps/rejected": -11.26807975769043, - "loss": 0.4079, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -9.825363159179688, - "rewards/margins": 1.4427168369293213, - "rewards/rejected": -11.26807975769043, - "semantic_entropy": 0.0016312900697812438, + "logits/chosen": -0.11950943619012833, + "logits/rejected": 0.09114255011081696, + "logps/chosen": -5.233829498291016, + "logps/rejected": -6.665799140930176, + "loss": 0.3935, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.233829498291016, + "rewards/margins": 1.4319689273834229, + "rewards/rejected": -6.665799140930176, "step": 4560 }, { "epoch": 2.4432179294196352, - "grad_norm": 19.057899173583554, + "grad_norm": 18.977645737815436, "learning_rate": 1.0112944950712782e-07, - "logits/chosen": 0.7082661986351013, - "logits/rejected": 0.7645975947380066, - "logps/chosen": -9.70044994354248, - "logps/rejected": -11.13469123840332, - "loss": 0.3632, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.70044994354248, - "rewards/margins": 1.434242606163025, - "rewards/rejected": -11.13469123840332, - "semantic_entropy": 0.001436132937669754, + "logits/chosen": -0.09520555287599564, + "logits/rejected": 0.03586623817682266, + "logps/chosen": -5.083714485168457, + "logps/rejected": -6.485954284667969, + "loss": 0.3719, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.083714485168457, + "rewards/margins": 1.402239441871643, + "rewards/rejected": -6.485954284667969, "step": 4565 }, { "epoch": 2.445893962201037, - "grad_norm": 22.488252138217725, + "grad_norm": 22.176538675071544, "learning_rate": 1.0019227196590174e-07, - "logits/chosen": 0.8336771130561829, - "logits/rejected": 0.8841003179550171, - "logps/chosen": -9.908154487609863, - "logps/rejected": -11.13020133972168, - "loss": 0.4829, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -9.908154487609863, - "rewards/margins": 1.2220475673675537, - "rewards/rejected": -11.13020133972168, - "semantic_entropy": 0.0014596920227631927, + "logits/chosen": -0.06180862337350845, + "logits/rejected": 0.08642885833978653, + "logps/chosen": -5.0647172927856445, + "logps/rejected": -6.232893943786621, + "loss": 0.4815, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -5.0647172927856445, + "rewards/margins": 1.1681764125823975, + "rewards/rejected": -6.232893943786621, "step": 4570 }, { "epoch": 2.4485699949824387, - "grad_norm": 23.13587206070571, + "grad_norm": 18.68141817480857, "learning_rate": 9.925897336100664e-08, - "logits/chosen": 0.8057346343994141, - "logits/rejected": 0.850749135017395, - "logps/chosen": -9.688154220581055, - "logps/rejected": -11.131349563598633, - "loss": 0.3866, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.688154220581055, - "rewards/margins": 1.4431952238082886, - "rewards/rejected": -11.131349563598633, - "semantic_entropy": 0.0014849931467324495, + "logits/chosen": -0.02597079798579216, + "logits/rejected": 0.04939575493335724, + "logps/chosen": -4.922869682312012, + "logps/rejected": -6.3803791999816895, + "loss": 0.3642, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.922869682312012, + "rewards/margins": 1.457509994506836, + "rewards/rejected": -6.3803791999816895, "step": 4575 }, { "epoch": 2.45124602776384, - "grad_norm": 23.357693931056765, + "grad_norm": 24.249107872987476, "learning_rate": 9.832956274730946e-08, - "logits/chosen": 0.7591571807861328, - "logits/rejected": 0.7910270094871521, - "logps/chosen": -9.584843635559082, - "logps/rejected": -10.765449523925781, - "loss": 0.4539, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -9.584843635559082, - "rewards/margins": 1.1806063652038574, - "rewards/rejected": -10.765449523925781, - "semantic_entropy": 0.0015608349349349737, + "logits/chosen": -0.08943992853164673, + "logits/rejected": -0.03537165746092796, + "logps/chosen": -4.793396472930908, + "logps/rejected": -5.849032402038574, + "loss": 0.4762, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.793396472930908, + "rewards/margins": 1.055635929107666, + "rewards/rejected": -5.849032402038574, "step": 4580 }, { "epoch": 2.4539220605452416, - "grad_norm": 20.637887868629868, + "grad_norm": 19.63527201675227, "learning_rate": 9.740404914195633e-08, - "logits/chosen": 0.7534157037734985, - "logits/rejected": 0.842387855052948, - "logps/chosen": -9.789365768432617, - "logps/rejected": -11.138287544250488, - "loss": 0.4176, - "rewards/accuracies": 0.84375, - "rewards/chosen": -9.789365768432617, - "rewards/margins": 1.3489205837249756, - "rewards/rejected": -11.138287544250488, - "semantic_entropy": 0.0012669655261561275, + "logits/chosen": -0.07588151842355728, + "logits/rejected": 0.08270827680826187, + "logps/chosen": -5.10926628112793, + "logps/rejected": -6.38045072555542, + "loss": 0.4166, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -5.10926628112793, + "rewards/margins": 1.271185040473938, + "rewards/rejected": -6.38045072555542, "step": 4585 }, { "epoch": 2.4565980933266434, - "grad_norm": 18.543572299518026, + "grad_norm": 16.721921172593518, "learning_rate": 9.648244152428392e-08, - "logits/chosen": 0.7632014751434326, - "logits/rejected": 0.8216020464897156, - "logps/chosen": -9.654337882995605, - "logps/rejected": -10.88366413116455, - "loss": 0.4224, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.654337882995605, - "rewards/margins": 1.229326605796814, - "rewards/rejected": -10.88366413116455, - "semantic_entropy": 0.0014172986848279834, + "logits/chosen": -0.16460056602954865, + "logits/rejected": -0.02666083537042141, + "logps/chosen": -4.814793586730957, + "logps/rejected": -5.954825401306152, + "loss": 0.4343, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.814793586730957, + "rewards/margins": 1.1400330066680908, + "rewards/rejected": -5.954825401306152, "step": 4590 }, { "epoch": 2.4592741261080446, - "grad_norm": 19.149278895897503, + "grad_norm": 20.907321242116318, "learning_rate": 9.556474883573379e-08, - "logits/chosen": 0.7528376579284668, - "logits/rejected": 0.8152421116828918, - "logps/chosen": -9.646097183227539, - "logps/rejected": -11.058183670043945, - "loss": 0.4187, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.646097183227539, - "rewards/margins": 1.412088394165039, - "rewards/rejected": -11.058183670043945, - "semantic_entropy": 0.0015735877677798271, + "logits/chosen": -0.14378607273101807, + "logits/rejected": -0.01512183714658022, + "logps/chosen": -4.747116565704346, + "logps/rejected": -6.175529956817627, + "loss": 0.4025, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.747116565704346, + "rewards/margins": 1.4284136295318604, + "rewards/rejected": -6.175529956817627, "step": 4595 }, { "epoch": 2.4619501588894463, - "grad_norm": 21.187119688281765, + "grad_norm": 17.850870901447962, "learning_rate": 9.465097997976412e-08, - "logits/chosen": 0.7996068000793457, - "logits/rejected": 0.8711563348770142, - "logps/chosen": -9.832134246826172, - "logps/rejected": -11.303122520446777, - "loss": 0.3744, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -9.832134246826172, - "rewards/margins": 1.4709880352020264, - "rewards/rejected": -11.303122520446777, - "semantic_entropy": 0.0013540387153625488, + "logits/chosen": -0.13265123963356018, + "logits/rejected": 0.09264367818832397, + "logps/chosen": -4.926660537719727, + "logps/rejected": -6.362934112548828, + "loss": 0.3765, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.926660537719727, + "rewards/margins": 1.4362733364105225, + "rewards/rejected": -6.362934112548828, "step": 4600 }, { "epoch": 2.464626191670848, - "grad_norm": 21.004241219678757, + "grad_norm": 18.126235026004135, "learning_rate": 9.374114382176457e-08, - "logits/chosen": 0.7817031741142273, - "logits/rejected": 0.8433337211608887, - "logps/chosen": -9.771738052368164, - "logps/rejected": -11.173564910888672, - "loss": 0.3894, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.771738052368164, - "rewards/margins": 1.4018254280090332, - "rewards/rejected": -11.173564910888672, - "semantic_entropy": 0.0012005962198600173, + "logits/chosen": -0.09208249300718307, + "logits/rejected": 0.07163773477077484, + "logps/chosen": -4.998213768005371, + "logps/rejected": -6.317043304443359, + "loss": 0.4168, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.998213768005371, + "rewards/margins": 1.3188292980194092, + "rewards/rejected": -6.317043304443359, "step": 4605 }, { "epoch": 2.46730222445225, - "grad_norm": 27.55976528492471, + "grad_norm": 25.14917141731815, "learning_rate": 9.283524918896945e-08, - "logits/chosen": 0.7919789552688599, - "logits/rejected": 0.8178110122680664, - "logps/chosen": -9.770918846130371, - "logps/rejected": -11.095891952514648, - "loss": 0.437, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.770918846130371, - "rewards/margins": 1.3249746561050415, - "rewards/rejected": -11.095891952514648, - "semantic_entropy": 0.0011873061303049326, + "logits/chosen": -0.10843755304813385, + "logits/rejected": 0.028638357296586037, + "logps/chosen": -5.116772174835205, + "logps/rejected": -6.423123836517334, + "loss": 0.4217, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -5.116772174835205, + "rewards/margins": 1.306351661682129, + "rewards/rejected": -6.423123836517334, "step": 4610 }, { "epoch": 2.469978257233651, - "grad_norm": 20.74798992950919, + "grad_norm": 20.477298140965818, "learning_rate": 9.193330487037232e-08, - "logits/chosen": 0.814818263053894, - "logits/rejected": 0.8907683491706848, - "logps/chosen": -9.838384628295898, - "logps/rejected": -11.240675926208496, - "loss": 0.3874, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.838384628295898, - "rewards/margins": 1.402291178703308, - "rewards/rejected": -11.240675926208496, - "semantic_entropy": 0.0014131965581327677, + "logits/chosen": -0.062186747789382935, + "logits/rejected": 0.1135500892996788, + "logps/chosen": -5.195328712463379, + "logps/rejected": -6.554124355316162, + "loss": 0.4177, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -5.195328712463379, + "rewards/margins": 1.3587956428527832, + "rewards/rejected": -6.554124355316162, "step": 4615 }, { "epoch": 2.4726542900150528, - "grad_norm": 17.77642999066438, + "grad_norm": 18.321197218301556, "learning_rate": 9.103531961664118e-08, - "logits/chosen": 0.7889447808265686, - "logits/rejected": 0.8778635859489441, - "logps/chosen": -9.612691879272461, - "logps/rejected": -10.984048843383789, - "loss": 0.3752, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -9.612691879272461, - "rewards/margins": 1.3713561296463013, - "rewards/rejected": -10.984048843383789, - "semantic_entropy": 0.0014132572105154395, + "logits/chosen": -0.07995424419641495, + "logits/rejected": 0.11600959300994873, + "logps/chosen": -4.770631790161133, + "logps/rejected": -6.067399024963379, + "loss": 0.3769, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.770631790161133, + "rewards/margins": 1.2967665195465088, + "rewards/rejected": -6.067399024963379, "step": 4620 }, { "epoch": 2.475330322796454, - "grad_norm": 19.68502083527493, + "grad_norm": 18.342682503111757, "learning_rate": 9.014130214003269e-08, - "logits/chosen": 0.7648957967758179, - "logits/rejected": 0.7786797881126404, - "logps/chosen": -9.624971389770508, - "logps/rejected": -11.085968971252441, - "loss": 0.3885, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -9.624971389770508, - "rewards/margins": 1.460997223854065, - "rewards/rejected": -11.085968971252441, - "semantic_entropy": 0.0014543391298502684, + "logits/chosen": -0.1574215590953827, + "logits/rejected": -0.13371939957141876, + "logps/chosen": -4.962009906768799, + "logps/rejected": -6.364610195159912, + "loss": 0.3874, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.962009906768799, + "rewards/margins": 1.402600884437561, + "rewards/rejected": -6.364610195159912, "step": 4625 }, { "epoch": 2.4780063555778558, - "grad_norm": 23.013655732727454, + "grad_norm": 22.483001002029635, "learning_rate": 8.925126111430848e-08, - "logits/chosen": 0.7716919183731079, - "logits/rejected": 0.8139322996139526, - "logps/chosen": -9.422433853149414, - "logps/rejected": -10.843701362609863, - "loss": 0.4095, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.422433853149414, - "rewards/margins": 1.4212672710418701, - "rewards/rejected": -10.843701362609863, - "semantic_entropy": 0.001683591166511178, + "logits/chosen": -0.01639522984623909, + "logits/rejected": 0.0729493498802185, + "logps/chosen": -4.845233917236328, + "logps/rejected": -6.188063144683838, + "loss": 0.4068, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.845233917236328, + "rewards/margins": 1.342829704284668, + "rewards/rejected": -6.188063144683838, "step": 4630 }, { "epoch": 2.4806823883592575, - "grad_norm": 24.587592896382663, + "grad_norm": 23.820751761824773, "learning_rate": 8.83652051746504e-08, - "logits/chosen": 0.9284713864326477, - "logits/rejected": 0.9669192433357239, - "logps/chosen": -9.819892883300781, - "logps/rejected": -11.273547172546387, - "loss": 0.3939, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.819892883300781, - "rewards/margins": 1.4536547660827637, - "rewards/rejected": -11.273547172546387, - "semantic_entropy": 0.0012030914658680558, + "logits/chosen": 0.005275038070976734, + "logits/rejected": 0.15345266461372375, + "logps/chosen": -4.97251033782959, + "logps/rejected": -6.336302280426025, + "loss": 0.397, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.97251033782959, + "rewards/margins": 1.3637912273406982, + "rewards/rejected": -6.336302280426025, "step": 4635 }, { "epoch": 2.483358421140659, - "grad_norm": 22.016927492062443, + "grad_norm": 18.932205852005836, "learning_rate": 8.748314291757696e-08, - "logits/chosen": 0.7996488213539124, - "logits/rejected": 0.8577510714530945, - "logps/chosen": -9.611989974975586, - "logps/rejected": -10.817550659179688, - "loss": 0.4296, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -9.611989974975586, - "rewards/margins": 1.2055622339248657, - "rewards/rejected": -10.817550659179688, - "semantic_entropy": 0.0014349967241287231, + "logits/chosen": -0.07357851415872574, + "logits/rejected": 0.05138392373919487, + "logps/chosen": -4.951448440551758, + "logps/rejected": -6.193153381347656, + "loss": 0.4093, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.951448440551758, + "rewards/margins": 1.2417047023773193, + "rewards/rejected": -6.193153381347656, "step": 4640 }, { "epoch": 2.4860344539220605, - "grad_norm": 20.553055577709493, + "grad_norm": 21.468348312055042, "learning_rate": 8.660508290086032e-08, - "logits/chosen": 0.8431406021118164, - "logits/rejected": 0.930561900138855, - "logps/chosen": -9.651968002319336, - "logps/rejected": -11.155494689941406, - "loss": 0.3923, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.651968002319336, - "rewards/margins": 1.5035268068313599, - "rewards/rejected": -11.155494689941406, - "semantic_entropy": 0.001411119825206697, + "logits/chosen": -0.069229856133461, + "logits/rejected": 0.08976881206035614, + "logps/chosen": -5.050375938415527, + "logps/rejected": -6.4421820640563965, + "loss": 0.4083, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.050375938415527, + "rewards/margins": 1.39180588722229, + "rewards/rejected": -6.4421820640563965, "step": 4645 }, { "epoch": 2.488710486703462, - "grad_norm": 29.66982937376926, + "grad_norm": 24.045766441301197, "learning_rate": 8.573103364344231e-08, - "logits/chosen": 0.7703269124031067, - "logits/rejected": 0.8427858352661133, - "logps/chosen": -9.545438766479492, - "logps/rejected": -10.993854522705078, - "loss": 0.3839, + "logits/chosen": -0.16926923394203186, + "logits/rejected": 0.04382326081395149, + "logps/chosen": -4.90109395980835, + "logps/rejected": -6.3292951583862305, + "loss": 0.3884, "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.545438766479492, - "rewards/margins": 1.4484152793884277, - "rewards/rejected": -10.993854522705078, - "semantic_entropy": 0.0015610662521794438, + "rewards/chosen": -4.90109395980835, + "rewards/margins": 1.4282008409500122, + "rewards/rejected": -6.3292951583862305, "step": 4650 }, { "epoch": 2.4913865194848634, - "grad_norm": 24.578094627007825, + "grad_norm": 21.011048687085864, "learning_rate": 8.486100362535292e-08, - "logits/chosen": 0.7740985751152039, - "logits/rejected": 0.851282000541687, - "logps/chosen": -9.825765609741211, - "logps/rejected": -10.998079299926758, - "loss": 0.4317, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -9.825765609741211, - "rewards/margins": 1.1723124980926514, - "rewards/rejected": -10.998079299926758, - "semantic_entropy": 0.0011104957666248083, + "logits/chosen": -0.12171546369791031, + "logits/rejected": 0.0305899977684021, + "logps/chosen": -5.153502464294434, + "logps/rejected": -6.291139125823975, + "loss": 0.4463, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.153502464294434, + "rewards/margins": 1.137636661529541, + "rewards/rejected": -6.291139125823975, "step": 4655 }, { "epoch": 2.494062552266265, - "grad_norm": 17.212403265377716, + "grad_norm": 16.94315226378548, "learning_rate": 8.399500128762693e-08, - "logits/chosen": 0.7384323477745056, - "logits/rejected": 0.809241771697998, - "logps/chosen": -9.808893203735352, - "logps/rejected": -11.134016036987305, - "loss": 0.4052, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.808893203735352, - "rewards/margins": 1.3251229524612427, - "rewards/rejected": -11.134016036987305, - "semantic_entropy": 0.001255923300050199, + "logits/chosen": -0.0955643281340599, + "logits/rejected": 0.016864914447069168, + "logps/chosen": -5.159852027893066, + "logps/rejected": -6.46709680557251, + "loss": 0.3871, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -5.159852027893066, + "rewards/margins": 1.3072445392608643, + "rewards/rejected": -6.46709680557251, "step": 4660 }, { "epoch": 2.496738585047667, - "grad_norm": 24.084031431969617, + "grad_norm": 22.439882583061728, "learning_rate": 8.313303503222313e-08, - "logits/chosen": 0.8113842010498047, - "logits/rejected": 0.8640966415405273, - "logps/chosen": -9.65959644317627, - "logps/rejected": -10.88883113861084, - "loss": 0.4217, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.65959644317627, - "rewards/margins": 1.2292344570159912, - "rewards/rejected": -10.88883113861084, - "semantic_entropy": 0.0014709953684359789, + "logits/chosen": -0.10057331621646881, + "logits/rejected": -0.019338756799697876, + "logps/chosen": -4.862074375152588, + "logps/rejected": -6.097967624664307, + "loss": 0.4412, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.862074375152588, + "rewards/margins": 1.2358930110931396, + "rewards/rejected": -6.097967624664307, "step": 4665 }, { "epoch": 2.4994146178290686, - "grad_norm": 23.42979933597834, + "grad_norm": 28.388994202572086, "learning_rate": 8.227511322194164e-08, - "logits/chosen": 0.8243430256843567, - "logits/rejected": 0.8697830438613892, - "logps/chosen": -9.677289962768555, - "logps/rejected": -10.889700889587402, - "loss": 0.4429, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -9.677289962768555, - "rewards/margins": 1.212410807609558, - "rewards/rejected": -10.889700889587402, - "semantic_entropy": 0.001211336930282414, + "logits/chosen": -0.11687038838863373, + "logits/rejected": 0.02748199924826622, + "logps/chosen": -4.844364166259766, + "logps/rejected": -5.997697353363037, + "loss": 0.4391, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.844364166259766, + "rewards/margins": 1.1533329486846924, + "rewards/rejected": -5.997697353363037, "step": 4670 }, { "epoch": 2.50209065061047, - "grad_norm": 19.33238977643887, + "grad_norm": 26.917620286003707, "learning_rate": 8.142124418034385e-08, - "logits/chosen": 0.830100417137146, - "logits/rejected": 0.8942376971244812, - "logps/chosen": -9.691996574401855, - "logps/rejected": -11.031505584716797, - "loss": 0.4332, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.691996574401855, - "rewards/margins": 1.3395094871520996, - "rewards/rejected": -11.031505584716797, - "semantic_entropy": 0.0013690624618902802, + "logits/chosen": -0.0227427426725626, + "logits/rejected": 0.15813866257667542, + "logps/chosen": -4.876067161560059, + "logps/rejected": -6.08060884475708, + "loss": 0.4644, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.876067161560059, + "rewards/margins": 1.2045419216156006, + "rewards/rejected": -6.08060884475708, "step": 4675 }, { "epoch": 2.5047666833918716, - "grad_norm": 23.119082399990962, + "grad_norm": 22.762842210333183, "learning_rate": 8.057143619167073e-08, - "logits/chosen": 0.8294227719306946, - "logits/rejected": 0.8717254400253296, - "logps/chosen": -9.509129524230957, - "logps/rejected": -10.85628890991211, - "loss": 0.41, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.509129524230957, - "rewards/margins": 1.3471596240997314, - "rewards/rejected": -10.85628890991211, - "semantic_entropy": 0.0018529357621446252, + "logits/chosen": -0.028193265199661255, + "logits/rejected": 0.06258614361286163, + "logps/chosen": -4.616222381591797, + "logps/rejected": -5.919699192047119, + "loss": 0.4149, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.616222381591797, + "rewards/margins": 1.3034764528274536, + "rewards/rejected": -5.919699192047119, "step": 4680 }, { "epoch": 2.507442716173273, - "grad_norm": 18.56557158174028, + "grad_norm": 16.096278448308887, "learning_rate": 7.97256975007633e-08, - "logits/chosen": 0.795819878578186, - "logits/rejected": 0.88841712474823, - "logps/chosen": -9.543716430664062, - "logps/rejected": -10.928507804870605, - "loss": 0.3959, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.543716430664062, - "rewards/margins": 1.3847920894622803, - "rewards/rejected": -10.928507804870605, - "semantic_entropy": 0.0014773935545235872, + "logits/chosen": -0.11109156906604767, + "logits/rejected": 0.10410912334918976, + "logps/chosen": -4.95347785949707, + "logps/rejected": -6.317327499389648, + "loss": 0.383, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.95347785949707, + "rewards/margins": 1.3638501167297363, + "rewards/rejected": -6.317327499389648, "step": 4685 }, { "epoch": 2.5101187489546746, - "grad_norm": 26.78815904135646, + "grad_norm": 25.425832120094036, "learning_rate": 7.888403631298186e-08, - "logits/chosen": 0.7813885807991028, - "logits/rejected": 0.8535317182540894, - "logps/chosen": -9.660378456115723, - "logps/rejected": -10.954367637634277, - "loss": 0.4392, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.660378456115723, - "rewards/margins": 1.2939906120300293, - "rewards/rejected": -10.954367637634277, - "semantic_entropy": 0.0014350914862006903, + "logits/chosen": -0.04960218816995621, + "logits/rejected": 0.0151765625923872, + "logps/chosen": -4.832310676574707, + "logps/rejected": -6.093466758728027, + "loss": 0.4357, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.832310676574707, + "rewards/margins": 1.2611563205718994, + "rewards/rejected": -6.093466758728027, "step": 4690 }, { "epoch": 2.5127947817360763, - "grad_norm": 20.289843532833586, + "grad_norm": 18.001602709357115, "learning_rate": 7.804646079412719e-08, - "logits/chosen": 0.8242961168289185, - "logits/rejected": 0.9029603004455566, - "logps/chosen": -9.712553977966309, - "logps/rejected": -11.0477933883667, - "loss": 0.4122, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.712553977966309, - "rewards/margins": 1.335240125656128, - "rewards/rejected": -11.0477933883667, - "semantic_entropy": 0.0014915402280166745, + "logits/chosen": -0.05475863814353943, + "logits/rejected": 0.12167694419622421, + "logps/chosen": -5.103209495544434, + "logps/rejected": -6.43721866607666, + "loss": 0.4088, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.103209495544434, + "rewards/margins": 1.3340082168579102, + "rewards/rejected": -6.43721866607666, "step": 4695 }, { "epoch": 2.515470814517478, - "grad_norm": 21.47761754271104, + "grad_norm": 21.536757183699446, "learning_rate": 7.72129790703604e-08, - "logits/chosen": 0.787671685218811, - "logits/rejected": 0.8374601602554321, - "logps/chosen": -9.7283353805542, - "logps/rejected": -10.967700004577637, - "loss": 0.4222, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.7283353805542, - "rewards/margins": 1.2393652200698853, - "rewards/rejected": -10.967700004577637, - "semantic_entropy": 0.001365487463772297, + "logits/chosen": -0.1813846379518509, + "logits/rejected": -0.03618471696972847, + "logps/chosen": -4.8661346435546875, + "logps/rejected": -6.031615257263184, + "loss": 0.4378, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.8661346435546875, + "rewards/margins": 1.165480613708496, + "rewards/rejected": -6.031615257263184, "step": 4700 }, { "epoch": 2.5181468472988793, - "grad_norm": 28.03895601653979, + "grad_norm": 23.10987094790759, "learning_rate": 7.638359922812504e-08, - "logits/chosen": 0.776307225227356, - "logits/rejected": 0.8199490308761597, - "logps/chosen": -9.557718276977539, - "logps/rejected": -10.877527236938477, - "loss": 0.4126, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.557718276977539, - "rewards/margins": 1.3198084831237793, - "rewards/rejected": -10.877527236938477, - "semantic_entropy": 0.0016608207952231169, + "logits/chosen": -0.071537546813488, + "logits/rejected": 0.007393138017505407, + "logps/chosen": -4.867361068725586, + "logps/rejected": -6.137284755706787, + "loss": 0.4288, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.867361068725586, + "rewards/margins": 1.2699235677719116, + "rewards/rejected": -6.137284755706787, "step": 4705 }, { "epoch": 2.520822880080281, - "grad_norm": 32.868150571212254, + "grad_norm": 25.911880336498683, "learning_rate": 7.555832931406774e-08, - "logits/chosen": 0.7730585336685181, - "logits/rejected": 0.8562465906143188, - "logps/chosen": -9.679393768310547, - "logps/rejected": -11.042935371398926, - "loss": 0.4244, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -9.679393768310547, - "rewards/margins": 1.3635411262512207, - "rewards/rejected": -11.042935371398926, - "semantic_entropy": 0.0014725655782967806, + "logits/chosen": -0.10369233787059784, + "logits/rejected": 0.10426206886768341, + "logps/chosen": -5.083745002746582, + "logps/rejected": -6.443483829498291, + "loss": 0.4051, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.083745002746582, + "rewards/margins": 1.3597382307052612, + "rewards/rejected": -6.443483829498291, "step": 4710 }, { "epoch": 2.5234989128616827, - "grad_norm": 18.781224798445212, + "grad_norm": 19.18034529162048, "learning_rate": 7.47371773349611e-08, - "logits/chosen": 0.8205526471138, - "logits/rejected": 0.8565570712089539, - "logps/chosen": -9.75233268737793, - "logps/rejected": -11.291008949279785, - "loss": 0.3533, - "rewards/accuracies": 0.84375, - "rewards/chosen": -9.75233268737793, - "rewards/margins": 1.5386755466461182, - "rewards/rejected": -11.291008949279785, - "semantic_entropy": 0.0013560467632487416, + "logits/chosen": -0.040231820195913315, + "logits/rejected": -0.006743092089891434, + "logps/chosen": -5.069214820861816, + "logps/rejected": -6.597991943359375, + "loss": 0.3348, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.069214820861816, + "rewards/margins": 1.5287768840789795, + "rewards/rejected": -6.597991943359375, "step": 4715 }, { "epoch": 2.526174945643084, - "grad_norm": 28.052854143232477, + "grad_norm": 24.774011067668457, "learning_rate": 7.392015125762496e-08, - "logits/chosen": 0.7241109609603882, - "logits/rejected": 0.8222753405570984, - "logps/chosen": -9.689494132995605, - "logps/rejected": -11.097482681274414, - "loss": 0.3869, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.689494132995605, - "rewards/margins": 1.4079889059066772, - "rewards/rejected": -11.097482681274414, - "semantic_entropy": 0.0011607788037508726, + "logits/chosen": -0.11954519897699356, + "logits/rejected": 0.03383365273475647, + "logps/chosen": -4.968479156494141, + "logps/rejected": -6.3763508796691895, + "loss": 0.3758, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.968479156494141, + "rewards/margins": 1.4078716039657593, + "rewards/rejected": -6.3763508796691895, "step": 4720 }, { "epoch": 2.5288509784244857, - "grad_norm": 25.150937293751188, + "grad_norm": 18.76336344842376, "learning_rate": 7.310725900885018e-08, - "logits/chosen": 0.7780320048332214, - "logits/rejected": 0.8369150161743164, - "logps/chosen": -9.633565902709961, - "logps/rejected": -10.885080337524414, - "loss": 0.4725, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -9.633565902709961, - "rewards/margins": 1.2515143156051636, - "rewards/rejected": -10.885080337524414, - "semantic_entropy": 0.001628419035114348, + "logits/chosen": -0.1333286315202713, + "logits/rejected": -0.06862220913171768, + "logps/chosen": -4.9732255935668945, + "logps/rejected": -6.212380886077881, + "loss": 0.4847, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.9732255935668945, + "rewards/margins": 1.2391555309295654, + "rewards/rejected": -6.212380886077881, "step": 4725 }, { "epoch": 2.5315270112058874, - "grad_norm": 22.549685697406634, + "grad_norm": 23.41252161745601, "learning_rate": 7.229850847532076e-08, - "logits/chosen": 0.8130934834480286, - "logits/rejected": 0.9138733744621277, - "logps/chosen": -9.572754859924316, - "logps/rejected": -11.118395805358887, - "loss": 0.3436, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -9.572754859924316, - "rewards/margins": 1.5456407070159912, - "rewards/rejected": -11.118395805358887, - "semantic_entropy": 0.0017241360619664192, + "logits/chosen": -0.0589381568133831, + "logits/rejected": 0.1041998490691185, + "logps/chosen": -4.850452423095703, + "logps/rejected": -6.337832450866699, + "loss": 0.3531, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.850452423095703, + "rewards/margins": 1.4873801469802856, + "rewards/rejected": -6.337832450866699, "step": 4730 }, { "epoch": 2.5342030439872887, - "grad_norm": 22.715970370218447, + "grad_norm": 20.763347682534455, "learning_rate": 7.149390750353779e-08, - "logits/chosen": 0.8037542104721069, - "logits/rejected": 0.8449680209159851, - "logps/chosen": -9.80845832824707, - "logps/rejected": -11.075779914855957, - "loss": 0.4049, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.80845832824707, - "rewards/margins": 1.2673219442367554, - "rewards/rejected": -11.075779914855957, - "semantic_entropy": 0.0013479054905474186, + "logits/chosen": -0.0004107370914425701, + "logits/rejected": -0.018195699900388718, + "logps/chosen": -5.360447406768799, + "logps/rejected": -6.654149532318115, + "loss": 0.3784, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.360447406768799, + "rewards/margins": 1.2937031984329224, + "rewards/rejected": -6.654149532318115, "step": 4735 }, { "epoch": 2.5368790767686904, - "grad_norm": 21.870424101570975, + "grad_norm": 15.515318743782364, "learning_rate": 7.069346389974374e-08, - "logits/chosen": 0.80865877866745, - "logits/rejected": 0.8610594868659973, - "logps/chosen": -9.720281600952148, - "logps/rejected": -11.03128433227539, - "loss": 0.3992, - "rewards/accuracies": 0.875, - "rewards/chosen": -9.720281600952148, - "rewards/margins": 1.3110027313232422, - "rewards/rejected": -11.03128433227539, - "semantic_entropy": 0.0014346633106470108, + "logits/chosen": -0.10219915211200714, + "logits/rejected": 0.0518370158970356, + "logps/chosen": -5.299516677856445, + "logps/rejected": -6.531324863433838, + "loss": 0.4294, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -5.299516677856445, + "rewards/margins": 1.2318084239959717, + "rewards/rejected": -6.531324863433838, "step": 4740 }, { "epoch": 2.539555109550092, - "grad_norm": 25.565083159355556, + "grad_norm": 22.182546677915816, "learning_rate": 6.989718542984563e-08, - "logits/chosen": 0.7875592708587646, - "logits/rejected": 0.8164467811584473, - "logps/chosen": -9.870200157165527, - "logps/rejected": -11.153292655944824, - "loss": 0.4386, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.870200157165527, - "rewards/margins": 1.2830924987792969, - "rewards/rejected": -11.153292655944824, - "semantic_entropy": 0.0011865177657455206, + "logits/chosen": -0.08616326749324799, + "logits/rejected": -0.032669879496097565, + "logps/chosen": -5.393087387084961, + "logps/rejected": -6.708924770355225, + "loss": 0.4315, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.393087387084961, + "rewards/margins": 1.31583833694458, + "rewards/rejected": -6.708924770355225, "step": 4745 }, { "epoch": 2.5422311423314934, - "grad_norm": 23.70159485340149, + "grad_norm": 16.909205961777676, "learning_rate": 6.9105079819341e-08, - "logits/chosen": 0.7858568429946899, - "logits/rejected": 0.8397903442382812, - "logps/chosen": -9.606379508972168, - "logps/rejected": -11.173693656921387, - "loss": 0.3656, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.606379508972168, - "rewards/margins": 1.5673143863677979, - "rewards/rejected": -11.173693656921387, - "semantic_entropy": 0.0015011833747848868, + "logits/chosen": -0.03768426924943924, + "logits/rejected": 0.20672397315502167, + "logps/chosen": -5.092083930969238, + "logps/rejected": -6.636031150817871, + "loss": 0.3449, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.092083930969238, + "rewards/margins": 1.543947458267212, + "rewards/rejected": -6.636031150817871, "step": 4750 }, { "epoch": 2.544907175112895, - "grad_norm": 20.488177720347142, + "grad_norm": 23.031758109037636, "learning_rate": 6.831715475324163e-08, - "logits/chosen": 0.7883397936820984, - "logits/rejected": 0.8317297101020813, - "logps/chosen": -9.790312767028809, - "logps/rejected": -11.212163925170898, - "loss": 0.4019, + "logits/chosen": -0.12189313024282455, + "logits/rejected": 0.03989701718091965, + "logps/chosen": -5.158926963806152, + "logps/rejected": -6.661678314208984, + "loss": 0.387, "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.790312767028809, - "rewards/margins": 1.4218522310256958, - "rewards/rejected": -11.212163925170898, - "semantic_entropy": 0.0015070982044562697, + "rewards/chosen": -5.158926963806152, + "rewards/margins": 1.5027509927749634, + "rewards/rejected": -6.661678314208984, "step": 4755 }, { "epoch": 2.547583207894297, - "grad_norm": 19.639448727845746, + "grad_norm": 19.357967284653586, "learning_rate": 6.753341787600026e-08, - "logits/chosen": 0.7966683506965637, - "logits/rejected": 0.8559118509292603, - "logps/chosen": -9.566210746765137, - "logps/rejected": -11.087980270385742, - "loss": 0.3618, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.566210746765137, - "rewards/margins": 1.5217713117599487, - "rewards/rejected": -11.087980270385742, - "semantic_entropy": 0.0015061668818816543, + "logits/chosen": -0.13845564424991608, + "logits/rejected": -0.0358344241976738, + "logps/chosen": -5.014981269836426, + "logps/rejected": -6.5170464515686035, + "loss": 0.3482, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.014981269836426, + "rewards/margins": 1.5020650625228882, + "rewards/rejected": -6.5170464515686035, "step": 4760 }, { "epoch": 2.5502592406756985, - "grad_norm": 23.858385047679327, + "grad_norm": 27.85769312488511, "learning_rate": 6.67538767914353e-08, - "logits/chosen": 0.791594922542572, - "logits/rejected": 0.8555682301521301, - "logps/chosen": -9.7157564163208, - "logps/rejected": -10.862947463989258, - "loss": 0.4575, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.7157564163208, - "rewards/margins": 1.1471917629241943, - "rewards/rejected": -10.862947463989258, - "semantic_entropy": 0.0014351477148011327, + "logits/chosen": -0.14569178223609924, + "logits/rejected": 0.020375341176986694, + "logps/chosen": -5.019095420837402, + "logps/rejected": -6.216348171234131, + "loss": 0.4405, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -5.019095420837402, + "rewards/margins": 1.1972529888153076, + "rewards/rejected": -6.216348171234131, "step": 4765 }, { "epoch": 2.5529352734571, - "grad_norm": 23.875748432774547, + "grad_norm": 24.822541187754556, "learning_rate": 6.597853906265793e-08, - "logits/chosen": 0.8073896169662476, - "logits/rejected": 0.850189208984375, - "logps/chosen": -9.71528434753418, - "logps/rejected": -11.275640487670898, - "loss": 0.3913, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.71528434753418, - "rewards/margins": 1.56035578250885, - "rewards/rejected": -11.275640487670898, - "semantic_entropy": 0.0013745089527219534, + "logits/chosen": -0.04982522130012512, + "logits/rejected": 0.09526404738426208, + "logps/chosen": -5.287869453430176, + "logps/rejected": -6.890192985534668, + "loss": 0.398, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -5.287869453430176, + "rewards/margins": 1.6023244857788086, + "rewards/rejected": -6.890192985534668, "step": 4770 }, { "epoch": 2.5556113062385015, - "grad_norm": 24.624000051473708, + "grad_norm": 24.5686157333911, "learning_rate": 6.5207412211998e-08, - "logits/chosen": 0.8650287389755249, - "logits/rejected": 0.9158161282539368, - "logps/chosen": -9.759730339050293, - "logps/rejected": -11.222911834716797, - "loss": 0.4193, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.759730339050293, - "rewards/margins": 1.463181495666504, - "rewards/rejected": -11.222911834716797, - "semantic_entropy": 0.0013576913625001907, + "logits/chosen": 0.04860395938158035, + "logits/rejected": 0.13133636116981506, + "logps/chosen": -5.331174373626709, + "logps/rejected": -6.700440883636475, + "loss": 0.4462, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.331174373626709, + "rewards/margins": 1.3692667484283447, + "rewards/rejected": -6.700440883636475, "step": 4775 }, { "epoch": 2.558287339019903, - "grad_norm": 18.880924959816383, + "grad_norm": 20.094745107962922, "learning_rate": 6.444050372093186e-08, - "logits/chosen": 0.753667950630188, - "logits/rejected": 0.8446556329727173, - "logps/chosen": -9.776894569396973, - "logps/rejected": -11.069523811340332, - "loss": 0.4017, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.776894569396973, - "rewards/margins": 1.292628526687622, - "rewards/rejected": -11.069523811340332, - "semantic_entropy": 0.0013450583210214972, + "logits/chosen": -0.11245042085647583, + "logits/rejected": 0.017978152260184288, + "logps/chosen": -5.162657260894775, + "logps/rejected": -6.4206132888793945, + "loss": 0.4097, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.162657260894775, + "rewards/margins": 1.257956862449646, + "rewards/rejected": -6.4206132888793945, "step": 4780 }, { "epoch": 2.5609633718013045, - "grad_norm": 26.78240030796786, + "grad_norm": 23.71353269707678, "learning_rate": 6.367782103000873e-08, - "logits/chosen": 0.8099533319473267, - "logits/rejected": 0.8473021388053894, - "logps/chosen": -9.640462875366211, - "logps/rejected": -10.797101974487305, - "loss": 0.4493, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.640462875366211, - "rewards/margins": 1.1566379070281982, - "rewards/rejected": -10.797101974487305, - "semantic_entropy": 0.0015895968535915017, + "logits/chosen": -0.07516603171825409, + "logits/rejected": -0.0186539888381958, + "logps/chosen": -5.131236553192139, + "logps/rejected": -6.140219688415527, + "loss": 0.4833, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.131236553192139, + "rewards/margins": 1.0089826583862305, + "rewards/rejected": -6.140219688415527, "step": 4785 }, { "epoch": 2.5636394045827062, - "grad_norm": 27.727951802633132, + "grad_norm": 21.53689958684339, "learning_rate": 6.29193715387798e-08, - "logits/chosen": 0.7754964828491211, - "logits/rejected": 0.8192489743232727, - "logps/chosen": -9.690164566040039, - "logps/rejected": -11.208440780639648, - "loss": 0.4116, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.690164566040039, - "rewards/margins": 1.5182764530181885, - "rewards/rejected": -11.208440780639648, - "semantic_entropy": 0.0017889321316033602, + "logits/chosen": -0.0936863049864769, + "logits/rejected": 0.028363818302750587, + "logps/chosen": -5.266635894775391, + "logps/rejected": -6.6888108253479, + "loss": 0.4145, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.266635894775391, + "rewards/margins": 1.4221746921539307, + "rewards/rejected": -6.6888108253479, "step": 4790 }, { "epoch": 2.566315437364108, - "grad_norm": 28.149971668259315, + "grad_norm": 27.00294717844681, "learning_rate": 6.216516260572502e-08, - "logits/chosen": 0.7809394001960754, - "logits/rejected": 0.8340757489204407, - "logps/chosen": -9.840954780578613, - "logps/rejected": -11.24518871307373, - "loss": 0.3951, + "logits/chosen": -0.03358614072203636, + "logits/rejected": 0.08967655897140503, + "logps/chosen": -5.30711555480957, + "logps/rejected": -6.664097785949707, + "loss": 0.4132, "rewards/accuracies": 0.8125, - "rewards/chosen": -9.840954780578613, - "rewards/margins": 1.4042353630065918, - "rewards/rejected": -11.24518871307373, - "semantic_entropy": 0.0015198871260508895, + "rewards/chosen": -5.30711555480957, + "rewards/margins": 1.3569823503494263, + "rewards/rejected": -6.664097785949707, "step": 4795 }, { "epoch": 2.568991470145509, - "grad_norm": 17.6357809314226, + "grad_norm": 17.918025089354956, "learning_rate": 6.141520154818297e-08, - "logits/chosen": 0.8027510643005371, - "logits/rejected": 0.8235558271408081, - "logps/chosen": -9.668852806091309, - "logps/rejected": -10.926243782043457, - "loss": 0.4394, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -9.668852806091309, - "rewards/margins": 1.257389783859253, - "rewards/rejected": -10.926243782043457, - "semantic_entropy": 0.001651085214689374, + "logits/chosen": -0.07204408943653107, + "logits/rejected": 0.028885483741760254, + "logps/chosen": -4.950024127960205, + "logps/rejected": -6.128761291503906, + "loss": 0.4481, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.950024127960205, + "rewards/margins": 1.1787374019622803, + "rewards/rejected": -6.128761291503906, "step": 4800 }, { "epoch": 2.568991470145509, - "eval_logits/chosen": 0.8550450205802917, - "eval_logits/rejected": 0.8955670595169067, - "eval_logps/chosen": -9.817285537719727, - "eval_logps/rejected": -10.942052841186523, - "eval_loss": 0.5237716436386108, - "eval_rewards/accuracies": 0.7255192995071411, - "eval_rewards/chosen": -9.817285537719727, - "eval_rewards/margins": 1.1247663497924805, - "eval_rewards/rejected": -10.942052841186523, - "eval_runtime": 35.1465, - "eval_samples_per_second": 38.268, - "eval_semantic_entropy": 0.0013976304326206446, - "eval_steps_per_second": 9.588, + "eval_logits/chosen": 0.31069934368133545, + "eval_logits/rejected": 0.4138917624950409, + "eval_logps/chosen": -5.2343430519104, + "eval_logps/rejected": -6.352870941162109, + "eval_loss": 0.5253465175628662, + "eval_rewards/accuracies": 0.7307121753692627, + "eval_rewards/chosen": -5.2343430519104, + "eval_rewards/margins": 1.1185271739959717, + "eval_rewards/rejected": -6.352870941162109, + "eval_runtime": 40.3666, + "eval_samples_per_second": 33.32, + "eval_steps_per_second": 8.348, "step": 4800 }, { "epoch": 2.571667502926911, - "grad_norm": 25.642642407075105, + "grad_norm": 25.582360591676196, "learning_rate": 6.066949564227897e-08, - "logits/chosen": 0.7796936631202698, - "logits/rejected": 0.817895233631134, - "logps/chosen": -9.595781326293945, - "logps/rejected": -10.862689018249512, - "loss": 0.4646, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -9.595781326293945, - "rewards/margins": 1.2669070959091187, - "rewards/rejected": -10.862689018249512, - "semantic_entropy": 0.0013227377785369754, + "logits/chosen": -0.10616904497146606, + "logits/rejected": 0.010032358579337597, + "logps/chosen": -5.034721851348877, + "logps/rejected": -6.360010147094727, + "loss": 0.4444, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.034721851348877, + "rewards/margins": 1.3252880573272705, + "rewards/rejected": -6.360010147094727, "step": 4805 }, { "epoch": 2.574343535708312, - "grad_norm": 20.93852604421033, + "grad_norm": 21.964114337463396, "learning_rate": 5.992805212285523e-08, - "logits/chosen": 0.777945339679718, - "logits/rejected": 0.7996780872344971, - "logps/chosen": -9.620034217834473, - "logps/rejected": -11.079931259155273, - "loss": 0.3832, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.620034217834473, - "rewards/margins": 1.4598976373672485, - "rewards/rejected": -11.079931259155273, - "semantic_entropy": 0.0017585292225703597, + "logits/chosen": -0.06536881625652313, + "logits/rejected": 0.03300068527460098, + "logps/chosen": -5.160165309906006, + "logps/rejected": -6.528327941894531, + "loss": 0.4099, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.160165309906006, + "rewards/margins": 1.3681637048721313, + "rewards/rejected": -6.528327941894531, "step": 4810 }, { "epoch": 2.577019568489714, - "grad_norm": 24.39765980999833, + "grad_norm": 28.188885092880874, "learning_rate": 5.9190878183399684e-08, - "logits/chosen": 0.8418199419975281, - "logits/rejected": 0.8500019311904907, - "logps/chosen": -9.530296325683594, - "logps/rejected": -10.991520881652832, - "loss": 0.4437, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.530296325683594, - "rewards/margins": 1.4612245559692383, - "rewards/rejected": -10.991520881652832, - "semantic_entropy": 0.0018125723581761122, + "logits/chosen": -0.07806040346622467, + "logits/rejected": 0.05833492428064346, + "logps/chosen": -4.769211769104004, + "logps/rejected": -6.24694299697876, + "loss": 0.4576, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.769211769104004, + "rewards/margins": 1.4777309894561768, + "rewards/rejected": -6.24694299697876, "step": 4815 }, { "epoch": 2.5796956012711156, - "grad_norm": 31.63637936690902, + "grad_norm": 21.614900921628273, "learning_rate": 5.845798097597748e-08, - "logits/chosen": 0.8116466403007507, - "logits/rejected": 0.8796448707580566, - "logps/chosen": -9.765870094299316, - "logps/rejected": -10.956972122192383, - "loss": 0.4451, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.765870094299316, - "rewards/margins": 1.191102385520935, - "rewards/rejected": -10.956972122192383, - "semantic_entropy": 0.0012761508114635944, + "logits/chosen": -0.04254940152168274, + "logits/rejected": 0.06183934211730957, + "logps/chosen": -5.051640510559082, + "logps/rejected": -6.1899003982543945, + "loss": 0.4471, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -5.051640510559082, + "rewards/margins": 1.1382604837417603, + "rewards/rejected": -6.1899003982543945, "step": 4820 }, { "epoch": 2.5823716340525174, - "grad_norm": 27.871272030425462, + "grad_norm": 25.077592197006048, "learning_rate": 5.772936761116026e-08, - "logits/chosen": 0.8403164148330688, - "logits/rejected": 0.900818943977356, - "logps/chosen": -9.660100936889648, - "logps/rejected": -10.994760513305664, - "loss": 0.4122, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.660100936889648, - "rewards/margins": 1.3346589803695679, - "rewards/rejected": -10.994760513305664, - "semantic_entropy": 0.001608129939995706, + "logits/chosen": -0.02731374464929104, + "logits/rejected": 0.13152813911437988, + "logps/chosen": -5.111601829528809, + "logps/rejected": -6.397869110107422, + "loss": 0.408, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.111601829528809, + "rewards/margins": 1.286267638206482, + "rewards/rejected": -6.397869110107422, "step": 4825 }, { "epoch": 2.5850476668339186, - "grad_norm": 28.248867607037017, + "grad_norm": 26.599660648109378, "learning_rate": 5.700504515795829e-08, - "logits/chosen": 0.8395519256591797, - "logits/rejected": 0.8937468528747559, - "logps/chosen": -9.703470230102539, - "logps/rejected": -11.014467239379883, - "loss": 0.413, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.703470230102539, - "rewards/margins": 1.3109973669052124, - "rewards/rejected": -11.014467239379883, - "semantic_entropy": 0.0014707682421430945, + "logits/chosen": -0.07527033984661102, + "logits/rejected": 0.09612502157688141, + "logps/chosen": -5.268985748291016, + "logps/rejected": -6.496352195739746, + "loss": 0.4297, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -5.268985748291016, + "rewards/margins": 1.2273662090301514, + "rewards/rejected": -6.496352195739746, "step": 4830 }, { "epoch": 2.5877236996153203, - "grad_norm": 25.030373000459893, + "grad_norm": 23.148150851722196, "learning_rate": 5.628502064375101e-08, - "logits/chosen": 0.7156926393508911, - "logits/rejected": 0.783743679523468, - "logps/chosen": -9.621664047241211, - "logps/rejected": -11.060919761657715, - "loss": 0.351, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -9.621664047241211, - "rewards/margins": 1.439256191253662, - "rewards/rejected": -11.060919761657715, - "semantic_entropy": 0.001427180483005941, + "logits/chosen": -0.19011929631233215, + "logits/rejected": 0.0017858206992968917, + "logps/chosen": -4.819035530090332, + "logps/rejected": -6.275510311126709, + "loss": 0.3503, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.819035530090332, + "rewards/margins": 1.456475019454956, + "rewards/rejected": -6.275510311126709, "step": 4835 }, { "epoch": 2.5903997323967216, - "grad_norm": 26.228171272523518, + "grad_norm": 19.299991505095925, "learning_rate": 5.55693010542197e-08, - "logits/chosen": 0.7665778398513794, - "logits/rejected": 0.8383312225341797, - "logps/chosen": -9.543096542358398, - "logps/rejected": -11.02253532409668, - "loss": 0.3674, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.543096542358398, - "rewards/margins": 1.4794379472732544, - "rewards/rejected": -11.02253532409668, - "semantic_entropy": 0.001682286150753498, + "logits/chosen": -0.1539369374513626, + "logits/rejected": 0.0787445679306984, + "logps/chosen": -4.938119888305664, + "logps/rejected": -6.429734706878662, + "loss": 0.3546, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.938119888305664, + "rewards/margins": 1.4916150569915771, + "rewards/rejected": -6.429734706878662, "step": 4840 }, { "epoch": 2.5930757651781233, - "grad_norm": 28.09304993416799, + "grad_norm": 20.4756837943824, "learning_rate": 5.485789333327856e-08, - "logits/chosen": 0.7801726460456848, - "logits/rejected": 0.8014837503433228, - "logps/chosen": -9.684420585632324, - "logps/rejected": -10.936495780944824, - "loss": 0.4075, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.684420585632324, - "rewards/margins": 1.2520757913589478, - "rewards/rejected": -10.936495780944824, - "semantic_entropy": 0.0015050426591187716, + "logits/chosen": -0.07487092912197113, + "logits/rejected": -0.02441031113266945, + "logps/chosen": -4.971838474273682, + "logps/rejected": -6.226641654968262, + "loss": 0.4232, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.971838474273682, + "rewards/margins": 1.254804015159607, + "rewards/rejected": -6.226641654968262, "step": 4845 }, { "epoch": 2.595751797959525, - "grad_norm": 23.215944833323043, + "grad_norm": 24.5902199403009, "learning_rate": 5.4150804383008675e-08, - "logits/chosen": 0.7506409883499146, - "logits/rejected": 0.8083317875862122, - "logps/chosen": -9.687183380126953, - "logps/rejected": -11.075380325317383, - "loss": 0.4257, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.687183380126953, - "rewards/margins": 1.3881968259811401, - "rewards/rejected": -11.075380325317383, - "semantic_entropy": 0.0012998328311368823, + "logits/chosen": -0.1605321615934372, + "logits/rejected": 0.006235034205019474, + "logps/chosen": -5.253085136413574, + "logps/rejected": -6.608834266662598, + "loss": 0.43, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.253085136413574, + "rewards/margins": 1.3557493686676025, + "rewards/rejected": -6.608834266662598, "step": 4850 }, { "epoch": 2.5984278307409268, - "grad_norm": 26.1051895667939, + "grad_norm": 29.26717533808806, "learning_rate": 5.344804106359002e-08, - "logits/chosen": 0.8464560508728027, - "logits/rejected": 0.884229302406311, - "logps/chosen": -9.554250717163086, - "logps/rejected": -10.965102195739746, - "loss": 0.3927, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.554250717163086, - "rewards/margins": 1.410851240158081, - "rewards/rejected": -10.965102195739746, - "semantic_entropy": 0.0016194203635677695, + "logits/chosen": -0.053511105477809906, + "logits/rejected": 0.11099044233560562, + "logps/chosen": -4.774605751037598, + "logps/rejected": -6.160122871398926, + "loss": 0.4134, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.774605751037598, + "rewards/margins": 1.3855173587799072, + "rewards/rejected": -6.160122871398926, "step": 4855 }, { "epoch": 2.601103863522328, - "grad_norm": 29.62614133583001, + "grad_norm": 23.142917357697943, "learning_rate": 5.274961019323559e-08, - "logits/chosen": 0.7584127187728882, - "logits/rejected": 0.7833465337753296, - "logps/chosen": -9.556783676147461, - "logps/rejected": -10.777464866638184, - "loss": 0.4411, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.556783676147461, - "rewards/margins": 1.2206814289093018, - "rewards/rejected": -10.777464866638184, - "semantic_entropy": 0.002015589503571391, + "logits/chosen": -0.12091958522796631, + "logits/rejected": -0.02233690768480301, + "logps/chosen": -4.776228904724121, + "logps/rejected": -6.029642581939697, + "loss": 0.4087, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.776228904724121, + "rewards/margins": 1.2534143924713135, + "rewards/rejected": -6.029642581939697, "step": 4860 }, { "epoch": 2.6037798963037297, - "grad_norm": 19.381379819748208, + "grad_norm": 15.3426480479517, "learning_rate": 5.205551854812451e-08, - "logits/chosen": 0.8153206706047058, - "logits/rejected": 0.8267370462417603, - "logps/chosen": -9.816014289855957, - "logps/rejected": -11.20833683013916, - "loss": 0.4169, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -9.816014289855957, - "rewards/margins": 1.3923231363296509, - "rewards/rejected": -11.20833683013916, - "semantic_entropy": 0.001474303426221013, + "logits/chosen": -0.11927191913127899, + "logits/rejected": -0.03344221040606499, + "logps/chosen": -5.345898628234863, + "logps/rejected": -6.7047929763793945, + "loss": 0.3857, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.345898628234863, + "rewards/margins": 1.3588941097259521, + "rewards/rejected": -6.7047929763793945, "step": 4865 }, { "epoch": 2.606455929085131, - "grad_norm": 16.778112141427197, + "grad_norm": 20.685281655642115, "learning_rate": 5.1365772862337177e-08, - "logits/chosen": 0.8045045137405396, - "logits/rejected": 0.8898431658744812, - "logps/chosen": -9.492764472961426, - "logps/rejected": -11.15455436706543, - "loss": 0.3243, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -9.492764472961426, - "rewards/margins": 1.6617908477783203, - "rewards/rejected": -11.15455436706543, - "semantic_entropy": 0.001532680937089026, + "logits/chosen": -0.0561569444835186, + "logits/rejected": 0.078944131731987, + "logps/chosen": -4.844910621643066, + "logps/rejected": -6.501934051513672, + "loss": 0.3263, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.844910621643066, + "rewards/margins": 1.6570236682891846, + "rewards/rejected": -6.501934051513672, "step": 4870 }, { "epoch": 2.6091319618665327, - "grad_norm": 22.352970563490228, + "grad_norm": 25.312086525373953, "learning_rate": 5.068037982778905e-08, - "logits/chosen": 0.8180361986160278, - "logits/rejected": 0.8790004849433899, - "logps/chosen": -9.450529098510742, - "logps/rejected": -10.990362167358398, - "loss": 0.3905, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.450529098510742, - "rewards/margins": 1.539833426475525, - "rewards/rejected": -10.990362167358398, - "semantic_entropy": 0.0015029583591967821, + "logits/chosen": -0.07162773609161377, + "logits/rejected": 0.027165304869413376, + "logps/chosen": -4.627474784851074, + "logps/rejected": -5.9752678871154785, + "loss": 0.4425, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.627474784851074, + "rewards/margins": 1.3477920293807983, + "rewards/rejected": -5.9752678871154785, "step": 4875 }, { "epoch": 2.6118079946479344, - "grad_norm": 20.68826893781038, + "grad_norm": 15.398119510105793, "learning_rate": 4.999934609416656e-08, - "logits/chosen": 0.9028242826461792, - "logits/rejected": 0.9264825582504272, - "logps/chosen": -9.678912162780762, - "logps/rejected": -11.107706069946289, - "loss": 0.3986, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -9.678912162780762, - "rewards/margins": 1.4287939071655273, - "rewards/rejected": -11.107706069946289, - "semantic_entropy": 0.0014561197021976113, + "logits/chosen": -0.007556693162769079, + "logits/rejected": 0.10744617879390717, + "logps/chosen": -4.965868949890137, + "logps/rejected": -6.536759376525879, + "loss": 0.3732, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.965868949890137, + "rewards/margins": 1.570890188217163, + "rewards/rejected": -6.536759376525879, "step": 4880 }, { "epoch": 2.614484027429336, - "grad_norm": 23.193032984362397, + "grad_norm": 20.76260316176352, "learning_rate": 4.932267826886183e-08, - "logits/chosen": 0.8201519846916199, - "logits/rejected": 0.8877601623535156, - "logps/chosen": -9.788244247436523, - "logps/rejected": -11.236588478088379, - "loss": 0.3943, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.788244247436523, - "rewards/margins": 1.4483439922332764, - "rewards/rejected": -11.236588478088379, - "semantic_entropy": 0.0011878965888172388, + "logits/chosen": -0.03661930933594704, + "logits/rejected": 0.031339287757873535, + "logps/chosen": -5.167394638061523, + "logps/rejected": -6.604741096496582, + "loss": 0.3942, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.167394638061523, + "rewards/margins": 1.4373458623886108, + "rewards/rejected": -6.604741096496582, "step": 4885 }, { "epoch": 2.6171600602107374, - "grad_norm": 22.557556114089028, + "grad_norm": 23.823399493853124, "learning_rate": 4.8650382916909206e-08, - "logits/chosen": 0.7988881468772888, - "logits/rejected": 0.8320202827453613, - "logps/chosen": -9.691407203674316, - "logps/rejected": -11.068517684936523, - "loss": 0.4254, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.691407203674316, - "rewards/margins": 1.3771107196807861, - "rewards/rejected": -11.068517684936523, - "semantic_entropy": 0.001317240297794342, + "logits/chosen": -0.14548127353191376, + "logits/rejected": 0.0224468931555748, + "logps/chosen": -5.202889442443848, + "logps/rejected": -6.5568647384643555, + "loss": 0.4391, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -5.202889442443848, + "rewards/margins": 1.353975772857666, + "rewards/rejected": -6.5568647384643555, "step": 4890 }, { "epoch": 2.619836092992139, - "grad_norm": 19.898200763361565, + "grad_norm": 16.740156235771764, "learning_rate": 4.7982466560920976e-08, - "logits/chosen": 0.7807987928390503, - "logits/rejected": 0.8615278005599976, - "logps/chosen": -9.802709579467773, - "logps/rejected": -11.011950492858887, - "loss": 0.4279, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.802709579467773, - "rewards/margins": 1.2092421054840088, - "rewards/rejected": -11.011950492858887, - "semantic_entropy": 0.0015492916572839022, + "logits/chosen": -0.0674758031964302, + "logits/rejected": 0.03641900047659874, + "logps/chosen": -5.1634345054626465, + "logps/rejected": -6.365001678466797, + "loss": 0.4512, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -5.1634345054626465, + "rewards/margins": 1.201567530632019, + "rewards/rejected": -6.365001678466797, "step": 4895 }, { "epoch": 2.622512125773541, - "grad_norm": 23.69089776610765, + "grad_norm": 23.253098932599745, "learning_rate": 4.7318935681024685e-08, - "logits/chosen": 0.7918484807014465, - "logits/rejected": 0.8997529149055481, - "logps/chosen": -9.777336120605469, - "logps/rejected": -11.20526123046875, - "loss": 0.374, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.777336120605469, - "rewards/margins": 1.4279241561889648, - "rewards/rejected": -11.20526123046875, - "semantic_entropy": 0.0012134136632084846, + "logits/chosen": -0.04896990582346916, + "logits/rejected": 0.12760671973228455, + "logps/chosen": -5.083669185638428, + "logps/rejected": -6.481484889984131, + "loss": 0.3754, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.083669185638428, + "rewards/margins": 1.3978158235549927, + "rewards/rejected": -6.481484889984131, "step": 4900 }, { "epoch": 2.625188158554942, - "grad_norm": 20.406979801252664, + "grad_norm": 18.929588351942524, "learning_rate": 4.6659796714799745e-08, - "logits/chosen": 0.795091986656189, - "logits/rejected": 0.8641031384468079, - "logps/chosen": -9.73538589477539, - "logps/rejected": -11.23878288269043, - "loss": 0.3381, - "rewards/accuracies": 0.875, - "rewards/chosen": -9.73538589477539, - "rewards/margins": 1.5033972263336182, - "rewards/rejected": -11.23878288269043, - "semantic_entropy": 0.0015479883877560496, + "logits/chosen": -0.04449799656867981, + "logits/rejected": 0.1239284873008728, + "logps/chosen": -5.0774641036987305, + "logps/rejected": -6.590850830078125, + "loss": 0.3578, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.0774641036987305, + "rewards/margins": 1.5133864879608154, + "rewards/rejected": -6.590850830078125, "step": 4905 }, { "epoch": 2.627864191336344, - "grad_norm": 19.21550189332208, + "grad_norm": 19.410461595111393, "learning_rate": 4.60050560572155e-08, - "logits/chosen": 0.7698862552642822, - "logits/rejected": 0.8082722425460815, - "logps/chosen": -9.636996269226074, - "logps/rejected": -11.186447143554688, - "loss": 0.3999, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.636996269226074, - "rewards/margins": 1.5494511127471924, - "rewards/rejected": -11.186447143554688, - "semantic_entropy": 0.0014376682229340076, + "logits/chosen": -0.08050365746021271, + "logits/rejected": -0.08981191366910934, + "logps/chosen": -4.894737243652344, + "logps/rejected": -6.472301483154297, + "loss": 0.3968, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.894737243652344, + "rewards/margins": 1.5775634050369263, + "rewards/rejected": -6.472301483154297, "step": 4910 }, { "epoch": 2.6305402241177456, - "grad_norm": 23.312400032128476, + "grad_norm": 26.14985977185623, "learning_rate": 4.535472006056834e-08, - "logits/chosen": 0.8084294199943542, - "logits/rejected": 0.8658881187438965, - "logps/chosen": -9.69934368133545, - "logps/rejected": -10.961128234863281, - "loss": 0.4383, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.69934368133545, - "rewards/margins": 1.261784553527832, - "rewards/rejected": -10.961128234863281, - "semantic_entropy": 0.0016576785128563643, + "logits/chosen": -0.08381062746047974, + "logits/rejected": 0.06533221155405045, + "logps/chosen": -5.02228307723999, + "logps/rejected": -6.292694568634033, + "loss": 0.4344, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -5.02228307723999, + "rewards/margins": 1.2704112529754639, + "rewards/rejected": -6.292694568634033, "step": 4915 }, { "epoch": 2.6332162568991473, - "grad_norm": 22.36582731335697, + "grad_norm": 24.822452649885356, "learning_rate": 4.470879503442132e-08, - "logits/chosen": 0.8091555833816528, - "logits/rejected": 0.8557069897651672, - "logps/chosen": -9.803995132446289, - "logps/rejected": -11.197736740112305, - "loss": 0.3964, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.803995132446289, - "rewards/margins": 1.3937435150146484, - "rewards/rejected": -11.197736740112305, - "semantic_entropy": 0.0012877520639449358, + "logits/chosen": -0.059259675443172455, + "logits/rejected": 0.06800401210784912, + "logps/chosen": -5.201934337615967, + "logps/rejected": -6.620414733886719, + "loss": 0.3903, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.201934337615967, + "rewards/margins": 1.4184800386428833, + "rewards/rejected": -6.620414733886719, "step": 4920 }, { "epoch": 2.6358922896805486, - "grad_norm": 18.952340559608157, + "grad_norm": 16.312371921811007, "learning_rate": 4.406728724554154e-08, - "logits/chosen": 0.7486631870269775, - "logits/rejected": 0.8488380312919617, - "logps/chosen": -9.719161987304688, - "logps/rejected": -11.249533653259277, - "loss": 0.3694, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.719161987304688, - "rewards/margins": 1.5303723812103271, - "rewards/rejected": -11.249533653259277, - "semantic_entropy": 0.001469378243200481, + "logits/chosen": -0.17114049196243286, + "logits/rejected": 0.09722797572612762, + "logps/chosen": -5.185960292816162, + "logps/rejected": -6.6258416175842285, + "loss": 0.406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.185960292816162, + "rewards/margins": 1.4398808479309082, + "rewards/rejected": -6.6258416175842285, "step": 4925 }, { "epoch": 2.6385683224619503, - "grad_norm": 18.013629249072153, + "grad_norm": 19.676878400999488, "learning_rate": 4.3430202917840664e-08, - "logits/chosen": 0.8312109708786011, - "logits/rejected": 0.9005948901176453, - "logps/chosen": -9.830782890319824, - "logps/rejected": -11.317447662353516, - "loss": 0.3873, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.830782890319824, - "rewards/margins": 1.4866645336151123, - "rewards/rejected": -11.317447662353516, - "semantic_entropy": 0.0013012022245675325, + "logits/chosen": -0.10735473781824112, + "logits/rejected": 0.08928410708904266, + "logps/chosen": -4.998087406158447, + "logps/rejected": -6.5301833152771, + "loss": 0.4004, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.998087406158447, + "rewards/margins": 1.5320956707000732, + "rewards/rejected": -6.5301833152771, "step": 4930 }, { "epoch": 2.6412443552433515, - "grad_norm": 26.68369648236472, + "grad_norm": 30.462434133149227, "learning_rate": 4.279754823231346e-08, - "logits/chosen": 0.8236324191093445, - "logits/rejected": 0.898714542388916, - "logps/chosen": -9.688699722290039, - "logps/rejected": -11.057371139526367, - "loss": 0.4296, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.688699722290039, - "rewards/margins": 1.368671178817749, - "rewards/rejected": -11.057371139526367, - "semantic_entropy": 0.0014394777826964855, + "logits/chosen": -0.1339132934808731, + "logits/rejected": 0.06191990524530411, + "logps/chosen": -5.039044380187988, + "logps/rejected": -6.337520599365234, + "loss": 0.4408, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.039044380187988, + "rewards/margins": 1.2984764575958252, + "rewards/rejected": -6.337520599365234, "step": 4935 }, { "epoch": 2.6439203880247533, - "grad_norm": 19.214329700454428, + "grad_norm": 18.173935527897648, "learning_rate": 4.216932932697859e-08, - "logits/chosen": 0.7843598127365112, - "logits/rejected": 0.8269468545913696, - "logps/chosen": -9.618779182434082, - "logps/rejected": -10.83703899383545, - "loss": 0.4075, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.618779182434082, - "rewards/margins": 1.2182590961456299, - "rewards/rejected": -10.83703899383545, - "semantic_entropy": 0.0018105891067534685, + "logits/chosen": -0.12725499272346497, + "logits/rejected": -0.016351843252778053, + "logps/chosen": -5.107796669006348, + "logps/rejected": -6.196713447570801, + "loss": 0.4468, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -5.107796669006348, + "rewards/margins": 1.088916540145874, + "rewards/rejected": -6.196713447570801, "step": 4940 }, { "epoch": 2.646596420806155, - "grad_norm": 25.567099589733154, + "grad_norm": 28.48334002844591, "learning_rate": 4.154555229681844e-08, - "logits/chosen": 0.771405041217804, - "logits/rejected": 0.867265522480011, - "logps/chosen": -9.69874382019043, - "logps/rejected": -11.158080101013184, - "loss": 0.3747, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.69874382019043, - "rewards/margins": 1.4593359231948853, - "rewards/rejected": -11.158080101013184, - "semantic_entropy": 0.0013225203147158027, + "logits/chosen": -0.09741954505443573, + "logits/rejected": 0.09206944704055786, + "logps/chosen": -5.100863456726074, + "logps/rejected": -6.524051666259766, + "loss": 0.3682, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.100863456726074, + "rewards/margins": 1.4231879711151123, + "rewards/rejected": -6.524051666259766, "step": 4945 }, { "epoch": 2.6492724535875567, - "grad_norm": 21.387358904048636, + "grad_norm": 23.027335415619895, "learning_rate": 4.092622319372069e-08, - "logits/chosen": 0.8330507278442383, - "logits/rejected": 0.8846317529678345, - "logps/chosen": -9.71510124206543, - "logps/rejected": -11.14268684387207, - "loss": 0.3984, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.71510124206543, - "rewards/margins": 1.4275856018066406, - "rewards/rejected": -11.14268684387207, - "semantic_entropy": 0.0013751887017861009, + "logits/chosen": -0.06516359746456146, + "logits/rejected": 0.10412494093179703, + "logps/chosen": -5.040643215179443, + "logps/rejected": -6.3509135246276855, + "loss": 0.4433, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.040643215179443, + "rewards/margins": 1.3102710247039795, + "rewards/rejected": -6.3509135246276855, "step": 4950 }, { "epoch": 2.651948486368958, - "grad_norm": 23.151467653095995, + "grad_norm": 24.349182521971393, "learning_rate": 4.031134802641889e-08, - "logits/chosen": 0.8044828176498413, - "logits/rejected": 0.8498908877372742, - "logps/chosen": -9.883355140686035, - "logps/rejected": -11.208585739135742, - "loss": 0.4105, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.883355140686035, - "rewards/margins": 1.325231909751892, - "rewards/rejected": -11.208585739135742, - "semantic_entropy": 0.0015358685050159693, + "logits/chosen": -0.1144251674413681, + "logits/rejected": -0.05235855653882027, + "logps/chosen": -4.980933666229248, + "logps/rejected": -6.194422245025635, + "loss": 0.4076, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.980933666229248, + "rewards/margins": 1.2134876251220703, + "rewards/rejected": -6.194422245025635, "step": 4955 }, { "epoch": 2.6546245191503597, - "grad_norm": 21.04274720664362, + "grad_norm": 20.643833931903057, "learning_rate": 3.970093276043468e-08, - "logits/chosen": 0.8240159749984741, - "logits/rejected": 0.9003788828849792, - "logps/chosen": -9.615509986877441, - "logps/rejected": -11.031620025634766, - "loss": 0.3856, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.615509986877441, - "rewards/margins": 1.4161105155944824, - "rewards/rejected": -11.031620025634766, - "semantic_entropy": 0.0017490362515673041, + "logits/chosen": -0.05010553449392319, + "logits/rejected": 0.0602533333003521, + "logps/chosen": -4.936505317687988, + "logps/rejected": -6.3599419593811035, + "loss": 0.383, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.936505317687988, + "rewards/margins": 1.4234373569488525, + "rewards/rejected": -6.3599419593811035, "step": 4960 }, { "epoch": 2.657300551931761, - "grad_norm": 39.33191380791592, + "grad_norm": 28.280288045907902, "learning_rate": 3.9094983318019584e-08, - "logits/chosen": 0.7968525290489197, - "logits/rejected": 0.8308472633361816, - "logps/chosen": -9.674779891967773, - "logps/rejected": -11.145748138427734, - "loss": 0.371, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -9.674779891967773, - "rewards/margins": 1.4709681272506714, - "rewards/rejected": -11.145748138427734, - "semantic_entropy": 0.0014223111793398857, + "logits/chosen": -0.12539887428283691, + "logits/rejected": 0.004809826612472534, + "logps/chosen": -5.122442722320557, + "logps/rejected": -6.548783779144287, + "loss": 0.3888, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.122442722320557, + "rewards/margins": 1.4263410568237305, + "rewards/rejected": -6.548783779144287, "step": 4965 }, { "epoch": 2.6599765847131627, - "grad_norm": 20.85099205117232, + "grad_norm": 22.67187446863551, "learning_rate": 3.849350557809789e-08, - "logits/chosen": 0.8421157002449036, - "logits/rejected": 0.8929288983345032, - "logps/chosen": -9.53125, - "logps/rejected": -10.943530082702637, - "loss": 0.3826, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -9.53125, - "rewards/margins": 1.4122816324234009, - "rewards/rejected": -10.943530082702637, - "semantic_entropy": 0.0014043385162949562, + "logits/chosen": -0.036310989409685135, + "logits/rejected": 0.05018593743443489, + "logps/chosen": -4.754838466644287, + "logps/rejected": -6.171387672424316, + "loss": 0.3768, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.754838466644287, + "rewards/margins": 1.4165490865707397, + "rewards/rejected": -6.171387672424316, "step": 4970 }, { "epoch": 2.6626526174945644, - "grad_norm": 22.939506081463986, + "grad_norm": 22.277612467986778, "learning_rate": 3.789650537620903e-08, - "logits/chosen": 0.8108006715774536, - "logits/rejected": 0.8519940376281738, - "logps/chosen": -9.818662643432617, - "logps/rejected": -11.12476921081543, - "loss": 0.3931, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.818662643432617, - "rewards/margins": 1.3061046600341797, - "rewards/rejected": -11.12476921081543, - "semantic_entropy": 0.0010739094577729702, + "logits/chosen": -0.07914546877145767, + "logits/rejected": -0.011990757659077644, + "logps/chosen": -5.055473804473877, + "logps/rejected": -6.430083274841309, + "loss": 0.3702, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.055473804473877, + "rewards/margins": 1.3746103048324585, + "rewards/rejected": -6.430083274841309, "step": 4975 }, { "epoch": 2.665328650275966, - "grad_norm": 22.129521362188676, + "grad_norm": 23.89556704651212, "learning_rate": 3.730398850445182e-08, - "logits/chosen": 0.822609543800354, - "logits/rejected": 0.8529809713363647, - "logps/chosen": -9.925312995910645, - "logps/rejected": -11.192630767822266, - "loss": 0.4489, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.925312995910645, - "rewards/margins": 1.2673180103302002, - "rewards/rejected": -11.192630767822266, - "semantic_entropy": 0.0011812245938926935, + "logits/chosen": -0.0015282646054401994, + "logits/rejected": 0.03272349759936333, + "logps/chosen": -5.372143745422363, + "logps/rejected": -6.583274841308594, + "loss": 0.4723, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -5.372143745422363, + "rewards/margins": 1.211130976676941, + "rewards/rejected": -6.583274841308594, "step": 4980 }, { "epoch": 2.6680046830573674, - "grad_norm": 24.19733690749803, + "grad_norm": 21.07379926612421, "learning_rate": 3.671596071142735e-08, - "logits/chosen": 0.8324605226516724, - "logits/rejected": 0.9051470756530762, - "logps/chosen": -9.69133472442627, - "logps/rejected": -11.151666641235352, - "loss": 0.4529, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -9.69133472442627, - "rewards/margins": 1.460331678390503, - "rewards/rejected": -11.151666641235352, - "semantic_entropy": 0.0018878221744671464, + "logits/chosen": -0.05735694617033005, + "logits/rejected": 0.12063992023468018, + "logps/chosen": -4.967268466949463, + "logps/rejected": -6.345248222351074, + "loss": 0.448, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.967268466949463, + "rewards/margins": 1.377980351448059, + "rewards/rejected": -6.345248222351074, "step": 4985 }, { "epoch": 2.670680715838769, - "grad_norm": 31.252423984045087, + "grad_norm": 21.702364988612093, "learning_rate": 3.6132427702183996e-08, - "logits/chosen": 0.8355720639228821, - "logits/rejected": 0.8954976797103882, - "logps/chosen": -9.62411880493164, - "logps/rejected": -11.132128715515137, - "loss": 0.361, - "rewards/accuracies": 0.84375, - "rewards/chosen": -9.62411880493164, - "rewards/margins": 1.5080082416534424, - "rewards/rejected": -11.132128715515137, - "semantic_entropy": 0.0016464665532112122, + "logits/chosen": -0.17596092820167542, + "logits/rejected": -0.019732654094696045, + "logps/chosen": -5.017035484313965, + "logps/rejected": -6.439484596252441, + "loss": 0.3657, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.017035484313965, + "rewards/margins": 1.4224483966827393, + "rewards/rejected": -6.439484596252441, "step": 4990 }, { "epoch": 2.6733567486201704, - "grad_norm": 21.68508018940414, + "grad_norm": 22.36391682476609, "learning_rate": 3.555339513816147e-08, - "logits/chosen": 0.7898616790771484, - "logits/rejected": 0.8546016812324524, - "logps/chosen": -9.857865333557129, - "logps/rejected": -11.005608558654785, - "loss": 0.4645, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.857865333557129, - "rewards/margins": 1.147742748260498, - "rewards/rejected": -11.005608558654785, - "semantic_entropy": 0.0011803485685959458, + "logits/chosen": -0.10035596787929535, + "logits/rejected": -0.1047847643494606, + "logps/chosen": -5.242215156555176, + "logps/rejected": -6.304671287536621, + "loss": 0.5065, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -5.242215156555176, + "rewards/margins": 1.0624557733535767, + "rewards/rejected": -6.304671287536621, "step": 4995 }, { "epoch": 2.676032781401572, - "grad_norm": 21.328951358859364, + "grad_norm": 18.980791708236083, "learning_rate": 3.497886863713639e-08, - "logits/chosen": 0.8253191113471985, - "logits/rejected": 0.8587236404418945, - "logps/chosen": -9.829444885253906, - "logps/rejected": -11.261645317077637, - "loss": 0.4094, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.829444885253906, - "rewards/margins": 1.4322013854980469, - "rewards/rejected": -11.261645317077637, - "semantic_entropy": 0.0012271823361515999, + "logits/chosen": -0.11741407215595245, + "logits/rejected": -0.07669083774089813, + "logps/chosen": -4.964869499206543, + "logps/rejected": -6.2265825271606445, + "loss": 0.4509, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.964869499206543, + "rewards/margins": 1.2617132663726807, + "rewards/rejected": -6.2265825271606445, "step": 5000 }, { "epoch": 2.678708814182974, - "grad_norm": 25.75553736028734, + "grad_norm": 26.560137908920456, "learning_rate": 3.440885377316721e-08, - "logits/chosen": 0.8507275581359863, - "logits/rejected": 0.8877654075622559, - "logps/chosen": -9.808802604675293, - "logps/rejected": -10.979841232299805, - "loss": 0.438, + "logits/chosen": -0.031187046319246292, + "logits/rejected": 0.0338604636490345, + "logps/chosen": -5.11460018157959, + "logps/rejected": -6.238164901733398, + "loss": 0.4289, "rewards/accuracies": 0.78125, - "rewards/chosen": -9.808802604675293, - "rewards/margins": 1.1710389852523804, - "rewards/rejected": -10.979841232299805, - "semantic_entropy": 0.0013294884702190757, + "rewards/chosen": -5.11460018157959, + "rewards/margins": 1.1235647201538086, + "rewards/rejected": -6.238164901733398, "step": 5005 }, { "epoch": 2.6813848469643755, - "grad_norm": 29.51695593361045, + "grad_norm": 26.872735827027462, "learning_rate": 3.384335607654082e-08, - "logits/chosen": 0.8268327713012695, - "logits/rejected": 0.8833521008491516, - "logps/chosen": -9.724630355834961, - "logps/rejected": -11.137059211730957, - "loss": 0.3859, - "rewards/accuracies": 0.84375, - "rewards/chosen": -9.724630355834961, - "rewards/margins": 1.412428617477417, - "rewards/rejected": -11.137059211730957, - "semantic_entropy": 0.0016626717988401651, + "logits/chosen": -0.054861366748809814, + "logits/rejected": 0.043877165764570236, + "logps/chosen": -5.015519142150879, + "logps/rejected": -6.306814670562744, + "loss": 0.3845, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.015519142150879, + "rewards/margins": 1.2912952899932861, + "rewards/rejected": -6.306814670562744, "step": 5010 }, { "epoch": 2.684060879745777, - "grad_norm": 20.483800947742463, + "grad_norm": 21.907148360541846, "learning_rate": 3.328238103371811e-08, - "logits/chosen": 0.8099279403686523, - "logits/rejected": 0.8540660738945007, - "logps/chosen": -9.710822105407715, - "logps/rejected": -11.191596984863281, - "loss": 0.3804, + "logits/chosen": -0.09043256938457489, + "logits/rejected": -0.008728738874197006, + "logps/chosen": -5.0564069747924805, + "logps/rejected": -6.3970417976379395, + "loss": 0.3915, "rewards/accuracies": 0.8125, - "rewards/chosen": -9.710822105407715, - "rewards/margins": 1.4807744026184082, - "rewards/rejected": -11.191596984863281, - "semantic_entropy": 0.0015045705949887633, + "rewards/chosen": -5.0564069747924805, + "rewards/margins": 1.3406347036361694, + "rewards/rejected": -6.3970417976379395, "step": 5015 }, { "epoch": 2.6867369125271785, - "grad_norm": 26.229172740059667, + "grad_norm": 25.516239894884727, "learning_rate": 3.272593408728169e-08, - "logits/chosen": 0.8173542022705078, - "logits/rejected": 0.8689044117927551, - "logps/chosen": -9.657730102539062, - "logps/rejected": -10.887810707092285, - "loss": 0.4424, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -9.657730102539062, - "rewards/margins": 1.230080008506775, - "rewards/rejected": -10.887810707092285, - "semantic_entropy": 0.0013948578853160143, + "logits/chosen": -0.1450892984867096, + "logits/rejected": 0.07460806518793106, + "logps/chosen": -4.887761116027832, + "logps/rejected": -6.16219425201416, + "loss": 0.4275, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.887761116027832, + "rewards/margins": 1.2744325399398804, + "rewards/rejected": -6.16219425201416, "step": 5020 }, { "epoch": 2.6894129453085798, - "grad_norm": 19.40648296473702, + "grad_norm": 17.866424701255326, "learning_rate": 3.217402063588204e-08, - "logits/chosen": 0.7883289456367493, - "logits/rejected": 0.8523383140563965, - "logps/chosen": -9.800715446472168, - "logps/rejected": -11.160974502563477, - "loss": 0.4105, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.800715446472168, - "rewards/margins": 1.360258936882019, - "rewards/rejected": -11.160974502563477, - "semantic_entropy": 0.00112335872836411, + "logits/chosen": -0.1516355574131012, + "logits/rejected": 3.784894943237305e-05, + "logps/chosen": -5.015979766845703, + "logps/rejected": -6.26530647277832, + "loss": 0.4354, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -5.015979766845703, + "rewards/margins": 1.2493269443511963, + "rewards/rejected": -6.26530647277832, "step": 5025 }, { "epoch": 2.6920889780899815, - "grad_norm": 20.18654274201843, + "grad_norm": 18.271811215308187, "learning_rate": 3.162664603418608e-08, - "logits/chosen": 0.8560435175895691, - "logits/rejected": 0.8896482586860657, - "logps/chosen": -9.66085147857666, - "logps/rejected": -11.121121406555176, - "loss": 0.3676, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -9.66085147857666, - "rewards/margins": 1.4602700471878052, - "rewards/rejected": -11.121121406555176, - "semantic_entropy": 0.0015098705189302564, + "logits/chosen": -0.11020305007696152, + "logits/rejected": -0.016627604141831398, + "logps/chosen": -4.83048152923584, + "logps/rejected": -6.31688117980957, + "loss": 0.4022, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.83048152923584, + "rewards/margins": 1.4863994121551514, + "rewards/rejected": -6.31688117980957, "step": 5030 }, { "epoch": 2.694765010871383, - "grad_norm": 27.638003566187923, + "grad_norm": 27.157023643527513, "learning_rate": 3.1083815592824416e-08, - "logits/chosen": 0.8065202832221985, - "logits/rejected": 0.8954984545707703, - "logps/chosen": -9.99959945678711, - "logps/rejected": -11.314790725708008, - "loss": 0.4151, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.99959945678711, - "rewards/margins": 1.3151907920837402, - "rewards/rejected": -11.314790725708008, - "semantic_entropy": 0.0012680039508268237, + "logits/chosen": -0.11241893470287323, + "logits/rejected": 0.014663688838481903, + "logps/chosen": -5.051051139831543, + "logps/rejected": -6.379415988922119, + "loss": 0.4065, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.051051139831543, + "rewards/margins": 1.3283647298812866, + "rewards/rejected": -6.379415988922119, "step": 5035 }, { "epoch": 2.697441043652785, - "grad_norm": 22.574139063646903, + "grad_norm": 20.940539004004872, "learning_rate": 3.054553457834053e-08, - "logits/chosen": 0.8925463557243347, - "logits/rejected": 0.9036859273910522, - "logps/chosen": -9.90015983581543, - "logps/rejected": -11.163106918334961, - "loss": 0.4213, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -9.90015983581543, - "rewards/margins": 1.262947678565979, - "rewards/rejected": -11.163106918334961, - "semantic_entropy": 0.0010416943114250898, + "logits/chosen": 0.061786603182554245, + "logits/rejected": 0.044669874012470245, + "logps/chosen": -5.184185028076172, + "logps/rejected": -6.384743690490723, + "loss": 0.431, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -5.184185028076172, + "rewards/margins": 1.2005587816238403, + "rewards/rejected": -6.384743690490723, "step": 5040 }, { "epoch": 2.700117076434186, - "grad_norm": 23.894727539187592, + "grad_norm": 22.82960474447674, "learning_rate": 3.0011808213139036e-08, - "logits/chosen": 0.8361862301826477, - "logits/rejected": 0.8620640635490417, - "logps/chosen": -9.725111961364746, - "logps/rejected": -11.048177719116211, - "loss": 0.3983, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.725111961364746, - "rewards/margins": 1.3230668306350708, - "rewards/rejected": -11.048177719116211, - "semantic_entropy": 0.001452545402571559, + "logits/chosen": 0.008117287419736385, + "logits/rejected": 0.007514400873333216, + "logps/chosen": -4.9580183029174805, + "logps/rejected": -6.220813274383545, + "loss": 0.4124, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.9580183029174805, + "rewards/margins": 1.262795329093933, + "rewards/rejected": -6.220813274383545, "step": 5045 }, { "epoch": 2.702793109215588, - "grad_norm": 22.35978287732217, + "grad_norm": 20.811548776732057, "learning_rate": 2.948264167543568e-08, - "logits/chosen": 0.7902384996414185, - "logits/rejected": 0.8329121470451355, - "logps/chosen": -9.640339851379395, - "logps/rejected": -10.89880657196045, - "loss": 0.4069, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.640339851379395, - "rewards/margins": 1.258466362953186, - "rewards/rejected": -10.89880657196045, - "semantic_entropy": 0.0015184081858024001, + "logits/chosen": -0.07265743613243103, + "logits/rejected": -0.004255610518157482, + "logps/chosen": -4.736907005310059, + "logps/rejected": -5.944591999053955, + "loss": 0.3979, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.736907005310059, + "rewards/margins": 1.207684874534607, + "rewards/rejected": -5.944591999053955, "step": 5050 }, { "epoch": 2.7054691419969896, - "grad_norm": 21.602439650044992, + "grad_norm": 20.696363800754362, "learning_rate": 2.8958040099206216e-08, - "logits/chosen": 0.7853751182556152, - "logits/rejected": 0.8521712422370911, - "logps/chosen": -9.592992782592773, - "logps/rejected": -11.00520133972168, - "loss": 0.3884, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.592992782592773, - "rewards/margins": 1.4122079610824585, - "rewards/rejected": -11.00520133972168, - "semantic_entropy": 0.0015211288118734956, + "logits/chosen": -0.1964438110589981, + "logits/rejected": -0.10185573250055313, + "logps/chosen": -4.745640754699707, + "logps/rejected": -6.192984580993652, + "loss": 0.3651, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.745640754699707, + "rewards/margins": 1.4473440647125244, + "rewards/rejected": -6.192984580993652, "step": 5055 }, { "epoch": 2.708145174778391, - "grad_norm": 24.292700666304096, + "grad_norm": 22.76033157896739, "learning_rate": 2.843800857413775e-08, - "logits/chosen": 0.8210417628288269, - "logits/rejected": 0.8545898199081421, - "logps/chosen": -9.68048095703125, - "logps/rejected": -10.930859565734863, - "loss": 0.4632, - "rewards/accuracies": 0.75, - "rewards/chosen": -9.68048095703125, - "rewards/margins": 1.2503786087036133, - "rewards/rejected": -10.930859565734863, - "semantic_entropy": 0.001456740777939558, + "logits/chosen": -0.06616393476724625, + "logits/rejected": 0.00257996772415936, + "logps/chosen": -4.860841274261475, + "logps/rejected": -6.065991401672363, + "loss": 0.4729, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.860841274261475, + "rewards/margins": 1.2051498889923096, + "rewards/rejected": -6.065991401672363, "step": 5060 }, { "epoch": 2.7108212075597926, - "grad_norm": 23.853271664665353, + "grad_norm": 22.861343111472365, "learning_rate": 2.7922552145578203e-08, - "logits/chosen": 0.8457640409469604, - "logits/rejected": 0.8957823514938354, - "logps/chosen": -9.459394454956055, - "logps/rejected": -10.84212589263916, - "loss": 0.4069, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.459394454956055, - "rewards/margins": 1.3827307224273682, - "rewards/rejected": -10.84212589263916, - "semantic_entropy": 0.0017677752766758204, + "logits/chosen": -0.08517236262559891, + "logits/rejected": 0.14641715586185455, + "logps/chosen": -4.831512451171875, + "logps/rejected": -6.2311811447143555, + "loss": 0.4022, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.831512451171875, + "rewards/margins": 1.3996695280075073, + "rewards/rejected": -6.2311811447143555, "step": 5065 }, { "epoch": 2.7134972403411943, - "grad_norm": 27.366391781200583, + "grad_norm": 27.588417955798164, "learning_rate": 2.7411675814488277e-08, - "logits/chosen": 0.8697333335876465, - "logits/rejected": 0.9225956201553345, - "logps/chosen": -9.678709983825684, - "logps/rejected": -10.979662895202637, - "loss": 0.3868, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.678709983825684, - "rewards/margins": 1.3009527921676636, - "rewards/rejected": -10.979662895202637, - "semantic_entropy": 0.0014675845159217715, + "logits/chosen": -0.005851246416568756, + "logits/rejected": 0.13974998891353607, + "logps/chosen": -4.925848960876465, + "logps/rejected": -6.105233669281006, + "loss": 0.4184, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.925848960876465, + "rewards/margins": 1.1793848276138306, + "rewards/rejected": -6.105233669281006, "step": 5070 }, { "epoch": 2.7161732731225956, - "grad_norm": 29.099080644827772, + "grad_norm": 27.180526653687956, "learning_rate": 2.690538453739216e-08, - "logits/chosen": 0.8708797693252563, - "logits/rejected": 0.8982332348823547, - "logps/chosen": -9.632844924926758, - "logps/rejected": -10.764936447143555, - "loss": 0.4623, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -9.632844924926758, - "rewards/margins": 1.132093071937561, - "rewards/rejected": -10.764936447143555, - "semantic_entropy": 0.0013683564029633999, + "logits/chosen": -0.04187758266925812, + "logits/rejected": 0.021136537194252014, + "logps/chosen": -4.950596332550049, + "logps/rejected": -5.946451663970947, + "loss": 0.5004, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.950596332550049, + "rewards/margins": 0.9958555102348328, + "rewards/rejected": -5.946451663970947, "step": 5075 }, { "epoch": 2.7188493059039973, - "grad_norm": 23.320063008009125, + "grad_norm": 18.75311237262048, "learning_rate": 2.6403683226330298e-08, - "logits/chosen": 0.8062912225723267, - "logits/rejected": 0.8842616081237793, - "logps/chosen": -9.81375503540039, - "logps/rejected": -11.180780410766602, - "loss": 0.3998, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -9.81375503540039, - "rewards/margins": 1.3670246601104736, - "rewards/rejected": -11.180780410766602, - "semantic_entropy": 0.0012563010677695274, + "logits/chosen": -0.15084920823574066, + "logits/rejected": 0.01241408009082079, + "logps/chosen": -4.908097267150879, + "logps/rejected": -6.219198226928711, + "loss": 0.4318, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.908097267150879, + "rewards/margins": 1.3111016750335693, + "rewards/rejected": -6.219198226928711, "step": 5080 }, { "epoch": 2.721525338685399, - "grad_norm": 27.02131423176778, + "grad_norm": 37.96008612713335, "learning_rate": 2.5906576748810804e-08, - "logits/chosen": 0.8369554281234741, - "logits/rejected": 0.8760555386543274, - "logps/chosen": -9.69217300415039, - "logps/rejected": -11.273096084594727, - "loss": 0.3291, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -9.69217300415039, - "rewards/margins": 1.580923318862915, - "rewards/rejected": -11.273096084594727, - "semantic_entropy": 0.0013889706460759044, + "logits/chosen": -0.13855311274528503, + "logits/rejected": -0.030596798285841942, + "logps/chosen": -4.741514682769775, + "logps/rejected": -6.241177558898926, + "loss": 0.3672, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.741514682769775, + "rewards/margins": 1.4996625185012817, + "rewards/rejected": -6.241177558898926, "step": 5085 }, { "epoch": 2.7242013714668003, - "grad_norm": 26.902505403388286, + "grad_norm": 22.211978370713222, "learning_rate": 2.5414069927763016e-08, - "logits/chosen": 0.8435298800468445, - "logits/rejected": 0.8922918438911438, - "logps/chosen": -9.850217819213867, - "logps/rejected": -11.199501037597656, - "loss": 0.4054, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.850217819213867, - "rewards/margins": 1.3492811918258667, - "rewards/rejected": -11.199501037597656, - "semantic_entropy": 0.0012866712640970945, + "logits/chosen": -0.16319754719734192, + "logits/rejected": 0.019400810822844505, + "logps/chosen": -5.272116661071777, + "logps/rejected": -6.668057441711426, + "loss": 0.391, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.272116661071777, + "rewards/margins": 1.3959410190582275, + "rewards/rejected": -6.668057441711426, "step": 5090 }, { "epoch": 2.726877404248202, - "grad_norm": 23.139963064688857, + "grad_norm": 20.543967277410783, "learning_rate": 2.4926167541490185e-08, - "logits/chosen": 0.7457908987998962, - "logits/rejected": 0.8107954859733582, - "logps/chosen": -9.701014518737793, - "logps/rejected": -11.19543743133545, - "loss": 0.4072, + "logits/chosen": -0.22875313460826874, + "logits/rejected": -0.012035268358886242, + "logps/chosen": -4.9495415687561035, + "logps/rejected": -6.374375820159912, + "loss": 0.404, "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.701014518737793, - "rewards/margins": 1.4944229125976562, - "rewards/rejected": -11.19543743133545, - "semantic_entropy": 0.001561012351885438, + "rewards/chosen": -4.9495415687561035, + "rewards/margins": 1.424835443496704, + "rewards/rejected": -6.374375820159912, "step": 5095 }, { "epoch": 2.7295534370296037, - "grad_norm": 18.505427836623596, + "grad_norm": 17.69995596230235, "learning_rate": 2.4442874323623574e-08, - "logits/chosen": 0.8288080096244812, - "logits/rejected": 0.852368950843811, - "logps/chosen": -9.831713676452637, - "logps/rejected": -11.238375663757324, - "loss": 0.4368, + "logits/chosen": -0.029152479022741318, + "logits/rejected": 0.09870848059654236, + "logps/chosen": -5.0544962882995605, + "logps/rejected": -6.465998649597168, + "loss": 0.4235, "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -9.831713676452637, - "rewards/margins": 1.4066613912582397, - "rewards/rejected": -11.238375663757324, - "semantic_entropy": 0.0013758750865235925, + "rewards/chosen": -5.0544962882995605, + "rewards/margins": 1.411502480506897, + "rewards/rejected": -6.465998649597168, "step": 5100 }, { "epoch": 2.7322294698110055, - "grad_norm": 24.20695474770458, + "grad_norm": 23.497658533774157, "learning_rate": 2.396419496307589e-08, - "logits/chosen": 0.7989641427993774, - "logits/rejected": 0.8495559692382812, - "logps/chosen": -9.894341468811035, - "logps/rejected": -11.228631019592285, - "loss": 0.3974, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.894341468811035, - "rewards/margins": 1.3342888355255127, - "rewards/rejected": -11.228631019592285, - "semantic_entropy": 0.0011284537613391876, + "logits/chosen": -0.05298537015914917, + "logits/rejected": 0.11710648238658905, + "logps/chosen": -5.197011947631836, + "logps/rejected": -6.578624725341797, + "loss": 0.4004, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.197011947631836, + "rewards/margins": 1.3816121816635132, + "rewards/rejected": -6.578624725341797, "step": 5105 }, { "epoch": 2.7349055025924067, - "grad_norm": 19.561878478981807, + "grad_norm": 22.771484692253996, "learning_rate": 2.349013410399653e-08, - "logits/chosen": 0.7845159769058228, - "logits/rejected": 0.8294118046760559, - "logps/chosen": -9.763102531433105, - "logps/rejected": -11.050474166870117, - "loss": 0.4596, + "logits/chosen": -0.08245809376239777, + "logits/rejected": 0.01638326793909073, + "logps/chosen": -4.960375785827637, + "logps/rejected": -6.19534158706665, + "loss": 0.4482, "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.763102531433105, - "rewards/margins": 1.2873718738555908, - "rewards/rejected": -11.050474166870117, - "semantic_entropy": 0.0012428943300619721, + "rewards/chosen": -4.960375785827637, + "rewards/margins": 1.2349660396575928, + "rewards/rejected": -6.19534158706665, "step": 5110 }, { "epoch": 2.7375815353738084, - "grad_norm": 26.484826469549404, + "grad_norm": 19.61403972869291, "learning_rate": 2.3020696345725954e-08, - "logits/chosen": 0.7876384258270264, - "logits/rejected": 0.8591636419296265, - "logps/chosen": -9.837101936340332, - "logps/rejected": -11.317276954650879, - "loss": 0.348, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -9.837101936340332, - "rewards/margins": 1.4801758527755737, - "rewards/rejected": -11.317276954650879, - "semantic_entropy": 0.0012740811798721552, + "logits/chosen": -0.17781050503253937, + "logits/rejected": 0.03603339567780495, + "logps/chosen": -5.099895000457764, + "logps/rejected": -6.584254264831543, + "loss": 0.3607, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.099895000457764, + "rewards/margins": 1.4843590259552002, + "rewards/rejected": -6.584254264831543, "step": 5115 }, { "epoch": 2.7402575681552097, - "grad_norm": 26.768124375085126, + "grad_norm": 25.239054057832988, "learning_rate": 2.2555886242751398e-08, - "logits/chosen": 0.8366681933403015, - "logits/rejected": 0.8936346173286438, - "logps/chosen": -9.770502090454102, - "logps/rejected": -11.170347213745117, - "loss": 0.3935, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.770502090454102, - "rewards/margins": 1.3998456001281738, - "rewards/rejected": -11.170347213745117, - "semantic_entropy": 0.0012818884570151567, + "logits/chosen": -0.1083039864897728, + "logits/rejected": -0.021904457360506058, + "logps/chosen": -5.019197940826416, + "logps/rejected": -6.317998886108398, + "loss": 0.3799, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.019197940826416, + "rewards/margins": 1.2988008260726929, + "rewards/rejected": -6.317998886108398, "step": 5120 }, { "epoch": 2.7429336009366114, - "grad_norm": 34.15098903260704, + "grad_norm": 33.21172825758021, "learning_rate": 2.2095708304662453e-08, - "logits/chosen": 0.7642577886581421, - "logits/rejected": 0.8753819465637207, - "logps/chosen": -9.696569442749023, - "logps/rejected": -11.150983810424805, - "loss": 0.3862, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.696569442749023, - "rewards/margins": 1.454413652420044, - "rewards/rejected": -11.150983810424805, - "semantic_entropy": 0.0013747283956035972, + "logits/chosen": -0.21706345677375793, + "logits/rejected": 0.007407322525978088, + "logps/chosen": -4.892796039581299, + "logps/rejected": -6.207766532897949, + "loss": 0.4158, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.892796039581299, + "rewards/margins": 1.31497061252594, + "rewards/rejected": -6.207766532897949, "step": 5125 }, { "epoch": 2.745609633718013, - "grad_norm": 28.8525710173051, + "grad_norm": 25.867751276539025, "learning_rate": 2.16401669961076e-08, - "logits/chosen": 0.7833540439605713, - "logits/rejected": 0.855305552482605, - "logps/chosen": -9.691442489624023, - "logps/rejected": -11.090289115905762, - "loss": 0.4098, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.691442489624023, - "rewards/margins": 1.3988467454910278, - "rewards/rejected": -11.090289115905762, - "semantic_entropy": 0.001538719516247511, + "logits/chosen": -0.21717064082622528, + "logits/rejected": -0.024591034278273582, + "logps/chosen": -4.993123531341553, + "logps/rejected": -6.321095943450928, + "loss": 0.4114, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.993123531341553, + "rewards/margins": 1.3279722929000854, + "rewards/rejected": -6.321095943450928, "step": 5130 }, { "epoch": 2.748285666499415, - "grad_norm": 30.720388291606127, + "grad_norm": 27.455183765683877, "learning_rate": 2.1189266736750532e-08, - "logits/chosen": 0.8673465847969055, - "logits/rejected": 0.9190985560417175, - "logps/chosen": -9.79682445526123, - "logps/rejected": -11.017160415649414, - "loss": 0.4352, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -9.79682445526123, - "rewards/margins": 1.2203348875045776, - "rewards/rejected": -11.017160415649414, - "semantic_entropy": 0.0019034147262573242, + "logits/chosen": -0.01720466837286949, + "logits/rejected": 0.059476904571056366, + "logps/chosen": -4.95101261138916, + "logps/rejected": -6.184760570526123, + "loss": 0.4214, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.95101261138916, + "rewards/margins": 1.2337473630905151, + "rewards/rejected": -6.184760570526123, "step": 5135 }, { "epoch": 2.750961699280816, - "grad_norm": 19.26413991827598, + "grad_norm": 21.366393202844144, "learning_rate": 2.0743011901227623e-08, - "logits/chosen": 0.8775045275688171, - "logits/rejected": 0.948703944683075, - "logps/chosen": -9.918733596801758, - "logps/rejected": -11.272817611694336, - "loss": 0.3995, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.918733596801758, - "rewards/margins": 1.3540844917297363, - "rewards/rejected": -11.272817611694336, - "semantic_entropy": 0.0010751333320513368, + "logits/chosen": -0.0555567666888237, + "logits/rejected": 0.11373057216405869, + "logps/chosen": -5.284670829772949, + "logps/rejected": -6.582223415374756, + "loss": 0.4005, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.284670829772949, + "rewards/margins": 1.2975528240203857, + "rewards/rejected": -6.582223415374756, "step": 5140 }, { "epoch": 2.753637732062218, - "grad_norm": 27.791159491160563, + "grad_norm": 35.644316355884875, "learning_rate": 2.030140681910508e-08, - "logits/chosen": 0.8398303985595703, - "logits/rejected": 0.8916828036308289, - "logps/chosen": -9.831799507141113, - "logps/rejected": -11.197306632995605, - "loss": 0.4334, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -9.831799507141113, - "rewards/margins": 1.3655065298080444, - "rewards/rejected": -11.197306632995605, - "semantic_entropy": 0.0013978518545627594, + "logits/chosen": -0.046682894229888916, + "logits/rejected": 0.08958055078983307, + "logps/chosen": -5.074850559234619, + "logps/rejected": -6.3170576095581055, + "loss": 0.4435, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.074850559234619, + "rewards/margins": 1.2422072887420654, + "rewards/rejected": -6.3170576095581055, "step": 5145 }, { "epoch": 2.756313764843619, - "grad_norm": 26.235313395338775, + "grad_norm": 20.883291628815336, "learning_rate": 1.986445577483753e-08, - "logits/chosen": 0.8134158849716187, - "logits/rejected": 0.8556219935417175, - "logps/chosen": -9.731077194213867, - "logps/rejected": -11.126041412353516, - "loss": 0.4181, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.731077194213867, - "rewards/margins": 1.394963026046753, - "rewards/rejected": -11.126041412353516, - "semantic_entropy": 0.00145871308632195, + "logits/chosen": -0.13727611303329468, + "logits/rejected": -0.022844281047582626, + "logps/chosen": -5.069186687469482, + "logps/rejected": -6.38149356842041, + "loss": 0.414, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.069186687469482, + "rewards/margins": 1.3123061656951904, + "rewards/rejected": -6.38149356842041, "step": 5150 }, { "epoch": 2.758989797625021, - "grad_norm": 19.605206487487308, + "grad_norm": 23.250925261468012, "learning_rate": 1.9432163007725765e-08, - "logits/chosen": 0.7870944738388062, - "logits/rejected": 0.8299382925033569, - "logps/chosen": -9.642583847045898, - "logps/rejected": -11.064803123474121, - "loss": 0.4027, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.642583847045898, - "rewards/margins": 1.4222198724746704, - "rewards/rejected": -11.064803123474121, - "semantic_entropy": 0.0014933927450329065, + "logits/chosen": -0.1541663110256195, + "logits/rejected": -0.06817305088043213, + "logps/chosen": -4.901342391967773, + "logps/rejected": -6.127877235412598, + "loss": 0.4354, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.901342391967773, + "rewards/margins": 1.2265350818634033, + "rewards/rejected": -6.127877235412598, "step": 5155 }, { "epoch": 2.7616658304064226, - "grad_norm": 18.551143666002904, + "grad_norm": 17.169050800181854, "learning_rate": 1.9004532711876297e-08, - "logits/chosen": 0.7672029137611389, - "logits/rejected": 0.8150334358215332, - "logps/chosen": -9.65953254699707, - "logps/rejected": -11.068132400512695, - "loss": 0.3776, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -9.65953254699707, - "rewards/margins": 1.4086004495620728, - "rewards/rejected": -11.068132400512695, - "semantic_entropy": 0.0015167773235589266, + "logits/chosen": -0.11514854431152344, + "logits/rejected": -0.08344851434230804, + "logps/chosen": -4.7996506690979, + "logps/rejected": -6.161312580108643, + "loss": 0.3997, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.7996506690979, + "rewards/margins": 1.3616621494293213, + "rewards/rejected": -6.161312580108643, "step": 5160 }, { "epoch": 2.7643418631878243, - "grad_norm": 28.399641966852776, + "grad_norm": 24.414615816165, "learning_rate": 1.8581569036159928e-08, - "logits/chosen": 0.8048108220100403, - "logits/rejected": 0.8365411758422852, - "logps/chosen": -9.653702735900879, - "logps/rejected": -11.053384780883789, - "loss": 0.4278, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.653702735900879, - "rewards/margins": 1.3996822834014893, - "rewards/rejected": -11.053384780883789, - "semantic_entropy": 0.0016002919292077422, + "logits/chosen": -0.13370995223522186, + "logits/rejected": 0.0559103861451149, + "logps/chosen": -4.897290229797363, + "logps/rejected": -6.227686405181885, + "loss": 0.4053, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.897290229797363, + "rewards/margins": 1.3303956985473633, + "rewards/rejected": -6.227686405181885, "step": 5165 }, { "epoch": 2.7670178959692255, - "grad_norm": 22.352042394265855, + "grad_norm": 20.164443040153937, "learning_rate": 1.8163276084172285e-08, - "logits/chosen": 0.8579298257827759, - "logits/rejected": 0.9403360486030579, - "logps/chosen": -10.103005409240723, - "logps/rejected": -11.47459602355957, - "loss": 0.3877, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -10.103005409240723, - "rewards/margins": 1.3715909719467163, - "rewards/rejected": -11.47459602355957, - "semantic_entropy": 0.001051284489221871, + "logits/chosen": -0.09423048049211502, + "logits/rejected": 0.02804609201848507, + "logps/chosen": -5.046459197998047, + "logps/rejected": -6.345981121063232, + "loss": 0.3996, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.046459197998047, + "rewards/margins": 1.299522042274475, + "rewards/rejected": -6.345981121063232, "step": 5170 }, { "epoch": 2.7696939287506273, - "grad_norm": 25.499811878639434, + "grad_norm": 20.229102031053603, "learning_rate": 1.7749657914193194e-08, - "logits/chosen": 0.8347901105880737, - "logits/rejected": 0.9103593826293945, - "logps/chosen": -9.974831581115723, - "logps/rejected": -11.45885944366455, - "loss": 0.3708, - "rewards/accuracies": 0.84375, - "rewards/chosen": -9.974831581115723, - "rewards/margins": 1.4840264320373535, - "rewards/rejected": -11.45885944366455, - "semantic_entropy": 0.0011410152073949575, + "logits/chosen": -0.07339875400066376, + "logits/rejected": 0.019816776737570763, + "logps/chosen": -5.233479022979736, + "logps/rejected": -6.714315891265869, + "loss": 0.3481, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.233479022979736, + "rewards/margins": 1.480836272239685, + "rewards/rejected": -6.714315891265869, "step": 5175 }, { "epoch": 2.7723699615320285, - "grad_norm": 28.6122272486666, + "grad_norm": 28.3758801617876, "learning_rate": 1.7340718539148203e-08, - "logits/chosen": 0.8332939147949219, - "logits/rejected": 0.8270009756088257, - "logps/chosen": -9.950929641723633, - "logps/rejected": -11.199871063232422, - "loss": 0.4188, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.950929641723633, - "rewards/margins": 1.2489430904388428, - "rewards/rejected": -11.199871063232422, - "semantic_entropy": 0.001222481718286872, + "logits/chosen": -0.03687150403857231, + "logits/rejected": 0.0157229695469141, + "logps/chosen": -5.291976451873779, + "logps/rejected": -6.420541286468506, + "loss": 0.4488, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -5.291976451873779, + "rewards/margins": 1.1285651922225952, + "rewards/rejected": -6.420541286468506, "step": 5180 }, { "epoch": 2.7750459943134302, - "grad_norm": 17.901053138739112, + "grad_norm": 22.082471401632258, "learning_rate": 1.6936461926568724e-08, - "logits/chosen": 0.8614856600761414, - "logits/rejected": 0.8957780599594116, - "logps/chosen": -9.616273880004883, - "logps/rejected": -11.097272872924805, - "loss": 0.3995, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.616273880004883, - "rewards/margins": 1.4809997081756592, - "rewards/rejected": -11.097272872924805, - "semantic_entropy": 0.001978642772883177, + "logits/chosen": -0.062348462641239166, + "logits/rejected": 0.0656796544790268, + "logps/chosen": -4.788423538208008, + "logps/rejected": -6.25627326965332, + "loss": 0.4136, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.788423538208008, + "rewards/margins": 1.4678497314453125, + "rewards/rejected": -6.25627326965332, "step": 5185 }, { "epoch": 2.777722027094832, - "grad_norm": 23.833732221773882, + "grad_norm": 22.737931770141596, "learning_rate": 1.6536891998554346e-08, - "logits/chosen": 0.7540593147277832, - "logits/rejected": 0.8192625045776367, - "logps/chosen": -9.756368637084961, - "logps/rejected": -11.14604663848877, - "loss": 0.3821, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -9.756368637084961, - "rewards/margins": 1.3896772861480713, - "rewards/rejected": -11.14604663848877, - "semantic_entropy": 0.0011760034831240773, + "logits/chosen": -0.18261876702308655, + "logits/rejected": -0.021240118891000748, + "logps/chosen": -4.919559001922607, + "logps/rejected": -6.250835418701172, + "loss": 0.4006, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.919559001922607, + "rewards/margins": 1.3312764167785645, + "rewards/rejected": -6.250835418701172, "step": 5190 }, { "epoch": 2.7803980598762337, - "grad_norm": 22.006338516815813, + "grad_norm": 24.15860876661104, "learning_rate": 1.6142012631734093e-08, - "logits/chosen": 0.8480769991874695, - "logits/rejected": 0.9198252558708191, - "logps/chosen": -9.712282180786133, - "logps/rejected": -11.163978576660156, - "loss": 0.3873, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.712282180786133, - "rewards/margins": 1.4516950845718384, - "rewards/rejected": -11.163978576660156, - "semantic_entropy": 0.0014456122880801558, + "logits/chosen": -0.06195025518536568, + "logits/rejected": 0.07474417984485626, + "logps/chosen": -4.909679412841797, + "logps/rejected": -6.245831489562988, + "loss": 0.4115, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.909679412841797, + "rewards/margins": 1.3361527919769287, + "rewards/rejected": -6.245831489562988, "step": 5195 }, { "epoch": 2.783074092657635, - "grad_norm": 29.759216354153438, + "grad_norm": 27.65873763908661, "learning_rate": 1.575182765722949e-08, - "logits/chosen": 0.7646334171295166, - "logits/rejected": 0.8091537356376648, - "logps/chosen": -9.873659133911133, - "logps/rejected": -11.170156478881836, - "loss": 0.4221, + "logits/chosen": -0.20309391617774963, + "logits/rejected": -0.04086611419916153, + "logps/chosen": -4.957182884216309, + "logps/rejected": -6.330117702484131, + "loss": 0.3925, "rewards/accuracies": 0.8125, - "rewards/chosen": -9.873659133911133, - "rewards/margins": 1.2964979410171509, - "rewards/rejected": -11.170156478881836, - "semantic_entropy": 0.001071856007911265, + "rewards/chosen": -4.957182884216309, + "rewards/margins": 1.3729345798492432, + "rewards/rejected": -6.330117702484131, "step": 5200 }, { "epoch": 2.783074092657635, - "eval_logits/chosen": 0.8672059774398804, - "eval_logits/rejected": 0.9047586917877197, - "eval_logps/chosen": -9.958077430725098, - "eval_logps/rejected": -11.086060523986816, - "eval_loss": 0.5239496231079102, - "eval_rewards/accuracies": 0.7247774600982666, - "eval_rewards/chosen": -9.958077430725098, - "eval_rewards/margins": 1.1279836893081665, - "eval_rewards/rejected": -11.086060523986816, - "eval_runtime": 35.0763, - "eval_samples_per_second": 38.345, - "eval_semantic_entropy": 0.0012691987212747335, - "eval_steps_per_second": 9.608, + "eval_logits/chosen": 0.24112989008426666, + "eval_logits/rejected": 0.3386378884315491, + "eval_logps/chosen": -5.209949493408203, + "eval_logps/rejected": -6.320249557495117, + "eval_loss": 0.5251456499099731, + "eval_rewards/accuracies": 0.7284866571426392, + "eval_rewards/chosen": -5.209949493408203, + "eval_rewards/margins": 1.1103001832962036, + "eval_rewards/rejected": -6.320249557495117, + "eval_runtime": 40.3829, + "eval_samples_per_second": 33.306, + "eval_steps_per_second": 8.345, "step": 5200 }, { "epoch": 2.7857501254390367, - "grad_norm": 19.396392953978733, + "grad_norm": 13.312778693112557, "learning_rate": 1.536634086061672e-08, - "logits/chosen": 0.8649656176567078, - "logits/rejected": 0.8771345019340515, - "logps/chosen": -9.787662506103516, - "logps/rejected": -11.10372543334961, - "loss": 0.4402, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -9.787662506103516, - "rewards/margins": 1.316063642501831, - "rewards/rejected": -11.10372543334961, - "semantic_entropy": 0.001424965332262218, + "logits/chosen": -0.06444571167230606, + "logits/rejected": 0.01334885973483324, + "logps/chosen": -4.965758323669434, + "logps/rejected": -6.318658351898193, + "loss": 0.4089, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.965758323669434, + "rewards/margins": 1.3529002666473389, + "rewards/rejected": -6.318658351898193, "step": 5205 }, { "epoch": 2.788426158220438, - "grad_norm": 26.94949255337376, + "grad_norm": 19.582665983697215, "learning_rate": 1.4985555981890495e-08, - "logits/chosen": 0.8623428344726562, - "logits/rejected": 0.8985008001327515, - "logps/chosen": -9.891159057617188, - "logps/rejected": -11.298178672790527, - "loss": 0.4101, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.891159057617188, - "rewards/margins": 1.4070203304290771, - "rewards/rejected": -11.298178672790527, - "semantic_entropy": 0.0011413523461669683, + "logits/chosen": -0.08094249665737152, + "logits/rejected": 0.03226098790764809, + "logps/chosen": -4.964107513427734, + "logps/rejected": -6.364620685577393, + "loss": 0.3996, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.964107513427734, + "rewards/margins": 1.4005130529403687, + "rewards/rejected": -6.364620685577393, "step": 5210 }, { "epoch": 2.7911021910018396, - "grad_norm": 19.482056056108192, + "grad_norm": 16.699219725164234, "learning_rate": 1.4609476715427226e-08, - "logits/chosen": 0.8556788563728333, - "logits/rejected": 0.8989516496658325, - "logps/chosen": -9.591584205627441, - "logps/rejected": -11.054668426513672, - "loss": 0.3868, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.591584205627441, - "rewards/margins": 1.4630842208862305, - "rewards/rejected": -11.054668426513672, - "semantic_entropy": 0.0015841536223888397, + "logits/chosen": -0.1029234528541565, + "logits/rejected": -0.007795036770403385, + "logps/chosen": -4.74837589263916, + "logps/rejected": -6.207093238830566, + "loss": 0.3815, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.74837589263916, + "rewards/margins": 1.458717703819275, + "rewards/rejected": -6.207093238830566, "step": 5215 }, { "epoch": 2.7937782237832414, - "grad_norm": 24.45269364730528, + "grad_norm": 17.477113131756592, "learning_rate": 1.4238106709949792e-08, - "logits/chosen": 0.7989322543144226, - "logits/rejected": 0.8648680448532104, - "logps/chosen": -9.795249938964844, - "logps/rejected": -11.304253578186035, - "loss": 0.3443, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.795249938964844, - "rewards/margins": 1.5090038776397705, - "rewards/rejected": -11.304253578186035, - "semantic_entropy": 0.0012715930351987481, + "logits/chosen": -0.1659001111984253, + "logits/rejected": -0.06113949418067932, + "logps/chosen": -4.925139427185059, + "logps/rejected": -6.529391288757324, + "loss": 0.3228, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.925139427185059, + "rewards/margins": 1.6042522192001343, + "rewards/rejected": -6.529391288757324, "step": 5220 }, { "epoch": 2.796454256564643, - "grad_norm": 29.960130426736338, + "grad_norm": 29.35848245918299, "learning_rate": 1.3871449568491511e-08, - "logits/chosen": 0.7782562971115112, - "logits/rejected": 0.8624800443649292, - "logps/chosen": -9.840250968933105, - "logps/rejected": -11.1703462600708, - "loss": 0.4095, + "logits/chosen": -0.09469349682331085, + "logits/rejected": 0.08303854614496231, + "logps/chosen": -5.004944801330566, + "logps/rejected": -6.256485462188721, + "loss": 0.4218, "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.840250968933105, - "rewards/margins": 1.3300951719284058, - "rewards/rejected": -11.1703462600708, - "semantic_entropy": 0.0011493575293570757, + "rewards/chosen": -5.004944801330566, + "rewards/margins": 1.2515411376953125, + "rewards/rejected": -6.256485462188721, "step": 5225 }, { "epoch": 2.7991302893460444, - "grad_norm": 17.26742704065323, + "grad_norm": 14.54118921652714, "learning_rate": 1.3509508848361606e-08, - "logits/chosen": 0.7447667121887207, - "logits/rejected": 0.7923721075057983, - "logps/chosen": -9.699943542480469, - "logps/rejected": -11.134923934936523, - "loss": 0.3732, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.699943542480469, - "rewards/margins": 1.4349799156188965, - "rewards/rejected": -11.134923934936523, - "semantic_entropy": 0.001493643270805478, + "logits/chosen": -0.2041049748659134, + "logits/rejected": -0.05934162065386772, + "logps/chosen": -5.090305805206299, + "logps/rejected": -6.350647926330566, + "loss": 0.4091, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.090305805206299, + "rewards/margins": 1.260342001914978, + "rewards/rejected": -6.350647926330566, "step": 5230 }, { "epoch": 2.801806322127446, - "grad_norm": 18.96292537799569, + "grad_norm": 17.413022714160352, "learning_rate": 1.3152288061110517e-08, - "logits/chosen": 0.7624896764755249, - "logits/rejected": 0.8235493898391724, - "logps/chosen": -9.626928329467773, - "logps/rejected": -11.030915260314941, - "loss": 0.3852, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -9.626928329467773, - "rewards/margins": 1.4039862155914307, - "rewards/rejected": -11.030915260314941, - "semantic_entropy": 0.0014617822598665953, + "logits/chosen": -0.19223091006278992, + "logits/rejected": -0.04316201061010361, + "logps/chosen": -4.843361854553223, + "logps/rejected": -6.240896224975586, + "loss": 0.3724, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.843361854553223, + "rewards/margins": 1.3975343704223633, + "rewards/rejected": -6.240896224975586, "step": 5235 }, { "epoch": 2.804482354908848, - "grad_norm": 21.629798181433536, + "grad_norm": 21.17609056207554, "learning_rate": 1.2799790672495814e-08, - "logits/chosen": 0.8226927518844604, - "logits/rejected": 0.88921058177948, - "logps/chosen": -9.711584091186523, - "logps/rejected": -11.16446590423584, - "loss": 0.3962, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.711584091186523, - "rewards/margins": 1.4528809785842896, - "rewards/rejected": -11.16446590423584, - "semantic_entropy": 0.0014884325210005045, + "logits/chosen": -0.14835722744464874, + "logits/rejected": 0.045030392706394196, + "logps/chosen": -4.93457555770874, + "logps/rejected": -6.316958427429199, + "loss": 0.4054, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.93457555770874, + "rewards/margins": 1.3823829889297485, + "rewards/rejected": -6.316958427429199, "step": 5240 }, { "epoch": 2.807158387690249, - "grad_norm": 25.58854679978555, + "grad_norm": 22.183036334519294, "learning_rate": 1.2452020102448835e-08, - "logits/chosen": 0.844528317451477, - "logits/rejected": 0.8750749826431274, - "logps/chosen": -9.763383865356445, - "logps/rejected": -11.062616348266602, - "loss": 0.4045, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.763383865356445, - "rewards/margins": 1.2992339134216309, - "rewards/rejected": -11.062616348266602, - "semantic_entropy": 0.001275677583180368, + "logits/chosen": -0.10303513705730438, + "logits/rejected": -0.03303191810846329, + "logps/chosen": -4.865030765533447, + "logps/rejected": -6.180552005767822, + "loss": 0.4028, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.865030765533447, + "rewards/margins": 1.3155206441879272, + "rewards/rejected": -6.180552005767822, "step": 5245 }, { "epoch": 2.8098344204716508, - "grad_norm": 30.415412249344016, + "grad_norm": 32.32573504667491, "learning_rate": 1.2108979725041103e-08, - "logits/chosen": 0.7932205200195312, - "logits/rejected": 0.8923565745353699, - "logps/chosen": -9.760113716125488, - "logps/rejected": -11.208516120910645, - "loss": 0.4172, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.760113716125488, - "rewards/margins": 1.4484022855758667, - "rewards/rejected": -11.208516120910645, - "semantic_entropy": 0.0014725803630426526, + "logits/chosen": -0.17047499120235443, + "logits/rejected": -0.007521389983594418, + "logps/chosen": -5.1098713874816895, + "logps/rejected": -6.507616996765137, + "loss": 0.4128, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.1098713874816895, + "rewards/margins": 1.3977460861206055, + "rewards/rejected": -6.507616996765137, "step": 5250 }, { "epoch": 2.8125104532530525, - "grad_norm": 22.204446838947426, + "grad_norm": 21.919650728160775, "learning_rate": 1.1770672868451958e-08, - "logits/chosen": 0.8339746594429016, - "logits/rejected": 0.9121615290641785, - "logps/chosen": -10.030183792114258, - "logps/rejected": -11.423551559448242, - "loss": 0.3705, - "rewards/accuracies": 0.84375, - "rewards/chosen": -10.030183792114258, - "rewards/margins": 1.393368124961853, - "rewards/rejected": -11.423551559448242, - "semantic_entropy": 0.0011341646313667297, + "logits/chosen": -0.12820717692375183, + "logits/rejected": 0.0865853875875473, + "logps/chosen": -5.3525848388671875, + "logps/rejected": -6.725950717926025, + "loss": 0.3864, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.3525848388671875, + "rewards/margins": 1.3733667135238647, + "rewards/rejected": -6.725950717926025, "step": 5255 }, { "epoch": 2.8151864860344538, - "grad_norm": 36.390773136441254, + "grad_norm": 26.25266455002211, "learning_rate": 1.1437102814935872e-08, - "logits/chosen": 0.8048036694526672, - "logits/rejected": 0.8228060603141785, - "logps/chosen": -9.797411918640137, - "logps/rejected": -11.064781188964844, - "loss": 0.4645, + "logits/chosen": -0.09210020303726196, + "logits/rejected": -0.04158976674079895, + "logps/chosen": -5.020191192626953, + "logps/rejected": -6.293526649475098, + "loss": 0.4533, "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.797411918640137, - "rewards/margins": 1.267369270324707, - "rewards/rejected": -11.064781188964844, - "semantic_entropy": 0.0013338859425857663, + "rewards/chosen": -5.020191192626953, + "rewards/margins": 1.2733356952667236, + "rewards/rejected": -6.293526649475098, "step": 5260 }, { "epoch": 2.8178625188158555, - "grad_norm": 19.439863848193177, + "grad_norm": 18.17387697590232, "learning_rate": 1.1108272800791018e-08, - "logits/chosen": 0.8065903782844543, - "logits/rejected": 0.8303533792495728, - "logps/chosen": -9.820572853088379, - "logps/rejected": -11.200105667114258, - "loss": 0.3785, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.820572853088379, - "rewards/margins": 1.3795334100723267, - "rewards/rejected": -11.200105667114258, - "semantic_entropy": 0.0013253279030323029, + "logits/chosen": -0.213901549577713, + "logits/rejected": 0.00761891296133399, + "logps/chosen": -5.223311424255371, + "logps/rejected": -6.529093265533447, + "loss": 0.4079, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -5.223311424255371, + "rewards/margins": 1.3057810068130493, + "rewards/rejected": -6.529093265533447, "step": 5265 }, { "epoch": 2.820538551597257, - "grad_norm": 24.995280474358317, + "grad_norm": 23.39880776986712, "learning_rate": 1.078418601632769e-08, - "logits/chosen": 0.8746574521064758, - "logits/rejected": 0.8969374895095825, - "logps/chosen": -9.832982063293457, - "logps/rejected": -11.270672798156738, - "loss": 0.3779, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -9.832982063293457, - "rewards/margins": 1.43769109249115, - "rewards/rejected": -11.270672798156738, - "semantic_entropy": 0.0013589839218184352, + "logits/chosen": -0.05190851166844368, + "logits/rejected": 0.06831367313861847, + "logps/chosen": -5.152034759521484, + "logps/rejected": -6.503214359283447, + "loss": 0.3921, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.152034759521484, + "rewards/margins": 1.3511805534362793, + "rewards/rejected": -6.503214359283447, "step": 5270 }, { "epoch": 2.8232145843786585, - "grad_norm": 15.437216876674157, + "grad_norm": 17.35951012375128, "learning_rate": 1.0464845605837159e-08, - "logits/chosen": 0.7981137633323669, - "logits/rejected": 0.8469152450561523, - "logps/chosen": -9.720634460449219, - "logps/rejected": -11.106169700622559, - "loss": 0.3561, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -9.720634460449219, - "rewards/margins": 1.385535478591919, - "rewards/rejected": -11.106169700622559, - "semantic_entropy": 0.0014761090278625488, + "logits/chosen": -0.09964104741811752, + "logits/rejected": 0.04313405603170395, + "logps/chosen": -5.1447858810424805, + "logps/rejected": -6.442635536193848, + "loss": 0.377, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.1447858810424805, + "rewards/margins": 1.297849416732788, + "rewards/rejected": -6.442635536193848, "step": 5275 }, { "epoch": 2.82589061716006, - "grad_norm": 15.998655549802471, + "grad_norm": 17.88597742751946, "learning_rate": 1.0150254667561642e-08, - "logits/chosen": 0.7983990907669067, - "logits/rejected": 0.8353071212768555, - "logps/chosen": -10.041610717773438, - "logps/rejected": -11.562406539916992, - "loss": 0.377, + "logits/chosen": -0.1049385666847229, + "logits/rejected": 0.09079035371541977, + "logps/chosen": -5.354544639587402, + "logps/rejected": -6.763103485107422, + "loss": 0.3773, "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -10.041610717773438, - "rewards/margins": 1.5207948684692383, - "rewards/rejected": -11.562406539916992, - "semantic_entropy": 0.0010572883766144514, + "rewards/chosen": -5.354544639587402, + "rewards/margins": 1.40855872631073, + "rewards/rejected": -6.763103485107422, "step": 5280 }, { "epoch": 2.828566649941462, - "grad_norm": 27.424504358400647, + "grad_norm": 27.50450873282693, "learning_rate": 9.840416253663719e-09, - "logits/chosen": 0.7955508232116699, - "logits/rejected": 0.8736175298690796, - "logps/chosen": -9.822931289672852, - "logps/rejected": -11.340441703796387, - "loss": 0.3779, - "rewards/accuracies": 0.84375, - "rewards/chosen": -9.822931289672852, - "rewards/margins": 1.5175096988677979, - "rewards/rejected": -11.340441703796387, - "semantic_entropy": 0.0011566228931769729, + "logits/chosen": -0.16542509198188782, + "logits/rejected": -0.03766718506813049, + "logps/chosen": -5.01721715927124, + "logps/rejected": -6.487986087799072, + "loss": 0.3922, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.01721715927124, + "rewards/margins": 1.470768928527832, + "rewards/rejected": -6.487986087799072, "step": 5285 }, { "epoch": 2.8312426827228636, - "grad_norm": 27.591040599780438, + "grad_norm": 22.213429918330053, "learning_rate": 9.535333370197074e-09, - "logits/chosen": 0.8231992721557617, - "logits/rejected": 0.8760835528373718, - "logps/chosen": -9.819466590881348, - "logps/rejected": -11.222938537597656, - "loss": 0.4014, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.819466590881348, - "rewards/margins": 1.4034711122512817, - "rewards/rejected": -11.222938537597656, - "semantic_entropy": 0.0014938964741304517, + "logits/chosen": -0.06838265061378479, + "logits/rejected": 0.09782443940639496, + "logps/chosen": -5.169068813323975, + "logps/rejected": -6.565960884094238, + "loss": 0.3821, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.169068813323975, + "rewards/margins": 1.3968923091888428, + "rewards/rejected": -6.565960884094238, "step": 5290 }, { "epoch": 2.833918715504265, - "grad_norm": 18.470072827747238, + "grad_norm": 19.772135488776723, "learning_rate": 9.23500897707713e-09, - "logits/chosen": 0.8005205988883972, - "logits/rejected": 0.8584259748458862, - "logps/chosen": -9.931344985961914, - "logps/rejected": -11.320419311523438, - "loss": 0.4159, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.931344985961914, - "rewards/margins": 1.3890745639801025, - "rewards/rejected": -11.320419311523438, - "semantic_entropy": 0.0011897350195795298, + "logits/chosen": -0.15767113864421844, + "logits/rejected": 0.032631613314151764, + "logps/chosen": -5.245600700378418, + "logps/rejected": -6.654934883117676, + "loss": 0.3918, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.245600700378418, + "rewards/margins": 1.4093341827392578, + "rewards/rejected": -6.654934883117676, "step": 5295 }, { "epoch": 2.8365947482856666, - "grad_norm": 23.14759681156242, + "grad_norm": 25.369553596980023, "learning_rate": 8.939445988052574e-09, - "logits/chosen": 0.7812570929527283, - "logits/rejected": 0.8415404558181763, - "logps/chosen": -9.733675003051758, - "logps/rejected": -11.219903945922852, - "loss": 0.3617, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -9.733675003051758, - "rewards/margins": 1.4862289428710938, - "rewards/rejected": -11.219903945922852, - "semantic_entropy": 0.001550258370116353, + "logits/chosen": -0.11192308366298676, + "logits/rejected": -0.036247618496418, + "logps/chosen": -4.97711706161499, + "logps/rejected": -6.383062839508057, + "loss": 0.3837, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.97711706161499, + "rewards/margins": 1.4059457778930664, + "rewards/rejected": -6.383062839508057, "step": 5300 }, { "epoch": 2.839270781067068, - "grad_norm": 28.37485360554531, + "grad_norm": 34.58267146613646, "learning_rate": 8.648647270676656e-09, - "logits/chosen": 0.8305708169937134, - "logits/rejected": 0.8369789123535156, - "logps/chosen": -9.811845779418945, - "logps/rejected": -11.167532920837402, - "loss": 0.4223, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -9.811845779418945, - "rewards/margins": 1.3556877374649048, - "rewards/rejected": -11.167532920837402, - "semantic_entropy": 0.0015380210243165493, + "logits/chosen": -0.077006995677948, + "logits/rejected": 0.02099095657467842, + "logps/chosen": -5.236058712005615, + "logps/rejected": -6.505140781402588, + "loss": 0.4372, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -5.236058712005615, + "rewards/margins": 1.2690818309783936, + "rewards/rejected": -6.505140781402588, "step": 5305 }, { "epoch": 2.8419468138484696, - "grad_norm": 16.333026482611224, + "grad_norm": 21.90538033697602, "learning_rate": 8.362615646279991e-09, - "logits/chosen": 0.8135038614273071, - "logits/rejected": 0.8544967770576477, - "logps/chosen": -9.836585998535156, - "logps/rejected": -11.551431655883789, - "loss": 0.379, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.836585998535156, - "rewards/margins": 1.7148460149765015, - "rewards/rejected": -11.551431655883789, - "semantic_entropy": 0.0012975989375263453, + "logits/chosen": -0.2337752878665924, + "logits/rejected": -0.02170114777982235, + "logps/chosen": -5.009992599487305, + "logps/rejected": -6.556719779968262, + "loss": 0.4163, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.009992599487305, + "rewards/margins": 1.546726942062378, + "rewards/rejected": -6.556719779968262, "step": 5310 }, { "epoch": 2.8446228466298713, - "grad_norm": 22.628935794259295, + "grad_norm": 32.22805683044439, "learning_rate": 8.081353889942466e-09, - "logits/chosen": 0.8904609680175781, - "logits/rejected": 0.9412251710891724, - "logps/chosen": -9.89416217803955, - "logps/rejected": -11.147021293640137, - "loss": 0.4024, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.89416217803955, - "rewards/margins": 1.2528594732284546, - "rewards/rejected": -11.147021293640137, - "semantic_entropy": 0.001227770815603435, + "logits/chosen": -0.04680415242910385, + "logits/rejected": 0.13405628502368927, + "logps/chosen": -5.078368663787842, + "logps/rejected": -6.320135593414307, + "loss": 0.4213, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.078368663787842, + "rewards/margins": 1.241767168045044, + "rewards/rejected": -6.320135593414307, "step": 5315 }, { "epoch": 2.847298879411273, - "grad_norm": 24.067990062164654, + "grad_norm": 22.73519980870183, "learning_rate": 7.804864730467042e-09, - "logits/chosen": 0.8739885091781616, - "logits/rejected": 0.9077037572860718, - "logps/chosen": -9.861922264099121, - "logps/rejected": -11.29807186126709, - "loss": 0.3558, - "rewards/accuracies": 0.84375, - "rewards/chosen": -9.861922264099121, - "rewards/margins": 1.4361498355865479, - "rewards/rejected": -11.29807186126709, - "semantic_entropy": 0.0011800903594121337, + "logits/chosen": -0.05560026317834854, + "logits/rejected": -0.001978851156309247, + "logps/chosen": -5.024258613586426, + "logps/rejected": -6.366459846496582, + "loss": 0.369, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.024258613586426, + "rewards/margins": 1.3422011137008667, + "rewards/rejected": -6.366459846496582, "step": 5320 }, { "epoch": 2.8499749121926743, - "grad_norm": 20.07201861954392, + "grad_norm": 17.631790482611823, "learning_rate": 7.533150850352665e-09, - "logits/chosen": 0.8092811703681946, - "logits/rejected": 0.8977264165878296, - "logps/chosen": -9.86597728729248, - "logps/rejected": -11.442625045776367, - "loss": 0.3673, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.86597728729248, - "rewards/margins": 1.5766479969024658, - "rewards/rejected": -11.442625045776367, - "semantic_entropy": 0.0013031138805672526, + "logits/chosen": -0.11392883956432343, + "logits/rejected": 0.06536897271871567, + "logps/chosen": -4.977337837219238, + "logps/rejected": -6.510616302490234, + "loss": 0.3487, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.977337837219238, + "rewards/margins": 1.5332787036895752, + "rewards/rejected": -6.510616302490234, "step": 5325 }, { "epoch": 2.852650944974076, - "grad_norm": 28.074967041760427, + "grad_norm": 28.67512214532912, "learning_rate": 7.2662148857686175e-09, - "logits/chosen": 0.8255325555801392, - "logits/rejected": 0.8576027154922485, - "logps/chosen": -9.927534103393555, - "logps/rejected": -11.337235450744629, - "loss": 0.4482, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -9.927534103393555, - "rewards/margins": 1.4097009897232056, - "rewards/rejected": -11.337235450744629, - "semantic_entropy": 0.0012552501866593957, + "logits/chosen": -0.0496702678501606, + "logits/rejected": 0.03874925523996353, + "logps/chosen": -5.020773887634277, + "logps/rejected": -6.442172050476074, + "loss": 0.4189, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.020773887634277, + "rewards/margins": 1.4213987588882446, + "rewards/rejected": -6.442172050476074, "step": 5330 }, { "epoch": 2.8553269777554773, - "grad_norm": 20.375284103879604, + "grad_norm": 16.430757604397645, "learning_rate": 7.0040594265287635e-09, - "logits/chosen": 0.820387065410614, - "logits/rejected": 0.8405435681343079, - "logps/chosen": -9.774995803833008, - "logps/rejected": -10.992179870605469, - "loss": 0.4593, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.774995803833008, - "rewards/margins": 1.2171828746795654, - "rewards/rejected": -10.992179870605469, - "semantic_entropy": 0.001311628962866962, + "logits/chosen": -0.037665605545043945, + "logits/rejected": -0.07690034806728363, + "logps/chosen": -4.881814002990723, + "logps/rejected": -6.030283451080322, + "loss": 0.4513, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.881814002990723, + "rewards/margins": 1.1484688520431519, + "rewards/rejected": -6.030283451080322, "step": 5335 }, { "epoch": 2.858003010536879, - "grad_norm": 20.571805080727156, + "grad_norm": 19.809890115252383, "learning_rate": 6.746687016066566e-09, - "logits/chosen": 0.8561931848526001, - "logits/rejected": 0.9219743609428406, - "logps/chosen": -9.802877426147461, - "logps/rejected": -11.207548141479492, - "loss": 0.4, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.802877426147461, - "rewards/margins": 1.404672384262085, - "rewards/rejected": -11.207548141479492, - "semantic_entropy": 0.0014292590785771608, + "logits/chosen": -0.04789482802152634, + "logits/rejected": 0.03438682481646538, + "logps/chosen": -4.915477752685547, + "logps/rejected": -6.364718914031982, + "loss": 0.3852, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.915477752685547, + "rewards/margins": 1.449240803718567, + "rewards/rejected": -6.364718914031982, "step": 5340 }, { "epoch": 2.8606790433182807, - "grad_norm": 22.031680288537032, + "grad_norm": 20.27537153528591, "learning_rate": 6.494100151410276e-09, - "logits/chosen": 0.7743942141532898, - "logits/rejected": 0.8276403546333313, - "logps/chosen": -9.865171432495117, - "logps/rejected": -11.20588207244873, - "loss": 0.3737, - "rewards/accuracies": 0.84375, - "rewards/chosen": -9.865171432495117, - "rewards/margins": 1.3407100439071655, - "rewards/rejected": -11.20588207244873, - "semantic_entropy": 0.0012073902180418372, + "logits/chosen": -0.1857433170080185, + "logits/rejected": -0.008080517873167992, + "logps/chosen": -5.11199951171875, + "logps/rejected": -6.4306640625, + "loss": 0.3832, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.11199951171875, + "rewards/margins": 1.3186638355255127, + "rewards/rejected": -6.4306640625, "step": 5345 }, { "epoch": 2.8633550760996824, - "grad_norm": 23.63055718220825, + "grad_norm": 23.056126418907777, "learning_rate": 6.246301283158728e-09, - "logits/chosen": 0.8746307492256165, - "logits/rejected": 0.895931601524353, - "logps/chosen": -9.761825561523438, - "logps/rejected": -11.010538101196289, - "loss": 0.473, - "rewards/accuracies": 0.75, - "rewards/chosen": -9.761825561523438, - "rewards/margins": 1.2487126588821411, - "rewards/rejected": -11.010538101196289, - "semantic_entropy": 0.001414592145010829, + "logits/chosen": -0.041939567774534225, + "logits/rejected": -0.03831356391310692, + "logps/chosen": -4.9089789390563965, + "logps/rejected": -6.035547733306885, + "loss": 0.4768, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.9089789390563965, + "rewards/margins": 1.12656831741333, + "rewards/rejected": -6.035547733306885, "step": 5350 }, { "epoch": 2.8660311088810837, - "grad_norm": 20.296610881740666, + "grad_norm": 19.40044450648787, "learning_rate": 6.0032928154576944e-09, - "logits/chosen": 0.8312317132949829, - "logits/rejected": 0.8839332461357117, - "logps/chosen": -9.876100540161133, - "logps/rejected": -11.113783836364746, - "loss": 0.4206, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.876100540161133, - "rewards/margins": 1.2376841306686401, - "rewards/rejected": -11.113783836364746, - "semantic_entropy": 0.0015838369727134705, + "logits/chosen": -0.07604765892028809, + "logits/rejected": 0.0007553629693575203, + "logps/chosen": -5.030134201049805, + "logps/rejected": -6.209230899810791, + "loss": 0.4211, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.030134201049805, + "rewards/margins": 1.1790968179702759, + "rewards/rejected": -6.209230899810791, "step": 5355 }, { "epoch": 2.8687071416624854, - "grad_norm": 30.87457108658666, + "grad_norm": 28.533809066761783, "learning_rate": 5.76507710597629e-09, - "logits/chosen": 0.8078063726425171, - "logits/rejected": 0.8501046895980835, - "logps/chosen": -9.82047176361084, - "logps/rejected": -11.10120964050293, - "loss": 0.4325, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -9.82047176361084, - "rewards/margins": 1.2807366847991943, - "rewards/rejected": -11.10120964050293, - "semantic_entropy": 0.001436726888641715, + "logits/chosen": -0.12218046188354492, + "logits/rejected": 0.06249883025884628, + "logps/chosen": -5.104481220245361, + "logps/rejected": -6.352130889892578, + "loss": 0.4282, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -5.104481220245361, + "rewards/margins": 1.2476491928100586, + "rewards/rejected": -6.352130889892578, "step": 5360 }, { "epoch": 2.8713831744438867, - "grad_norm": 20.020061107451614, + "grad_norm": 17.016592795173896, "learning_rate": 5.531656465884438e-09, - "logits/chosen": 0.7786573171615601, - "logits/rejected": 0.8030532598495483, - "logps/chosen": -9.810864448547363, - "logps/rejected": -11.275449752807617, - "loss": 0.402, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.810864448547363, - "rewards/margins": 1.4645856618881226, - "rewards/rejected": -11.275449752807617, - "semantic_entropy": 0.0011968390317633748, + "logits/chosen": -0.15425851941108704, + "logits/rejected": -0.008609334006905556, + "logps/chosen": -4.985617160797119, + "logps/rejected": -6.449091911315918, + "loss": 0.3867, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.985617160797119, + "rewards/margins": 1.463474988937378, + "rewards/rejected": -6.449091911315918, "step": 5365 }, { "epoch": 2.8740592072252884, - "grad_norm": 27.208547849442688, + "grad_norm": 24.459665593351385, "learning_rate": 5.303033159830217e-09, - "logits/chosen": 0.8697658777236938, - "logits/rejected": 0.8987666368484497, - "logps/chosen": -9.942388534545898, - "logps/rejected": -11.106074333190918, - "loss": 0.4543, + "logits/chosen": 0.011714700609445572, + "logits/rejected": 0.049465470016002655, + "logps/chosen": -5.1374711990356445, + "logps/rejected": -6.278901100158691, + "loss": 0.457, "rewards/accuracies": 0.78125, - "rewards/chosen": -9.942388534545898, - "rewards/margins": 1.1636863946914673, - "rewards/rejected": -11.106074333190918, - "semantic_entropy": 0.0013244937872514129, + "rewards/chosen": -5.1374711990356445, + "rewards/margins": 1.1414297819137573, + "rewards/rejected": -6.278901100158691, "step": 5370 }, { "epoch": 2.87673524000669, - "grad_norm": 23.175199847112445, + "grad_norm": 20.951592589326737, "learning_rate": 5.079209405917939e-09, - "logits/chosen": 0.7859164476394653, - "logits/rejected": 0.8312576413154602, - "logps/chosen": -9.579522132873535, - "logps/rejected": -11.244876861572266, - "loss": 0.3603, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -9.579522132873535, - "rewards/margins": 1.665353775024414, - "rewards/rejected": -11.244876861572266, - "semantic_entropy": 0.001594201079569757, + "logits/chosen": -0.09543697535991669, + "logits/rejected": -0.0058655934408307076, + "logps/chosen": -4.832207679748535, + "logps/rejected": -6.465902805328369, + "loss": 0.3708, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.832207679748535, + "rewards/margins": 1.6336956024169922, + "rewards/rejected": -6.465902805328369, "step": 5375 }, { "epoch": 2.879411272788092, - "grad_norm": 19.400301117431837, + "grad_norm": 19.932447011247554, "learning_rate": 4.860187375686664e-09, - "logits/chosen": 0.789514422416687, - "logits/rejected": 0.8606871366500854, - "logps/chosen": -9.77333927154541, - "logps/rejected": -11.256834983825684, - "loss": 0.3748, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.77333927154541, - "rewards/margins": 1.4834961891174316, - "rewards/rejected": -11.256834983825684, - "semantic_entropy": 0.001359016285277903, + "logits/chosen": -0.1664259135723114, + "logits/rejected": 0.07564910501241684, + "logps/chosen": -5.123175144195557, + "logps/rejected": -6.544573783874512, + "loss": 0.3698, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.123175144195557, + "rewards/margins": 1.4213993549346924, + "rewards/rejected": -6.544573783874512, "step": 5380 }, { "epoch": 2.882087305569493, - "grad_norm": 16.661476052195837, + "grad_norm": 17.19812297447756, "learning_rate": 4.64596919408905e-09, - "logits/chosen": 0.8640682101249695, - "logits/rejected": 0.8913204073905945, - "logps/chosen": -9.612969398498535, - "logps/rejected": -11.022314071655273, - "loss": 0.402, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.612969398498535, - "rewards/margins": 1.4093445539474487, - "rewards/rejected": -11.022314071655273, - "semantic_entropy": 0.0015766730066388845, + "logits/chosen": -0.021460913121700287, + "logits/rejected": 0.0474419929087162, + "logps/chosen": -4.9569315910339355, + "logps/rejected": -6.284632205963135, + "loss": 0.3923, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.9569315910339355, + "rewards/margins": 1.3277006149291992, + "rewards/rejected": -6.284632205963135, "step": 5385 }, { "epoch": 2.884763338350895, - "grad_norm": 23.611323349348925, + "grad_norm": 19.00660352208009, "learning_rate": 4.436556939470814e-09, - "logits/chosen": 0.7981586456298828, - "logits/rejected": 0.8657184839248657, - "logps/chosen": -10.07356071472168, - "logps/rejected": -11.186826705932617, - "loss": 0.4737, - "rewards/accuracies": 0.75, - "rewards/chosen": -10.07356071472168, - "rewards/margins": 1.1132649183273315, - "rewards/rejected": -11.186826705932617, - "semantic_entropy": 0.001212230185046792, + "logits/chosen": -0.10958348214626312, + "logits/rejected": 0.04778730124235153, + "logps/chosen": -5.25009822845459, + "logps/rejected": -6.299006462097168, + "loss": 0.4757, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -5.25009822845459, + "rewards/margins": 1.0489082336425781, + "rewards/rejected": -6.299006462097168, "step": 5390 }, { "epoch": 2.887439371132296, - "grad_norm": 24.769868059281617, + "grad_norm": 18.93304657719946, "learning_rate": 4.23195264355064e-09, - "logits/chosen": 0.688225269317627, - "logits/rejected": 0.7675420641899109, - "logps/chosen": -9.678533554077148, - "logps/rejected": -11.024267196655273, - "loss": 0.4225, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -9.678533554077148, - "rewards/margins": 1.3457330465316772, - "rewards/rejected": -11.024267196655273, - "semantic_entropy": 0.001531310030259192, + "logits/chosen": -0.2551293671131134, + "logits/rejected": -0.02950384095311165, + "logps/chosen": -5.0177106857299805, + "logps/rejected": -6.315464496612549, + "loss": 0.4085, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -5.0177106857299805, + "rewards/margins": 1.2977533340454102, + "rewards/rejected": -6.315464496612549, "step": 5395 }, { "epoch": 2.890115403913698, - "grad_norm": 23.697079058959545, + "grad_norm": 22.61331918550436, "learning_rate": 4.032158291400245e-09, - "logits/chosen": 0.7804639339447021, - "logits/rejected": 0.8653789758682251, - "logps/chosen": -9.64900016784668, - "logps/rejected": -11.365049362182617, - "loss": 0.329, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -9.64900016784668, - "rewards/margins": 1.7160485982894897, - "rewards/rejected": -11.365049362182617, - "semantic_entropy": 0.0016292607178911567, + "logits/chosen": -0.16071471571922302, + "logits/rejected": 0.10017833858728409, + "logps/chosen": -4.8222880363464355, + "logps/rejected": -6.517637729644775, + "loss": 0.3424, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.8222880363464355, + "rewards/margins": 1.6953496932983398, + "rewards/rejected": -6.517637729644775, "step": 5400 }, { "epoch": 2.8927914366950995, - "grad_norm": 20.18703093788473, + "grad_norm": 17.435921908089636, "learning_rate": 3.837175821425398e-09, - "logits/chosen": 0.8109928369522095, - "logits/rejected": 0.8580228686332703, - "logps/chosen": -9.768526077270508, - "logps/rejected": -11.133955001831055, - "loss": 0.4179, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.768526077270508, - "rewards/margins": 1.3654298782348633, - "rewards/rejected": -11.133955001831055, - "semantic_entropy": 0.0016449004178866744, + "logits/chosen": -0.05047282576560974, + "logits/rejected": 0.01756933331489563, + "logps/chosen": -5.000189781188965, + "logps/rejected": -6.1976799964904785, + "loss": 0.4581, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -5.000189781188965, + "rewards/margins": 1.1974899768829346, + "rewards/rejected": -6.1976799964904785, "step": 5405 }, { "epoch": 2.8954674694765012, - "grad_norm": 14.944854500276783, + "grad_norm": 14.599639872937415, "learning_rate": 3.6470071253467683e-09, - "logits/chosen": 0.8247249722480774, - "logits/rejected": 0.8406414985656738, - "logps/chosen": -9.951112747192383, - "logps/rejected": -11.39813232421875, - "loss": 0.4228, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.951112747192383, - "rewards/margins": 1.4470199346542358, - "rewards/rejected": -11.39813232421875, - "semantic_entropy": 0.001148298499174416, + "logits/chosen": -0.08340935409069061, + "logits/rejected": 0.02749234437942505, + "logps/chosen": -5.346668720245361, + "logps/rejected": -6.856997013092041, + "loss": 0.4095, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.346668720245361, + "rewards/margins": 1.5103282928466797, + "rewards/rejected": -6.856997013092041, "step": 5410 }, { "epoch": 2.8981435022579025, - "grad_norm": 17.13225055698541, + "grad_norm": 19.61943787372246, "learning_rate": 3.461654048181939e-09, - "logits/chosen": 0.810439944267273, - "logits/rejected": 0.904525637626648, - "logps/chosen": -10.016260147094727, - "logps/rejected": -11.255754470825195, - "loss": 0.4262, + "logits/chosen": -0.12954792380332947, + "logits/rejected": 0.08068902790546417, + "logps/chosen": -5.284794807434082, + "logps/rejected": -6.404184818267822, + "loss": 0.4611, "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -10.016260147094727, - "rewards/margins": 1.2394943237304688, - "rewards/rejected": -11.255754470825195, - "semantic_entropy": 0.0010481254430487752, + "rewards/chosen": -5.284794807434082, + "rewards/margins": 1.1193897724151611, + "rewards/rejected": -6.404184818267822, "step": 5415 }, { "epoch": 2.9008195350393042, - "grad_norm": 21.19682630785255, + "grad_norm": 18.009437863257947, "learning_rate": 3.281118388227255e-09, - "logits/chosen": 0.8494071960449219, - "logits/rejected": 0.8823550343513489, - "logps/chosen": -9.834370613098145, - "logps/rejected": -11.027946472167969, - "loss": 0.4672, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -9.834370613098145, - "rewards/margins": 1.193576693534851, - "rewards/rejected": -11.027946472167969, - "semantic_entropy": 0.001247903099283576, + "logits/chosen": -0.0696437731385231, + "logits/rejected": -0.004479380790144205, + "logps/chosen": -5.242532253265381, + "logps/rejected": -6.39145565032959, + "loss": 0.4682, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -5.242532253265381, + "rewards/margins": 1.148923635482788, + "rewards/rejected": -6.39145565032959, "step": 5420 }, { "epoch": 2.903495567820706, - "grad_norm": 25.992370109808167, + "grad_norm": 21.685463218783983, "learning_rate": 3.1054018970405048e-09, - "logits/chosen": 0.8348292112350464, - "logits/rejected": 0.8578389286994934, - "logps/chosen": -9.816828727722168, - "logps/rejected": -11.25603199005127, - "loss": 0.4049, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.816828727722168, - "rewards/margins": 1.4392026662826538, - "rewards/rejected": -11.25603199005127, - "semantic_entropy": 0.0012755084317177534, + "logits/chosen": -0.06106134504079819, + "logits/rejected": 0.05308028310537338, + "logps/chosen": -5.048851490020752, + "logps/rejected": -6.556333065032959, + "loss": 0.3797, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -5.048851490020752, + "rewards/margins": 1.5074818134307861, + "rewards/rejected": -6.556333065032959, "step": 5425 }, { "epoch": 2.906171600602107, - "grad_norm": 22.319154102335233, + "grad_norm": 21.61821189004268, "learning_rate": 2.9345062794238207e-09, - "logits/chosen": 0.8351479768753052, - "logits/rejected": 0.9166293144226074, - "logps/chosen": -9.844882011413574, - "logps/rejected": -11.379692077636719, - "loss": 0.35, - "rewards/accuracies": 0.875, - "rewards/chosen": -9.844882011413574, - "rewards/margins": 1.5348093509674072, - "rewards/rejected": -11.379692077636719, - "semantic_entropy": 0.0016486002132296562, + "logits/chosen": -0.11096260696649551, + "logits/rejected": 0.0796138197183609, + "logps/chosen": -5.070502281188965, + "logps/rejected": -6.48342752456665, + "loss": 0.382, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.070502281188965, + "rewards/margins": 1.412925362586975, + "rewards/rejected": -6.48342752456665, "step": 5430 }, { "epoch": 2.908847633383509, - "grad_norm": 19.717507438885043, + "grad_norm": 20.629784722229324, "learning_rate": 2.7684331934072492e-09, - "logits/chosen": 0.7874764204025269, - "logits/rejected": 0.8273738026618958, - "logps/chosen": -9.670753479003906, - "logps/rejected": -11.1673583984375, - "loss": 0.3777, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.670753479003906, - "rewards/margins": 1.4966033697128296, - "rewards/rejected": -11.1673583984375, - "semantic_entropy": 0.001804637722671032, + "logits/chosen": -0.18221724033355713, + "logits/rejected": -0.09126058220863342, + "logps/chosen": -4.96766996383667, + "logps/rejected": -6.4256439208984375, + "loss": 0.3726, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.96766996383667, + "rewards/margins": 1.4579739570617676, + "rewards/rejected": -6.4256439208984375, "step": 5435 }, { "epoch": 2.9115236661649107, - "grad_norm": 20.47002458984704, + "grad_norm": 17.033069626855507, "learning_rate": 2.6071842502326526e-09, - "logits/chosen": 0.8280852437019348, - "logits/rejected": 0.8752776980400085, - "logps/chosen": -9.854182243347168, - "logps/rejected": -11.047213554382324, - "loss": 0.4253, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.854182243347168, - "rewards/margins": 1.1930307149887085, - "rewards/rejected": -11.047213554382324, - "semantic_entropy": 0.0011610215296968818, + "logits/chosen": -0.15713807940483093, + "logits/rejected": -0.008741682395339012, + "logps/chosen": -5.051283836364746, + "logps/rejected": -6.296348571777344, + "loss": 0.4055, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.051283836364746, + "rewards/margins": 1.2450649738311768, + "rewards/rejected": -6.296348571777344, "step": 5440 }, { "epoch": 2.9141996989463124, - "grad_norm": 27.24590899012989, + "grad_norm": 28.18701518191439, "learning_rate": 2.450761014337888e-09, - "logits/chosen": 0.8899133801460266, - "logits/rejected": 0.9149841070175171, - "logps/chosen": -9.686140060424805, - "logps/rejected": -11.125383377075195, - "loss": 0.4581, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -9.686140060424805, - "rewards/margins": 1.4392426013946533, - "rewards/rejected": -11.125383377075195, - "semantic_entropy": 0.0013887417735531926, + "logits/chosen": 0.032346032559871674, + "logits/rejected": 0.10855402052402496, + "logps/chosen": -4.835173606872559, + "logps/rejected": -6.3008713722229, + "loss": 0.4366, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.835173606872559, + "rewards/margins": 1.4656974077224731, + "rewards/rejected": -6.3008713722229, "step": 5445 }, { "epoch": 2.9168757317277136, - "grad_norm": 30.201566273141413, + "grad_norm": 21.64422190910182, "learning_rate": 2.299165003341985e-09, - "logits/chosen": 0.8797470331192017, - "logits/rejected": 0.9165847897529602, - "logps/chosen": -9.838947296142578, - "logps/rejected": -11.165335655212402, - "loss": 0.4477, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.838947296142578, - "rewards/margins": 1.3263883590698242, - "rewards/rejected": -11.165335655212402, - "semantic_entropy": 0.0014228606596589088, + "logits/chosen": 0.03410804271697998, + "logits/rejected": 0.11431734263896942, + "logps/chosen": -5.048746109008789, + "logps/rejected": -6.37112283706665, + "loss": 0.4061, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.048746109008789, + "rewards/margins": 1.3223768472671509, + "rewards/rejected": -6.37112283706665, "step": 5450 }, { "epoch": 2.9195517645091154, - "grad_norm": 21.66371324186062, + "grad_norm": 23.138921983841502, "learning_rate": 2.1523976880299945e-09, - "logits/chosen": 0.7495226263999939, - "logits/rejected": 0.8447777032852173, - "logps/chosen": -9.877795219421387, - "logps/rejected": -10.990615844726562, - "loss": 0.4624, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -9.877795219421387, - "rewards/margins": 1.1128205060958862, - "rewards/rejected": -10.990615844726562, - "semantic_entropy": 0.001258770003914833, + "logits/chosen": -0.15719352662563324, + "logits/rejected": 0.01195734366774559, + "logps/chosen": -5.11898136138916, + "logps/rejected": -6.24781608581543, + "loss": 0.4524, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.11898136138916, + "rewards/margins": 1.12883460521698, + "rewards/rejected": -6.24781608581543, "step": 5455 }, { "epoch": 2.9222277972905166, - "grad_norm": 19.131164191986134, + "grad_norm": 14.375810601384593, "learning_rate": 2.010460492339161e-09, - "logits/chosen": 0.7976396083831787, - "logits/rejected": 0.8642821311950684, - "logps/chosen": -9.621539115905762, - "logps/rejected": -11.064565658569336, - "loss": 0.3859, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.621539115905762, - "rewards/margins": 1.4430257081985474, - "rewards/rejected": -11.064565658569336, - "semantic_entropy": 0.0015615615993738174, + "logits/chosen": -0.12225662171840668, + "logits/rejected": 0.03143675997853279, + "logps/chosen": -4.849614143371582, + "logps/rejected": -6.263222694396973, + "loss": 0.4034, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.849614143371582, + "rewards/margins": 1.4136087894439697, + "rewards/rejected": -6.263222694396973, "step": 5460 }, { "epoch": 2.9249038300719183, - "grad_norm": 16.998983409706852, + "grad_norm": 15.747197976201871, "learning_rate": 1.8733547933446614e-09, - "logits/chosen": 0.8069744110107422, - "logits/rejected": 0.8920931816101074, - "logps/chosen": -9.918710708618164, - "logps/rejected": -11.107327461242676, - "loss": 0.4295, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.918710708618164, - "rewards/margins": 1.1886180639266968, - "rewards/rejected": -11.107327461242676, - "semantic_entropy": 0.001179686514660716, + "logits/chosen": -0.15932923555374146, + "logits/rejected": 0.05145525932312012, + "logps/chosen": -5.191567420959473, + "logps/rejected": -6.349931716918945, + "loss": 0.4502, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -5.191567420959473, + "rewards/margins": 1.158363938331604, + "rewards/rejected": -6.349931716918945, "step": 5465 }, { "epoch": 2.92757986285332, - "grad_norm": 34.20627907884556, + "grad_norm": 33.79075762897427, "learning_rate": 1.7410819212467231e-09, - "logits/chosen": 0.8224443197250366, - "logits/rejected": 0.8717595338821411, - "logps/chosen": -9.855988502502441, - "logps/rejected": -11.113961219787598, - "loss": 0.4348, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.855988502502441, - "rewards/margins": 1.2579724788665771, - "rewards/rejected": -11.113961219787598, - "semantic_entropy": 0.0013345398474484682, + "logits/chosen": -0.08193556219339371, + "logits/rejected": 0.0048525454476475716, + "logps/chosen": -5.074016094207764, + "logps/rejected": -6.189368724822998, + "loss": 0.4527, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.074016094207764, + "rewards/margins": 1.1153522729873657, + "rewards/rejected": -6.189368724822998, "step": 5470 }, { "epoch": 2.9302558956347218, - "grad_norm": 21.07119537810863, + "grad_norm": 22.334958887111743, "learning_rate": 1.613643159357192e-09, - "logits/chosen": 0.8732544183731079, - "logits/rejected": 0.8522858619689941, - "logps/chosen": -9.735010147094727, - "logps/rejected": -10.999938011169434, - "loss": 0.4045, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.735010147094727, - "rewards/margins": 1.2649286985397339, - "rewards/rejected": -10.999938011169434, - "semantic_entropy": 0.0016018247697502375, + "logits/chosen": -0.054608434438705444, + "logits/rejected": -0.0951676219701767, + "logps/chosen": -4.858354568481445, + "logps/rejected": -6.002394676208496, + "loss": 0.4433, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.858354568481445, + "rewards/margins": 1.1440402269363403, + "rewards/rejected": -6.002394676208496, "step": 5475 }, { "epoch": 2.932931928416123, - "grad_norm": 22.6554353032534, + "grad_norm": 21.70771458096057, "learning_rate": 1.4910397440875967e-09, - "logits/chosen": 0.795121431350708, - "logits/rejected": 0.8548393249511719, - "logps/chosen": -9.826266288757324, - "logps/rejected": -11.189390182495117, - "loss": 0.4163, + "logits/chosen": -0.07198528945446014, + "logits/rejected": 0.043470971286296844, + "logps/chosen": -5.215410232543945, + "logps/rejected": -6.580199241638184, + "loss": 0.4077, "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -9.826266288757324, - "rewards/margins": 1.363124132156372, - "rewards/rejected": -11.189390182495117, - "semantic_entropy": 0.0013476324966177344, + "rewards/chosen": -5.215410232543945, + "rewards/margins": 1.3647887706756592, + "rewards/rejected": -6.580199241638184, "step": 5480 }, { "epoch": 2.9356079611975248, - "grad_norm": 25.149551686278766, + "grad_norm": 20.363656643715917, "learning_rate": 1.3732728649368253e-09, - "logits/chosen": 0.8276500701904297, - "logits/rejected": 0.8823320269584656, - "logps/chosen": -9.684768676757812, - "logps/rejected": -10.858797073364258, - "loss": 0.4437, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -9.684768676757812, - "rewards/margins": 1.1740278005599976, - "rewards/rejected": -10.858797073364258, - "semantic_entropy": 0.0018609801772981882, + "logits/chosen": -0.04140637442469597, + "logits/rejected": 0.1400243639945984, + "logps/chosen": -4.853867530822754, + "logps/rejected": -6.051516532897949, + "loss": 0.4064, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.853867530822754, + "rewards/margins": 1.1976487636566162, + "rewards/rejected": -6.051516532897949, "step": 5485 }, { "epoch": 2.938283993978926, - "grad_norm": 25.88498942606627, + "grad_norm": 20.568240075290966, "learning_rate": 1.260343664479524e-09, - "logits/chosen": 0.7547510862350464, - "logits/rejected": 0.7992917895317078, - "logps/chosen": -9.716946601867676, - "logps/rejected": -10.954937934875488, - "loss": 0.4331, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.716946601867676, - "rewards/margins": 1.2379915714263916, - "rewards/rejected": -10.954937934875488, - "semantic_entropy": 0.0013116684276610613, + "logits/chosen": -0.10832848399877548, + "logits/rejected": -0.029571712017059326, + "logps/chosen": -4.894542694091797, + "logps/rejected": -6.259955406188965, + "loss": 0.3902, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.894542694091797, + "rewards/margins": 1.3654124736785889, + "rewards/rejected": -6.259955406188965, "step": 5490 }, { "epoch": 2.9409600267603278, - "grad_norm": 18.9946324561305, + "grad_norm": 15.019386194455882, "learning_rate": 1.1522532383554384e-09, - "logits/chosen": 0.8538810014724731, - "logits/rejected": 0.9081370234489441, - "logps/chosen": -9.737415313720703, - "logps/rejected": -11.309637069702148, - "loss": 0.3575, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -9.737415313720703, - "rewards/margins": 1.5722216367721558, - "rewards/rejected": -11.309637069702148, - "semantic_entropy": 0.0014735187869518995, + "logits/chosen": -0.14992813766002655, + "logits/rejected": 0.060244105756282806, + "logps/chosen": -4.905772686004639, + "logps/rejected": -6.47158670425415, + "loss": 0.3452, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.905772686004639, + "rewards/margins": 1.5658137798309326, + "rewards/rejected": -6.47158670425415, "step": 5495 }, { "epoch": 2.9436360595417295, - "grad_norm": 18.123811698473638, + "grad_norm": 17.56136885369219, "learning_rate": 1.049002635258256e-09, - "logits/chosen": 0.8666974902153015, - "logits/rejected": 0.9071024656295776, - "logps/chosen": -9.86131763458252, - "logps/rejected": -11.140911102294922, + "logits/chosen": -0.023222994059324265, + "logits/rejected": 0.0903361439704895, + "logps/chosen": -5.119718074798584, + "logps/rejected": -6.367912769317627, "loss": 0.4182, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.86131763458252, - "rewards/margins": 1.2795933485031128, - "rewards/rejected": -11.140911102294922, - "semantic_entropy": 0.0012588893296197057, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -5.119718074798584, + "rewards/margins": 1.248194694519043, + "rewards/rejected": -6.367912769317627, "step": 5500 }, { "epoch": 2.946312092323131, - "grad_norm": 28.867054705404758, + "grad_norm": 26.314980891432334, "learning_rate": 9.505928569258358e-10, - "logits/chosen": 0.828734278678894, - "logits/rejected": 0.8480997085571289, - "logps/chosen": -9.789422988891602, - "logps/rejected": -11.044143676757812, - "loss": 0.4458, - "rewards/accuracies": 0.78125, - "rewards/chosen": -9.789422988891602, - "rewards/margins": 1.2547214031219482, - "rewards/rejected": -11.044143676757812, - "semantic_entropy": 0.0014515508664771914, + "logits/chosen": -0.08501719683408737, + "logits/rejected": -0.052177123725414276, + "logps/chosen": -4.957799911499023, + "logps/rejected": -6.24397611618042, + "loss": 0.4067, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.957799911499023, + "rewards/margins": 1.2861764430999756, + "rewards/rejected": -6.24397611618042, "step": 5505 }, { "epoch": 2.9489881251045325, - "grad_norm": 20.361191522004823, + "grad_norm": 20.476304181561787, "learning_rate": 8.57024858130273e-10, - "logits/chosen": 0.8103793859481812, - "logits/rejected": 0.8867511749267578, - "logps/chosen": -9.793035507202148, - "logps/rejected": -11.505876541137695, - "loss": 0.3433, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -9.793035507202148, - "rewards/margins": 1.7128407955169678, - "rewards/rejected": -11.505876541137695, - "semantic_entropy": 0.0011786060640588403, + "logits/chosen": -0.12162284553050995, + "logits/rejected": 0.03268551826477051, + "logps/chosen": -5.015363693237305, + "logps/rejected": -6.568727970123291, + "loss": 0.375, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.015363693237305, + "rewards/margins": 1.5533647537231445, + "rewards/rejected": -6.568727970123291, "step": 5510 }, { "epoch": 2.951664157885934, - "grad_norm": 25.4932234461195, + "grad_norm": 19.209953741217777, "learning_rate": 7.682995466686826e-10, - "logits/chosen": 0.7820402383804321, - "logits/rejected": 0.8293735384941101, - "logps/chosen": -9.81843376159668, - "logps/rejected": -11.206514358520508, - "loss": 0.4035, + "logits/chosen": -0.17776979506015778, + "logits/rejected": -0.03494267538189888, + "logps/chosen": -5.030096530914307, + "logps/rejected": -6.362129211425781, + "loss": 0.4016, "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -9.81843376159668, - "rewards/margins": 1.3880798816680908, - "rewards/rejected": -11.206514358520508, - "semantic_entropy": 0.001364008872769773, + "rewards/chosen": -5.030096530914307, + "rewards/margins": 1.3320331573486328, + "rewards/rejected": -6.362129211425781, "step": 5515 }, { "epoch": 2.9543401906673354, - "grad_norm": 22.786471299719448, + "grad_norm": 21.810120786012032, "learning_rate": 6.844177833543741e-10, - "logits/chosen": 0.8798080682754517, - "logits/rejected": 0.8902498483657837, - "logps/chosen": -9.707275390625, - "logps/rejected": -11.065264701843262, - "loss": 0.3739, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.707275390625, - "rewards/margins": 1.3579896688461304, - "rewards/rejected": -11.065264701843262, - "semantic_entropy": 0.0014018730726093054, + "logits/chosen": -0.08294065296649933, + "logits/rejected": -0.011479432694613934, + "logps/chosen": -5.019160747528076, + "logps/rejected": -6.254429817199707, + "loss": 0.4112, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -5.019160747528076, + "rewards/margins": 1.2352688312530518, + "rewards/rejected": -6.254429817199707, "step": 5520 }, { "epoch": 2.957016223448737, - "grad_norm": 24.344487883516077, + "grad_norm": 20.903702039664246, "learning_rate": 6.053803820087467e-10, - "logits/chosen": 0.8397024273872375, - "logits/rejected": 0.9295798540115356, - "logps/chosen": -9.954577445983887, - "logps/rejected": -11.368110656738281, - "loss": 0.4164, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.954577445983887, - "rewards/margins": 1.4135328531265259, - "rewards/rejected": -11.368110656738281, - "semantic_entropy": 0.0010031659621745348, + "logits/chosen": -0.07538093626499176, + "logits/rejected": 0.07659383118152618, + "logps/chosen": -5.180552959442139, + "logps/rejected": -6.571086883544922, + "loss": 0.4069, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.180552959442139, + "rewards/margins": 1.390533685684204, + "rewards/rejected": -6.571086883544922, "step": 5525 }, { "epoch": 2.959692256230139, - "grad_norm": 22.61990267466979, + "grad_norm": 18.08445927839968, "learning_rate": 5.311881094528514e-10, - "logits/chosen": 0.8053072094917297, - "logits/rejected": 0.8756014108657837, - "logps/chosen": -10.003788948059082, - "logps/rejected": -11.16923713684082, - "loss": 0.4491, - "rewards/accuracies": 0.75, - "rewards/chosen": -10.003788948059082, - "rewards/margins": 1.1654479503631592, - "rewards/rejected": -11.16923713684082, - "semantic_entropy": 0.001166566857136786, + "logits/chosen": -0.15429647266864777, + "logits/rejected": 0.06284980475902557, + "logps/chosen": -5.253418922424316, + "logps/rejected": -6.375418663024902, + "loss": 0.4505, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.253418922424316, + "rewards/margins": 1.121999979019165, + "rewards/rejected": -6.375418663024902, "step": 5530 }, { "epoch": 2.9623682890115406, - "grad_norm": 25.594286670666698, + "grad_norm": 26.89532398221362, "learning_rate": 4.6184168550050806e-10, - "logits/chosen": 0.8106497526168823, - "logits/rejected": 0.8661069869995117, - "logps/chosen": -9.88862419128418, - "logps/rejected": -11.237882614135742, - "loss": 0.4061, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.88862419128418, - "rewards/margins": 1.3492584228515625, - "rewards/rejected": -11.237882614135742, - "semantic_entropy": 0.0012052215170115232, + "logits/chosen": -0.1020139828324318, + "logits/rejected": -0.04405033960938454, + "logps/chosen": -5.085295677185059, + "logps/rejected": -6.340322017669678, + "loss": 0.4469, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.085295677185059, + "rewards/margins": 1.2550262212753296, + "rewards/rejected": -6.340322017669678, "step": 5535 }, { "epoch": 2.965044321792942, - "grad_norm": 22.658593939455915, + "grad_norm": 20.841693713757003, "learning_rate": 3.973417829510328e-10, - "logits/chosen": 0.7906460762023926, - "logits/rejected": 0.8491800427436829, - "logps/chosen": -9.941095352172852, - "logps/rejected": -11.274066925048828, - "loss": 0.4179, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.941095352172852, - "rewards/margins": 1.332972526550293, - "rewards/rejected": -11.274066925048828, - "semantic_entropy": 0.0011057687224820256, + "logits/chosen": -0.16356001794338226, + "logits/rejected": -0.03581715375185013, + "logps/chosen": -5.233830451965332, + "logps/rejected": -6.557257652282715, + "loss": 0.4063, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.233830451965332, + "rewards/margins": 1.3234273195266724, + "rewards/rejected": -6.557257652282715, "step": 5540 }, { "epoch": 2.9677203545743436, - "grad_norm": 22.045194562461553, + "grad_norm": 27.1761320340176, "learning_rate": 3.3768902758274377e-10, - "logits/chosen": 0.847141444683075, - "logits/rejected": 0.885520339012146, - "logps/chosen": -9.864678382873535, - "logps/rejected": -11.155853271484375, - "loss": 0.4141, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -9.864678382873535, - "rewards/margins": 1.2911745309829712, - "rewards/rejected": -11.155853271484375, - "semantic_entropy": 0.0010961454827338457, + "logits/chosen": -0.09966840595006943, + "logits/rejected": 0.0162192489951849, + "logps/chosen": -5.083858489990234, + "logps/rejected": -6.2742018699646, + "loss": 0.4337, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -5.083858489990234, + "rewards/margins": 1.1903434991836548, + "rewards/rejected": -6.2742018699646, "step": 5545 }, { "epoch": 2.970396387355745, - "grad_norm": 16.922925357956778, + "grad_norm": 16.16890851162645, "learning_rate": 2.8288399814691e-10, - "logits/chosen": 0.8648706674575806, - "logits/rejected": 0.9044130444526672, - "logps/chosen": -9.688464164733887, - "logps/rejected": -10.932621955871582, - "loss": 0.4164, + "logits/chosen": -0.036839328706264496, + "logits/rejected": 0.0087259067222476, + "logps/chosen": -4.835643768310547, + "logps/rejected": -6.081192493438721, + "loss": 0.3905, "rewards/accuracies": 0.8125, - "rewards/chosen": -9.688464164733887, - "rewards/margins": 1.2441574335098267, - "rewards/rejected": -10.932621955871582, - "semantic_entropy": 0.0013283784501254559, + "rewards/chosen": -4.835643768310547, + "rewards/margins": 1.2455488443374634, + "rewards/rejected": -6.081192493438721, "step": 5550 }, { "epoch": 2.9730724201371466, - "grad_norm": 25.775353395913132, + "grad_norm": 21.857007911422443, "learning_rate": 2.3292722636220066e-10, - "logits/chosen": 0.7877558469772339, - "logits/rejected": 0.8715565800666809, - "logps/chosen": -9.736814498901367, - "logps/rejected": -11.426549911499023, - "loss": 0.3466, - "rewards/accuracies": 0.84375, - "rewards/chosen": -9.736814498901367, - "rewards/margins": 1.6897351741790771, - "rewards/rejected": -11.426549911499023, - "semantic_entropy": 0.001374770887196064, + "logits/chosen": -0.13308118283748627, + "logits/rejected": 0.04274693876504898, + "logps/chosen": -5.085918426513672, + "logps/rejected": -6.537113189697266, + "loss": 0.3861, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -5.085918426513672, + "rewards/margins": 1.4511945247650146, + "rewards/rejected": -6.537113189697266, "step": 5555 }, { "epoch": 2.9757484529185483, - "grad_norm": 24.401341154573725, + "grad_norm": 22.16959748035582, "learning_rate": 1.8781919690946668e-10, - "logits/chosen": 0.7978461980819702, - "logits/rejected": 0.8458053469657898, - "logps/chosen": -9.92179012298584, - "logps/rejected": -11.083666801452637, - "loss": 0.4448, - "rewards/accuracies": 0.8125, - "rewards/chosen": -9.92179012298584, - "rewards/margins": 1.1618760824203491, - "rewards/rejected": -11.083666801452637, - "semantic_entropy": 0.001205159118399024, + "logits/chosen": -0.04707597941160202, + "logits/rejected": -0.008806167170405388, + "logps/chosen": -5.048376083374023, + "logps/rejected": -6.145014762878418, + "loss": 0.4667, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -5.048376083374023, + "rewards/margins": 1.0966380834579468, + "rewards/rejected": -6.145014762878418, "step": 5560 }, { "epoch": 2.97842448569995, - "grad_norm": 23.987542667982627, + "grad_norm": 21.608370595486402, "learning_rate": 1.4756034742696711e-10, - "logits/chosen": 0.8300431370735168, - "logits/rejected": 0.9039738774299622, - "logps/chosen": -9.867820739746094, - "logps/rejected": -11.240914344787598, - "loss": 0.4123, + "logits/chosen": -0.12347067892551422, + "logits/rejected": -0.0023892566096037626, + "logps/chosen": -5.1615118980407715, + "logps/rejected": -6.494134426116943, + "loss": 0.4252, "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.867820739746094, - "rewards/margins": 1.3730926513671875, - "rewards/rejected": -11.240914344787598, - "semantic_entropy": 0.0011399075156077743, + "rewards/chosen": -5.1615118980407715, + "rewards/margins": 1.3326222896575928, + "rewards/rejected": -6.494134426116943, "step": 5565 }, { "epoch": 2.9811005184813513, - "grad_norm": 20.018421831607807, + "grad_norm": 17.228163389780708, "learning_rate": 1.12151068506261e-10, - "logits/chosen": 0.8630874752998352, - "logits/rejected": 0.9077743291854858, - "logps/chosen": -9.715357780456543, - "logps/rejected": -11.356972694396973, - "loss": 0.3585, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -9.715357780456543, - "rewards/margins": 1.6416149139404297, - "rewards/rejected": -11.356972694396973, - "semantic_entropy": 0.0017618630081415176, + "logits/chosen": -0.10240642726421356, + "logits/rejected": 0.034067101776599884, + "logps/chosen": -4.943631649017334, + "logps/rejected": -6.6033735275268555, + "loss": 0.3624, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.943631649017334, + "rewards/margins": 1.6597416400909424, + "rewards/rejected": -6.6033735275268555, "step": 5570 }, { "epoch": 2.983776551262753, - "grad_norm": 19.995404454240774, + "grad_norm": 21.997484193464704, "learning_rate": 8.159170368826629e-11, - "logits/chosen": 0.8361412882804871, - "logits/rejected": 0.8893247842788696, - "logps/chosen": -9.477738380432129, - "logps/rejected": -10.875368118286133, - "loss": 0.4263, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.477738380432129, - "rewards/margins": 1.3976287841796875, - "rewards/rejected": -10.875368118286133, - "semantic_entropy": 0.001689505996182561, + "logits/chosen": -0.12148864567279816, + "logits/rejected": 0.03400012105703354, + "logps/chosen": -4.589667320251465, + "logps/rejected": -5.928946495056152, + "loss": 0.4435, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.589667320251465, + "rewards/margins": 1.3392789363861084, + "rewards/rejected": -5.928946495056152, "step": 5575 }, { "epoch": 2.9864525840441547, - "grad_norm": 25.119002068756046, + "grad_norm": 22.49638505152311, "learning_rate": 5.588254946015114e-11, - "logits/chosen": 0.8052582740783691, - "logits/rejected": 0.8892769813537598, - "logps/chosen": -9.728483200073242, - "logps/rejected": -11.229433059692383, - "loss": 0.3915, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -9.728483200073242, - "rewards/margins": 1.5009489059448242, - "rewards/rejected": -11.229433059692383, - "semantic_entropy": 0.0017851864686235785, + "logits/chosen": -0.18121260404586792, + "logits/rejected": 0.07925824820995331, + "logps/chosen": -5.0184006690979, + "logps/rejected": -6.358648300170898, + "loss": 0.4067, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.0184006690979, + "rewards/margins": 1.3402475118637085, + "rewards/rejected": -6.358648300170898, "step": 5580 }, { "epoch": 2.989128616825556, - "grad_norm": 17.38170767201798, + "grad_norm": 16.229481882995472, "learning_rate": 3.502385525216978e-11, - "logits/chosen": 0.7621601819992065, - "logits/rejected": 0.8389317393302917, - "logps/chosen": -9.730030059814453, - "logps/rejected": -11.127847671508789, - "loss": 0.3831, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -9.730030059814453, - "rewards/margins": 1.3978168964385986, - "rewards/rejected": -11.127847671508789, - "semantic_entropy": 0.0015055348630994558, + "logits/chosen": -0.16574475169181824, + "logits/rejected": 0.009163349866867065, + "logps/chosen": -4.97738790512085, + "logps/rejected": -6.442929744720459, + "loss": 0.3664, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.97738790512085, + "rewards/margins": 1.465541958808899, + "rewards/rejected": -6.442929744720459, "step": 5585 }, { "epoch": 2.9918046496069577, - "grad_norm": 18.851089955324714, + "grad_norm": 18.202827663684207, "learning_rate": 1.901582343555308e-11, - "logits/chosen": 0.8362342119216919, - "logits/rejected": 0.894599437713623, - "logps/chosen": -9.942944526672363, - "logps/rejected": -11.178738594055176, - "loss": 0.4489, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -9.942944526672363, - "rewards/margins": 1.2357933521270752, - "rewards/rejected": -11.178738594055176, - "semantic_entropy": 0.001227195025421679, + "logits/chosen": -0.07705139368772507, + "logits/rejected": 0.0040489910170435905, + "logps/chosen": -5.141321659088135, + "logps/rejected": -6.442171573638916, + "loss": 0.4192, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -5.141321659088135, + "rewards/margins": 1.3008501529693604, + "rewards/rejected": -6.442171573638916, "step": 5590 }, { "epoch": 2.9944806823883594, - "grad_norm": 33.481853049291296, + "grad_norm": 33.81771205737403, "learning_rate": 7.858609320232634e-12, - "logits/chosen": 0.8312174677848816, - "logits/rejected": 0.907203197479248, - "logps/chosen": -9.739818572998047, - "logps/rejected": -11.097038269042969, - "loss": 0.435, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -9.739818572998047, - "rewards/margins": 1.3572200536727905, - "rewards/rejected": -11.097038269042969, - "semantic_entropy": 0.0013708441983908415, + "logits/chosen": -0.12298817932605743, + "logits/rejected": 0.06417442113161087, + "logps/chosen": -4.815728187561035, + "logps/rejected": -6.2156243324279785, + "loss": 0.3802, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.815728187561035, + "rewards/margins": 1.3998963832855225, + "rewards/rejected": -6.2156243324279785, "step": 5595 }, { "epoch": 2.9971567151697607, - "grad_norm": 32.77365513835394, + "grad_norm": 22.710624361826266, "learning_rate": 1.5523211535639624e-12, - "logits/chosen": 0.8438766598701477, - "logits/rejected": 0.876266360282898, - "logps/chosen": -9.731843948364258, - "logps/rejected": -11.251733779907227, - "loss": 0.4023, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -9.731843948364258, - "rewards/margins": 1.5198904275894165, - "rewards/rejected": -11.251733779907227, - "semantic_entropy": 0.001664994633756578, + "logits/chosen": -0.122617207467556, + "logits/rejected": 0.007517233490943909, + "logps/chosen": -4.989627838134766, + "logps/rejected": -6.593759059906006, + "loss": 0.4044, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.989627838134766, + "rewards/margins": 1.6041316986083984, + "rewards/rejected": -6.593759059906006, "step": 5600 }, { "epoch": 2.9971567151697607, - "eval_logits/chosen": 0.9163224101066589, - "eval_logits/rejected": 0.9576993584632874, - "eval_logps/chosen": -9.975739479064941, - "eval_logps/rejected": -11.105352401733398, - "eval_loss": 0.52450031042099, - "eval_rewards/accuracies": 0.7240356206893921, - "eval_rewards/chosen": -9.975739479064941, - "eval_rewards/margins": 1.1296132802963257, - "eval_rewards/rejected": -11.105352401733398, - "eval_runtime": 35.057, - "eval_samples_per_second": 38.366, - "eval_semantic_entropy": 0.0012647128896787763, - "eval_steps_per_second": 9.613, + "eval_logits/chosen": 0.23467637598514557, + "eval_logits/rejected": 0.3310812711715698, + "eval_logps/chosen": -5.230490684509277, + "eval_logps/rejected": -6.346008777618408, + "eval_loss": 0.525705099105835, + "eval_rewards/accuracies": 0.7284866571426392, + "eval_rewards/chosen": -5.230490684509277, + "eval_rewards/margins": 1.1155189275741577, + "eval_rewards/rejected": -6.346008777618408, + "eval_runtime": 40.4465, + "eval_samples_per_second": 33.254, + "eval_steps_per_second": 8.332, "step": 5600 }, { "epoch": 2.999297541394882, "step": 5604, "total_flos": 0.0, - "train_loss": 0.5450739759491819, - "train_runtime": 29046.9509, - "train_samples_per_second": 6.175, - "train_steps_per_second": 0.193 + "train_loss": 0.5174550147332267, + "train_runtime": 29971.1802, + "train_samples_per_second": 5.985, + "train_steps_per_second": 0.187 } ], "logging_steps": 5,