{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998638529611981, "eval_steps": 500, "global_step": 3672, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002722940776038121, "grad_norm": 2.365647699760686, "learning_rate": 5.434782608695653e-07, "logits/chosen": -0.8060563206672668, "logits/rejected": -0.5449127554893494, "logps/chosen": -397.73370361328125, "logps/rejected": -461.0686340332031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.002722940776038121, "grad_norm": 2.117870186084684, "learning_rate": 5.4347826086956525e-06, "logits/chosen": -1.0161805152893066, "logits/rejected": -0.9247782230377197, "logps/chosen": -339.845947265625, "logps/rejected": -335.201416015625, "loss": 0.6926, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": 0.0008954557124525309, "rewards/margins": 0.0016563390381634235, "rewards/rejected": -0.0007608832092955709, "step": 10 }, { "epoch": 0.005445881552076242, "grad_norm": 2.3050788819778347, "learning_rate": 1.0869565217391305e-05, "logits/chosen": -1.1553447246551514, "logits/rejected": -1.1530015468597412, "logps/chosen": -319.328369140625, "logps/rejected": -281.5868225097656, "loss": 0.6841, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.025853103026747704, "rewards/margins": 0.01720840111374855, "rewards/rejected": 0.008644700981676579, "step": 20 }, { "epoch": 0.008168822328114363, "grad_norm": 2.018186008036389, "learning_rate": 1.630434782608696e-05, "logits/chosen": -1.2003402709960938, "logits/rejected": -1.0625836849212646, "logps/chosen": -297.01800537109375, "logps/rejected": -293.5678405761719, "loss": 0.6398, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.15041035413742065, "rewards/margins": 0.10567835718393326, "rewards/rejected": 0.0447319932281971, "step": 30 }, { "epoch": 0.010891763104152484, "grad_norm": 2.0974366425813176, "learning_rate": 2.173913043478261e-05, "logits/chosen": -1.2458221912384033, "logits/rejected": -1.1523743867874146, "logps/chosen": -333.25506591796875, "logps/rejected": -345.1227111816406, "loss": 0.5651, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.38798853754997253, "rewards/margins": 0.30759841203689575, "rewards/rejected": 0.08039011061191559, "step": 40 }, { "epoch": 0.013614703880190605, "grad_norm": 1.8585066163676949, "learning_rate": 2.7173913043478262e-05, "logits/chosen": -1.1938469409942627, "logits/rejected": -1.2074568271636963, "logps/chosen": -302.45819091796875, "logps/rejected": -314.1945495605469, "loss": 0.5028, "rewards/accuracies": 0.625, "rewards/chosen": 0.5669056177139282, "rewards/margins": 0.5880029201507568, "rewards/rejected": -0.02109731361269951, "step": 50 }, { "epoch": 0.016337644656228726, "grad_norm": 2.4589848970734356, "learning_rate": 3.260869565217392e-05, "logits/chosen": -1.1839563846588135, "logits/rejected": -1.250597357749939, "logps/chosen": -286.90283203125, "logps/rejected": -327.09271240234375, "loss": 0.4906, "rewards/accuracies": 0.625, "rewards/chosen": 0.571540892124176, "rewards/margins": 0.813601016998291, "rewards/rejected": -0.2420600950717926, "step": 60 }, { "epoch": 0.01906058543226685, "grad_norm": 2.367267226063014, "learning_rate": 3.804347826086957e-05, "logits/chosen": -1.2047820091247559, "logits/rejected": -1.2526741027832031, "logps/chosen": -298.99395751953125, "logps/rejected": -348.8773498535156, "loss": 0.4426, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.3528999388217926, "rewards/margins": 1.0532305240631104, "rewards/rejected": -0.7003306150436401, "step": 70 }, { "epoch": 0.021783526208304968, "grad_norm": 3.298724009406964, "learning_rate": 4.347826086956522e-05, "logits/chosen": -1.0405532121658325, "logits/rejected": -1.1189762353897095, "logps/chosen": -368.87835693359375, "logps/rejected": -510.46112060546875, "loss": 0.3959, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.30884578824043274, "rewards/margins": 1.878394365310669, "rewards/rejected": -2.1872401237487793, "step": 80 }, { "epoch": 0.02450646698434309, "grad_norm": 5.125711190041795, "learning_rate": 4.891304347826087e-05, "logits/chosen": -0.9169028997421265, "logits/rejected": -0.8087233304977417, "logps/chosen": -467.9712829589844, "logps/rejected": -670.4898071289062, "loss": 0.3937, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.1821720600128174, "rewards/margins": 1.763218879699707, "rewards/rejected": -2.9453907012939453, "step": 90 }, { "epoch": 0.02722940776038121, "grad_norm": 2.393390101676132, "learning_rate": 5.4347826086956524e-05, "logits/chosen": -0.9179345369338989, "logits/rejected": -0.9133983850479126, "logps/chosen": -389.9552307128906, "logps/rejected": -575.6070556640625, "loss": 0.3284, "rewards/accuracies": 0.8125, "rewards/chosen": -0.910792350769043, "rewards/margins": 1.9828264713287354, "rewards/rejected": -2.8936190605163574, "step": 100 }, { "epoch": 0.029952348536419333, "grad_norm": 2.9244828418124116, "learning_rate": 5.9782608695652175e-05, "logits/chosen": -0.8454478979110718, "logits/rejected": -0.7953473329544067, "logps/chosen": -379.1640625, "logps/rejected": -630.0404052734375, "loss": 0.278, "rewards/accuracies": 0.875, "rewards/chosen": -0.684828519821167, "rewards/margins": 2.8788986206054688, "rewards/rejected": -3.5637271404266357, "step": 110 }, { "epoch": 0.03267528931245745, "grad_norm": 5.119335229124842, "learning_rate": 6.521739130434783e-05, "logits/chosen": -0.4422452449798584, "logits/rejected": -0.3685118854045868, "logps/chosen": -523.5398559570312, "logps/rejected": -851.53173828125, "loss": 0.278, "rewards/accuracies": 0.875, "rewards/chosen": -2.0111899375915527, "rewards/margins": 3.533088207244873, "rewards/rejected": -5.544278144836426, "step": 120 }, { "epoch": 0.035398230088495575, "grad_norm": 6.032034269990042, "learning_rate": 7.065217391304349e-05, "logits/chosen": -0.42538633942604065, "logits/rejected": -0.3796442151069641, "logps/chosen": -537.8934326171875, "logps/rejected": -948.9290161132812, "loss": 0.318, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7698837518692017, "rewards/margins": 4.274045467376709, "rewards/rejected": -6.043929100036621, "step": 130 }, { "epoch": 0.0381211708645337, "grad_norm": 3.937864766160224, "learning_rate": 7.608695652173914e-05, "logits/chosen": -0.3227941393852234, "logits/rejected": -0.24235177040100098, "logps/chosen": -437.365234375, "logps/rejected": -727.3698120117188, "loss": 0.2836, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.2195541858673096, "rewards/margins": 3.049933671951294, "rewards/rejected": -4.269488334655762, "step": 140 }, { "epoch": 0.04084411164057182, "grad_norm": 5.771526324368126, "learning_rate": 8.152173913043478e-05, "logits/chosen": -0.3635067343711853, "logits/rejected": -0.2840490937232971, "logps/chosen": -505.41339111328125, "logps/rejected": -776.4160766601562, "loss": 0.3556, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4678966999053955, "rewards/margins": 3.195099353790283, "rewards/rejected": -4.6629958152771, "step": 150 }, { "epoch": 0.043567052416609936, "grad_norm": 4.261974292549152, "learning_rate": 8.695652173913044e-05, "logits/chosen": -0.5400364398956299, "logits/rejected": -0.5849201083183289, "logps/chosen": -511.65802001953125, "logps/rejected": -811.4276123046875, "loss": 0.3132, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.5283739566802979, "rewards/margins": 3.7420878410339355, "rewards/rejected": -5.2704620361328125, "step": 160 }, { "epoch": 0.04628999319264806, "grad_norm": 5.162764854438213, "learning_rate": 9.239130434782609e-05, "logits/chosen": -0.3992065489292145, "logits/rejected": -0.21120062470436096, "logps/chosen": -578.4542236328125, "logps/rejected": -1250.295654296875, "loss": 0.2808, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.6138319969177246, "rewards/margins": 6.92595911026001, "rewards/rejected": -9.539790153503418, "step": 170 }, { "epoch": 0.04901293396868618, "grad_norm": 3.546614542325096, "learning_rate": 9.782608695652174e-05, "logits/chosen": 0.12355854362249374, "logits/rejected": 0.16381272673606873, "logps/chosen": -525.3487548828125, "logps/rejected": -877.5525512695312, "loss": 0.2683, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.712798833847046, "rewards/margins": 3.966627597808838, "rewards/rejected": -5.679426670074463, "step": 180 }, { "epoch": 0.051735874744724304, "grad_norm": 3.4061998514196326, "learning_rate": 0.00010326086956521738, "logits/chosen": -0.07915548235177994, "logits/rejected": 0.03230556100606918, "logps/chosen": -541.4509887695312, "logps/rejected": -907.5144653320312, "loss": 0.2763, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.3669848442077637, "rewards/margins": 3.6007239818573, "rewards/rejected": -5.967708587646484, "step": 190 }, { "epoch": 0.05445881552076242, "grad_norm": 4.318373028362168, "learning_rate": 0.00010869565217391305, "logits/chosen": -0.5367928743362427, "logits/rejected": -0.46212688088417053, "logps/chosen": -589.5100708007812, "logps/rejected": -1115.4957275390625, "loss": 0.2403, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.3453047275543213, "rewards/margins": 5.046809196472168, "rewards/rejected": -7.392115116119385, "step": 200 }, { "epoch": 0.05718175629680054, "grad_norm": 3.315434156465907, "learning_rate": 0.0001141304347826087, "logits/chosen": -0.3160248398780823, "logits/rejected": -0.2609514594078064, "logps/chosen": -664.8533325195312, "logps/rejected": -1347.0562744140625, "loss": 0.2439, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.291189193725586, "rewards/margins": 7.1285600662231445, "rewards/rejected": -10.419748306274414, "step": 210 }, { "epoch": 0.059904697072838665, "grad_norm": 2.9900477836167583, "learning_rate": 0.00011956521739130435, "logits/chosen": 0.034565992653369904, "logits/rejected": 0.11762849986553192, "logps/chosen": -673.8556518554688, "logps/rejected": -1427.378173828125, "loss": 0.1902, "rewards/accuracies": 0.875, "rewards/chosen": -3.362067461013794, "rewards/margins": 7.918702602386475, "rewards/rejected": -11.280771255493164, "step": 220 }, { "epoch": 0.06262763784887679, "grad_norm": 2.299272777029839, "learning_rate": 0.000125, "logits/chosen": 0.2120196521282196, "logits/rejected": 0.4011055529117584, "logps/chosen": -816.4850463867188, "logps/rejected": -1636.709716796875, "loss": 0.2206, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.696841239929199, "rewards/margins": 8.66151237487793, "rewards/rejected": -13.358352661132812, "step": 230 }, { "epoch": 0.0653505786249149, "grad_norm": 2.520846682761172, "learning_rate": 0.00013043478260869567, "logits/chosen": -0.6589404344558716, "logits/rejected": -0.6260676383972168, "logps/chosen": -496.455322265625, "logps/rejected": -948.7802734375, "loss": 0.2742, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.457515835762024, "rewards/margins": 5.1725172996521, "rewards/rejected": -6.630032539367676, "step": 240 }, { "epoch": 0.06807351940095303, "grad_norm": 3.470765195985346, "learning_rate": 0.0001358695652173913, "logits/chosen": -0.129820317029953, "logits/rejected": -0.09683764725923538, "logps/chosen": -612.4053955078125, "logps/rejected": -995.4520263671875, "loss": 0.2801, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.644944667816162, "rewards/margins": 4.22878885269165, "rewards/rejected": -6.8737335205078125, "step": 250 }, { "epoch": 0.07079646017699115, "grad_norm": 3.5234122965177432, "learning_rate": 0.00014130434782608697, "logits/chosen": 0.4874610900878906, "logits/rejected": 0.5410028696060181, "logps/chosen": -530.3843383789062, "logps/rejected": -806.9224243164062, "loss": 0.2545, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.913825273513794, "rewards/margins": 2.779323101043701, "rewards/rejected": -4.693148136138916, "step": 260 }, { "epoch": 0.07351940095302927, "grad_norm": 3.1672868272882293, "learning_rate": 0.00014673913043478264, "logits/chosen": 0.5825409293174744, "logits/rejected": 0.6227949857711792, "logps/chosen": -453.08990478515625, "logps/rejected": -778.2337036132812, "loss": 0.2666, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.291534423828125, "rewards/margins": 3.5406317710876465, "rewards/rejected": -4.8321661949157715, "step": 270 }, { "epoch": 0.0762423417290674, "grad_norm": 3.4653355946787867, "learning_rate": 0.00015217391304347827, "logits/chosen": 0.7842821478843689, "logits/rejected": 0.8171672821044922, "logps/chosen": -646.3670043945312, "logps/rejected": -1298.140380859375, "loss": 0.2762, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.0766921043395996, "rewards/margins": 6.976851463317871, "rewards/rejected": -10.053544998168945, "step": 280 }, { "epoch": 0.07896528250510551, "grad_norm": 4.264156688585643, "learning_rate": 0.0001576086956521739, "logits/chosen": 0.39463651180267334, "logits/rejected": 0.38154393434524536, "logps/chosen": -410.0770568847656, "logps/rejected": -697.1278686523438, "loss": 0.2482, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.6038021445274353, "rewards/margins": 3.522867202758789, "rewards/rejected": -4.126669883728027, "step": 290 }, { "epoch": 0.08168822328114364, "grad_norm": 7.548808524451859, "learning_rate": 0.00016304347826086955, "logits/chosen": 0.36318862438201904, "logits/rejected": 0.34319767355918884, "logps/chosen": -570.1300048828125, "logps/rejected": -1266.759033203125, "loss": 0.2265, "rewards/accuracies": 0.875, "rewards/chosen": -2.0891940593719482, "rewards/margins": 7.467465400695801, "rewards/rejected": -9.556659698486328, "step": 300 }, { "epoch": 0.08441116405718176, "grad_norm": 5.697122001859261, "learning_rate": 0.00016847826086956522, "logits/chosen": -0.522394597530365, "logits/rejected": -0.4420931935310364, "logps/chosen": -533.5513916015625, "logps/rejected": -984.91357421875, "loss": 0.2946, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.0686821937561035, "rewards/margins": 4.774750709533691, "rewards/rejected": -6.843433380126953, "step": 310 }, { "epoch": 0.08713410483321987, "grad_norm": 5.515933709027342, "learning_rate": 0.00017391304347826088, "logits/chosen": 0.015002882108092308, "logits/rejected": 0.12198108434677124, "logps/chosen": -530.60400390625, "logps/rejected": -830.6185302734375, "loss": 0.3218, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9283926486968994, "rewards/margins": 3.346539258956909, "rewards/rejected": -5.274931907653809, "step": 320 }, { "epoch": 0.089857045609258, "grad_norm": 20.20417518678615, "learning_rate": 0.00017934782608695652, "logits/chosen": -0.8085662722587585, "logits/rejected": -0.6665461659431458, "logps/chosen": -557.08984375, "logps/rejected": -1014.3035888671875, "loss": 0.3115, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7877311706542969, "rewards/margins": 4.699117183685303, "rewards/rejected": -6.4868483543396, "step": 330 }, { "epoch": 0.09257998638529612, "grad_norm": 19.141358392191165, "learning_rate": 0.00018478260869565218, "logits/chosen": -1.1671316623687744, "logits/rejected": -1.022890329360962, "logps/chosen": -433.53460693359375, "logps/rejected": -890.1748046875, "loss": 0.278, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.8231090307235718, "rewards/margins": 4.5598225593566895, "rewards/rejected": -5.382931709289551, "step": 340 }, { "epoch": 0.09530292716133425, "grad_norm": 5.004173601837073, "learning_rate": 0.00019021739130434782, "logits/chosen": -0.16809643805027008, "logits/rejected": 0.023858536034822464, "logps/chosen": -580.4594116210938, "logps/rejected": -948.2473754882812, "loss": 0.3074, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.90035080909729, "rewards/margins": 4.385798454284668, "rewards/rejected": -6.286149024963379, "step": 350 }, { "epoch": 0.09802586793737236, "grad_norm": 18.975782176199033, "learning_rate": 0.0001956521739130435, "logits/chosen": -0.13861560821533203, "logits/rejected": -0.05721583217382431, "logps/chosen": -495.3792419433594, "logps/rejected": -899.0491943359375, "loss": 0.2447, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.3074277639389038, "rewards/margins": 4.709783554077148, "rewards/rejected": -6.017210960388184, "step": 360 }, { "epoch": 0.10074880871341048, "grad_norm": 12.932097663714115, "learning_rate": 0.00019999981917872262, "logits/chosen": -0.7583842873573303, "logits/rejected": -0.7324551343917847, "logps/chosen": -502.64813232421875, "logps/rejected": -696.3719482421875, "loss": 0.4578, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4866981506347656, "rewards/margins": 2.141483783721924, "rewards/rejected": -3.6281819343566895, "step": 370 }, { "epoch": 0.10347174948944861, "grad_norm": 7.538819283985503, "learning_rate": 0.000199993490502677, "logits/chosen": 1.3222413063049316, "logits/rejected": 1.3994247913360596, "logps/chosen": -552.2727661132812, "logps/rejected": -888.35009765625, "loss": 0.3377, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7047837972640991, "rewards/margins": 4.129269123077393, "rewards/rejected": -5.834052562713623, "step": 380 }, { "epoch": 0.10619469026548672, "grad_norm": 2.698223448211145, "learning_rate": 0.0001999781214166793, "logits/chosen": -0.531406581401825, "logits/rejected": -0.29297247529029846, "logps/chosen": -579.4844970703125, "logps/rejected": -1168.0150146484375, "loss": 0.2687, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.2400717735290527, "rewards/margins": 6.4643354415893555, "rewards/rejected": -8.704408645629883, "step": 390 }, { "epoch": 0.10891763104152484, "grad_norm": 2.832583238601158, "learning_rate": 0.00019995371331024835, "logits/chosen": -0.9013770818710327, "logits/rejected": -0.8614107966423035, "logps/chosen": -567.0159912109375, "logps/rejected": -1161.518310546875, "loss": 0.2585, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.9346994161605835, "rewards/margins": 6.5001540184021, "rewards/rejected": -8.434852600097656, "step": 400 }, { "epoch": 0.11164057181756297, "grad_norm": 2.4836030301537275, "learning_rate": 0.00019992026839012067, "logits/chosen": -0.20795145630836487, "logits/rejected": -0.05467641353607178, "logps/chosen": -535.1635131835938, "logps/rejected": -879.9713745117188, "loss": 0.2492, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.5837352275848389, "rewards/margins": 3.8102355003356934, "rewards/rejected": -5.393970966339111, "step": 410 }, { "epoch": 0.11436351259360109, "grad_norm": 4.0678022085960155, "learning_rate": 0.00019987778968005106, "logits/chosen": -1.7985776662826538, "logits/rejected": -1.5679908990859985, "logps/chosen": -499.22686767578125, "logps/rejected": -949.0340576171875, "loss": 0.3623, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2260411977767944, "rewards/margins": 4.656464099884033, "rewards/rejected": -5.882505416870117, "step": 420 }, { "epoch": 0.11708645336963922, "grad_norm": 6.170976378523087, "learning_rate": 0.00019982628102053899, "logits/chosen": -1.7474931478500366, "logits/rejected": -1.472307562828064, "logps/chosen": -452.9400329589844, "logps/rejected": -947.8646240234375, "loss": 0.2443, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.1056087017059326, "rewards/margins": 5.24057674407959, "rewards/rejected": -6.346184730529785, "step": 430 }, { "epoch": 0.11980939414567733, "grad_norm": 2.3740869343585986, "learning_rate": 0.00019976574706848154, "logits/chosen": -1.0254762172698975, "logits/rejected": -0.9232420921325684, "logps/chosen": -577.9771728515625, "logps/rejected": -1111.5164794921875, "loss": 0.2136, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2708170413970947, "rewards/margins": 5.644797325134277, "rewards/rejected": -7.915614128112793, "step": 440 }, { "epoch": 0.12253233492171545, "grad_norm": 3.062599110592225, "learning_rate": 0.00019969619329675249, "logits/chosen": -1.398494005203247, "logits/rejected": -1.2401583194732666, "logps/chosen": -548.5270385742188, "logps/rejected": -1162.383056640625, "loss": 0.3083, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.0478172302246094, "rewards/margins": 6.271994590759277, "rewards/rejected": -8.31981086730957, "step": 450 }, { "epoch": 0.12525527569775358, "grad_norm": 2.4479245244845, "learning_rate": 0.00019961762599370723, "logits/chosen": -1.366804838180542, "logits/rejected": -1.2496994733810425, "logps/chosen": -391.8729553222656, "logps/rejected": -678.1502075195312, "loss": 0.2812, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6106027364730835, "rewards/margins": 3.496123790740967, "rewards/rejected": -4.106726169586182, "step": 460 }, { "epoch": 0.1279782164737917, "grad_norm": 5.570404886434436, "learning_rate": 0.00019953005226261438, "logits/chosen": -1.8177093267440796, "logits/rejected": -1.7150996923446655, "logps/chosen": -585.9056396484375, "logps/rejected": -944.1336059570312, "loss": 0.3153, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0305967330932617, "rewards/margins": 4.166149139404297, "rewards/rejected": -6.196745872497559, "step": 470 }, { "epoch": 0.1307011572498298, "grad_norm": 4.9616313649114305, "learning_rate": 0.00019943348002101371, "logits/chosen": -0.35997989773750305, "logits/rejected": -0.54933100938797, "logps/chosen": -697.2908325195312, "logps/rejected": -1242.2154541015625, "loss": 0.2813, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.610248565673828, "rewards/margins": 5.970644474029541, "rewards/rejected": -9.580892562866211, "step": 480 }, { "epoch": 0.13342409802586794, "grad_norm": 3.2104602173488095, "learning_rate": 0.0001993279180000001, "logits/chosen": 3.2386550903320312, "logits/rejected": 2.8118205070495605, "logps/chosen": -657.9302978515625, "logps/rejected": -1343.6512451171875, "loss": 0.304, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.4300479888916016, "rewards/margins": 7.140336036682129, "rewards/rejected": -10.57038402557373, "step": 490 }, { "epoch": 0.13614703880190607, "grad_norm": 3.40241770940377, "learning_rate": 0.00019921337574343423, "logits/chosen": -1.1426218748092651, "logits/rejected": -0.986798107624054, "logps/chosen": -434.1651916503906, "logps/rejected": -757.93115234375, "loss": 0.2695, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.8630467653274536, "rewards/margins": 3.374795436859131, "rewards/rejected": -4.237841606140137, "step": 500 }, { "epoch": 0.13614703880190607, "eval_logits/chosen": -1.5326930284500122, "eval_logits/rejected": -1.3998456001281738, "eval_logps/chosen": -383.6803283691406, "eval_logps/rejected": -745.71533203125, "eval_loss": 0.2652795910835266, "eval_rewards/accuracies": 0.8679706454277039, "eval_rewards/chosen": -0.4398553669452667, "eval_rewards/margins": 4.098072052001953, "eval_rewards/rejected": -4.537927627563477, "eval_runtime": 3754.4478, "eval_samples_per_second": 1.306, "eval_steps_per_second": 0.109, "step": 500 }, { "epoch": 0.13886997957794417, "grad_norm": 5.474127540729177, "learning_rate": 0.00019908986360707981, "logits/chosen": -1.5042083263397217, "logits/rejected": -1.2466522455215454, "logps/chosen": -523.0069580078125, "logps/rejected": -1227.4290771484375, "loss": 0.2134, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.2869787216186523, "rewards/margins": 7.605735778808594, "rewards/rejected": -9.89271354675293, "step": 510 }, { "epoch": 0.1415929203539823, "grad_norm": 7.744668859821086, "learning_rate": 0.00019895739275766717, "logits/chosen": -1.6189693212509155, "logits/rejected": -1.3868303298950195, "logps/chosen": -627.96826171875, "logps/rejected": -1323.58837890625, "loss": 0.2436, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.1383042335510254, "rewards/margins": 7.207145690917969, "rewards/rejected": -10.345449447631836, "step": 520 }, { "epoch": 0.14431586113002043, "grad_norm": 4.7297851415537995, "learning_rate": 0.00019881597517188378, "logits/chosen": -1.7887611389160156, "logits/rejected": -1.6225858926773071, "logps/chosen": -476.73077392578125, "logps/rejected": -840.6845703125, "loss": 0.3274, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9709323644638062, "rewards/margins": 4.59786319732666, "rewards/rejected": -5.568795204162598, "step": 530 }, { "epoch": 0.14703880190605853, "grad_norm": 5.496852164954777, "learning_rate": 0.00019866562363529146, "logits/chosen": -0.9063366055488586, "logits/rejected": -0.7964249849319458, "logps/chosen": -589.5694580078125, "logps/rejected": -1222.3099365234375, "loss": 0.3257, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.250840425491333, "rewards/margins": 6.9674835205078125, "rewards/rejected": -9.218323707580566, "step": 540 }, { "epoch": 0.14976174268209666, "grad_norm": 2.955258018366531, "learning_rate": 0.00019850635174117033, "logits/chosen": -1.3758379220962524, "logits/rejected": -1.2850220203399658, "logps/chosen": -562.1868286132812, "logps/rejected": -1195.6204833984375, "loss": 0.2985, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.2212133407592773, "rewards/margins": 7.197710990905762, "rewards/rejected": -9.418924331665039, "step": 550 }, { "epoch": 0.1524846834581348, "grad_norm": 3.245125438369836, "learning_rate": 0.00019833817388928985, "logits/chosen": -1.1794776916503906, "logits/rejected": -0.9993559122085571, "logps/chosen": -571.94921875, "logps/rejected": -1167.773681640625, "loss": 0.3072, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.339094638824463, "rewards/margins": 6.1757402420043945, "rewards/rejected": -8.5148344039917, "step": 560 }, { "epoch": 0.15520762423417292, "grad_norm": 2.604195604750394, "learning_rate": 0.00019816110528460713, "logits/chosen": -1.2078096866607666, "logits/rejected": -0.7949275970458984, "logps/chosen": -589.8275756835938, "logps/rejected": -1255.671875, "loss": 0.3152, "rewards/accuracies": 0.875, "rewards/chosen": -2.489520311355591, "rewards/margins": 6.91791296005249, "rewards/rejected": -9.40743350982666, "step": 570 }, { "epoch": 0.15793056501021102, "grad_norm": 14.441606451933733, "learning_rate": 0.00019797516193589194, "logits/chosen": -1.5580374002456665, "logits/rejected": -1.270991325378418, "logps/chosen": -670.9024658203125, "logps/rejected": -1422.287353515625, "loss": 0.4552, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1384835243225098, "rewards/margins": 7.682137966156006, "rewards/rejected": -10.8206205368042, "step": 580 }, { "epoch": 0.16065350578624915, "grad_norm": 3.9273831546290814, "learning_rate": 0.00019778036065427965, "logits/chosen": -1.3679311275482178, "logits/rejected": -1.3166406154632568, "logps/chosen": -571.2752685546875, "logps/rejected": -921.2325439453125, "loss": 0.7785, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.133291006088257, "rewards/margins": 4.120671272277832, "rewards/rejected": -6.253961563110352, "step": 590 }, { "epoch": 0.16337644656228728, "grad_norm": 4.005186765164421, "learning_rate": 0.00019757671905175117, "logits/chosen": -0.47542038559913635, "logits/rejected": -0.4297063946723938, "logps/chosen": -464.59820556640625, "logps/rejected": -1003.8577880859375, "loss": 0.4314, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9565982818603516, "rewards/margins": 5.590369701385498, "rewards/rejected": -7.54696798324585, "step": 600 }, { "epoch": 0.16609938733832538, "grad_norm": 2.806683383323669, "learning_rate": 0.00019736425553954065, "logits/chosen": -1.535628318786621, "logits/rejected": -1.4295036792755127, "logps/chosen": -740.3170776367188, "logps/rejected": -1282.82861328125, "loss": 0.4249, "rewards/accuracies": 0.8125, "rewards/chosen": -3.8235206604003906, "rewards/margins": 5.897660732269287, "rewards/rejected": -9.721181869506836, "step": 610 }, { "epoch": 0.1688223281143635, "grad_norm": 4.278979765897441, "learning_rate": 0.00019714298932647098, "logits/chosen": -0.43291616439819336, "logits/rejected": -0.4710071086883545, "logps/chosen": -545.1385498046875, "logps/rejected": -900.1130981445312, "loss": 0.5233, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.2823760509490967, "rewards/margins": 3.3586456775665283, "rewards/rejected": -5.641021251678467, "step": 620 }, { "epoch": 0.17154526889040164, "grad_norm": 11.430682840390572, "learning_rate": 0.0001969129404172172, "logits/chosen": 0.8560006022453308, "logits/rejected": 0.7378341555595398, "logps/chosen": -513.2482299804688, "logps/rejected": -754.9190063476562, "loss": 0.4592, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5915935039520264, "rewards/margins": 2.9012575149536133, "rewards/rejected": -4.492851257324219, "step": 630 }, { "epoch": 0.17426820966643974, "grad_norm": 2.199875368036885, "learning_rate": 0.00019667412961049755, "logits/chosen": -0.1992878019809723, "logits/rejected": -0.40254640579223633, "logps/chosen": -433.77484130859375, "logps/rejected": -813.1231689453125, "loss": 0.3492, "rewards/accuracies": 0.875, "rewards/chosen": -0.7354617118835449, "rewards/margins": 4.359702110290527, "rewards/rejected": -5.0951642990112305, "step": 640 }, { "epoch": 0.17699115044247787, "grad_norm": 4.4540211798567, "learning_rate": 0.00019642657849719354, "logits/chosen": 0.4657669961452484, "logits/rejected": 0.18967926502227783, "logps/chosen": -536.6248168945312, "logps/rejected": -1097.6444091796875, "loss": 0.3092, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8782879114151, "rewards/margins": 5.84519624710083, "rewards/rejected": -7.723484992980957, "step": 650 }, { "epoch": 0.179714091218516, "grad_norm": 3.0889028203904334, "learning_rate": 0.0001961703094583975, "logits/chosen": -1.8331960439682007, "logits/rejected": -1.687788963317871, "logps/chosen": -544.7129516601562, "logps/rejected": -981.7268676757812, "loss": 0.3037, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7984678745269775, "rewards/margins": 4.749049186706543, "rewards/rejected": -6.5475172996521, "step": 660 }, { "epoch": 0.1824370319945541, "grad_norm": 7.960275160298456, "learning_rate": 0.00019590534566338934, "logits/chosen": 0.07616959512233734, "logits/rejected": 0.03734927624464035, "logps/chosen": -528.64501953125, "logps/rejected": -872.9537963867188, "loss": 0.4539, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.8545458316802979, "rewards/margins": 3.848759412765503, "rewards/rejected": -5.703305721282959, "step": 670 }, { "epoch": 0.18515997277059223, "grad_norm": 12.196026749043849, "learning_rate": 0.0001956317110675417, "logits/chosen": -1.7587692737579346, "logits/rejected": -1.3882538080215454, "logps/chosen": -594.3480224609375, "logps/rejected": -1230.926513671875, "loss": 0.3005, "rewards/accuracies": 0.875, "rewards/chosen": -2.64725923538208, "rewards/margins": 6.312607765197754, "rewards/rejected": -8.959867477416992, "step": 680 }, { "epoch": 0.18788291354663036, "grad_norm": 9.39312929425058, "learning_rate": 0.00019534943041015423, "logits/chosen": -2.2219159603118896, "logits/rejected": -2.1489291191101074, "logps/chosen": -371.2558288574219, "logps/rejected": -592.716796875, "loss": 0.5019, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.6846780180931091, "rewards/margins": 2.5522751808166504, "rewards/rejected": -3.2369537353515625, "step": 690 }, { "epoch": 0.1906058543226685, "grad_norm": 2.8421606651782394, "learning_rate": 0.0001950585292122169, "logits/chosen": 0.6412663459777832, "logits/rejected": 0.4461473524570465, "logps/chosen": -528.5960693359375, "logps/rejected": -914.9078369140625, "loss": 0.3292, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.2477450370788574, "rewards/margins": 3.9819042682647705, "rewards/rejected": -6.229649066925049, "step": 700 }, { "epoch": 0.1933287950987066, "grad_norm": 6.372719336489545, "learning_rate": 0.00019475903377410252, "logits/chosen": 3.104797840118408, "logits/rejected": 3.0445351600646973, "logps/chosen": -678.0279541015625, "logps/rejected": -1072.2177734375, "loss": 0.7254, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.2950127124786377, "rewards/margins": 4.643362998962402, "rewards/rejected": -7.938374996185303, "step": 710 }, { "epoch": 0.19605173587474473, "grad_norm": 3.626617866099579, "learning_rate": 0.00019445097117318913, "logits/chosen": -1.6091477870941162, "logits/rejected": -1.535391926765442, "logps/chosen": -503.35784912109375, "logps/rejected": -848.8382568359375, "loss": 0.5518, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6061153411865234, "rewards/margins": 4.038764953613281, "rewards/rejected": -5.644880294799805, "step": 720 }, { "epoch": 0.19877467665078286, "grad_norm": 4.243327644286249, "learning_rate": 0.00019413436926141175, "logits/chosen": -0.8332279324531555, "logits/rejected": -0.8175574541091919, "logps/chosen": -737.3671875, "logps/rejected": -1388.236572265625, "loss": 0.3784, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.6704049110412598, "rewards/margins": 6.894456386566162, "rewards/rejected": -10.564861297607422, "step": 730 }, { "epoch": 0.20149761742682096, "grad_norm": 5.900207156659361, "learning_rate": 0.00019380925666274444, "logits/chosen": -0.7356165647506714, "logits/rejected": -0.7930446267127991, "logps/chosen": -599.1915283203125, "logps/rejected": -1108.341552734375, "loss": 0.3328, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.732137680053711, "rewards/margins": 5.603833198547363, "rewards/rejected": -8.335970878601074, "step": 740 }, { "epoch": 0.2042205582028591, "grad_norm": 3.825224843937841, "learning_rate": 0.0001934756627706122, "logits/chosen": -1.3480488061904907, "logits/rejected": -1.160653829574585, "logps/chosen": -441.1055603027344, "logps/rejected": -832.5628662109375, "loss": 0.3124, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.4829859733581543, "rewards/margins": 3.520029067993164, "rewards/rejected": -5.003015041351318, "step": 750 }, { "epoch": 0.20694349897889722, "grad_norm": 5.110356705719763, "learning_rate": 0.00019313361774523385, "logits/chosen": -2.0760579109191895, "logits/rejected": -2.1237432956695557, "logps/chosen": -549.3860473632812, "logps/rejected": -1049.9378662109375, "loss": 0.6153, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.1024250984191895, "rewards/margins": 5.283135414123535, "rewards/rejected": -7.385560035705566, "step": 760 }, { "epoch": 0.20966643975493532, "grad_norm": 2.4636308668975317, "learning_rate": 0.00019278315251089486, "logits/chosen": -1.732892632484436, "logits/rejected": -1.539317011833191, "logps/chosen": -472.11651611328125, "logps/rejected": -894.4132690429688, "loss": 0.4139, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7663090229034424, "rewards/margins": 4.492798805236816, "rewards/rejected": -6.2591071128845215, "step": 770 }, { "epoch": 0.21238938053097345, "grad_norm": 11.617472243164942, "learning_rate": 0.0001924242987531517, "logits/chosen": -1.3720897436141968, "logits/rejected": -1.4372451305389404, "logps/chosen": -642.5867919921875, "logps/rejected": -1076.4737548828125, "loss": 0.7831, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.8324971199035645, "rewards/margins": 4.546250343322754, "rewards/rejected": -7.378748416900635, "step": 780 }, { "epoch": 0.21511232130701158, "grad_norm": 4.800064106894111, "learning_rate": 0.0001920570889159672, "logits/chosen": -2.131412982940674, "logits/rejected": -2.0672669410705566, "logps/chosen": -502.35125732421875, "logps/rejected": -979.3629150390625, "loss": 0.5855, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5898116827011108, "rewards/margins": 6.003064155578613, "rewards/rejected": -7.592875003814697, "step": 790 }, { "epoch": 0.21783526208304968, "grad_norm": 7.9969773800091, "learning_rate": 0.0001916815561987771, "logits/chosen": -1.6771615743637085, "logits/rejected": -1.6893478631973267, "logps/chosen": -475.9574279785156, "logps/rejected": -858.0924072265625, "loss": 0.4716, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.2917953729629517, "rewards/margins": 4.0918192863464355, "rewards/rejected": -5.383615016937256, "step": 800 }, { "epoch": 0.2205582028590878, "grad_norm": 15.598299647751306, "learning_rate": 0.00019129773455348864, "logits/chosen": -2.2555699348449707, "logits/rejected": -2.179759979248047, "logps/chosen": -422.21075439453125, "logps/rejected": -853.91259765625, "loss": 0.4422, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0770437717437744, "rewards/margins": 4.805014610290527, "rewards/rejected": -5.882058143615723, "step": 810 }, { "epoch": 0.22328114363512594, "grad_norm": 5.334288389286907, "learning_rate": 0.00019090565868141096, "logits/chosen": -0.569889485836029, "logits/rejected": -0.6646596193313599, "logps/chosen": -913.2365112304688, "logps/rejected": -1756.2760009765625, "loss": 1.2924, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.7498884201049805, "rewards/margins": 8.836221694946289, "rewards/rejected": -14.586111068725586, "step": 820 }, { "epoch": 0.22600408441116407, "grad_norm": 9.424443385582844, "learning_rate": 0.0001905053640301176, "logits/chosen": 1.087628722190857, "logits/rejected": 0.9164185523986816, "logps/chosen": -554.4573974609375, "logps/rejected": -922.4230346679688, "loss": 0.4635, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7598752975463867, "rewards/margins": 4.093839168548584, "rewards/rejected": -5.853714942932129, "step": 830 }, { "epoch": 0.22872702518720217, "grad_norm": 31.748839971929247, "learning_rate": 0.0001900968867902419, "logits/chosen": 1.860044240951538, "logits/rejected": 1.750836968421936, "logps/chosen": -501.2557678222656, "logps/rejected": -742.8480224609375, "loss": 0.4693, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5709228515625, "rewards/margins": 2.8358662128448486, "rewards/rejected": -4.4067888259887695, "step": 840 }, { "epoch": 0.2314499659632403, "grad_norm": 5.037429450172698, "learning_rate": 0.00018968026389220498, "logits/chosen": 1.901974081993103, "logits/rejected": 1.9345325231552124, "logps/chosen": -473.47100830078125, "logps/rejected": -704.6774291992188, "loss": 0.4027, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.1274769306182861, "rewards/margins": 3.0902841091156006, "rewards/rejected": -4.217761039733887, "step": 850 }, { "epoch": 0.23417290673927843, "grad_norm": 4.938998989868357, "learning_rate": 0.0001892555330028766, "logits/chosen": 1.3002218008041382, "logits/rejected": 1.288474202156067, "logps/chosen": -481.8773498535156, "logps/rejected": -835.0935668945312, "loss": 0.3162, "rewards/accuracies": 0.875, "rewards/chosen": -1.5530319213867188, "rewards/margins": 4.073337078094482, "rewards/rejected": -5.626368522644043, "step": 860 }, { "epoch": 0.23689584751531653, "grad_norm": 4.653639204830587, "learning_rate": 0.00018882273252217004, "logits/chosen": 0.8681214451789856, "logits/rejected": 0.951123058795929, "logps/chosen": -576.9916381835938, "logps/rejected": -988.4410400390625, "loss": 0.2979, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.7273517847061157, "rewards/margins": 4.580703258514404, "rewards/rejected": -6.3080549240112305, "step": 870 }, { "epoch": 0.23961878829135466, "grad_norm": 3.633959384080648, "learning_rate": 0.0001883819015795701, "logits/chosen": 0.8870223760604858, "logits/rejected": 0.8765641450881958, "logps/chosen": -463.49993896484375, "logps/rejected": -962.3264770507812, "loss": 0.3212, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.3259989023208618, "rewards/margins": 5.314154148101807, "rewards/rejected": -6.640152931213379, "step": 880 }, { "epoch": 0.2423417290673928, "grad_norm": 4.332123847510828, "learning_rate": 0.00018793308003059572, "logits/chosen": 1.1213561296463013, "logits/rejected": 1.1425080299377441, "logps/chosen": -459.064453125, "logps/rejected": -865.5914306640625, "loss": 0.4558, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.331207036972046, "rewards/margins": 4.389949798583984, "rewards/rejected": -5.721157073974609, "step": 890 }, { "epoch": 0.2450646698434309, "grad_norm": 15.682490544625567, "learning_rate": 0.00018747630845319612, "logits/chosen": 1.2807202339172363, "logits/rejected": 1.3666961193084717, "logps/chosen": -704.359375, "logps/rejected": -1433.5845947265625, "loss": 0.3848, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.641261577606201, "rewards/margins": 7.442375183105469, "rewards/rejected": -11.083636283874512, "step": 900 }, { "epoch": 0.24778761061946902, "grad_norm": 11.750100569612796, "learning_rate": 0.00018701162814408278, "logits/chosen": 0.8161400556564331, "logits/rejected": 0.7485871315002441, "logps/chosen": -518.4786987304688, "logps/rejected": -798.63720703125, "loss": 0.5486, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.984863042831421, "rewards/margins": 2.9147756099700928, "rewards/rejected": -4.8996381759643555, "step": 910 }, { "epoch": 0.25051055139550715, "grad_norm": 7.000935714029327, "learning_rate": 0.00018653908111499533, "logits/chosen": -0.31201204657554626, "logits/rejected": -0.2849845290184021, "logps/chosen": -527.2504272460938, "logps/rejected": -787.3291015625, "loss": 0.4349, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6963837146759033, "rewards/margins": 3.1402206420898438, "rewards/rejected": -4.836604118347168, "step": 920 }, { "epoch": 0.2532334921715453, "grad_norm": 11.18800926600743, "learning_rate": 0.00018605871008890346, "logits/chosen": -1.294745683670044, "logits/rejected": -1.240708351135254, "logps/chosen": -511.86309814453125, "logps/rejected": -744.6692504882812, "loss": 1.0639, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8128286600112915, "rewards/margins": 2.8213400840759277, "rewards/rejected": -4.634169101715088, "step": 930 }, { "epoch": 0.2559564329475834, "grad_norm": 10.556179766281154, "learning_rate": 0.0001855705584961443, "logits/chosen": 0.7220875024795532, "logits/rejected": 0.6994001269340515, "logps/chosen": -607.748291015625, "logps/rejected": -1030.421875, "loss": 0.451, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.9004902839660645, "rewards/margins": 4.076137542724609, "rewards/rejected": -6.976628303527832, "step": 940 }, { "epoch": 0.2586793737236215, "grad_norm": 6.13682401768759, "learning_rate": 0.00018507467047049593, "logits/chosen": 0.3612140715122223, "logits/rejected": 0.24034667015075684, "logps/chosen": -518.5841674804688, "logps/rejected": -700.7687377929688, "loss": 0.46, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6922804117202759, "rewards/margins": 2.2763285636901855, "rewards/rejected": -3.968608856201172, "step": 950 }, { "epoch": 0.2614023144996596, "grad_norm": 6.121773697604543, "learning_rate": 0.0001845710908451872, "logits/chosen": 0.5088990330696106, "logits/rejected": 0.33550935983657837, "logps/chosen": -541.6002197265625, "logps/rejected": -806.7115478515625, "loss": 0.4299, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0940399169921875, "rewards/margins": 3.1160244941711426, "rewards/rejected": -5.21006441116333, "step": 960 }, { "epoch": 0.26412525527569775, "grad_norm": 10.899985502581202, "learning_rate": 0.00018405986514884434, "logits/chosen": 2.749624252319336, "logits/rejected": 2.3466954231262207, "logps/chosen": -582.1544799804688, "logps/rejected": -924.7894287109375, "loss": 0.4395, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.315011501312256, "rewards/margins": 3.5922577381134033, "rewards/rejected": -5.907269477844238, "step": 970 }, { "epoch": 0.2668481960517359, "grad_norm": 4.048659705337102, "learning_rate": 0.00018354103960137473, "logits/chosen": 1.1572563648223877, "logits/rejected": 1.267608404159546, "logps/chosen": -582.2197875976562, "logps/rejected": -861.1555786132812, "loss": 0.3092, "rewards/accuracies": 0.875, "rewards/chosen": -2.239684820175171, "rewards/margins": 3.4138379096984863, "rewards/rejected": -5.65352201461792, "step": 980 }, { "epoch": 0.269571136827774, "grad_norm": 6.445129082017715, "learning_rate": 0.00018301466110978826, "logits/chosen": 5.172424793243408, "logits/rejected": 5.469472885131836, "logps/chosen": -583.6631469726562, "logps/rejected": -822.27783203125, "loss": 0.3662, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.1274476051330566, "rewards/margins": 3.208089828491211, "rewards/rejected": -5.335537910461426, "step": 990 }, { "epoch": 0.27229407760381213, "grad_norm": 6.566218610450217, "learning_rate": 0.00018248077726395635, "logits/chosen": 6.2127532958984375, "logits/rejected": 5.775629043579102, "logps/chosen": -699.6742553710938, "logps/rejected": -1013.6728515625, "loss": 0.4349, "rewards/accuracies": 0.875, "rewards/chosen": -3.4475739002227783, "rewards/margins": 3.560396671295166, "rewards/rejected": -7.007970333099365, "step": 1000 }, { "epoch": 0.27229407760381213, "eval_logits/chosen": 4.7867841720581055, "eval_logits/rejected": 4.172375202178955, "eval_logps/chosen": -599.8698120117188, "eval_logps/rejected": -1004.0470581054688, "eval_loss": 0.3151506781578064, "eval_rewards/accuracies": 0.8514670133590698, "eval_rewards/chosen": -2.601750135421753, "eval_rewards/margins": 4.519495487213135, "eval_rewards/rejected": -7.121245384216309, "eval_runtime": 3748.1009, "eval_samples_per_second": 1.308, "eval_steps_per_second": 0.109, "step": 1000 }, { "epoch": 0.27501701837985026, "grad_norm": 4.1859048562805325, "learning_rate": 0.0001819394363323093, "logits/chosen": 2.4643194675445557, "logits/rejected": 2.2225146293640137, "logps/chosen": -545.6774291992188, "logps/rejected": -996.73681640625, "loss": 0.2765, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.89382004737854, "rewards/margins": 5.465787410736084, "rewards/rejected": -7.3596086502075195, "step": 1010 }, { "epoch": 0.27773995915588834, "grad_norm": 4.563487274752729, "learning_rate": 0.00018139068725747253, "logits/chosen": 1.0098166465759277, "logits/rejected": 1.0116276741027832, "logps/chosen": -602.1691284179688, "logps/rejected": -1046.082763671875, "loss": 0.2743, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.5139992237091064, "rewards/margins": 4.713437557220459, "rewards/rejected": -7.2274370193481445, "step": 1020 }, { "epoch": 0.28046289993192647, "grad_norm": 7.5233407893974364, "learning_rate": 0.0001808345796518415, "logits/chosen": -0.4183047413825989, "logits/rejected": -0.31315118074417114, "logps/chosen": -532.0780029296875, "logps/rejected": -1177.1412353515625, "loss": 0.2154, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.2215864658355713, "rewards/margins": 6.905013084411621, "rewards/rejected": -9.12660026550293, "step": 1030 }, { "epoch": 0.2831858407079646, "grad_norm": 5.735221398988511, "learning_rate": 0.00018027116379309638, "logits/chosen": -0.38075047731399536, "logits/rejected": -0.2400140017271042, "logps/chosen": -578.424072265625, "logps/rejected": -1119.6865234375, "loss": 0.3202, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2283248901367188, "rewards/margins": 5.826390743255615, "rewards/rejected": -8.054716110229492, "step": 1040 }, { "epoch": 0.2859087814840027, "grad_norm": 4.097854425465531, "learning_rate": 0.00017970049061965637, "logits/chosen": 3.559316635131836, "logits/rejected": 3.129002571105957, "logps/chosen": -522.4072265625, "logps/rejected": -1131.6197509765625, "loss": 0.2673, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.1318278312683105, "rewards/margins": 6.1658806800842285, "rewards/rejected": -8.297708511352539, "step": 1050 }, { "epoch": 0.28863172226004086, "grad_norm": 5.557225788607541, "learning_rate": 0.00017912261172607437, "logits/chosen": 0.9803678393363953, "logits/rejected": 0.749468207359314, "logps/chosen": -525.6317138671875, "logps/rejected": -1126.9403076171875, "loss": 0.2935, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.0150177478790283, "rewards/margins": 5.855062484741211, "rewards/rejected": -7.87007999420166, "step": 1060 }, { "epoch": 0.291354663036079, "grad_norm": 16.881151266803318, "learning_rate": 0.00017853757935837235, "logits/chosen": 2.2658114433288574, "logits/rejected": 2.1108152866363525, "logps/chosen": -653.8731689453125, "logps/rejected": -1232.9185791015625, "loss": 0.2515, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.2572929859161377, "rewards/margins": 6.0115251541137695, "rewards/rejected": -9.268818855285645, "step": 1070 }, { "epoch": 0.29407760381211706, "grad_norm": 2.9796008894176103, "learning_rate": 0.00017794544640931773, "logits/chosen": 0.10931304842233658, "logits/rejected": 0.12252505123615265, "logps/chosen": -558.3709106445312, "logps/rejected": -942.2255859375, "loss": 0.3009, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7539688348770142, "rewards/margins": 4.423993110656738, "rewards/rejected": -6.177962303161621, "step": 1080 }, { "epoch": 0.2968005445881552, "grad_norm": 4.281633296458022, "learning_rate": 0.00017734626641364147, "logits/chosen": 0.3914720416069031, "logits/rejected": 0.32610705494880676, "logps/chosen": -486.5399475097656, "logps/rejected": -891.3079833984375, "loss": 0.2547, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6498864889144897, "rewards/margins": 4.376126289367676, "rewards/rejected": -6.026012420654297, "step": 1090 }, { "epoch": 0.2995234853641933, "grad_norm": 5.5930892470284475, "learning_rate": 0.00017674009354319778, "logits/chosen": 0.5930287837982178, "logits/rejected": 0.653694748878479, "logps/chosen": -603.6002807617188, "logps/rejected": -1086.633544921875, "loss": 0.3199, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.18953800201416, "rewards/margins": 5.238548755645752, "rewards/rejected": -7.428086757659912, "step": 1100 }, { "epoch": 0.30224642614023145, "grad_norm": 2.722087080665883, "learning_rate": 0.00017612698260206666, "logits/chosen": 0.37087658047676086, "logits/rejected": 0.4003225266933441, "logps/chosen": -492.9803771972656, "logps/rejected": -1075.6131591796875, "loss": 0.5122, "rewards/accuracies": 0.875, "rewards/chosen": -1.6157805919647217, "rewards/margins": 6.397913932800293, "rewards/rejected": -8.013693809509277, "step": 1110 }, { "epoch": 0.3049693669162696, "grad_norm": 14.979862803572216, "learning_rate": 0.00017550698902159896, "logits/chosen": 1.7447624206542969, "logits/rejected": 1.8597869873046875, "logps/chosen": -696.9720458984375, "logps/rejected": -1302.992431640625, "loss": 0.3806, "rewards/accuracies": 0.8125, "rewards/chosen": -3.522810697555542, "rewards/margins": 6.992275238037109, "rewards/rejected": -10.51508617401123, "step": 1120 }, { "epoch": 0.3076923076923077, "grad_norm": 3.147793738872875, "learning_rate": 0.00017488016885540484, "logits/chosen": 3.254767656326294, "logits/rejected": 2.9010090827941895, "logps/chosen": -507.4468688964844, "logps/rejected": -1095.754638671875, "loss": 0.3034, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.7343683242797852, "rewards/margins": 5.961197853088379, "rewards/rejected": -7.695566654205322, "step": 1130 }, { "epoch": 0.31041524846834584, "grad_norm": 3.677842943487348, "learning_rate": 0.00017424657877428596, "logits/chosen": -0.7234563231468201, "logits/rejected": -0.5526185035705566, "logps/chosen": -540.0379638671875, "logps/rejected": -1126.988037109375, "loss": 0.3271, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7782533168792725, "rewards/margins": 6.2880048751831055, "rewards/rejected": -8.066259384155273, "step": 1140 }, { "epoch": 0.3131381892443839, "grad_norm": 10.43709043871757, "learning_rate": 0.0001736062760611119, "logits/chosen": 0.05422030761837959, "logits/rejected": 0.14549703896045685, "logps/chosen": -657.1055908203125, "logps/rejected": -1551.24560546875, "loss": 0.3395, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.0819263458251953, "rewards/margins": 9.380105018615723, "rewards/rejected": -12.462031364440918, "step": 1150 }, { "epoch": 0.31586113002042204, "grad_norm": 5.657046152510863, "learning_rate": 0.00017295931860564117, "logits/chosen": -0.35942110419273376, "logits/rejected": -0.25374308228492737, "logps/chosen": -654.0399169921875, "logps/rejected": -1307.819580078125, "loss": 0.2725, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.1632027626037598, "rewards/margins": 6.858729362487793, "rewards/rejected": -10.021931648254395, "step": 1160 }, { "epoch": 0.3185840707964602, "grad_norm": 6.835124252986284, "learning_rate": 0.0001723057648992875, "logits/chosen": -1.796087622642517, "logits/rejected": -1.386889934539795, "logps/chosen": -551.5918579101562, "logps/rejected": -1379.042724609375, "loss": 0.2229, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.4278125762939453, "rewards/margins": 8.520901679992676, "rewards/rejected": -10.948714256286621, "step": 1170 }, { "epoch": 0.3213070115724983, "grad_norm": 3.841956624946803, "learning_rate": 0.00017164567402983152, "logits/chosen": -1.5889203548431396, "logits/rejected": -1.3325690031051636, "logps/chosen": -705.3286743164062, "logps/rejected": -1427.21484375, "loss": 0.1891, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.543011426925659, "rewards/margins": 7.749202728271484, "rewards/rejected": -11.292215347290039, "step": 1180 }, { "epoch": 0.32402995234853643, "grad_norm": 7.006267955561037, "learning_rate": 0.00017097910567607865, "logits/chosen": -1.8383554220199585, "logits/rejected": -1.5112298727035522, "logps/chosen": -626.0816650390625, "logps/rejected": -1356.2662353515625, "loss": 0.3225, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.867626667022705, "rewards/margins": 7.212944984436035, "rewards/rejected": -10.080572128295898, "step": 1190 }, { "epoch": 0.32675289312457456, "grad_norm": 6.2805032054390075, "learning_rate": 0.0001703061201024636, "logits/chosen": -1.2520471811294556, "logits/rejected": -0.8927146792411804, "logps/chosen": -705.08154296875, "logps/rejected": -1572.4884033203125, "loss": 0.22, "rewards/accuracies": 0.875, "rewards/chosen": -3.847522020339966, "rewards/margins": 8.92027473449707, "rewards/rejected": -12.76779556274414, "step": 1200 }, { "epoch": 0.32947583390061264, "grad_norm": 5.6676894500481065, "learning_rate": 0.0001696267781536018, "logits/chosen": -1.0109606981277466, "logits/rejected": -0.7403501868247986, "logps/chosen": -720.68798828125, "logps/rejected": -1489.216064453125, "loss": 0.2525, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.673945665359497, "rewards/margins": 8.27706527709961, "rewards/rejected": -11.951011657714844, "step": 1210 }, { "epoch": 0.33219877467665077, "grad_norm": 6.806373569251935, "learning_rate": 0.0001689411412487885, "logits/chosen": -1.2114444971084595, "logits/rejected": -0.9269537925720215, "logps/chosen": -660.3043823242188, "logps/rejected": -1289.1474609375, "loss": 0.2564, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.7403571605682373, "rewards/margins": 6.5076584815979, "rewards/rejected": -9.248014450073242, "step": 1220 }, { "epoch": 0.3349217154526889, "grad_norm": 1.6484966634408573, "learning_rate": 0.00016824927137644587, "logits/chosen": -1.5764806270599365, "logits/rejected": -1.3572378158569336, "logps/chosen": -622.459228515625, "logps/rejected": -1207.199951171875, "loss": 0.2831, "rewards/accuracies": 0.875, "rewards/chosen": -2.8502438068389893, "rewards/margins": 6.140684604644775, "rewards/rejected": -8.990928649902344, "step": 1230 }, { "epoch": 0.337644656228727, "grad_norm": 3.1773340218228983, "learning_rate": 0.00016755123108851843, "logits/chosen": -0.8344672918319702, "logits/rejected": -0.7575286030769348, "logps/chosen": -557.8627319335938, "logps/rejected": -1094.7261962890625, "loss": 0.2617, "rewards/accuracies": 0.875, "rewards/chosen": -2.271852970123291, "rewards/margins": 6.080749988555908, "rewards/rejected": -8.352602005004883, "step": 1240 }, { "epoch": 0.34036759700476515, "grad_norm": 7.197978920136861, "learning_rate": 0.00016684708349481804, "logits/chosen": -1.293826937675476, "logits/rejected": -1.111547589302063, "logps/chosen": -684.3062744140625, "logps/rejected": -1335.118896484375, "loss": 0.4245, "rewards/accuracies": 0.9375, "rewards/chosen": -3.4491424560546875, "rewards/margins": 6.6166839599609375, "rewards/rejected": -10.065826416015625, "step": 1250 }, { "epoch": 0.3430905377808033, "grad_norm": 8.542463349127416, "learning_rate": 0.00016613689225731789, "logits/chosen": -1.3532450199127197, "logits/rejected": -1.0953752994537354, "logps/chosen": -587.2614135742188, "logps/rejected": -1286.6195068359375, "loss": 0.2996, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.7271084785461426, "rewards/margins": 6.817942142486572, "rewards/rejected": -9.545049667358398, "step": 1260 }, { "epoch": 0.3458134785568414, "grad_norm": 2.1114546108326517, "learning_rate": 0.00016542072158439691, "logits/chosen": -1.4009244441986084, "logits/rejected": -1.2579541206359863, "logps/chosen": -632.0369873046875, "logps/rejected": -1034.799072265625, "loss": 0.2368, "rewards/accuracies": 0.875, "rewards/chosen": -2.2211804389953613, "rewards/margins": 4.827291011810303, "rewards/rejected": -7.048470973968506, "step": 1270 }, { "epoch": 0.3485364193328795, "grad_norm": 24.697563459789354, "learning_rate": 0.00016469863622503477, "logits/chosen": -1.9558353424072266, "logits/rejected": -1.6197192668914795, "logps/chosen": -671.5903930664062, "logps/rejected": -1494.4512939453125, "loss": 0.2951, "rewards/accuracies": 0.9375, "rewards/chosen": -3.403165340423584, "rewards/margins": 8.787665367126465, "rewards/rejected": -12.190831184387207, "step": 1280 }, { "epoch": 0.3512593601089176, "grad_norm": 265.08504869734134, "learning_rate": 0.00016397070146295778, "logits/chosen": 0.3683956265449524, "logits/rejected": 0.6786172389984131, "logps/chosen": -579.5296630859375, "logps/rejected": -1230.6458740234375, "loss": 0.8783, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.450284481048584, "rewards/margins": 7.403939723968506, "rewards/rejected": -9.85422420501709, "step": 1290 }, { "epoch": 0.35398230088495575, "grad_norm": 54.97903067864863, "learning_rate": 0.00016323698311073668, "logits/chosen": 1.4707801342010498, "logits/rejected": 1.2925649881362915, "logps/chosen": -1907.9742431640625, "logps/rejected": -1993.4407958984375, "loss": 3.2747, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -15.379717826843262, "rewards/margins": 1.4004836082458496, "rewards/rejected": -16.780200958251953, "step": 1300 }, { "epoch": 0.3567052416609939, "grad_norm": 8.592614500780192, "learning_rate": 0.0001624975475038365, "logits/chosen": -1.2588945627212524, "logits/rejected": -1.2224012613296509, "logps/chosen": -716.8282470703125, "logps/rejected": -945.8975830078125, "loss": 0.843, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.887277126312256, "rewards/margins": 2.443674087524414, "rewards/rejected": -6.330951690673828, "step": 1310 }, { "epoch": 0.359428182437032, "grad_norm": 5.311974463877925, "learning_rate": 0.0001617524614946192, "logits/chosen": -1.1290063858032227, "logits/rejected": -1.064706563949585, "logps/chosen": -629.7766723632812, "logps/rejected": -904.5687255859375, "loss": 0.4904, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.5980165004730225, "rewards/margins": 3.1313178539276123, "rewards/rejected": -5.729334831237793, "step": 1320 }, { "epoch": 0.36215112321307014, "grad_norm": 6.474086591026475, "learning_rate": 0.00016100179244629952, "logits/chosen": -1.5740928649902344, "logits/rejected": -1.4228092432022095, "logps/chosen": -764.6334228515625, "logps/rejected": -1126.4373779296875, "loss": 0.4008, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.025094032287598, "rewards/margins": 4.259903907775879, "rewards/rejected": -8.284997940063477, "step": 1330 }, { "epoch": 0.3648740639891082, "grad_norm": 5.477184678397902, "learning_rate": 0.00016024560822685467, "logits/chosen": -2.164599657058716, "logits/rejected": -2.0423648357391357, "logps/chosen": -634.7586669921875, "logps/rejected": -1000.6339721679688, "loss": 0.2964, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.5424180030822754, "rewards/margins": 4.107353210449219, "rewards/rejected": -6.649771213531494, "step": 1340 }, { "epoch": 0.36759700476514634, "grad_norm": 11.647999715649028, "learning_rate": 0.0001594839772028884, "logits/chosen": 1.426068663597107, "logits/rejected": 1.4375708103179932, "logps/chosen": -649.6060791015625, "logps/rejected": -1172.4080810546875, "loss": 0.3104, "rewards/accuracies": 0.875, "rewards/chosen": -3.2215964794158936, "rewards/margins": 5.544508457183838, "rewards/rejected": -8.766103744506836, "step": 1350 }, { "epoch": 0.37031994554118447, "grad_norm": 11.349006083438557, "learning_rate": 0.00015871696823345, "logits/chosen": -1.0609710216522217, "logits/rejected": -0.9911941289901733, "logps/chosen": -605.7052001953125, "logps/rejected": -1159.8377685546875, "loss": 0.2463, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.4982380867004395, "rewards/margins": 5.881691932678223, "rewards/rejected": -8.37993049621582, "step": 1360 }, { "epoch": 0.3730428863172226, "grad_norm": 11.295379259266975, "learning_rate": 0.00015794465066380867, "logits/chosen": -1.2733609676361084, "logits/rejected": -1.1467927694320679, "logps/chosen": -688.9752197265625, "logps/rejected": -1283.682373046875, "loss": 0.3466, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.429932117462158, "rewards/margins": 6.115570545196533, "rewards/rejected": -9.545502662658691, "step": 1370 }, { "epoch": 0.37576582709326073, "grad_norm": 12.797060807657392, "learning_rate": 0.00015716709431918413, "logits/chosen": -1.3164949417114258, "logits/rejected": -1.233361840248108, "logps/chosen": -816.4059448242188, "logps/rejected": -1399.9384765625, "loss": 0.3763, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -4.322115898132324, "rewards/margins": 6.451016426086426, "rewards/rejected": -10.77313232421875, "step": 1380 }, { "epoch": 0.37848876786929886, "grad_norm": 10.821771896690832, "learning_rate": 0.0001563843694984336, "logits/chosen": 0.735268771648407, "logits/rejected": 0.6736514568328857, "logps/chosen": -542.029296875, "logps/rejected": -1001.3328857421875, "loss": 0.3622, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.9188995361328125, "rewards/margins": 5.033745288848877, "rewards/rejected": -6.952645301818848, "step": 1390 }, { "epoch": 0.381211708645337, "grad_norm": 6.621280422308756, "learning_rate": 0.00015559654696769627, "logits/chosen": 3.1346821784973145, "logits/rejected": 2.477602481842041, "logps/chosen": -491.7142028808594, "logps/rejected": -1117.6937255859375, "loss": 0.2835, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.8081684112548828, "rewards/margins": 6.549437522888184, "rewards/rejected": -8.357605934143066, "step": 1400 }, { "epoch": 0.38393464942137506, "grad_norm": 7.483334853774204, "learning_rate": 0.00015480369795399507, "logits/chosen": 0.8859611749649048, "logits/rejected": 0.6168441772460938, "logps/chosen": -614.8277587890625, "logps/rejected": -1257.08056640625, "loss": 0.3414, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.5697829723358154, "rewards/margins": 6.640227317810059, "rewards/rejected": -9.210010528564453, "step": 1410 }, { "epoch": 0.3866575901974132, "grad_norm": 8.321072241953082, "learning_rate": 0.0001540058941387973, "logits/chosen": -1.7052541971206665, "logits/rejected": -1.3722909688949585, "logps/chosen": -543.1255493164062, "logps/rejected": -1270.39013671875, "loss": 0.3713, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.073045253753662, "rewards/margins": 7.566277980804443, "rewards/rejected": -9.639322280883789, "step": 1420 }, { "epoch": 0.3893805309734513, "grad_norm": 9.566652205072009, "learning_rate": 0.00015320320765153367, "logits/chosen": -0.6746680736541748, "logits/rejected": -0.31939640641212463, "logps/chosen": -646.5062255859375, "logps/rejected": -1472.7542724609375, "loss": 0.2957, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.08526873588562, "rewards/margins": 9.02385139465332, "rewards/rejected": -12.10912036895752, "step": 1430 }, { "epoch": 0.39210347174948945, "grad_norm": 5.674672936267165, "learning_rate": 0.00015239571106307728, "logits/chosen": -1.5170918703079224, "logits/rejected": -1.3616416454315186, "logps/chosen": -488.8348693847656, "logps/rejected": -978.12158203125, "loss": 0.3775, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6571521759033203, "rewards/margins": 5.227241516113281, "rewards/rejected": -6.88439416885376, "step": 1440 }, { "epoch": 0.3948264125255276, "grad_norm": 14.456689496224328, "learning_rate": 0.0001515834773791824, "logits/chosen": -1.5392897129058838, "logits/rejected": -1.2314963340759277, "logps/chosen": -568.02685546875, "logps/rejected": -1188.26171875, "loss": 1.526, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.578427791595459, "rewards/margins": 6.594627380371094, "rewards/rejected": -9.173055648803711, "step": 1450 }, { "epoch": 0.3975493533015657, "grad_norm": 24.108809069668514, "learning_rate": 0.000150766580033884, "logits/chosen": -0.8912268877029419, "logits/rejected": -0.7923849821090698, "logps/chosen": -611.3818359375, "logps/rejected": -1145.431884765625, "loss": 0.4417, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.100924253463745, "rewards/margins": 5.812254428863525, "rewards/rejected": -8.913179397583008, "step": 1460 }, { "epoch": 0.4002722940776038, "grad_norm": 5.034388636648245, "learning_rate": 0.00014994509288285862, "logits/chosen": -1.4466712474822998, "logits/rejected": -1.3241074085235596, "logps/chosen": -714.8709716796875, "logps/rejected": -1214.1171875, "loss": 0.3186, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.5827248096466064, "rewards/margins": 5.621241092681885, "rewards/rejected": -9.20396614074707, "step": 1470 }, { "epoch": 0.4029952348536419, "grad_norm": 10.420645648436944, "learning_rate": 0.00014911909019674704, "logits/chosen": -1.1877351999282837, "logits/rejected": -0.9257787466049194, "logps/chosen": -613.208251953125, "logps/rejected": -1327.60400390625, "loss": 0.3966, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.045889139175415, "rewards/margins": 7.302800178527832, "rewards/rejected": -10.348688125610352, "step": 1480 }, { "epoch": 0.40571817562968004, "grad_norm": 51.90536060739862, "learning_rate": 0.00014828864665443952, "logits/chosen": -1.4053256511688232, "logits/rejected": -1.2433897256851196, "logps/chosen": -557.5704345703125, "logps/rejected": -1206.54345703125, "loss": 0.2779, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.4509506225585938, "rewards/margins": 6.966989040374756, "rewards/rejected": -9.417940139770508, "step": 1490 }, { "epoch": 0.4084411164057182, "grad_norm": 14.361486530833302, "learning_rate": 0.0001474538373363241, "logits/chosen": -1.412669062614441, "logits/rejected": -1.3897775411605835, "logps/chosen": -564.7686767578125, "logps/rejected": -837.3571166992188, "loss": 0.531, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.930058479309082, "rewards/margins": 3.367487668991089, "rewards/rejected": -5.29754638671875, "step": 1500 }, { "epoch": 0.4084411164057182, "eval_logits/chosen": -1.6537649631500244, "eval_logits/rejected": -1.519540786743164, "eval_logps/chosen": -582.22412109375, "eval_logps/rejected": -1098.727783203125, "eval_loss": 0.4872981011867523, "eval_rewards/accuracies": 0.7854523062705994, "eval_rewards/chosen": -2.425293445587158, "eval_rewards/margins": 5.642757415771484, "eval_rewards/rejected": -8.068050384521484, "eval_runtime": 3785.6353, "eval_samples_per_second": 1.295, "eval_steps_per_second": 0.108, "step": 1500 }, { "epoch": 0.4111640571817563, "grad_norm": 9.318696202923965, "learning_rate": 0.0001466147377174985, "logits/chosen": -0.715455174446106, "logits/rejected": -0.6652237176895142, "logps/chosen": -572.0481567382812, "logps/rejected": -1031.051513671875, "loss": 0.418, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.4588992595672607, "rewards/margins": 4.753371238708496, "rewards/rejected": -7.212270259857178, "step": 1510 }, { "epoch": 0.41388699795779443, "grad_norm": 19.325914422583597, "learning_rate": 0.00014577142366094644, "logits/chosen": -0.05393592268228531, "logits/rejected": 0.009328609332442284, "logps/chosen": -620.9105224609375, "logps/rejected": -1113.5853271484375, "loss": 0.4078, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.8703293800354004, "rewards/margins": 5.491124629974365, "rewards/rejected": -8.36145305633545, "step": 1520 }, { "epoch": 0.41660993873383256, "grad_norm": 12.358286061777378, "learning_rate": 0.00014492397141067887, "logits/chosen": 0.14358481764793396, "logits/rejected": 0.03318501263856888, "logps/chosen": -594.7238159179688, "logps/rejected": -1354.75537109375, "loss": 0.3331, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.9422993659973145, "rewards/margins": 7.6811628341674805, "rewards/rejected": -10.623462677001953, "step": 1530 }, { "epoch": 0.41933287950987064, "grad_norm": 6.2960027951032, "learning_rate": 0.00014407245758484095, "logits/chosen": -0.20292606949806213, "logits/rejected": -0.11837242543697357, "logps/chosen": -618.982666015625, "logps/rejected": -1294.1939697265625, "loss": 0.3578, "rewards/accuracies": 0.875, "rewards/chosen": -2.7013134956359863, "rewards/margins": 7.379813194274902, "rewards/rejected": -10.08112621307373, "step": 1540 }, { "epoch": 0.42205582028590877, "grad_norm": 9.817192732437798, "learning_rate": 0.00014321695916878454, "logits/chosen": -0.2398807555437088, "logits/rejected": -0.1428002268075943, "logps/chosen": -564.1031494140625, "logps/rejected": -1201.117431640625, "loss": 0.3268, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.393275737762451, "rewards/margins": 6.322265625, "rewards/rejected": -8.715542793273926, "step": 1550 }, { "epoch": 0.4247787610619469, "grad_norm": 5.209206228944598, "learning_rate": 0.00014235755350810853, "logits/chosen": 0.9619453549385071, "logits/rejected": 1.1703174114227295, "logps/chosen": -648.5733642578125, "logps/rejected": -1496.5716552734375, "loss": 0.3565, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.6705708503723145, "rewards/margins": 8.732547760009766, "rewards/rejected": -11.403119087219238, "step": 1560 }, { "epoch": 0.427501701837985, "grad_norm": 4.077144779224901, "learning_rate": 0.0001414943183016655, "logits/chosen": 2.7115371227264404, "logits/rejected": 2.399930953979492, "logps/chosen": -606.0468139648438, "logps/rejected": -1309.142333984375, "loss": 0.2268, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.912951707839966, "rewards/margins": 7.471800327301025, "rewards/rejected": -10.38475227355957, "step": 1570 }, { "epoch": 0.43022464261402316, "grad_norm": 17.422115506001596, "learning_rate": 0.0001406273315945374, "logits/chosen": -0.34676986932754517, "logits/rejected": -0.28898870944976807, "logps/chosen": -653.0081787109375, "logps/rejected": -1026.3204345703125, "loss": 0.3029, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.7177176475524902, "rewards/margins": 3.7674331665039062, "rewards/rejected": -6.4851508140563965, "step": 1580 }, { "epoch": 0.4329475833900613, "grad_norm": 4.86972424076265, "learning_rate": 0.00013975667177097914, "logits/chosen": -0.1830269992351532, "logits/rejected": -0.24179303646087646, "logps/chosen": -635.9149169921875, "logps/rejected": -1108.5006103515625, "loss": 0.3134, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.9789676666259766, "rewards/margins": 4.767451286315918, "rewards/rejected": -7.7464189529418945, "step": 1590 }, { "epoch": 0.43567052416609936, "grad_norm": 3.596472735756183, "learning_rate": 0.00013888241754733208, "logits/chosen": -0.6846984028816223, "logits/rejected": -0.6952911019325256, "logps/chosen": -489.0577087402344, "logps/rejected": -890.4197998046875, "loss": 0.2965, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.5424995422363281, "rewards/margins": 4.706803321838379, "rewards/rejected": -6.249302387237549, "step": 1600 }, { "epoch": 0.4383934649421375, "grad_norm": 8.198013945041934, "learning_rate": 0.0001380046479649073, "logits/chosen": -1.4048243761062622, "logits/rejected": -1.1473358869552612, "logps/chosen": -611.8831176757812, "logps/rejected": -1111.0921630859375, "loss": 0.2666, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.5834736824035645, "rewards/margins": 5.021603584289551, "rewards/rejected": -7.605076789855957, "step": 1610 }, { "epoch": 0.4411164057181756, "grad_norm": 9.566449101001135, "learning_rate": 0.00013712344238283933, "logits/chosen": -0.8325891494750977, "logits/rejected": -0.7076963186264038, "logps/chosen": -621.9786376953125, "logps/rejected": -1179.3758544921875, "loss": 0.2623, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.860924005508423, "rewards/margins": 6.293280601501465, "rewards/rejected": -9.154203414916992, "step": 1620 }, { "epoch": 0.44383934649421375, "grad_norm": 5.767063661996225, "learning_rate": 0.00013623888047091146, "logits/chosen": 1.593488335609436, "logits/rejected": 1.6555019617080688, "logps/chosen": -590.9708862304688, "logps/rejected": -1078.3956298828125, "loss": 0.238, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.6324057579040527, "rewards/margins": 5.353615760803223, "rewards/rejected": -7.986021518707275, "step": 1630 }, { "epoch": 0.4465622872702519, "grad_norm": 41.00318315603557, "learning_rate": 0.00013535104220235261, "logits/chosen": 1.8902591466903687, "logits/rejected": 2.1262030601501465, "logps/chosen": -876.5345458984375, "logps/rejected": -1053.7332763671875, "loss": 1.1928, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -5.313698768615723, "rewards/margins": 2.054201602935791, "rewards/rejected": -7.367900848388672, "step": 1640 }, { "epoch": 0.44928522804629, "grad_norm": 26.088451825695778, "learning_rate": 0.00013446000784660714, "logits/chosen": 1.4706940650939941, "logits/rejected": 1.519578218460083, "logps/chosen": -631.876953125, "logps/rejected": -1176.569091796875, "loss": 0.4262, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.078620672225952, "rewards/margins": 5.974215507507324, "rewards/rejected": -9.052835464477539, "step": 1650 }, { "epoch": 0.45200816882232814, "grad_norm": 33.28614487362792, "learning_rate": 0.00013356585796207756, "logits/chosen": 2.1754348278045654, "logits/rejected": 2.0363101959228516, "logps/chosen": -577.9168701171875, "logps/rejected": -1242.7994384765625, "loss": 0.2998, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.5749011039733887, "rewards/margins": 7.280187129974365, "rewards/rejected": -9.855088233947754, "step": 1660 }, { "epoch": 0.4547311095983662, "grad_norm": 10.427182612989903, "learning_rate": 0.0001326686733888413, "logits/chosen": 0.6365132331848145, "logits/rejected": 0.6580590605735779, "logps/chosen": -702.520263671875, "logps/rejected": -1273.900390625, "loss": 0.3962, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.290257692337036, "rewards/margins": 5.8721795082092285, "rewards/rejected": -9.162436485290527, "step": 1670 }, { "epoch": 0.45745405037440434, "grad_norm": 4.282070776156857, "learning_rate": 0.000131768535241342, "logits/chosen": -0.5329915285110474, "logits/rejected": -0.3240591287612915, "logps/chosen": -792.3297119140625, "logps/rejected": -1584.3717041015625, "loss": 0.2377, "rewards/accuracies": 0.875, "rewards/chosen": -4.805436134338379, "rewards/margins": 8.310664176940918, "rewards/rejected": -13.11609935760498, "step": 1680 }, { "epoch": 0.46017699115044247, "grad_norm": 15.921033696679123, "learning_rate": 0.00013086552490105573, "logits/chosen": -0.6514681577682495, "logits/rejected": -0.5372867584228516, "logps/chosen": -537.2027587890625, "logps/rejected": -1119.0516357421875, "loss": 0.2402, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.059156894683838, "rewards/margins": 6.330155372619629, "rewards/rejected": -8.389312744140625, "step": 1690 }, { "epoch": 0.4628999319264806, "grad_norm": 16.08695691527921, "learning_rate": 0.00012995972400913367, "logits/chosen": 0.9507571458816528, "logits/rejected": 1.0892035961151123, "logps/chosen": -657.4198608398438, "logps/rejected": -1084.3997802734375, "loss": 0.3032, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.760071039199829, "rewards/margins": 4.952934265136719, "rewards/rejected": -7.713004112243652, "step": 1700 }, { "epoch": 0.46562287270251873, "grad_norm": 15.03505603114339, "learning_rate": 0.00012905121445902067, "logits/chosen": -0.09099654108285904, "logits/rejected": 0.08855719864368439, "logps/chosen": -666.9413452148438, "logps/rejected": -1238.0989990234375, "loss": 0.3466, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.3812332153320312, "rewards/margins": 5.85336971282959, "rewards/rejected": -9.234602928161621, "step": 1710 }, { "epoch": 0.46834581347855686, "grad_norm": 5.776431534520819, "learning_rate": 0.00012814007838905128, "logits/chosen": -0.9203442335128784, "logits/rejected": -0.8418833017349243, "logps/chosen": -545.5592651367188, "logps/rejected": -1109.281005859375, "loss": 0.2509, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.17928147315979, "rewards/margins": 6.00477409362793, "rewards/rejected": -8.184054374694824, "step": 1720 }, { "epoch": 0.471068754254595, "grad_norm": 17.307649652391138, "learning_rate": 0.0001272263981750237, "logits/chosen": -1.3016124963760376, "logits/rejected": -1.0513460636138916, "logps/chosen": -614.2965087890625, "logps/rejected": -1212.0992431640625, "loss": 0.3222, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.9695653915405273, "rewards/margins": 6.4604692459106445, "rewards/rejected": -9.430034637451172, "step": 1730 }, { "epoch": 0.47379169503063306, "grad_norm": 6.686259280088899, "learning_rate": 0.00012631025642275212, "logits/chosen": -0.5886275172233582, "logits/rejected": -0.3992864489555359, "logps/chosen": -639.0484619140625, "logps/rejected": -1379.347412109375, "loss": 0.3295, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.267850160598755, "rewards/margins": 7.818935394287109, "rewards/rejected": -11.086786270141602, "step": 1740 }, { "epoch": 0.4765146358066712, "grad_norm": 8.332371540931518, "learning_rate": 0.00012539173596059849, "logits/chosen": -0.30099961161613464, "logits/rejected": -0.0007561176898889244, "logps/chosen": -628.400146484375, "logps/rejected": -1538.23974609375, "loss": 0.4098, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.416485548019409, "rewards/margins": 9.312593460083008, "rewards/rejected": -12.72907829284668, "step": 1750 }, { "epoch": 0.4792375765827093, "grad_norm": 3.1397396537158566, "learning_rate": 0.00012447091983198367, "logits/chosen": -1.1242637634277344, "logits/rejected": -0.7798986434936523, "logps/chosen": -593.5460205078125, "logps/rejected": -1356.296875, "loss": 0.253, "rewards/accuracies": 0.875, "rewards/chosen": -2.9524972438812256, "rewards/margins": 7.784090995788574, "rewards/rejected": -10.736586570739746, "step": 1760 }, { "epoch": 0.48196051735874745, "grad_norm": 8.522392208842632, "learning_rate": 0.0001235478912878799, "logits/chosen": -0.9284283518791199, "logits/rejected": -0.6892914772033691, "logps/chosen": -696.424560546875, "logps/rejected": -1392.125, "loss": 0.1594, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.4410488605499268, "rewards/margins": 7.498281002044678, "rewards/rejected": -10.939330101013184, "step": 1770 }, { "epoch": 0.4846834581347856, "grad_norm": 6.034875346009079, "learning_rate": 0.00012262273377928375, "logits/chosen": -1.1968759298324585, "logits/rejected": -0.9019950032234192, "logps/chosen": -613.7315673828125, "logps/rejected": -1290.041748046875, "loss": 0.3008, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.713265895843506, "rewards/margins": 6.849099636077881, "rewards/rejected": -9.562365531921387, "step": 1780 }, { "epoch": 0.4874063989108237, "grad_norm": 9.60806575188711, "learning_rate": 0.00012169553094967146, "logits/chosen": -1.1181106567382812, "logits/rejected": -0.7999989986419678, "logps/chosen": -614.52001953125, "logps/rejected": -1425.6143798828125, "loss": 0.2867, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5628480911254883, "rewards/margins": 8.526115417480469, "rewards/rejected": -11.088964462280273, "step": 1790 }, { "epoch": 0.4901293396868618, "grad_norm": 21.702603936533567, "learning_rate": 0.00012076636662743672, "logits/chosen": 1.323265790939331, "logits/rejected": 1.2390327453613281, "logps/chosen": -779.384521484375, "logps/rejected": -1323.75390625, "loss": 0.2856, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.8515422344207764, "rewards/margins": 5.356608867645264, "rewards/rejected": -9.208149909973145, "step": 1800 }, { "epoch": 0.4928522804628999, "grad_norm": 8.429242968799562, "learning_rate": 0.0001198353248183118, "logits/chosen": 2.109199285507202, "logits/rejected": 1.8442319631576538, "logps/chosen": -719.3864135742188, "logps/rejected": -1434.2828369140625, "loss": 0.1701, "rewards/accuracies": 0.9375, "rewards/chosen": -3.646466016769409, "rewards/margins": 7.611660957336426, "rewards/rejected": -11.258125305175781, "step": 1810 }, { "epoch": 0.49557522123893805, "grad_norm": 6.886953380619344, "learning_rate": 0.0001189024896977724, "logits/chosen": 0.2526296079158783, "logits/rejected": 0.22320647537708282, "logps/chosen": -618.1570434570312, "logps/rejected": -1169.676513671875, "loss": 0.2034, "rewards/accuracies": 0.875, "rewards/chosen": -3.075936794281006, "rewards/margins": 5.886470317840576, "rewards/rejected": -8.962407112121582, "step": 1820 }, { "epoch": 0.4982981620149762, "grad_norm": 8.040343434429271, "learning_rate": 0.00011796794560342754, "logits/chosen": -0.7386595606803894, "logits/rejected": -0.6110578775405884, "logps/chosen": -660.8449096679688, "logps/rejected": -1218.1505126953125, "loss": 0.3243, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.8239331245422363, "rewards/margins": 6.139927387237549, "rewards/rejected": -8.963861465454102, "step": 1830 }, { "epoch": 0.5010211027910143, "grad_norm": 8.563143413366994, "learning_rate": 0.0001170317770273946, "logits/chosen": -1.0322635173797607, "logits/rejected": -0.8018537759780884, "logps/chosen": -529.4595336914062, "logps/rejected": -1181.968505859375, "loss": 0.2521, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.1946306228637695, "rewards/margins": 6.752659797668457, "rewards/rejected": -8.947290420532227, "step": 1840 }, { "epoch": 0.5037440435670524, "grad_norm": 3.5682662969072747, "learning_rate": 0.00011609406860866023, "logits/chosen": -1.1348320245742798, "logits/rejected": -1.0071537494659424, "logps/chosen": -569.1886596679688, "logps/rejected": -1152.37255859375, "loss": 0.2472, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.218945026397705, "rewards/margins": 6.368105888366699, "rewards/rejected": -8.58704948425293, "step": 1850 }, { "epoch": 0.5064669843430906, "grad_norm": 7.758172151259579, "learning_rate": 0.00011515490512542833, "logits/chosen": -1.3560707569122314, "logits/rejected": -1.0845423936843872, "logps/chosen": -561.7158813476562, "logps/rejected": -1203.3419189453125, "loss": 0.2311, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.237051486968994, "rewards/margins": 7.016552925109863, "rewards/rejected": -9.2536039352417, "step": 1860 }, { "epoch": 0.5091899251191286, "grad_norm": 4.493807328662436, "learning_rate": 0.00011421437148745502, "logits/chosen": -1.0917279720306396, "logits/rejected": -0.9747053384780884, "logps/chosen": -645.5095825195312, "logps/rejected": -1229.0311279296875, "loss": 0.2134, "rewards/accuracies": 0.875, "rewards/chosen": -2.5472307205200195, "rewards/margins": 6.673137664794922, "rewards/rejected": -9.220368385314941, "step": 1870 }, { "epoch": 0.5119128658951668, "grad_norm": 6.201461431472656, "learning_rate": 0.00011327255272837221, "logits/chosen": -0.2544959485530853, "logits/rejected": -0.12365889549255371, "logps/chosen": -622.212890625, "logps/rejected": -1287.2431640625, "loss": 0.1549, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.9083447456359863, "rewards/margins": 6.886693000793457, "rewards/rejected": -9.795038223266602, "step": 1880 }, { "epoch": 0.5146358066712049, "grad_norm": 6.2832423410312925, "learning_rate": 0.00011232953399799957, "logits/chosen": 0.22300061583518982, "logits/rejected": 0.48449450731277466, "logps/chosen": -777.6243286132812, "logps/rejected": -1476.5068359375, "loss": 0.2152, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.214239120483398, "rewards/margins": 7.588281154632568, "rewards/rejected": -11.802520751953125, "step": 1890 }, { "epoch": 0.517358747447243, "grad_norm": 9.37402734675468, "learning_rate": 0.0001113854005546461, "logits/chosen": -0.2933140695095062, "logits/rejected": -0.16291429102420807, "logps/chosen": -761.1182861328125, "logps/rejected": -1387.656494140625, "loss": 0.2514, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.257036209106445, "rewards/margins": 6.1938581466674805, "rewards/rejected": -10.450895309448242, "step": 1900 }, { "epoch": 0.5200816882232812, "grad_norm": 7.183032873292223, "learning_rate": 0.00011044023775740204, "logits/chosen": -0.7535658478736877, "logits/rejected": -0.5872179269790649, "logps/chosen": -660.9326171875, "logps/rejected": -1360.95458984375, "loss": 0.2185, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.0175588130950928, "rewards/margins": 7.479903221130371, "rewards/rejected": -10.497461318969727, "step": 1910 }, { "epoch": 0.5228046289993192, "grad_norm": 15.307025263438828, "learning_rate": 0.00010949413105842147, "logits/chosen": -0.6107400059700012, "logits/rejected": -0.3973458409309387, "logps/chosen": -550.6314086914062, "logps/rejected": -1316.4263916015625, "loss": 0.2485, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.392124652862549, "rewards/margins": 8.194581985473633, "rewards/rejected": -10.586706161499023, "step": 1920 }, { "epoch": 0.5255275697753574, "grad_norm": 4.269502876033707, "learning_rate": 0.0001085471659951967, "logits/chosen": 0.16908931732177734, "logits/rejected": 0.4170844554901123, "logps/chosen": -687.2650756835938, "logps/rejected": -1543.733154296875, "loss": 0.1662, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.3644537925720215, "rewards/margins": 8.979719161987305, "rewards/rejected": -12.344173431396484, "step": 1930 }, { "epoch": 0.5282505105513955, "grad_norm": 9.311111235120233, "learning_rate": 0.00010759942818282454, "logits/chosen": 0.7541287541389465, "logits/rejected": 0.9952371716499329, "logps/chosen": -687.6876220703125, "logps/rejected": -1431.085693359375, "loss": 0.2171, "rewards/accuracies": 0.875, "rewards/chosen": -3.5470268726348877, "rewards/margins": 7.848145484924316, "rewards/rejected": -11.395172119140625, "step": 1940 }, { "epoch": 0.5309734513274337, "grad_norm": 6.088442848295023, "learning_rate": 0.00010665100330626625, "logits/chosen": -0.1732356995344162, "logits/rejected": 0.006719267461448908, "logps/chosen": -520.822265625, "logps/rejected": -1267.14501953125, "loss": 0.2347, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.0856547355651855, "rewards/margins": 7.704294681549072, "rewards/rejected": -9.789949417114258, "step": 1950 }, { "epoch": 0.5336963921034718, "grad_norm": 7.6561162101042, "learning_rate": 0.0001057019771126004, "logits/chosen": -0.6633592844009399, "logits/rejected": -0.38759341835975647, "logps/chosen": -532.0450439453125, "logps/rejected": -1239.3480224609375, "loss": 0.1965, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.8515552282333374, "rewards/margins": 7.4962158203125, "rewards/rejected": -9.347769737243652, "step": 1960 }, { "epoch": 0.5364193328795098, "grad_norm": 3.313605249754064, "learning_rate": 0.0001047524354032707, "logits/chosen": -0.9916669130325317, "logits/rejected": -0.6737325191497803, "logps/chosen": -566.5112915039062, "logps/rejected": -1290.538818359375, "loss": 0.2498, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.237109661102295, "rewards/margins": 8.159578323364258, "rewards/rejected": -10.396687507629395, "step": 1970 }, { "epoch": 0.539142273655548, "grad_norm": 9.196141505536001, "learning_rate": 0.0001038024640263287, "logits/chosen": -0.6640629768371582, "logits/rejected": -0.3816087543964386, "logps/chosen": -674.1700439453125, "logps/rejected": -1490.5645751953125, "loss": 0.1814, "rewards/accuracies": 0.9375, "rewards/chosen": -3.1683545112609863, "rewards/margins": 8.756607055664062, "rewards/rejected": -11.924962997436523, "step": 1980 }, { "epoch": 0.5418652144315861, "grad_norm": 6.910953624436588, "learning_rate": 0.00010285214886867198, "logits/chosen": -1.0215368270874023, "logits/rejected": -0.815405011177063, "logps/chosen": -664.8277587890625, "logps/rejected": -1331.6253662109375, "loss": 0.2056, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.9869272708892822, "rewards/margins": 7.080696105957031, "rewards/rejected": -10.067623138427734, "step": 1990 }, { "epoch": 0.5445881552076243, "grad_norm": 8.829016742458412, "learning_rate": 0.00010190157584827953, "logits/chosen": -0.7232739329338074, "logits/rejected": -0.5292662978172302, "logps/chosen": -668.0047607421875, "logps/rejected": -1450.50830078125, "loss": 0.1681, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.479228973388672, "rewards/margins": 8.136119842529297, "rewards/rejected": -11.615349769592285, "step": 2000 }, { "epoch": 0.5445881552076243, "eval_logits/chosen": -0.37419021129608154, "eval_logits/rejected": -0.188820019364357, "eval_logps/chosen": -735.248779296875, "eval_logps/rejected": -1603.610595703125, "eval_loss": 0.20030494034290314, "eval_rewards/accuracies": 0.9089242219924927, "eval_rewards/chosen": -3.9555394649505615, "eval_rewards/margins": 9.161340713500977, "eval_rewards/rejected": -13.1168794631958, "eval_runtime": 3761.3505, "eval_samples_per_second": 1.304, "eval_steps_per_second": 0.109, "step": 2000 }, { "epoch": 0.5473110959836623, "grad_norm": 5.076536520943281, "learning_rate": 0.00010095083090644364, "logits/chosen": -0.6943289637565613, "logits/rejected": -0.5084115266799927, "logps/chosen": -745.46435546875, "logps/rejected": -1585.6259765625, "loss": 0.1903, "rewards/accuracies": 0.875, "rewards/chosen": -3.852506637573242, "rewards/margins": 9.036561012268066, "rewards/rejected": -12.889068603515625, "step": 2010 }, { "epoch": 0.5500340367597005, "grad_norm": 5.364296538757404, "learning_rate": 0.0001, "logits/chosen": -1.3891135454177856, "logits/rejected": -1.2110545635223389, "logps/chosen": -719.6419677734375, "logps/rejected": -1496.1673583984375, "loss": 0.1839, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.265209674835205, "rewards/margins": 8.223184585571289, "rewards/rejected": -11.48839282989502, "step": 2020 }, { "epoch": 0.5527569775357386, "grad_norm": 11.808959554828267, "learning_rate": 9.904916909355638e-05, "logits/chosen": -1.5816423892974854, "logits/rejected": -1.1652607917785645, "logps/chosen": -539.0081176757812, "logps/rejected": -1492.573486328125, "loss": 0.2031, "rewards/accuracies": 0.9375, "rewards/chosen": -2.16530704498291, "rewards/margins": 9.950845718383789, "rewards/rejected": -12.116150856018066, "step": 2030 }, { "epoch": 0.5554799183117767, "grad_norm": 8.952558589174538, "learning_rate": 9.809842415172048e-05, "logits/chosen": -1.3569689989089966, "logits/rejected": -1.1630733013153076, "logps/chosen": -703.2092895507812, "logps/rejected": -1485.3570556640625, "loss": 0.2262, "rewards/accuracies": 0.875, "rewards/chosen": -3.354710817337036, "rewards/margins": 8.277021408081055, "rewards/rejected": -11.631732940673828, "step": 2040 }, { "epoch": 0.5582028590878149, "grad_norm": 8.492633179657727, "learning_rate": 9.714785113132801e-05, "logits/chosen": -0.8783491849899292, "logits/rejected": -0.5752300024032593, "logps/chosen": -723.2975463867188, "logps/rejected": -1613.511962890625, "loss": 0.1688, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.7593815326690674, "rewards/margins": 9.781450271606445, "rewards/rejected": -13.54083251953125, "step": 2050 }, { "epoch": 0.5609257998638529, "grad_norm": 5.586376860873606, "learning_rate": 9.619753597367134e-05, "logits/chosen": -1.0634355545043945, "logits/rejected": -0.6238435506820679, "logps/chosen": -619.2913208007812, "logps/rejected": -1520.8214111328125, "loss": 0.1177, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.2205727100372314, "rewards/margins": 9.413098335266113, "rewards/rejected": -12.633668899536133, "step": 2060 }, { "epoch": 0.5636487406398911, "grad_norm": 4.45704315988639, "learning_rate": 9.52475645967293e-05, "logits/chosen": -0.9397182464599609, "logits/rejected": -0.5685716271400452, "logps/chosen": -667.0559692382812, "logps/rejected": -1488.3125, "loss": 0.1976, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.539982318878174, "rewards/margins": 8.752443313598633, "rewards/rejected": -12.292426109313965, "step": 2070 }, { "epoch": 0.5663716814159292, "grad_norm": 7.7241013263377445, "learning_rate": 9.429802288739963e-05, "logits/chosen": -0.5390239953994751, "logits/rejected": -0.31946104764938354, "logps/chosen": -702.8306884765625, "logps/rejected": -1439.090087890625, "loss": 0.1863, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.5139050483703613, "rewards/margins": 7.537717342376709, "rewards/rejected": -11.05162239074707, "step": 2080 }, { "epoch": 0.5690946221919673, "grad_norm": 980.6412639778208, "learning_rate": 9.334899669373379e-05, "logits/chosen": 0.06543504446744919, "logits/rejected": 0.195405513048172, "logps/chosen": -813.8810424804688, "logps/rejected": -1287.7030029296875, "loss": 0.6388, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.83243465423584, "rewards/margins": 5.0349626541137695, "rewards/rejected": -9.86739730834961, "step": 2090 }, { "epoch": 0.5718175629680055, "grad_norm": 13.897944998252893, "learning_rate": 9.240057181717546e-05, "logits/chosen": -0.2763592004776001, "logits/rejected": -0.15278960764408112, "logps/chosen": -681.8336791992188, "logps/rejected": -1307.804931640625, "loss": 0.3034, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.6334025859832764, "rewards/margins": 6.592775821685791, "rewards/rejected": -10.226178169250488, "step": 2100 }, { "epoch": 0.5745405037440435, "grad_norm": 8.530907247381231, "learning_rate": 9.145283400480333e-05, "logits/chosen": -0.15677616000175476, "logits/rejected": -0.0704054981470108, "logps/chosen": -658.6497192382812, "logps/rejected": -1345.2955322265625, "loss": 0.2164, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.112351655960083, "rewards/margins": 6.875027656555176, "rewards/rejected": -9.987378120422363, "step": 2110 }, { "epoch": 0.5772634445200817, "grad_norm": 16.381271243779008, "learning_rate": 9.050586894157854e-05, "logits/chosen": -0.28872478008270264, "logits/rejected": -0.1264825314283371, "logps/chosen": -806.0684814453125, "logps/rejected": -1522.5067138671875, "loss": 0.5353, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.404202461242676, "rewards/margins": 7.904346466064453, "rewards/rejected": -12.308548927307129, "step": 2120 }, { "epoch": 0.5799863852961198, "grad_norm": 11.196451592798955, "learning_rate": 8.955976224259798e-05, "logits/chosen": 0.03705821558833122, "logits/rejected": 0.18372680246829987, "logps/chosen": -716.6239013671875, "logps/rejected": -1472.212646484375, "loss": 0.2185, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.9464664459228516, "rewards/margins": 8.058629989624023, "rewards/rejected": -12.005096435546875, "step": 2130 }, { "epoch": 0.582709326072158, "grad_norm": 33.293697813605284, "learning_rate": 8.86145994453539e-05, "logits/chosen": -0.03169569373130798, "logits/rejected": 0.13412004709243774, "logps/chosen": -664.3828125, "logps/rejected": -1275.538330078125, "loss": 0.2639, "rewards/accuracies": 0.875, "rewards/chosen": -2.8449416160583496, "rewards/margins": 6.907110691070557, "rewards/rejected": -9.752052307128906, "step": 2140 }, { "epoch": 0.585432266848196, "grad_norm": 7.031252506857269, "learning_rate": 8.767046600200045e-05, "logits/chosen": 0.9719666242599487, "logits/rejected": 1.0802761316299438, "logps/chosen": -654.38916015625, "logps/rejected": -1507.3211669921875, "loss": 0.1713, "rewards/accuracies": 0.9375, "rewards/chosen": -3.204228639602661, "rewards/margins": 8.715561866760254, "rewards/rejected": -11.919790267944336, "step": 2150 }, { "epoch": 0.5881552076242341, "grad_norm": 98.49722190943557, "learning_rate": 8.672744727162781e-05, "logits/chosen": 1.4611481428146362, "logits/rejected": 1.6459019184112549, "logps/chosen": -720.2965698242188, "logps/rejected": -1626.4840087890625, "loss": 3.8797, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.969226360321045, "rewards/margins": 9.662763595581055, "rewards/rejected": -13.631990432739258, "step": 2160 }, { "epoch": 0.5908781484002723, "grad_norm": 81.8395749005472, "learning_rate": 8.578562851254501e-05, "logits/chosen": 1.584269404411316, "logits/rejected": 1.420672059059143, "logps/chosen": -908.0494995117188, "logps/rejected": -2301.003173828125, "loss": 0.6286, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.922436714172363, "rewards/margins": 13.946557998657227, "rewards/rejected": -19.868993759155273, "step": 2170 }, { "epoch": 0.5936010891763104, "grad_norm": 37.76082762966783, "learning_rate": 8.484509487457172e-05, "logits/chosen": 2.651890993118286, "logits/rejected": 2.771029472351074, "logps/chosen": -1107.451904296875, "logps/rejected": -1360.6392822265625, "loss": 0.6267, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -7.304482936859131, "rewards/margins": 3.189244508743286, "rewards/rejected": -10.49372673034668, "step": 2180 }, { "epoch": 0.5963240299523486, "grad_norm": 9.755340455132105, "learning_rate": 8.390593139133975e-05, "logits/chosen": 2.368619918823242, "logits/rejected": 1.9935481548309326, "logps/chosen": -682.6423950195312, "logps/rejected": -1282.08935546875, "loss": 0.2434, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.698317050933838, "rewards/margins": 6.121725559234619, "rewards/rejected": -9.820043563842773, "step": 2190 }, { "epoch": 0.5990469707283866, "grad_norm": 18.76492804372937, "learning_rate": 8.296822297260541e-05, "logits/chosen": 1.6552093029022217, "logits/rejected": 1.6404740810394287, "logps/chosen": -648.8519287109375, "logps/rejected": -1346.6575927734375, "loss": 0.2895, "rewards/accuracies": 0.875, "rewards/chosen": -2.9380698204040527, "rewards/margins": 7.239809989929199, "rewards/rejected": -10.17788028717041, "step": 2200 }, { "epoch": 0.6017699115044248, "grad_norm": 10.336382337282298, "learning_rate": 8.203205439657248e-05, "logits/chosen": 0.8887385129928589, "logits/rejected": 1.1281851530075073, "logps/chosen": -658.0531005859375, "logps/rejected": -1374.1337890625, "loss": 1.5843, "rewards/accuracies": 0.875, "rewards/chosen": -3.1537742614746094, "rewards/margins": 7.757615566253662, "rewards/rejected": -10.911389350891113, "step": 2210 }, { "epoch": 0.6044928522804629, "grad_norm": 4.623988802456331, "learning_rate": 8.109751030222765e-05, "logits/chosen": -0.07704190909862518, "logits/rejected": 0.13484761118888855, "logps/chosen": -662.59033203125, "logps/rejected": -1480.857177734375, "loss": 0.371, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.091067314147949, "rewards/margins": 8.361741065979004, "rewards/rejected": -11.452808380126953, "step": 2220 }, { "epoch": 0.607215793056501, "grad_norm": 11.914279665453476, "learning_rate": 8.016467518168821e-05, "logits/chosen": 0.3015810549259186, "logits/rejected": 0.36731666326522827, "logps/chosen": -736.3297729492188, "logps/rejected": -1540.991943359375, "loss": 0.2803, "rewards/accuracies": 0.875, "rewards/chosen": -3.8391730785369873, "rewards/margins": 8.72679328918457, "rewards/rejected": -12.56596565246582, "step": 2230 }, { "epoch": 0.6099387338325392, "grad_norm": 24.117684567012283, "learning_rate": 7.923363337256328e-05, "logits/chosen": 0.606380820274353, "logits/rejected": 0.8258590698242188, "logps/chosen": -774.5369873046875, "logps/rejected": -1497.198974609375, "loss": 1.462, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.283402442932129, "rewards/margins": 7.879556179046631, "rewards/rejected": -12.162959098815918, "step": 2240 }, { "epoch": 0.6126616746085772, "grad_norm": 7.619277045213005, "learning_rate": 7.830446905032858e-05, "logits/chosen": -0.02835053764283657, "logits/rejected": 0.22348585724830627, "logps/chosen": -702.3712768554688, "logps/rejected": -1577.526611328125, "loss": 0.2359, "rewards/accuracies": 0.875, "rewards/chosen": -3.479196548461914, "rewards/margins": 9.12803840637207, "rewards/rejected": -12.6072359085083, "step": 2250 }, { "epoch": 0.6153846153846154, "grad_norm": 14.605221968669897, "learning_rate": 7.737726622071628e-05, "logits/chosen": -0.7005853652954102, "logits/rejected": -0.37476682662963867, "logps/chosen": -575.2031860351562, "logps/rejected": -1441.398681640625, "loss": 0.2372, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.820068359375, "rewards/margins": 8.712976455688477, "rewards/rejected": -11.533044815063477, "step": 2260 }, { "epoch": 0.6181075561606535, "grad_norm": 8.259350540995301, "learning_rate": 7.645210871212014e-05, "logits/chosen": -0.33096233010292053, "logits/rejected": -0.10710246860980988, "logps/chosen": -687.130126953125, "logps/rejected": -1422.7803955078125, "loss": 0.2, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.4392876625061035, "rewards/margins": 7.6333770751953125, "rewards/rejected": -11.072665214538574, "step": 2270 }, { "epoch": 0.6208304969366917, "grad_norm": 11.61901143825517, "learning_rate": 7.552908016801632e-05, "logits/chosen": -0.26237788796424866, "logits/rejected": 0.03336496278643608, "logps/chosen": -818.0942993164062, "logps/rejected": -1831.728515625, "loss": 0.1969, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.504512310028076, "rewards/margins": 10.295443534851074, "rewards/rejected": -14.799957275390625, "step": 2280 }, { "epoch": 0.6235534377127298, "grad_norm": 11.742306666819731, "learning_rate": 7.460826403940155e-05, "logits/chosen": -0.6621205806732178, "logits/rejected": -0.40963059663772583, "logps/chosen": -665.1314697265625, "logps/rejected": -1418.714111328125, "loss": 0.2204, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.196091413497925, "rewards/margins": 8.150285720825195, "rewards/rejected": -11.346376419067383, "step": 2290 }, { "epoch": 0.6262763784887678, "grad_norm": 8.175831739988089, "learning_rate": 7.368974357724789e-05, "logits/chosen": -0.9455803036689758, "logits/rejected": -0.6427022218704224, "logps/chosen": -634.2056884765625, "logps/rejected": -1466.7425537109375, "loss": 0.1935, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.8602559566497803, "rewards/margins": 8.551790237426758, "rewards/rejected": -11.412044525146484, "step": 2300 }, { "epoch": 0.628999319264806, "grad_norm": 11.08633926544173, "learning_rate": 7.277360182497633e-05, "logits/chosen": -0.14578841626644135, "logits/rejected": 0.062481798231601715, "logps/chosen": -709.3231201171875, "logps/rejected": -1652.924072265625, "loss": 0.1866, "rewards/accuracies": 0.9375, "rewards/chosen": -3.703045606613159, "rewards/margins": 9.57800006866455, "rewards/rejected": -13.281045913696289, "step": 2310 }, { "epoch": 0.6317222600408441, "grad_norm": 18.419415877088248, "learning_rate": 7.185992161094876e-05, "logits/chosen": 0.5623148679733276, "logits/rejected": 0.7446078062057495, "logps/chosen": -778.8319091796875, "logps/rejected": -1452.923583984375, "loss": 0.2594, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.7897286415100098, "rewards/margins": 7.021533966064453, "rewards/rejected": -10.811262130737305, "step": 2320 }, { "epoch": 0.6344452008168823, "grad_norm": 7.760961578008244, "learning_rate": 7.094878554097935e-05, "logits/chosen": 0.16852492094039917, "logits/rejected": 0.37618380784988403, "logps/chosen": -627.3386840820312, "logps/rejected": -1635.9002685546875, "loss": 0.1427, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.179767370223999, "rewards/margins": 10.47213363647461, "rewards/rejected": -13.651899337768555, "step": 2330 }, { "epoch": 0.6371681415929203, "grad_norm": 6.902496772019522, "learning_rate": 7.004027599086635e-05, "logits/chosen": -0.08031632751226425, "logits/rejected": 0.10308752208948135, "logps/chosen": -617.7669677734375, "logps/rejected": -1439.9415283203125, "loss": 0.1795, "rewards/accuracies": 0.875, "rewards/chosen": -2.939218044281006, "rewards/margins": 8.878007888793945, "rewards/rejected": -11.817225456237793, "step": 2340 }, { "epoch": 0.6398910823689585, "grad_norm": 4.644941374043179, "learning_rate": 6.913447509894431e-05, "logits/chosen": 0.3528628647327423, "logits/rejected": 0.6444572806358337, "logps/chosen": -709.1551513671875, "logps/rejected": -1517.9844970703125, "loss": 0.2281, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.4242711067199707, "rewards/margins": 8.883430480957031, "rewards/rejected": -12.307700157165527, "step": 2350 }, { "epoch": 0.6426140231449966, "grad_norm": 10.647837733586355, "learning_rate": 6.823146475865808e-05, "logits/chosen": 0.5684790015220642, "logits/rejected": 0.799397349357605, "logps/chosen": -809.0391845703125, "logps/rejected": -1618.5943603515625, "loss": 0.1812, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.392528057098389, "rewards/margins": 8.570115089416504, "rewards/rejected": -12.962644577026367, "step": 2360 }, { "epoch": 0.6453369639210347, "grad_norm": 4.992656274243724, "learning_rate": 6.73313266111587e-05, "logits/chosen": 0.6541573405265808, "logits/rejected": 0.8468329310417175, "logps/chosen": -720.4848022460938, "logps/rejected": -1716.3724365234375, "loss": 0.1339, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.6888911724090576, "rewards/margins": 10.465368270874023, "rewards/rejected": -14.154258728027344, "step": 2370 }, { "epoch": 0.6480599046970729, "grad_norm": 144.2262824604598, "learning_rate": 6.643414203792245e-05, "logits/chosen": 0.5966984629631042, "logits/rejected": 0.8201514482498169, "logps/chosen": -945.392578125, "logps/rejected": -1575.818359375, "loss": 0.7, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.658278465270996, "rewards/margins": 6.733407020568848, "rewards/rejected": -12.391685485839844, "step": 2380 }, { "epoch": 0.6507828454731109, "grad_norm": 10.385459412302742, "learning_rate": 6.553999215339289e-05, "logits/chosen": -0.4542000889778137, "logits/rejected": -0.23481395840644836, "logps/chosen": -672.037841796875, "logps/rejected": -1340.35791015625, "loss": 0.2024, "rewards/accuracies": 0.9375, "rewards/chosen": -3.1981143951416016, "rewards/margins": 7.2478790283203125, "rewards/rejected": -10.445992469787598, "step": 2390 }, { "epoch": 0.6535057862491491, "grad_norm": 3.4283866597659953, "learning_rate": 6.464895779764741e-05, "logits/chosen": -0.3537091910839081, "logits/rejected": -0.12687480449676514, "logps/chosen": -703.0349731445312, "logps/rejected": -1308.7872314453125, "loss": 0.247, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.5580172538757324, "rewards/margins": 6.497230529785156, "rewards/rejected": -10.05524730682373, "step": 2400 }, { "epoch": 0.6562287270251872, "grad_norm": 10.682248923459898, "learning_rate": 6.376111952908859e-05, "logits/chosen": -0.14068874716758728, "logits/rejected": 0.12782877683639526, "logps/chosen": -731.6837158203125, "logps/rejected": -1395.9217529296875, "loss": 0.2446, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.983330249786377, "rewards/margins": 7.223794460296631, "rewards/rejected": -11.207123756408691, "step": 2410 }, { "epoch": 0.6589516678012253, "grad_norm": 7.845658542414211, "learning_rate": 6.287655761716067e-05, "logits/chosen": -0.6590530276298523, "logits/rejected": -0.5280757546424866, "logps/chosen": -560.03173828125, "logps/rejected": -1205.8927001953125, "loss": 0.1641, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.356152296066284, "rewards/margins": 7.194516181945801, "rewards/rejected": -9.550667762756348, "step": 2420 }, { "epoch": 0.6616746085772635, "grad_norm": 8.940033970881544, "learning_rate": 6.199535203509272e-05, "logits/chosen": -0.6741346120834351, "logits/rejected": -0.5807133316993713, "logps/chosen": -559.6777954101562, "logps/rejected": -1138.4764404296875, "loss": 0.1621, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8906910419464111, "rewards/margins": 6.3634185791015625, "rewards/rejected": -8.254109382629395, "step": 2430 }, { "epoch": 0.6643975493533015, "grad_norm": 13.766342422082177, "learning_rate": 6.111758245266794e-05, "logits/chosen": -0.4139633774757385, "logits/rejected": -0.12003821134567261, "logps/chosen": -607.3410034179688, "logps/rejected": -1390.0321044921875, "loss": 0.153, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.9113223552703857, "rewards/margins": 7.944007873535156, "rewards/rejected": -10.855331420898438, "step": 2440 }, { "epoch": 0.6671204901293397, "grad_norm": 14.138045726244465, "learning_rate": 6.02433282290209e-05, "logits/chosen": -0.36063870787620544, "logits/rejected": -0.06381018459796906, "logps/chosen": -699.7347412109375, "logps/rejected": -1470.082275390625, "loss": 0.1933, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.4292755126953125, "rewards/margins": 8.251054763793945, "rewards/rejected": -11.680330276489258, "step": 2450 }, { "epoch": 0.6698434309053778, "grad_norm": 3.280339771466081, "learning_rate": 5.937266840546265e-05, "logits/chosen": -1.0655449628829956, "logits/rejected": -0.8338848352432251, "logps/chosen": -624.5250244140625, "logps/rejected": -1253.6978759765625, "loss": 1.1411, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.6232266426086426, "rewards/margins": 6.86643123626709, "rewards/rejected": -9.48965835571289, "step": 2460 }, { "epoch": 0.672566371681416, "grad_norm": 4.570905374864043, "learning_rate": 5.850568169833449e-05, "logits/chosen": -0.7835721969604492, "logits/rejected": -0.49934712052345276, "logps/chosen": -576.0851440429688, "logps/rejected": -1278.7838134765625, "loss": 0.1643, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.4718785285949707, "rewards/margins": 7.4020280838012695, "rewards/rejected": -9.873908042907715, "step": 2470 }, { "epoch": 0.675289312457454, "grad_norm": 11.384731637325952, "learning_rate": 5.76424464918915e-05, "logits/chosen": -0.3228727877140045, "logits/rejected": -0.060401104390621185, "logps/chosen": -685.0753173828125, "logps/rejected": -1462.6395263671875, "loss": 0.8961, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.290065050125122, "rewards/margins": 7.7338151931762695, "rewards/rejected": -11.023880004882812, "step": 2480 }, { "epoch": 0.6780122532334921, "grad_norm": 24.041375030089217, "learning_rate": 5.678304083121546e-05, "logits/chosen": -0.5586646199226379, "logits/rejected": -0.27681785821914673, "logps/chosen": -995.1302490234375, "logps/rejected": -1378.470947265625, "loss": 1.1643, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.8445916175842285, "rewards/margins": 4.3523359298706055, "rewards/rejected": -11.196928024291992, "step": 2490 }, { "epoch": 0.6807351940095303, "grad_norm": 11.286366339467362, "learning_rate": 5.592754241515908e-05, "logits/chosen": -0.8155454397201538, "logits/rejected": -0.5854301452636719, "logps/chosen": -649.5323486328125, "logps/rejected": -1438.3155517578125, "loss": 0.1778, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.1901750564575195, "rewards/margins": 8.069450378417969, "rewards/rejected": -11.259625434875488, "step": 2500 }, { "epoch": 0.6807351940095303, "eval_logits/chosen": -0.96079421043396, "eval_logits/rejected": -0.711790919303894, "eval_logps/chosen": -687.1464233398438, "eval_logps/rejected": -1489.6011962890625, "eval_loss": 0.2004450261592865, "eval_rewards/accuracies": 0.9242053627967834, "eval_rewards/chosen": -3.4745161533355713, "eval_rewards/margins": 8.502269744873047, "eval_rewards/rejected": -11.976785659790039, "eval_runtime": 3748.5252, "eval_samples_per_second": 1.308, "eval_steps_per_second": 0.109, "step": 2500 }, { "epoch": 0.6834581347855684, "grad_norm": 6.190600718672074, "learning_rate": 5.507602858932113e-05, "logits/chosen": -0.7578543424606323, "logits/rejected": -0.48180899024009705, "logps/chosen": -655.7686767578125, "logps/rejected": -1475.280517578125, "loss": 0.1788, "rewards/accuracies": 0.9375, "rewards/chosen": -3.3294525146484375, "rewards/margins": 8.292399406433105, "rewards/rejected": -11.62185287475586, "step": 2510 }, { "epoch": 0.6861810755616066, "grad_norm": 5.615801182575983, "learning_rate": 5.4228576339053586e-05, "logits/chosen": -0.6604496836662292, "logits/rejected": -0.3469547629356384, "logps/chosen": -687.5326538085938, "logps/rejected": -1508.2755126953125, "loss": 0.1189, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.2411141395568848, "rewards/margins": 8.564860343933105, "rewards/rejected": -11.805974960327148, "step": 2520 }, { "epoch": 0.6889040163376446, "grad_norm": 9.926521527274184, "learning_rate": 5.3385262282501535e-05, "logits/chosen": -0.3594810962677002, "logits/rejected": -0.052802037447690964, "logps/chosen": -595.9348754882812, "logps/rejected": -1410.1544189453125, "loss": 0.2423, "rewards/accuracies": 0.9375, "rewards/chosen": -2.8217885494232178, "rewards/margins": 8.592623710632324, "rewards/rejected": -11.414411544799805, "step": 2530 }, { "epoch": 0.6916269571136828, "grad_norm": 6.281514980730311, "learning_rate": 5.254616266367591e-05, "logits/chosen": -0.4293007254600525, "logits/rejected": -0.24068334698677063, "logps/chosen": -549.3060302734375, "logps/rejected": -1288.567626953125, "loss": 0.1586, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.1384198665618896, "rewards/margins": 7.993442535400391, "rewards/rejected": -10.131863594055176, "step": 2540 }, { "epoch": 0.6943498978897209, "grad_norm": 6.11278397464638, "learning_rate": 5.171135334556047e-05, "logits/chosen": 0.2943124771118164, "logits/rejected": 0.4400175213813782, "logps/chosen": -577.3480224609375, "logps/rejected": -1332.40625, "loss": 0.1633, "rewards/accuracies": 0.9375, "rewards/chosen": -2.670849084854126, "rewards/margins": 7.999800682067871, "rewards/rejected": -10.670649528503418, "step": 2550 }, { "epoch": 0.697072838665759, "grad_norm": 9.008079572058884, "learning_rate": 5.088090980325297e-05, "logits/chosen": 1.101324439048767, "logits/rejected": 1.0822101831436157, "logps/chosen": -674.9332275390625, "logps/rejected": -1506.8800048828125, "loss": 0.1711, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.356271743774414, "rewards/margins": 8.419809341430664, "rewards/rejected": -11.776080131530762, "step": 2560 }, { "epoch": 0.6997957794417972, "grad_norm": 9.425933719708931, "learning_rate": 5.005490711714139e-05, "logits/chosen": 0.4512532353401184, "logits/rejected": 0.5302013158798218, "logps/chosen": -619.0537719726562, "logps/rejected": -1460.2489013671875, "loss": 0.1672, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.9995157718658447, "rewards/margins": 8.4155855178833, "rewards/rejected": -11.41510009765625, "step": 2570 }, { "epoch": 0.7025187202178352, "grad_norm": 10.533473413081708, "learning_rate": 4.9233419966116036e-05, "logits/chosen": -0.4281206727027893, "logits/rejected": -0.11214808374643326, "logps/chosen": -540.2472534179688, "logps/rejected": -1485.8243408203125, "loss": 0.209, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.3536593914031982, "rewards/margins": 9.66262149810791, "rewards/rejected": -12.016282081604004, "step": 2580 }, { "epoch": 0.7052416609938734, "grad_norm": 2.6130275891308385, "learning_rate": 4.8416522620817627e-05, "logits/chosen": -0.7308027148246765, "logits/rejected": -0.41089239716529846, "logps/chosen": -613.0734252929688, "logps/rejected": -1534.1898193359375, "loss": 0.139, "rewards/accuracies": 0.9375, "rewards/chosen": -2.856041431427002, "rewards/margins": 9.61817741394043, "rewards/rejected": -12.474218368530273, "step": 2590 }, { "epoch": 0.7079646017699115, "grad_norm": 38.945197564012595, "learning_rate": 4.7604288936922735e-05, "logits/chosen": -0.625209629535675, "logits/rejected": -0.34178656339645386, "logps/chosen": -627.1326904296875, "logps/rejected": -1564.62158203125, "loss": 0.1752, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.122101306915283, "rewards/margins": 10.082366943359375, "rewards/rejected": -13.2044677734375, "step": 2600 }, { "epoch": 0.7106875425459497, "grad_norm": 3.161769729178617, "learning_rate": 4.6796792348466356e-05, "logits/chosen": -0.6596896648406982, "logits/rejected": -0.32370975613594055, "logps/chosen": -559.852783203125, "logps/rejected": -1473.3958740234375, "loss": 0.1316, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.7754173278808594, "rewards/margins": 9.06057357788086, "rewards/rejected": -11.835991859436035, "step": 2610 }, { "epoch": 0.7134104833219878, "grad_norm": 12.02339802433789, "learning_rate": 4.599410586120272e-05, "logits/chosen": -0.44842538237571716, "logits/rejected": -0.05267402529716492, "logps/chosen": -645.791259765625, "logps/rejected": -1563.593994140625, "loss": 0.1445, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7652764320373535, "rewards/margins": 9.446809768676758, "rewards/rejected": -12.212087631225586, "step": 2620 }, { "epoch": 0.7161334240980258, "grad_norm": 11.056020877129452, "learning_rate": 4.5196302046004926e-05, "logits/chosen": -0.5120202302932739, "logits/rejected": -0.22029462456703186, "logps/chosen": -651.0948486328125, "logps/rejected": -1455.8291015625, "loss": 0.194, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.2218902111053467, "rewards/margins": 8.378633499145508, "rewards/rejected": -11.600522994995117, "step": 2630 }, { "epoch": 0.718856364874064, "grad_norm": 3.682848580956075, "learning_rate": 4.4403453032303764e-05, "logits/chosen": -0.27802684903144836, "logits/rejected": -0.03195106238126755, "logps/chosen": -679.2535400390625, "logps/rejected": -1628.7232666015625, "loss": 0.1481, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.0372445583343506, "rewards/margins": 9.984166145324707, "rewards/rejected": -13.021410942077637, "step": 2640 }, { "epoch": 0.7215793056501021, "grad_norm": 7.257808137219564, "learning_rate": 4.3615630501566384e-05, "logits/chosen": -0.1426171511411667, "logits/rejected": 0.09189265221357346, "logps/chosen": -624.9664306640625, "logps/rejected": -1382.918701171875, "loss": 0.1858, "rewards/accuracies": 0.9375, "rewards/chosen": -2.731658935546875, "rewards/margins": 8.023258209228516, "rewards/rejected": -10.75491714477539, "step": 2650 }, { "epoch": 0.7243022464261403, "grad_norm": 11.218556371279355, "learning_rate": 4.283290568081591e-05, "logits/chosen": -0.15059207379817963, "logits/rejected": -0.008915537968277931, "logps/chosen": -594.3723754882812, "logps/rejected": -1270.779052734375, "loss": 0.3277, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.626492977142334, "rewards/margins": 6.870394706726074, "rewards/rejected": -9.49688720703125, "step": 2660 }, { "epoch": 0.7270251872021783, "grad_norm": 15.714016618081846, "learning_rate": 4.2055349336191366e-05, "logits/chosen": 0.12408769130706787, "logits/rejected": 0.4155445098876953, "logps/chosen": -705.4385375976562, "logps/rejected": -1534.524169921875, "loss": 0.1837, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.389605760574341, "rewards/margins": 8.947419166564941, "rewards/rejected": -12.337023735046387, "step": 2670 }, { "epoch": 0.7297481279782164, "grad_norm": 5.674729134005591, "learning_rate": 4.128303176655002e-05, "logits/chosen": -0.021259818226099014, "logits/rejected": 0.36636242270469666, "logps/chosen": -713.9142456054688, "logps/rejected": -1694.980712890625, "loss": 0.2208, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.674048662185669, "rewards/margins": 10.161161422729492, "rewards/rejected": -13.835209846496582, "step": 2680 }, { "epoch": 0.7324710687542546, "grad_norm": 12.609521570286844, "learning_rate": 4.051602279711163e-05, "logits/chosen": -0.2557820677757263, "logits/rejected": 0.07670646160840988, "logps/chosen": -731.0612182617188, "logps/rejected": -1623.459716796875, "loss": 0.1931, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.7107300758361816, "rewards/margins": 9.202885627746582, "rewards/rejected": -12.913615226745605, "step": 2690 }, { "epoch": 0.7351940095302927, "grad_norm": 5.52670434124597, "learning_rate": 3.975439177314533e-05, "logits/chosen": -0.024277815595269203, "logits/rejected": 0.2917264699935913, "logps/chosen": -684.9026489257812, "logps/rejected": -1733.6109619140625, "loss": 0.1331, "rewards/accuracies": 0.9375, "rewards/chosen": -3.6988205909729004, "rewards/margins": 11.1884765625, "rewards/rejected": -14.887295722961426, "step": 2700 }, { "epoch": 0.7379169503063309, "grad_norm": 3.5661251305369905, "learning_rate": 3.8998207553700506e-05, "logits/chosen": 0.26047295331954956, "logits/rejected": 0.5690510869026184, "logps/chosen": -759.6395263671875, "logps/rejected": -1651.3079833984375, "loss": 0.1748, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.069083213806152, "rewards/margins": 9.588483810424805, "rewards/rejected": -13.657567977905273, "step": 2710 }, { "epoch": 0.7406398910823689, "grad_norm": 12.452203870449546, "learning_rate": 3.824753850538082e-05, "logits/chosen": 0.25679439306259155, "logits/rejected": 0.6026118993759155, "logps/chosen": -745.6700439453125, "logps/rejected": -1589.962890625, "loss": 0.1568, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.7858002185821533, "rewards/margins": 9.034173011779785, "rewards/rejected": -12.819973945617676, "step": 2720 }, { "epoch": 0.7433628318584071, "grad_norm": 8.031642884115076, "learning_rate": 3.750245249616352e-05, "logits/chosen": 0.2537817358970642, "logits/rejected": 0.6085513830184937, "logps/chosen": -645.0103759765625, "logps/rejected": -1783.4013671875, "loss": 0.1372, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.4503912925720215, "rewards/margins": 11.608026504516602, "rewards/rejected": -15.058418273925781, "step": 2730 }, { "epoch": 0.7460857726344452, "grad_norm": 14.569525144419739, "learning_rate": 3.6763016889263344e-05, "logits/chosen": 0.15033717453479767, "logits/rejected": 0.47281932830810547, "logps/chosen": -663.0330810546875, "logps/rejected": -1513.896728515625, "loss": 0.1803, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.0524895191192627, "rewards/margins": 9.203824043273926, "rewards/rejected": -12.256312370300293, "step": 2740 }, { "epoch": 0.7488087134104833, "grad_norm": 62.65981235550733, "learning_rate": 3.602929853704223e-05, "logits/chosen": -0.08105476200580597, "logits/rejected": 0.24570266902446747, "logps/chosen": -862.7356567382812, "logps/rejected": -1566.857177734375, "loss": 0.8887, "rewards/accuracies": 0.9375, "rewards/chosen": -5.279505252838135, "rewards/margins": 7.579603672027588, "rewards/rejected": -12.859109878540039, "step": 2750 }, { "epoch": 0.7515316541865215, "grad_norm": 6.616456930014495, "learning_rate": 3.5301363774965256e-05, "logits/chosen": -0.20161032676696777, "logits/rejected": 0.048817120492458344, "logps/chosen": -653.1944580078125, "logps/rejected": -1532.51025390625, "loss": 0.1708, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.062190294265747, "rewards/margins": 8.815936088562012, "rewards/rejected": -11.878127098083496, "step": 2760 }, { "epoch": 0.7542545949625595, "grad_norm": 5.792226112009077, "learning_rate": 3.457927841560311e-05, "logits/chosen": -0.4394923150539398, "logits/rejected": -0.09831082820892334, "logps/chosen": -581.9075927734375, "logps/rejected": -1791.471435546875, "loss": 0.1458, "rewards/accuracies": 0.9375, "rewards/chosen": -2.953946352005005, "rewards/margins": 12.19267749786377, "rewards/rejected": -15.146624565124512, "step": 2770 }, { "epoch": 0.7569775357385977, "grad_norm": 10.191012036267383, "learning_rate": 3.3863107742682144e-05, "logits/chosen": -0.5570166707038879, "logits/rejected": -0.26251545548439026, "logps/chosen": -670.0498046875, "logps/rejected": -1517.240478515625, "loss": 0.1453, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.960145950317383, "rewards/margins": 9.034058570861816, "rewards/rejected": -11.994203567504883, "step": 2780 }, { "epoch": 0.7597004765146358, "grad_norm": 5.4213794129480215, "learning_rate": 3.315291650518197e-05, "logits/chosen": -0.3676304221153259, "logits/rejected": -0.056665968149900436, "logps/chosen": -603.147705078125, "logps/rejected": -1735.9398193359375, "loss": 0.1155, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.912886381149292, "rewards/margins": 11.69940185546875, "rewards/rejected": -14.612287521362305, "step": 2790 }, { "epoch": 0.762423417290674, "grad_norm": 12.375680844171878, "learning_rate": 3.2448768911481576e-05, "logits/chosen": -0.09998101741075516, "logits/rejected": 0.16582295298576355, "logps/chosen": -644.6640625, "logps/rejected": -1918.529296875, "loss": 0.1839, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.1426656246185303, "rewards/margins": 13.003247261047363, "rewards/rejected": -16.14591407775879, "step": 2800 }, { "epoch": 0.765146358066712, "grad_norm": 5.975694994931154, "learning_rate": 3.175072862355415e-05, "logits/chosen": 0.23628827929496765, "logits/rejected": 0.4834356904029846, "logps/chosen": -655.3341064453125, "logps/rejected": -1792.5072021484375, "loss": 0.1245, "rewards/accuracies": 0.9375, "rewards/chosen": -3.2562522888183594, "rewards/margins": 11.580477714538574, "rewards/rejected": -14.836729049682617, "step": 2810 }, { "epoch": 0.7678692988427501, "grad_norm": 12.23511220826118, "learning_rate": 3.105885875121152e-05, "logits/chosen": 0.279664009809494, "logits/rejected": 0.541578471660614, "logps/chosen": -647.7738037109375, "logps/rejected": -1687.0943603515625, "loss": 0.1188, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.340040922164917, "rewards/margins": 10.875506401062012, "rewards/rejected": -14.215548515319824, "step": 2820 }, { "epoch": 0.7705922396187883, "grad_norm": 6.939190892672432, "learning_rate": 3.0373221846398235e-05, "logits/chosen": 0.2148459255695343, "logits/rejected": 0.46170759201049805, "logps/chosen": -640.2728881835938, "logps/rejected": -1714.49609375, "loss": 0.127, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.9620585441589355, "rewards/margins": 11.328829765319824, "rewards/rejected": -14.290887832641602, "step": 2830 }, { "epoch": 0.7733151803948264, "grad_norm": 6.269444512733244, "learning_rate": 2.9693879897536436e-05, "logits/chosen": 0.27309393882751465, "logits/rejected": 0.5137401223182678, "logps/chosen": -668.1610107421875, "logps/rejected": -1638.63671875, "loss": 0.1535, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.3333866596221924, "rewards/margins": 10.211597442626953, "rewards/rejected": -13.54498291015625, "step": 2840 }, { "epoch": 0.7760381211708646, "grad_norm": 7.741925527096101, "learning_rate": 2.9020894323921366e-05, "logits/chosen": 0.25359398126602173, "logits/rejected": 0.5846693515777588, "logps/chosen": -644.554443359375, "logps/rejected": -1534.7540283203125, "loss": 0.1861, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.090136766433716, "rewards/margins": 9.402518272399902, "rewards/rejected": -12.492655754089355, "step": 2850 }, { "epoch": 0.7787610619469026, "grad_norm": 17.942064241858763, "learning_rate": 2.8354325970168484e-05, "logits/chosen": 0.39829185605049133, "logits/rejected": 0.6095653772354126, "logps/chosen": -619.0804443359375, "logps/rejected": -1742.217041015625, "loss": 0.1313, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.099508047103882, "rewards/margins": 11.940356254577637, "rewards/rejected": -15.039861679077148, "step": 2860 }, { "epoch": 0.7814840027229408, "grad_norm": 11.47804437386106, "learning_rate": 2.7694235100712518e-05, "logits/chosen": 0.4158736765384674, "logits/rejected": 0.6686142683029175, "logps/chosen": -601.0423583984375, "logps/rejected": -1758.963623046875, "loss": 0.1718, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.962646007537842, "rewards/margins": 12.120287895202637, "rewards/rejected": -15.082934379577637, "step": 2870 }, { "epoch": 0.7842069434989789, "grad_norm": 11.324300295147452, "learning_rate": 2.7040681394358813e-05, "logits/chosen": 0.26672840118408203, "logits/rejected": 0.49848708510398865, "logps/chosen": -687.2901611328125, "logps/rejected": -1745.1597900390625, "loss": 0.2234, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.3730876445770264, "rewards/margins": 10.44471263885498, "rewards/rejected": -13.817800521850586, "step": 2880 }, { "epoch": 0.786929884275017, "grad_norm": 3.8898657638742176, "learning_rate": 2.6393723938888125e-05, "logits/chosen": 0.1845589578151703, "logits/rejected": 0.44024285674095154, "logps/chosen": -611.0074462890625, "logps/rejected": -1503.866455078125, "loss": 0.171, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.838397741317749, "rewards/margins": 9.593734741210938, "rewards/rejected": -12.432132720947266, "step": 2890 }, { "epoch": 0.7896528250510552, "grad_norm": 7.214623467902022, "learning_rate": 2.5753421225714057e-05, "logits/chosen": 0.11631506681442261, "logits/rejected": 0.3261148929595947, "logps/chosen": -659.8120727539062, "logps/rejected": -1533.37939453125, "loss": 0.1904, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.959886074066162, "rewards/margins": 9.48987865447998, "rewards/rejected": -12.449766159057617, "step": 2900 }, { "epoch": 0.7923757658270932, "grad_norm": 14.811426408672398, "learning_rate": 2.5119831144595163e-05, "logits/chosen": 0.250567227602005, "logits/rejected": 0.4331508278846741, "logps/chosen": -606.6591796875, "logps/rejected": -1445.5570068359375, "loss": 0.1582, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.611093521118164, "rewards/margins": 9.088173866271973, "rewards/rejected": -11.699268341064453, "step": 2910 }, { "epoch": 0.7950987066031314, "grad_norm": 1.3533811803679652, "learning_rate": 2.4493010978401064e-05, "logits/chosen": 0.1440388262271881, "logits/rejected": 0.2874930799007416, "logps/chosen": -635.5841674804688, "logps/rejected": -1497.6400146484375, "loss": 0.1258, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7259573936462402, "rewards/margins": 9.025880813598633, "rewards/rejected": -11.751837730407715, "step": 2920 }, { "epoch": 0.7978216473791695, "grad_norm": 8.320155633358658, "learning_rate": 2.3873017397933327e-05, "logits/chosen": 0.33122745156288147, "logits/rejected": 0.5511559247970581, "logps/chosen": -595.3291015625, "logps/rejected": -1565.5882568359375, "loss": 0.1784, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.964611530303955, "rewards/margins": 9.899821281433105, "rewards/rejected": -12.864433288574219, "step": 2930 }, { "epoch": 0.8005445881552076, "grad_norm": 11.153673486628335, "learning_rate": 2.3259906456802216e-05, "logits/chosen": 0.14781682193279266, "logits/rejected": 0.2881123125553131, "logps/chosen": -578.9710083007812, "logps/rejected": -1423.344970703125, "loss": 0.1937, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.7508277893066406, "rewards/margins": 8.510377883911133, "rewards/rejected": -11.261204719543457, "step": 2940 }, { "epoch": 0.8032675289312458, "grad_norm": 6.886878526143292, "learning_rate": 2.265373358635856e-05, "logits/chosen": -0.09965401142835617, "logits/rejected": -0.01610557734966278, "logps/chosen": -640.9449462890625, "logps/rejected": -1365.075439453125, "loss": 0.2531, "rewards/accuracies": 0.875, "rewards/chosen": -2.95725154876709, "rewards/margins": 7.233819007873535, "rewards/rejected": -10.191070556640625, "step": 2950 }, { "epoch": 0.8059904697072838, "grad_norm": 16.611108572392084, "learning_rate": 2.205455359068227e-05, "logits/chosen": -0.021208569407463074, "logits/rejected": 0.1042444258928299, "logps/chosen": -568.1573486328125, "logps/rejected": -1511.6607666015625, "loss": 0.1583, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5029826164245605, "rewards/margins": 9.983243942260742, "rewards/rejected": -12.486227035522461, "step": 2960 }, { "epoch": 0.808713410483322, "grad_norm": 8.57165227758473, "learning_rate": 2.146242064162767e-05, "logits/chosen": -0.03554140776395798, "logits/rejected": 0.11067160218954086, "logps/chosen": -583.32568359375, "logps/rejected": -1428.696533203125, "loss": 0.1406, "rewards/accuracies": 0.9375, "rewards/chosen": -2.437119960784912, "rewards/margins": 8.866755485534668, "rewards/rejected": -11.303875923156738, "step": 2970 }, { "epoch": 0.8114363512593601, "grad_norm": 6.1603778868761285, "learning_rate": 2.0877388273925646e-05, "logits/chosen": 0.10540245473384857, "logits/rejected": 0.2626660466194153, "logps/chosen": -588.477783203125, "logps/rejected": -1473.778076171875, "loss": 0.1282, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.5758628845214844, "rewards/margins": 9.339284896850586, "rewards/rejected": -11.91514778137207, "step": 2980 }, { "epoch": 0.8141592920353983, "grad_norm": 1.9188133882396965, "learning_rate": 2.029950938034364e-05, "logits/chosen": 0.17124342918395996, "logits/rejected": 0.24657151103019714, "logps/chosen": -604.7764282226562, "logps/rejected": -1434.573974609375, "loss": 0.118, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.6486189365386963, "rewards/margins": 8.698214530944824, "rewards/rejected": -11.346834182739258, "step": 2990 }, { "epoch": 0.8168822328114363, "grad_norm": 6.713017303993807, "learning_rate": 1.9728836206903656e-05, "logits/chosen": 0.5552121996879578, "logits/rejected": 0.6636002063751221, "logps/chosen": -636.4107666015625, "logps/rejected": -1567.7227783203125, "loss": 0.1342, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.9134068489074707, "rewards/margins": 9.883702278137207, "rewards/rejected": -12.797107696533203, "step": 3000 }, { "epoch": 0.8168822328114363, "eval_logits/chosen": 0.5428004264831543, "eval_logits/rejected": 0.672680675983429, "eval_logps/chosen": -648.9738159179688, "eval_logps/rejected": -1576.696044921875, "eval_loss": 0.1451936662197113, "eval_rewards/accuracies": 0.9339853525161743, "eval_rewards/chosen": -3.092790126800537, "eval_rewards/margins": 9.754944801330566, "eval_rewards/rejected": -12.847735404968262, "eval_runtime": 3746.5832, "eval_samples_per_second": 1.309, "eval_steps_per_second": 0.109, "step": 3000 }, { "epoch": 0.8196051735874744, "grad_norm": 3.6241792804364183, "learning_rate": 1.9165420348158526e-05, "logits/chosen": 0.6930165886878967, "logits/rejected": 0.7744175791740417, "logps/chosen": -633.44287109375, "logps/rejected": -1447.761474609375, "loss": 0.324, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.974301815032959, "rewards/margins": 8.308740615844727, "rewards/rejected": -11.283042907714844, "step": 3010 }, { "epoch": 0.8223281143635126, "grad_norm": 7.97257246955101, "learning_rate": 1.8609312742527497e-05, "logits/chosen": 0.3581380248069763, "logits/rejected": 0.5654556155204773, "logps/chosen": -625.2705688476562, "logps/rejected": -1440.2318115234375, "loss": 0.1714, "rewards/accuracies": 0.9375, "rewards/chosen": -2.855855941772461, "rewards/margins": 8.565881729125977, "rewards/rejected": -11.42173957824707, "step": 3020 }, { "epoch": 0.8250510551395507, "grad_norm": 13.920925895284425, "learning_rate": 1.8060563667690712e-05, "logits/chosen": 0.4211028218269348, "logits/rejected": 0.5939645171165466, "logps/chosen": -573.2506103515625, "logps/rejected": -1506.34814453125, "loss": 0.1703, "rewards/accuracies": 0.9375, "rewards/chosen": -2.502913236618042, "rewards/margins": 9.767515182495117, "rewards/rejected": -12.270427703857422, "step": 3030 }, { "epoch": 0.8277739959155889, "grad_norm": 3.7906065265561137, "learning_rate": 1.7519222736043662e-05, "logits/chosen": 0.3115430474281311, "logits/rejected": 0.48292437195777893, "logps/chosen": -542.3753662109375, "logps/rejected": -1483.460205078125, "loss": 0.1485, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.395129680633545, "rewards/margins": 9.773895263671875, "rewards/rejected": -12.169026374816895, "step": 3040 }, { "epoch": 0.8304969366916269, "grad_norm": 9.127305003183238, "learning_rate": 1.6985338890211745e-05, "logits/chosen": 0.314956933259964, "logits/rejected": 0.4118734896183014, "logps/chosen": -588.65185546875, "logps/rejected": -1375.0704345703125, "loss": 0.1925, "rewards/accuracies": 0.9375, "rewards/chosen": -2.656867504119873, "rewards/margins": 7.795752048492432, "rewards/rejected": -10.452619552612305, "step": 3050 }, { "epoch": 0.8332198774676651, "grad_norm": 12.735372802298448, "learning_rate": 1.645896039862529e-05, "logits/chosen": 0.4811887741088867, "logits/rejected": 0.5578995943069458, "logps/chosen": -617.8807983398438, "logps/rejected": -1505.7147216796875, "loss": 0.1803, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.9311892986297607, "rewards/margins": 9.137002944946289, "rewards/rejected": -12.068192481994629, "step": 3060 }, { "epoch": 0.8359428182437032, "grad_norm": 7.068145484671497, "learning_rate": 1.5940134851155697e-05, "logits/chosen": 0.6108947992324829, "logits/rejected": 0.6557289361953735, "logps/chosen": -655.98828125, "logps/rejected": -1595.331298828125, "loss": 0.1364, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.7303531169891357, "rewards/margins": 9.733122825622559, "rewards/rejected": -12.46347713470459, "step": 3070 }, { "epoch": 0.8386657590197413, "grad_norm": 4.310739879114603, "learning_rate": 1.542890915481282e-05, "logits/chosen": 0.7832988500595093, "logits/rejected": 0.9027649164199829, "logps/chosen": -618.2164306640625, "logps/rejected": -1467.2933349609375, "loss": 0.1309, "rewards/accuracies": 0.9375, "rewards/chosen": -2.913681745529175, "rewards/margins": 9.126543045043945, "rewards/rejected": -12.040224075317383, "step": 3080 }, { "epoch": 0.8413886997957795, "grad_norm": 4.180426497638915, "learning_rate": 1.4925329529504073e-05, "logits/chosen": 1.0196577310562134, "logits/rejected": 1.1263244152069092, "logps/chosen": -634.84716796875, "logps/rejected": -1566.780517578125, "loss": 0.1516, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.2844977378845215, "rewards/margins": 9.819615364074707, "rewards/rejected": -13.10411262512207, "step": 3090 }, { "epoch": 0.8441116405718175, "grad_norm": 7.403131719337164, "learning_rate": 1.4429441503855722e-05, "logits/chosen": 0.7300751209259033, "logits/rejected": 0.9075593948364258, "logps/chosen": -662.4650268554688, "logps/rejected": -1523.392333984375, "loss": 0.1383, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.188582181930542, "rewards/margins": 9.216440200805664, "rewards/rejected": -12.405021667480469, "step": 3100 }, { "epoch": 0.8468345813478557, "grad_norm": 5.705211140347224, "learning_rate": 1.3941289911096566e-05, "logits/chosen": 0.8525094985961914, "logits/rejected": 0.8802255392074585, "logps/chosen": -633.8055419921875, "logps/rejected": -1591.3004150390625, "loss": 0.1648, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.3043887615203857, "rewards/margins": 9.925670623779297, "rewards/rejected": -13.230059623718262, "step": 3110 }, { "epoch": 0.8495575221238938, "grad_norm": 4.045894996195174, "learning_rate": 1.3460918885004658e-05, "logits/chosen": 0.5131864547729492, "logits/rejected": 0.5491036772727966, "logps/chosen": -568.0333862304688, "logps/rejected": -1503.20263671875, "loss": 0.1776, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.693911552429199, "rewards/margins": 9.109321594238281, "rewards/rejected": -11.80323314666748, "step": 3120 }, { "epoch": 0.852280462899932, "grad_norm": 12.066715885022818, "learning_rate": 1.2988371855917225e-05, "logits/chosen": 0.37565484642982483, "logits/rejected": 0.5007588863372803, "logps/chosen": -703.4903564453125, "logps/rejected": -1430.76171875, "loss": 0.2114, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.1784768104553223, "rewards/margins": 7.968869686126709, "rewards/rejected": -11.147346496582031, "step": 3130 }, { "epoch": 0.85500340367597, "grad_norm": 10.07953482925563, "learning_rate": 1.2523691546803873e-05, "logits/chosen": 0.49442654848098755, "logits/rejected": 0.5744796395301819, "logps/chosen": -650.1203002929688, "logps/rejected": -1487.5498046875, "loss": 0.169, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.1168293952941895, "rewards/margins": 8.435214042663574, "rewards/rejected": -11.552044868469238, "step": 3140 }, { "epoch": 0.8577263444520081, "grad_norm": 7.211644812057612, "learning_rate": 1.206691996940431e-05, "logits/chosen": 0.4109431207180023, "logits/rejected": 0.517845630645752, "logps/chosen": -614.5814208984375, "logps/rejected": -1589.5028076171875, "loss": 0.1509, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.7576184272766113, "rewards/margins": 10.276385307312012, "rewards/rejected": -13.034006118774414, "step": 3150 }, { "epoch": 0.8604492852280463, "grad_norm": 8.86779635631434, "learning_rate": 1.161809842042988e-05, "logits/chosen": 0.31158551573753357, "logits/rejected": 0.431427538394928, "logps/chosen": -564.8810424804688, "logps/rejected": -1479.360107421875, "loss": 2.1565, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4318184852600098, "rewards/margins": 9.508817672729492, "rewards/rejected": -11.94063663482666, "step": 3160 }, { "epoch": 0.8631722260040844, "grad_norm": 3.5558132087968035, "learning_rate": 1.1177267477829978e-05, "logits/chosen": 0.24956552684307098, "logits/rejected": 0.44387301802635193, "logps/chosen": -596.3125610351562, "logps/rejected": -1585.163330078125, "loss": 0.1356, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.810048818588257, "rewards/margins": 10.357275009155273, "rewards/rejected": -13.167322158813477, "step": 3170 }, { "epoch": 0.8658951667801226, "grad_norm": 5.862065971308642, "learning_rate": 1.0744466997123425e-05, "logits/chosen": 0.3437689542770386, "logits/rejected": 0.5175299644470215, "logps/chosen": -621.8297119140625, "logps/rejected": -1538.585693359375, "loss": 0.1573, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.5700974464416504, "rewards/margins": 9.909372329711914, "rewards/rejected": -12.479471206665039, "step": 3180 }, { "epoch": 0.8686181075561606, "grad_norm": 3.24472342611504, "learning_rate": 1.0319736107795041e-05, "logits/chosen": 0.37868422269821167, "logits/rejected": 0.6068054437637329, "logps/chosen": -572.27685546875, "logps/rejected": -1345.9105224609375, "loss": 0.155, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.400740623474121, "rewards/margins": 8.348073959350586, "rewards/rejected": -10.748812675476074, "step": 3190 }, { "epoch": 0.8713410483321987, "grad_norm": 2.744692426135948, "learning_rate": 9.903113209758096e-06, "logits/chosen": 0.2929421067237854, "logits/rejected": 0.5335865616798401, "logps/chosen": -701.8385009765625, "logps/rejected": -1510.951171875, "loss": 0.1221, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.2702605724334717, "rewards/margins": 8.927230834960938, "rewards/rejected": -12.197491645812988, "step": 3200 }, { "epoch": 0.8740639891082369, "grad_norm": 5.900535069753502, "learning_rate": 9.494635969882426e-06, "logits/chosen": 0.4422897696495056, "logits/rejected": 0.5750479102134705, "logps/chosen": -573.0560302734375, "logps/rejected": -1374.738037109375, "loss": 0.1683, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.55328106880188, "rewards/margins": 8.676356315612793, "rewards/rejected": -11.229637145996094, "step": 3210 }, { "epoch": 0.876786929884275, "grad_norm": 3.44462979022299, "learning_rate": 9.094341318589072e-06, "logits/chosen": 0.42226654291152954, "logits/rejected": 0.5428343415260315, "logps/chosen": -581.0510864257812, "logps/rejected": -1369.337158203125, "loss": 0.1241, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.3742847442626953, "rewards/margins": 8.42089557647705, "rewards/rejected": -10.79517936706543, "step": 3220 }, { "epoch": 0.8795098706603132, "grad_norm": 5.608307171603524, "learning_rate": 8.702265446511382e-06, "logits/chosen": 0.3774926960468292, "logits/rejected": 0.5087335705757141, "logps/chosen": -514.2108154296875, "logps/rejected": -1304.4666748046875, "loss": 0.1778, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0655770301818848, "rewards/margins": 8.323145866394043, "rewards/rejected": -10.38872241973877, "step": 3230 }, { "epoch": 0.8822328114363512, "grad_norm": 5.455708732235766, "learning_rate": 8.31844380122292e-06, "logits/chosen": 0.47580593824386597, "logits/rejected": 0.6076167821884155, "logps/chosen": -556.3704833984375, "logps/rejected": -1279.912841796875, "loss": 0.1173, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.3415322303771973, "rewards/margins": 7.390820503234863, "rewards/rejected": -9.732351303100586, "step": 3240 }, { "epoch": 0.8849557522123894, "grad_norm": 7.554518467559218, "learning_rate": 7.942911084032788e-06, "logits/chosen": 0.4682241976261139, "logits/rejected": 0.6157165765762329, "logps/chosen": -655.5394287109375, "logps/rejected": -1494.0238037109375, "loss": 0.1529, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6178641319274902, "rewards/margins": 8.754510879516602, "rewards/rejected": -11.372376441955566, "step": 3250 }, { "epoch": 0.8876786929884275, "grad_norm": 5.600989385971434, "learning_rate": 7.575701246848299e-06, "logits/chosen": 0.48880425095558167, "logits/rejected": 0.6227352619171143, "logps/chosen": -593.4500122070312, "logps/rejected": -1501.2703857421875, "loss": 0.1127, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.473632335662842, "rewards/margins": 9.577296257019043, "rewards/rejected": -12.050928115844727, "step": 3260 }, { "epoch": 0.8904016337644656, "grad_norm": 13.431074173165474, "learning_rate": 7.216847489105161e-06, "logits/chosen": 0.5438786149024963, "logits/rejected": 0.6403484344482422, "logps/chosen": -599.2393798828125, "logps/rejected": -1532.2991943359375, "loss": 0.2052, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.9075636863708496, "rewards/margins": 9.528191566467285, "rewards/rejected": -12.43575382232666, "step": 3270 }, { "epoch": 0.8931245745405038, "grad_norm": 6.918619567772649, "learning_rate": 6.866382254766157e-06, "logits/chosen": 0.619216799736023, "logits/rejected": 0.8395193815231323, "logps/chosen": -658.65283203125, "logps/rejected": -1515.1939697265625, "loss": 0.1629, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.2254257202148438, "rewards/margins": 8.87635326385498, "rewards/rejected": -12.10177993774414, "step": 3280 }, { "epoch": 0.8958475153165418, "grad_norm": 6.648028443759932, "learning_rate": 6.5243372293878e-06, "logits/chosen": 0.49252304434776306, "logits/rejected": 0.8068822026252747, "logps/chosen": -607.2750854492188, "logps/rejected": -1523.320556640625, "loss": 0.1617, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.453411817550659, "rewards/margins": 9.811650276184082, "rewards/rejected": -12.265061378479004, "step": 3290 }, { "epoch": 0.89857045609258, "grad_norm": 2.539210647381528, "learning_rate": 6.190743337255589e-06, "logits/chosen": 0.6140366792678833, "logits/rejected": 0.7768694162368774, "logps/chosen": -614.919921875, "logps/rejected": -1382.4276123046875, "loss": 0.0811, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.821394205093384, "rewards/margins": 8.077497482299805, "rewards/rejected": -10.89889144897461, "step": 3300 }, { "epoch": 0.9012933968686181, "grad_norm": 10.794465693165817, "learning_rate": 5.865630738588268e-06, "logits/chosen": 0.3767799735069275, "logits/rejected": 0.5967041850090027, "logps/chosen": -656.5824584960938, "logps/rejected": -1527.9437255859375, "loss": 0.1401, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.898848056793213, "rewards/margins": 9.388232231140137, "rewards/rejected": -12.287079811096191, "step": 3310 }, { "epoch": 0.9040163376446563, "grad_norm": 5.263799813316893, "learning_rate": 5.5490288268108866e-06, "logits/chosen": 0.4702087342739105, "logits/rejected": 0.6225636005401611, "logps/chosen": -599.8109130859375, "logps/rejected": -1474.5819091796875, "loss": 0.1201, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.4712748527526855, "rewards/margins": 9.180947303771973, "rewards/rejected": -11.652222633361816, "step": 3320 }, { "epoch": 0.9067392784206944, "grad_norm": 6.067392778625339, "learning_rate": 5.2409662258974925e-06, "logits/chosen": 0.4685233533382416, "logits/rejected": 0.7201848030090332, "logps/chosen": -561.8470458984375, "logps/rejected": -1426.10009765625, "loss": 0.1095, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.4505324363708496, "rewards/margins": 9.327180862426758, "rewards/rejected": -11.777711868286133, "step": 3330 }, { "epoch": 0.9094622191967324, "grad_norm": 4.0896710788723745, "learning_rate": 4.941470787783131e-06, "logits/chosen": 0.42010945081710815, "logits/rejected": 0.6404047012329102, "logps/chosen": -609.3140869140625, "logps/rejected": -1468.7435302734375, "loss": 0.1608, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.7196056842803955, "rewards/margins": 8.689676284790039, "rewards/rejected": -11.409282684326172, "step": 3340 }, { "epoch": 0.9121851599727706, "grad_norm": 2.4478379172465448, "learning_rate": 4.6505695898457655e-06, "logits/chosen": 0.4856113791465759, "logits/rejected": 0.6557452082633972, "logps/chosen": -576.8583984375, "logps/rejected": -1425.530517578125, "loss": 0.1166, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.381040096282959, "rewards/margins": 8.655874252319336, "rewards/rejected": -11.036913871765137, "step": 3350 }, { "epoch": 0.9149081007488087, "grad_norm": 6.006212378190146, "learning_rate": 4.368288932458309e-06, "logits/chosen": 0.4439225196838379, "logits/rejected": 0.6169265508651733, "logps/chosen": -598.0299072265625, "logps/rejected": -1448.4476318359375, "loss": 0.1014, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.50980544090271, "rewards/margins": 8.920835494995117, "rewards/rejected": -11.430641174316406, "step": 3360 }, { "epoch": 0.9176310415248469, "grad_norm": 2.801540016758875, "learning_rate": 4.09465433661067e-06, "logits/chosen": 0.417350709438324, "logits/rejected": 0.6509106755256653, "logps/chosen": -574.1402587890625, "logps/rejected": -1419.3201904296875, "loss": 0.1554, "rewards/accuracies": 0.9375, "rewards/chosen": -2.46280837059021, "rewards/margins": 9.459114074707031, "rewards/rejected": -11.92192268371582, "step": 3370 }, { "epoch": 0.9203539823008849, "grad_norm": 6.571082473335941, "learning_rate": 3.829690541602504e-06, "logits/chosen": 0.3773556351661682, "logits/rejected": 0.6057129502296448, "logps/chosen": -554.6580200195312, "logps/rejected": -1534.7188720703125, "loss": 0.1205, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.384308338165283, "rewards/margins": 10.432950973510742, "rewards/rejected": -12.817258834838867, "step": 3380 }, { "epoch": 0.9230769230769231, "grad_norm": 10.724095904793385, "learning_rate": 3.573421502806462e-06, "logits/chosen": 0.43141230940818787, "logits/rejected": 0.5827267169952393, "logps/chosen": -629.783935546875, "logps/rejected": -1560.9981689453125, "loss": 0.1687, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.6314046382904053, "rewards/margins": 9.693445205688477, "rewards/rejected": -12.324849128723145, "step": 3390 }, { "epoch": 0.9257998638529612, "grad_norm": 3.3756489869976214, "learning_rate": 3.325870389502439e-06, "logits/chosen": 0.4125029444694519, "logits/rejected": 0.6377438902854919, "logps/chosen": -630.9293823242188, "logps/rejected": -1469.781005859375, "loss": 0.1312, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.6700332164764404, "rewards/margins": 9.24914836883545, "rewards/rejected": -11.919180870056152, "step": 3400 }, { "epoch": 0.9285228046289993, "grad_norm": 7.788178458703957, "learning_rate": 3.0870595827828365e-06, "logits/chosen": 0.3888288140296936, "logits/rejected": 0.6043727993965149, "logps/chosen": -689.7025146484375, "logps/rejected": -1592.350341796875, "loss": 0.1529, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.963463068008423, "rewards/margins": 9.724987983703613, "rewards/rejected": -12.688450813293457, "step": 3410 }, { "epoch": 0.9312457454050375, "grad_norm": 7.147218526064257, "learning_rate": 2.857010673529015e-06, "logits/chosen": 0.44625091552734375, "logits/rejected": 0.6315649747848511, "logps/chosen": -627.4977416992188, "logps/rejected": -1486.74169921875, "loss": 0.1246, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.8249270915985107, "rewards/margins": 8.886324882507324, "rewards/rejected": -11.711252212524414, "step": 3420 }, { "epoch": 0.9339686861810755, "grad_norm": 9.153788579430927, "learning_rate": 2.6357444604593662e-06, "logits/chosen": 0.34807997941970825, "logits/rejected": 0.5339373350143433, "logps/chosen": -595.520751953125, "logps/rejected": -1477.987548828125, "loss": 0.1449, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.6145668029785156, "rewards/margins": 9.307913780212402, "rewards/rejected": -11.922479629516602, "step": 3430 }, { "epoch": 0.9366916269571137, "grad_norm": 7.602358676825036, "learning_rate": 2.4232809482488406e-06, "logits/chosen": 0.23554039001464844, "logits/rejected": 0.47686901688575745, "logps/chosen": -558.9524536132812, "logps/rejected": -1546.791259765625, "loss": 0.1221, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.5332367420196533, "rewards/margins": 10.079355239868164, "rewards/rejected": -12.612590789794922, "step": 3440 }, { "epoch": 0.9394145677331518, "grad_norm": 16.49701246380974, "learning_rate": 2.219639345720359e-06, "logits/chosen": 0.35037535429000854, "logits/rejected": 0.5514084100723267, "logps/chosen": -589.4315185546875, "logps/rejected": -1492.39892578125, "loss": 0.2012, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.627023220062256, "rewards/margins": 9.265153884887695, "rewards/rejected": -11.892175674438477, "step": 3450 }, { "epoch": 0.94213750850919, "grad_norm": 6.307435314088876, "learning_rate": 2.02483806410807e-06, "logits/chosen": 0.451333612203598, "logits/rejected": 0.5924814343452454, "logps/chosen": -598.7140502929688, "logps/rejected": -1415.045654296875, "loss": 0.1003, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.440220355987549, "rewards/margins": 8.552030563354492, "rewards/rejected": -10.992252349853516, "step": 3460 }, { "epoch": 0.944860449285228, "grad_norm": 6.338529057616752, "learning_rate": 1.8388947153929027e-06, "logits/chosen": 0.36880573630332947, "logits/rejected": 0.5762465596199036, "logps/chosen": -576.0185546875, "logps/rejected": -1487.611572265625, "loss": 0.1643, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.8171238899230957, "rewards/margins": 9.56772518157959, "rewards/rejected": -12.384849548339844, "step": 3470 }, { "epoch": 0.9475833900612661, "grad_norm": 8.68745236022594, "learning_rate": 1.661826110710163e-06, "logits/chosen": 0.39139264822006226, "logits/rejected": 0.6284823417663574, "logps/chosen": -512.65380859375, "logps/rejected": -1482.1314697265625, "loss": 0.1754, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.0515763759613037, "rewards/margins": 9.894174575805664, "rewards/rejected": -11.94575023651123, "step": 3480 }, { "epoch": 0.9503063308373043, "grad_norm": 8.323294909328977, "learning_rate": 1.493648258829694e-06, "logits/chosen": 0.37607231736183167, "logits/rejected": 0.4850333333015442, "logps/chosen": -595.7239990234375, "logps/rejected": -1427.0201416015625, "loss": 0.1485, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.6649580001831055, "rewards/margins": 8.330266952514648, "rewards/rejected": -10.99522590637207, "step": 3490 }, { "epoch": 0.9530292716133424, "grad_norm": 4.716570485344555, "learning_rate": 1.3343763647085339e-06, "logits/chosen": 0.33481714129447937, "logits/rejected": 0.5425733327865601, "logps/chosen": -607.2645263671875, "logps/rejected": -1544.1453857421875, "loss": 0.1252, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.5669472217559814, "rewards/margins": 9.806219100952148, "rewards/rejected": -12.373165130615234, "step": 3500 }, { "epoch": 0.9530292716133424, "eval_logits/chosen": 0.30259910225868225, "eval_logits/rejected": 0.5002096891403198, "eval_logps/chosen": -609.8344116210938, "eval_logps/rejected": -1531.6849365234375, "eval_loss": 0.13275307416915894, "eval_rewards/accuracies": 0.9382640719413757, "eval_rewards/chosen": -2.7013959884643555, "eval_rewards/margins": 9.69622802734375, "eval_rewards/rejected": -12.397622108459473, "eval_runtime": 3747.3913, "eval_samples_per_second": 1.308, "eval_steps_per_second": 0.109, "step": 3500 }, { "epoch": 0.9557522123893806, "grad_norm": 10.506491372972281, "learning_rate": 1.1840248281162037e-06, "logits/chosen": 0.27225154638290405, "logits/rejected": 0.4373188018798828, "logps/chosen": -638.9410400390625, "logps/rejected": -1560.869873046875, "loss": 0.1312, "rewards/accuracies": 0.9375, "rewards/chosen": -2.927722454071045, "rewards/margins": 9.307165145874023, "rewards/rejected": -12.234888076782227, "step": 3510 }, { "epoch": 0.9584751531654186, "grad_norm": 8.343867171985208, "learning_rate": 1.0426072423328382e-06, "logits/chosen": 0.341633141040802, "logits/rejected": 0.48726290464401245, "logps/chosen": -583.33837890625, "logps/rejected": -1415.044677734375, "loss": 0.1578, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5397531986236572, "rewards/margins": 8.36491870880127, "rewards/rejected": -10.904672622680664, "step": 3520 }, { "epoch": 0.9611980939414567, "grad_norm": 11.471597562026771, "learning_rate": 9.101363929201911e-07, "logits/chosen": 0.24860472977161407, "logits/rejected": 0.4540013372898102, "logps/chosen": -581.4928588867188, "logps/rejected": -1512.778076171875, "loss": 0.1095, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.657729148864746, "rewards/margins": 9.633256912231445, "rewards/rejected": -12.290987014770508, "step": 3530 }, { "epoch": 0.9639210347174949, "grad_norm": 4.763516291437868, "learning_rate": 7.8662425656576e-07, "logits/chosen": 0.25861018896102905, "logits/rejected": 0.4950624406337738, "logps/chosen": -541.2784423828125, "logps/rejected": -1484.020263671875, "loss": 0.1069, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.7152905464172363, "rewards/margins": 9.658266067504883, "rewards/rejected": -12.373555183410645, "step": 3540 }, { "epoch": 0.966643975493533, "grad_norm": 9.11040786986144, "learning_rate": 6.720819999999073e-07, "logits/chosen": 0.3221897482872009, "logits/rejected": 0.49063223600387573, "logps/chosen": -623.95703125, "logps/rejected": -1448.3956298828125, "loss": 0.1387, "rewards/accuracies": 0.9375, "rewards/chosen": -2.757344961166382, "rewards/margins": 8.484246253967285, "rewards/rejected": -11.241591453552246, "step": 3550 }, { "epoch": 0.9693669162695712, "grad_norm": 3.065251251534584, "learning_rate": 5.665199789862907e-07, "logits/chosen": 0.39759618043899536, "logits/rejected": 0.5658711791038513, "logps/chosen": -583.7330932617188, "logps/rejected": -1471.863037109375, "loss": 0.1435, "rewards/accuracies": 0.9375, "rewards/chosen": -2.722357988357544, "rewards/margins": 8.768855094909668, "rewards/rejected": -11.491212844848633, "step": 3560 }, { "epoch": 0.9720898570456092, "grad_norm": 5.12401841638688, "learning_rate": 4.6994773738563424e-07, "logits/chosen": 0.3610488176345825, "logits/rejected": 0.6028701663017273, "logps/chosen": -619.63916015625, "logps/rejected": -1411.327392578125, "loss": 0.1369, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.48909068107605, "rewards/margins": 8.581472396850586, "rewards/rejected": -11.070563316345215, "step": 3570 }, { "epoch": 0.9748127978216474, "grad_norm": 8.810544444486831, "learning_rate": 3.823740062928072e-07, "logits/chosen": 0.4455558657646179, "logits/rejected": 0.6806224584579468, "logps/chosen": -567.6795654296875, "logps/rejected": -1397.3406982421875, "loss": 0.1164, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.5231568813323975, "rewards/margins": 8.459074020385742, "rewards/rejected": -10.982232093811035, "step": 3580 }, { "epoch": 0.9775357385976855, "grad_norm": 6.610645146397345, "learning_rate": 3.0380670324752227e-07, "logits/chosen": 0.3287045657634735, "logits/rejected": 0.5522847771644592, "logps/chosen": -532.489501953125, "logps/rejected": -1528.7987060546875, "loss": 0.1497, "rewards/accuracies": 0.9375, "rewards/chosen": -2.387678623199463, "rewards/margins": 10.268462181091309, "rewards/rejected": -12.65614128112793, "step": 3590 }, { "epoch": 0.9802586793737236, "grad_norm": 9.186519711207744, "learning_rate": 2.3425293151845273e-07, "logits/chosen": 0.2782616913318634, "logits/rejected": 0.4124705195426941, "logps/chosen": -625.5531005859375, "logps/rejected": -1455.448974609375, "loss": 0.1652, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.585552453994751, "rewards/margins": 9.286867141723633, "rewards/rejected": -11.872419357299805, "step": 3600 }, { "epoch": 0.9829816201497618, "grad_norm": 6.4113152609936455, "learning_rate": 1.73718979461035e-07, "logits/chosen": 0.335416316986084, "logits/rejected": 0.5422466397285461, "logps/chosen": -594.4679565429688, "logps/rejected": -1484.909423828125, "loss": 0.1066, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.729745864868164, "rewards/margins": 9.36650562286377, "rewards/rejected": -12.096250534057617, "step": 3610 }, { "epoch": 0.9857045609257998, "grad_norm": 10.555918036454049, "learning_rate": 1.222103199489455e-07, "logits/chosen": 0.3650972247123718, "logits/rejected": 0.5400117635726929, "logps/chosen": -631.2184448242188, "logps/rejected": -1561.655029296875, "loss": 0.1368, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.8069605827331543, "rewards/margins": 9.688605308532715, "rewards/rejected": -12.495567321777344, "step": 3620 }, { "epoch": 0.988427501701838, "grad_norm": 9.85027247573772, "learning_rate": 7.973160987931883e-08, "logits/chosen": 0.3716769516468048, "logits/rejected": 0.5796463489532471, "logps/chosen": -593.5032958984375, "logps/rejected": -1529.673095703125, "loss": 0.1738, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.664257049560547, "rewards/margins": 9.706274032592773, "rewards/rejected": -12.370530128479004, "step": 3630 }, { "epoch": 0.9911504424778761, "grad_norm": 7.384129391564848, "learning_rate": 4.6286689751662285e-08, "logits/chosen": 0.2735041081905365, "logits/rejected": 0.4770600199699402, "logps/chosen": -557.375, "logps/rejected": -1553.71142578125, "loss": 1.5963, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.3559460639953613, "rewards/margins": 10.080205917358398, "rewards/rejected": -12.436152458190918, "step": 3640 }, { "epoch": 0.9938733832539143, "grad_norm": 2.0640752724213027, "learning_rate": 2.1878583320722457e-08, "logits/chosen": 0.3391203284263611, "logits/rejected": 0.5715040564537048, "logps/chosen": -543.0274658203125, "logps/rejected": -1483.306640625, "loss": 0.1335, "rewards/accuracies": 0.9375, "rewards/chosen": -2.347174644470215, "rewards/margins": 9.779848098754883, "rewards/rejected": -12.127023696899414, "step": 3650 }, { "epoch": 0.9965963240299524, "grad_norm": 16.99590231865479, "learning_rate": 6.50949732301509e-09, "logits/chosen": 0.3131711483001709, "logits/rejected": 0.47414079308509827, "logps/chosen": -641.8659057617188, "logps/rejected": -1475.5911865234375, "loss": 0.1154, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.771031141281128, "rewards/margins": 8.959924697875977, "rewards/rejected": -11.730956077575684, "step": 3660 }, { "epoch": 0.9993192648059904, "grad_norm": 4.2674518887673685, "learning_rate": 1.8082127736240851e-10, "logits/chosen": 0.28179001808166504, "logits/rejected": 0.5392038226127625, "logps/chosen": -625.225830078125, "logps/rejected": -1491.476806640625, "loss": 0.1415, "rewards/accuracies": 0.9375, "rewards/chosen": -2.782802104949951, "rewards/margins": 9.378962516784668, "rewards/rejected": -12.161764144897461, "step": 3670 }, { "epoch": 0.9998638529611981, "step": 3672, "total_flos": 0.0, "train_loss": 0.32278433931516665, "train_runtime": 141933.118, "train_samples_per_second": 0.621, "train_steps_per_second": 0.026 } ], "logging_steps": 10, "max_steps": 3672, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }