{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9959925193694897, "eval_steps": 400, "global_step": 233, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02137323002938819, "grad_norm": 0.4608515202999115, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -1.7747037410736084, "logits/rejected": -1.6486629247665405, "logps/chosen": -247.47836303710938, "logps/ref_chosen": -247.4757537841797, "logps/ref_rejected": -250.2177734375, "logps/rejected": -250.17874145507812, "loss": 0.5, "rewards/accuracies": 0.29374998807907104, "rewards/chosen": -2.605724148452282e-05, "rewards/margins": -0.00041639525443315506, "rewards/rejected": 0.0003903379547409713, "step": 5 }, { "epoch": 0.04274646005877638, "grad_norm": 0.426495224237442, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -1.7335236072540283, "logits/rejected": -1.6989978551864624, "logps/chosen": -222.6909637451172, "logps/ref_chosen": -222.6491241455078, "logps/ref_rejected": -223.95663452148438, "logps/rejected": -223.9930877685547, "loss": 0.5, "rewards/accuracies": 0.5, "rewards/chosen": -0.00041838129982352257, "rewards/margins": -5.400222653406672e-05, "rewards/rejected": -0.00036437893868424, "step": 10 }, { "epoch": 0.06411969008816458, "grad_norm": 0.4453659653663635, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -1.9023773670196533, "logits/rejected": -1.789849042892456, "logps/chosen": -218.5724334716797, "logps/ref_chosen": -218.7084503173828, "logps/ref_rejected": -224.755615234375, "logps/rejected": -224.6824493408203, "loss": 0.5, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0013600520323961973, "rewards/margins": 0.0006284656701609492, "rewards/rejected": 0.000731586420442909, "step": 15 }, { "epoch": 0.08549292011755276, "grad_norm": 0.5101017951965332, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -1.7127611637115479, "logits/rejected": -1.6293315887451172, "logps/chosen": -226.1074676513672, "logps/ref_chosen": -226.7457275390625, "logps/ref_rejected": -235.77908325195312, "logps/rejected": -235.2657928466797, "loss": 0.4999, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.006382433231920004, "rewards/margins": 0.0012494683032855392, "rewards/rejected": 0.005132964812219143, "step": 20 }, { "epoch": 0.10686615014694095, "grad_norm": 0.4738335609436035, "learning_rate": 4.999717571181741e-07, "logits/chosen": -1.6099249124526978, "logits/rejected": -1.5539109706878662, "logps/chosen": -229.36843872070312, "logps/ref_chosen": -230.34494018554688, "logps/ref_rejected": -231.64236450195312, "logps/rejected": -230.74813842773438, "loss": 0.4999, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.009765096008777618, "rewards/margins": 0.000822968955617398, "rewards/rejected": 0.008942126296460629, "step": 25 }, { "epoch": 0.12823938017632916, "grad_norm": 0.4367460608482361, "learning_rate": 4.98983926127519e-07, "logits/chosen": -1.6448577642440796, "logits/rejected": -1.560329794883728, "logps/chosen": -239.9384002685547, "logps/ref_chosen": -241.2040557861328, "logps/ref_rejected": -253.18862915039062, "logps/rejected": -251.95547485351562, "loss": 0.4998, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.012656150385737419, "rewards/margins": 0.00032438611378893256, "rewards/rejected": 0.012331764213740826, "step": 30 }, { "epoch": 0.14961261020571734, "grad_norm": 0.5036317706108093, "learning_rate": 4.965903258506806e-07, "logits/chosen": -1.65009343624115, "logits/rejected": -1.6685165166854858, "logps/chosen": -240.6787109375, "logps/ref_chosen": -242.33291625976562, "logps/ref_rejected": -237.6911163330078, "logps/rejected": -236.1189422607422, "loss": 0.4997, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.016541726887226105, "rewards/margins": 0.0008201012387871742, "rewards/rejected": 0.015721624717116356, "step": 35 }, { "epoch": 0.17098584023510552, "grad_norm": 0.5212914347648621, "learning_rate": 4.928044706128802e-07, "logits/chosen": -1.6572792530059814, "logits/rejected": -1.6342990398406982, "logps/chosen": -224.078857421875, "logps/ref_chosen": -226.43637084960938, "logps/ref_rejected": -224.00546264648438, "logps/rejected": -221.7003173828125, "loss": 0.4996, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.02357516996562481, "rewards/margins": 0.0005238516023382545, "rewards/rejected": 0.023051317781209946, "step": 40 }, { "epoch": 0.19235907026449373, "grad_norm": 0.5110143423080444, "learning_rate": 4.876477354446189e-07, "logits/chosen": -1.4905364513397217, "logits/rejected": -1.3957011699676514, "logps/chosen": -216.25308227539062, "logps/ref_chosen": -219.16494750976562, "logps/ref_rejected": -227.38040161132812, "logps/rejected": -224.87564086914062, "loss": 0.4994, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.02911846712231636, "rewards/margins": 0.004071122966706753, "rewards/rejected": 0.025047341361641884, "step": 45 }, { "epoch": 0.2137323002938819, "grad_norm": 0.48523762822151184, "learning_rate": 4.811492353977365e-07, "logits/chosen": -1.7010364532470703, "logits/rejected": -1.6736198663711548, "logps/chosen": -218.8837127685547, "logps/ref_chosen": -221.23171997070312, "logps/ref_rejected": -223.6177215576172, "logps/rejected": -221.6636199951172, "loss": 0.4993, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.023480093106627464, "rewards/margins": 0.0039388458244502544, "rewards/rejected": 0.019541248679161072, "step": 50 }, { "epoch": 0.2351055303232701, "grad_norm": 0.4816797971725464, "learning_rate": 4.7334566116112327e-07, "logits/chosen": -1.62349534034729, "logits/rejected": -1.5281016826629639, "logps/chosen": -237.206787109375, "logps/ref_chosen": -239.38412475585938, "logps/ref_rejected": -245.71304321289062, "logps/rejected": -244.2113800048828, "loss": 0.4989, "rewards/accuracies": 0.625, "rewards/chosen": 0.021773329004645348, "rewards/margins": 0.006756873335689306, "rewards/rejected": 0.015016456134617329, "step": 55 }, { "epoch": 0.2564787603526583, "grad_norm": 0.5273976922035217, "learning_rate": 4.6428107190419983e-07, "logits/chosen": -1.6468950510025024, "logits/rejected": -1.599461317062378, "logps/chosen": -228.3268585205078, "logps/ref_chosen": -231.1789093017578, "logps/ref_rejected": -231.9095001220703, "logps/rejected": -229.9440460205078, "loss": 0.4988, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.028520625084638596, "rewards/margins": 0.008865959011018276, "rewards/rejected": 0.019654670730233192, "step": 60 }, { "epoch": 0.2778519903820465, "grad_norm": 0.47698166966438293, "learning_rate": 4.540066465177783e-07, "logits/chosen": -1.7030376195907593, "logits/rejected": -1.7270011901855469, "logps/chosen": -218.37466430664062, "logps/ref_chosen": -222.1732635498047, "logps/ref_rejected": -221.90371704101562, "logps/rejected": -219.0262451171875, "loss": 0.4985, "rewards/accuracies": 0.65625, "rewards/chosen": 0.03798612207174301, "rewards/margins": 0.009211419150233269, "rewards/rejected": 0.028774702921509743, "step": 65 }, { "epoch": 0.2992252204114347, "grad_norm": 0.4908115863800049, "learning_rate": 4.425803946568032e-07, "logits/chosen": -1.701042890548706, "logits/rejected": -1.642853021621704, "logps/chosen": -237.1160430908203, "logps/ref_chosen": -241.13235473632812, "logps/ref_rejected": -247.3893585205078, "logps/rejected": -243.56692504882812, "loss": 0.4985, "rewards/accuracies": 0.5625, "rewards/chosen": 0.040162790566682816, "rewards/margins": 0.0019384495681151748, "rewards/rejected": 0.038224343210458755, "step": 70 }, { "epoch": 0.32059845044082286, "grad_norm": 0.48811107873916626, "learning_rate": 4.300668292164329e-07, "logits/chosen": -1.6175544261932373, "logits/rejected": -1.6155774593353271, "logps/chosen": -223.8777618408203, "logps/ref_chosen": -228.91860961914062, "logps/ref_rejected": -227.78170776367188, "logps/rejected": -223.22732543945312, "loss": 0.4981, "rewards/accuracies": 0.5625, "rewards/chosen": 0.05040856450796127, "rewards/margins": 0.004864625167101622, "rewards/rejected": 0.04554395005106926, "step": 75 }, { "epoch": 0.34197168047021104, "grad_norm": 0.5498376488685608, "learning_rate": 4.165366020906683e-07, "logits/chosen": -1.721421480178833, "logits/rejected": -1.6703542470932007, "logps/chosen": -220.573486328125, "logps/ref_chosen": -226.90060424804688, "logps/ref_rejected": -232.0827178955078, "logps/rejected": -227.0341339111328, "loss": 0.4975, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.06327112019062042, "rewards/margins": 0.012785114347934723, "rewards/rejected": 0.0504860058426857, "step": 80 }, { "epoch": 0.36334491049959927, "grad_norm": 0.5343174338340759, "learning_rate": 4.0206610527004607e-07, "logits/chosen": -1.630051612854004, "logits/rejected": -1.571542739868164, "logps/chosen": -231.68496704101562, "logps/ref_chosen": -237.4697723388672, "logps/ref_rejected": -240.751953125, "logps/rejected": -236.31600952148438, "loss": 0.4978, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.05784807354211807, "rewards/margins": 0.013488592579960823, "rewards/rejected": 0.0443594828248024, "step": 85 }, { "epoch": 0.38471814052898745, "grad_norm": 0.5112692713737488, "learning_rate": 3.867370395306068e-07, "logits/chosen": -1.7595088481903076, "logits/rejected": -1.7580636739730835, "logps/chosen": -211.63906860351562, "logps/ref_chosen": -217.63436889648438, "logps/ref_rejected": -222.6137237548828, "logps/rejected": -217.2650909423828, "loss": 0.4977, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.05995314195752144, "rewards/margins": 0.00646712351590395, "rewards/rejected": 0.053486019372940063, "step": 90 }, { "epoch": 0.40609137055837563, "grad_norm": 0.4654058516025543, "learning_rate": 3.7063595314933156e-07, "logits/chosen": -1.8619199991226196, "logits/rejected": -1.786892294883728, "logps/chosen": -208.5725555419922, "logps/ref_chosen": -213.7164306640625, "logps/ref_rejected": -228.556396484375, "logps/rejected": -224.4815216064453, "loss": 0.498, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.051438819617033005, "rewards/margins": 0.010690188966691494, "rewards/rejected": 0.04074862599372864, "step": 95 }, { "epoch": 0.4274646005877638, "grad_norm": 0.5265087485313416, "learning_rate": 3.5385375325047163e-07, "logits/chosen": -1.6727230548858643, "logits/rejected": -1.677062749862671, "logps/chosen": -239.5093994140625, "logps/ref_chosen": -245.71194458007812, "logps/ref_rejected": -240.1134490966797, "logps/rejected": -235.6671142578125, "loss": 0.4967, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.06202547624707222, "rewards/margins": 0.01756184920668602, "rewards/rejected": 0.0444636233150959, "step": 100 }, { "epoch": 0.448837830617152, "grad_norm": 0.53775554895401, "learning_rate": 3.36485192541719e-07, "logits/chosen": -1.8463099002838135, "logits/rejected": -1.7264705896377563, "logps/chosen": -224.50320434570312, "logps/ref_chosen": -232.00527954101562, "logps/ref_rejected": -232.0154266357422, "logps/rejected": -225.75454711914062, "loss": 0.4968, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0750209242105484, "rewards/margins": 0.012411920353770256, "rewards/rejected": 0.062609001994133, "step": 105 }, { "epoch": 0.4702110606465402, "grad_norm": 0.5438077449798584, "learning_rate": 3.186283343381213e-07, "logits/chosen": -1.7997539043426514, "logits/rejected": -1.7138378620147705, "logps/chosen": -220.4825897216797, "logps/ref_chosen": -229.9724578857422, "logps/ref_rejected": -238.1800079345703, "logps/rejected": -230.29736328125, "loss": 0.4966, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.09489865601062775, "rewards/margins": 0.016072329133749008, "rewards/rejected": 0.07882632315158844, "step": 110 }, { "epoch": 0.4915842906759284, "grad_norm": 0.5453912019729614, "learning_rate": 3.003839988942255e-07, "logits/chosen": -1.8438644409179688, "logits/rejected": -1.7028881311416626, "logps/chosen": -203.79205322265625, "logps/ref_chosen": -214.1478729248047, "logps/ref_rejected": -226.24618530273438, "logps/rejected": -217.4800567626953, "loss": 0.4968, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1035580188035965, "rewards/margins": 0.015896398574113846, "rewards/rejected": 0.08766160905361176, "step": 115 }, { "epoch": 0.5129575207053166, "grad_norm": 0.5030398964881897, "learning_rate": 2.8185519417047623e-07, "logits/chosen": -1.8514922857284546, "logits/rejected": -1.7740070819854736, "logps/chosen": -214.818359375, "logps/ref_chosen": -227.9495086669922, "logps/ref_rejected": -230.5752410888672, "logps/rejected": -218.9449005126953, "loss": 0.496, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.13131138682365417, "rewards/margins": 0.015008069574832916, "rewards/rejected": 0.11630330979824066, "step": 120 }, { "epoch": 0.5343307507347048, "grad_norm": 0.5339066982269287, "learning_rate": 2.631465342477719e-07, "logits/chosen": -1.9007892608642578, "logits/rejected": -1.8334102630615234, "logps/chosen": -218.14743041992188, "logps/ref_chosen": -232.6212158203125, "logps/ref_rejected": -234.5932159423828, "logps/rejected": -222.1468505859375, "loss": 0.4958, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.1447378695011139, "rewards/margins": 0.020274382084608078, "rewards/rejected": 0.12446349859237671, "step": 125 }, { "epoch": 0.555703980764093, "grad_norm": 0.5313855409622192, "learning_rate": 2.44363648673827e-07, "logits/chosen": -1.7636210918426514, "logits/rejected": -1.7406389713287354, "logps/chosen": -211.9698944091797, "logps/ref_chosen": -226.790771484375, "logps/ref_rejected": -231.8648223876953, "logps/rejected": -219.543212890625, "loss": 0.4945, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.1482090801000595, "rewards/margins": 0.024992961436510086, "rewards/rejected": 0.12321610748767853, "step": 130 }, { "epoch": 0.5770772107934812, "grad_norm": 0.5537051558494568, "learning_rate": 2.2561258607618294e-07, "logits/chosen": -1.8008477687835693, "logits/rejected": -1.8080832958221436, "logps/chosen": -234.68893432617188, "logps/ref_chosen": -247.26119995117188, "logps/ref_rejected": -241.82345581054688, "logps/rejected": -231.585693359375, "loss": 0.4949, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.12572243809700012, "rewards/margins": 0.023345012217760086, "rewards/rejected": 0.10237739980220795, "step": 135 }, { "epoch": 0.5984504408228694, "grad_norm": 0.5528976321220398, "learning_rate": 2.069992154090854e-07, "logits/chosen": -1.775397539138794, "logits/rejected": -1.6931631565093994, "logps/chosen": -219.74072265625, "logps/ref_chosen": -230.71826171875, "logps/ref_rejected": -227.7001953125, "logps/rejected": -218.38241577148438, "loss": 0.495, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.10977540910243988, "rewards/margins": 0.01659761555492878, "rewards/rejected": 0.09317778795957565, "step": 140 }, { "epoch": 0.6198236708522575, "grad_norm": 0.5473525524139404, "learning_rate": 1.886286282148002e-07, "logits/chosen": -1.7711913585662842, "logits/rejected": -1.7026926279067993, "logps/chosen": -195.3854217529297, "logps/ref_chosen": -208.07254028320312, "logps/ref_rejected": -210.4279022216797, "logps/rejected": -199.79165649414062, "loss": 0.4946, "rewards/accuracies": 0.65625, "rewards/chosen": 0.12687113881111145, "rewards/margins": 0.020508771762251854, "rewards/rejected": 0.10636236518621445, "step": 145 }, { "epoch": 0.6411969008816457, "grad_norm": 0.5966719388961792, "learning_rate": 1.7060454527421686e-07, "logits/chosen": -1.8688771724700928, "logits/rejected": -1.810694932937622, "logps/chosen": -211.9062042236328, "logps/ref_chosen": -224.8968505859375, "logps/ref_rejected": -226.1548309326172, "logps/rejected": -215.7084503173828, "loss": 0.4943, "rewards/accuracies": 0.625, "rewards/chosen": 0.12990659475326538, "rewards/margins": 0.02544253133237362, "rewards/rejected": 0.10446406900882721, "step": 150 }, { "epoch": 0.6625701309110339, "grad_norm": 0.5334843993186951, "learning_rate": 1.5302873099680374e-07, "logits/chosen": -1.786595344543457, "logits/rejected": -1.7971456050872803, "logps/chosen": -225.0083465576172, "logps/ref_chosen": -237.4626922607422, "logps/ref_rejected": -234.39547729492188, "logps/rejected": -223.2943572998047, "loss": 0.4955, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.12454362213611603, "rewards/margins": 0.013532285578548908, "rewards/rejected": 0.1110113263130188, "step": 155 }, { "epoch": 0.6839433609404221, "grad_norm": 0.5639063715934753, "learning_rate": 1.360004188562841e-07, "logits/chosen": -2.0527145862579346, "logits/rejected": -1.9811140298843384, "logps/chosen": -217.0570068359375, "logps/ref_chosen": -231.03369140625, "logps/ref_rejected": -232.6383819580078, "logps/rejected": -220.0625457763672, "loss": 0.4952, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.1397666186094284, "rewards/margins": 0.014008410274982452, "rewards/rejected": 0.12575821578502655, "step": 160 }, { "epoch": 0.7053165909698104, "grad_norm": 0.5417853593826294, "learning_rate": 1.1961575111603586e-07, "logits/chosen": -1.8371235132217407, "logits/rejected": -1.7954612970352173, "logps/chosen": -220.7694854736328, "logps/ref_chosen": -234.5041046142578, "logps/ref_rejected": -235.61181640625, "logps/rejected": -224.56640625, "loss": 0.4944, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.1373465657234192, "rewards/margins": 0.026892542839050293, "rewards/rejected": 0.1104540079832077, "step": 165 }, { "epoch": 0.7266898209991985, "grad_norm": 0.565830409526825, "learning_rate": 1.0396723600754143e-07, "logits/chosen": -1.8288425207138062, "logits/rejected": -1.83499276638031, "logps/chosen": -213.2861785888672, "logps/ref_chosen": -227.1809844970703, "logps/ref_rejected": -230.8953094482422, "logps/rejected": -218.4414520263672, "loss": 0.4954, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.13894793391227722, "rewards/margins": 0.014409348368644714, "rewards/rejected": 0.12453857809305191, "step": 170 }, { "epoch": 0.7480630510285867, "grad_norm": 0.5855058431625366, "learning_rate": 8.914322542666822e-08, "logits/chosen": -1.8145122528076172, "logits/rejected": -1.7646887302398682, "logps/chosen": -212.070068359375, "logps/ref_chosen": -224.17794799804688, "logps/ref_rejected": -225.526123046875, "logps/rejected": -214.7656707763672, "loss": 0.4947, "rewards/accuracies": 0.5625, "rewards/chosen": 0.12107895314693451, "rewards/margins": 0.013474419713020325, "rewards/rejected": 0.10760452598333359, "step": 175 }, { "epoch": 0.7694362810579749, "grad_norm": 0.6223751902580261, "learning_rate": 7.522741609672193e-08, "logits/chosen": -1.8675405979156494, "logits/rejected": -1.8476943969726562, "logps/chosen": -216.3776092529297, "logps/ref_chosen": -230.77182006835938, "logps/ref_rejected": -227.00619506835938, "logps/rejected": -214.32931518554688, "loss": 0.4945, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.1439422070980072, "rewards/margins": 0.017173700034618378, "rewards/rejected": 0.12676851451396942, "step": 180 }, { "epoch": 0.7908095110873631, "grad_norm": 0.5778200030326843, "learning_rate": 6.229837701471644e-08, "logits/chosen": -1.9124794006347656, "logits/rejected": -1.8135532140731812, "logps/chosen": -216.97702026367188, "logps/ref_chosen": -229.8362274169922, "logps/ref_rejected": -233.65390014648438, "logps/rejected": -222.93417358398438, "loss": 0.4945, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1285921037197113, "rewards/margins": 0.021394768729805946, "rewards/rejected": 0.10719730705022812, "step": 185 }, { "epoch": 0.8121827411167513, "grad_norm": 0.5558175444602966, "learning_rate": 5.0429105848910996e-08, "logits/chosen": -1.9621855020523071, "logits/rejected": -1.9175077676773071, "logps/chosen": -215.39450073242188, "logps/ref_chosen": -229.72836303710938, "logps/ref_rejected": -233.65237426757812, "logps/rejected": -222.21798706054688, "loss": 0.4937, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.14333853125572205, "rewards/margins": 0.028994807973504066, "rewards/rejected": 0.11434372514486313, "step": 190 }, { "epoch": 0.8335559711461394, "grad_norm": 0.5308636426925659, "learning_rate": 3.968661679220467e-08, "logits/chosen": -1.971208930015564, "logits/rejected": -1.9112732410430908, "logps/chosen": -210.79598999023438, "logps/ref_chosen": -224.2023468017578, "logps/ref_rejected": -224.3248748779297, "logps/rejected": -212.8175811767578, "loss": 0.4932, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1340634524822235, "rewards/margins": 0.018990488722920418, "rewards/rejected": 0.11507296562194824, "step": 195 }, { "epoch": 0.8549292011755276, "grad_norm": 0.615912675857544, "learning_rate": 3.013156219837776e-08, "logits/chosen": -1.7899879217147827, "logits/rejected": -1.6696176528930664, "logps/chosen": -215.92288208007812, "logps/ref_chosen": -228.88381958007812, "logps/ref_rejected": -231.0583953857422, "logps/rejected": -220.5959930419922, "loss": 0.4932, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.12960924208164215, "rewards/margins": 0.024985069409012794, "rewards/rejected": 0.1046241745352745, "step": 200 }, { "epoch": 0.8763024312049158, "grad_norm": 0.590220034122467, "learning_rate": 2.1817890137430932e-08, "logits/chosen": -1.81471848487854, "logits/rejected": -1.714023232460022, "logps/chosen": -205.69888305664062, "logps/ref_chosen": -221.30752563476562, "logps/ref_rejected": -224.98486328125, "logps/rejected": -211.78884887695312, "loss": 0.4937, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.15608620643615723, "rewards/margins": 0.024126073345541954, "rewards/rejected": 0.13196012377738953, "step": 205 }, { "epoch": 0.897675661234304, "grad_norm": 0.5369106531143188, "learning_rate": 1.479253980347392e-08, "logits/chosen": -1.8037662506103516, "logits/rejected": -1.7787643671035767, "logps/chosen": -225.9608612060547, "logps/ref_chosen": -241.4657440185547, "logps/ref_rejected": -241.3707733154297, "logps/rejected": -228.4087371826172, "loss": 0.4931, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.15504886209964752, "rewards/margins": 0.025428583845496178, "rewards/rejected": 0.1296202689409256, "step": 210 }, { "epoch": 0.9190488912636923, "grad_norm": 0.5737273097038269, "learning_rate": 9.095176494896661e-09, "logits/chosen": -1.8023388385772705, "logits/rejected": -1.7160924673080444, "logps/chosen": -218.32034301757812, "logps/ref_chosen": -231.6717071533203, "logps/ref_rejected": -236.741943359375, "logps/rejected": -225.2128448486328, "loss": 0.4933, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.13351376354694366, "rewards/margins": 0.018222931772470474, "rewards/rejected": 0.11529083549976349, "step": 215 }, { "epoch": 0.9404221212930804, "grad_norm": 0.6087775826454163, "learning_rate": 4.757967663132689e-09, "logits/chosen": -1.833620309829712, "logits/rejected": -1.7870299816131592, "logps/chosen": -221.86032104492188, "logps/ref_chosen": -236.0878448486328, "logps/ref_rejected": -230.54141235351562, "logps/rejected": -218.8464813232422, "loss": 0.4935, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.14227530360221863, "rewards/margins": 0.025325754657387733, "rewards/rejected": 0.11694953590631485, "step": 220 }, { "epoch": 0.9617953513224686, "grad_norm": 0.6274195909500122, "learning_rate": 1.8054012944479224e-09, "logits/chosen": -1.7650978565216064, "logits/rejected": -1.7383601665496826, "logps/chosen": -231.64111328125, "logps/ref_chosen": -244.44155883789062, "logps/ref_rejected": -240.8953094482422, "logps/rejected": -230.3839874267578, "loss": 0.4932, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.12800416350364685, "rewards/margins": 0.022890925407409668, "rewards/rejected": 0.10511324554681778, "step": 225 }, { "epoch": 0.9831685813518568, "grad_norm": 0.5350868105888367, "learning_rate": 2.541476501764228e-10, "logits/chosen": -1.8503191471099854, "logits/rejected": -1.878313660621643, "logps/chosen": -206.16665649414062, "logps/ref_chosen": -219.6629638671875, "logps/ref_rejected": -212.42172241210938, "logps/rejected": -200.54551696777344, "loss": 0.494, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.13496311008930206, "rewards/margins": 0.016201000660657883, "rewards/rejected": 0.11876209825277328, "step": 230 }, { "epoch": 0.9959925193694897, "step": 233, "total_flos": 0.0, "train_loss": 0.49642937480124283, "train_runtime": 16410.2083, "train_samples_per_second": 3.649, "train_steps_per_second": 0.014 } ], "logging_steps": 5, "max_steps": 233, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }