{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997382884061764, "eval_steps": 500, "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.208333333333333e-08, "logits/chosen": -2.578385353088379, "logits/rejected": -2.53226900100708, "logps/chosen": -286.13739013671875, "logps/rejected": -212.73016357421875, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 5.208333333333334e-07, "logits/chosen": -2.468435525894165, "logits/rejected": -2.5060648918151855, "logps/chosen": -258.7095947265625, "logps/rejected": -233.5037384033203, "loss": 0.5, "rewards/accuracies": 0.3819444477558136, "rewards/chosen": -7.664680015295744e-05, "rewards/margins": 6.3225775193131994e-06, "rewards/rejected": -8.296939631691203e-05, "step": 10 }, { "epoch": 0.02, "learning_rate": 1.0416666666666667e-06, "logits/chosen": -2.346015214920044, "logits/rejected": -2.4067437648773193, "logps/chosen": -196.97122192382812, "logps/rejected": -193.7008056640625, "loss": 0.5, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": 6.571458652615547e-05, "rewards/margins": -3.4166391742473934e-06, "rewards/rejected": 6.913123070262372e-05, "step": 20 }, { "epoch": 0.03, "learning_rate": 1.5625e-06, "logits/chosen": -2.4858970642089844, "logits/rejected": -2.451706886291504, "logps/chosen": -236.32901000976562, "logps/rejected": -208.12997436523438, "loss": 0.5, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0012510574888437986, "rewards/margins": 0.00014562405704054981, "rewards/rejected": 0.0011054335627704859, "step": 30 }, { "epoch": 0.04, "learning_rate": 2.0833333333333334e-06, "logits/chosen": -2.4558634757995605, "logits/rejected": -2.477804183959961, "logps/chosen": -242.181640625, "logps/rejected": -234.55661010742188, "loss": 0.4999, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0018630999838933349, "rewards/margins": 0.0003440978180151433, "rewards/rejected": 0.001519002253189683, "step": 40 }, { "epoch": 0.05, "learning_rate": 2.604166666666667e-06, "logits/chosen": -2.489288806915283, "logits/rejected": -2.5008156299591064, "logps/chosen": -235.43099975585938, "logps/rejected": -222.29641723632812, "loss": 0.4998, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0019815764389932156, "rewards/margins": 0.0009142985800281167, "rewards/rejected": 0.0010672778589650989, "step": 50 }, { "epoch": 0.06, "learning_rate": 3.125e-06, "logits/chosen": -2.450463056564331, "logits/rejected": -2.443624258041382, "logps/chosen": -256.23590087890625, "logps/rejected": -230.0203094482422, "loss": 0.4997, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.004073253367096186, "rewards/margins": 0.001231834408827126, "rewards/rejected": 0.002841418841853738, "step": 60 }, { "epoch": 0.07, "learning_rate": 3.6458333333333333e-06, "logits/chosen": -2.459900379180908, "logits/rejected": -2.4804420471191406, "logps/chosen": -251.94174194335938, "logps/rejected": -234.4181671142578, "loss": 0.4995, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.004830378107726574, "rewards/margins": 0.0021367089357227087, "rewards/rejected": 0.002693668706342578, "step": 70 }, { "epoch": 0.08, "learning_rate": 4.166666666666667e-06, "logits/chosen": -2.4172418117523193, "logits/rejected": -2.3813605308532715, "logps/chosen": -235.5308074951172, "logps/rejected": -215.6710968017578, "loss": 0.4992, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.006930059753358364, "rewards/margins": 0.002805978525429964, "rewards/rejected": 0.004124081693589687, "step": 80 }, { "epoch": 0.09, "learning_rate": 4.6875000000000004e-06, "logits/chosen": -2.4015233516693115, "logits/rejected": -2.3940534591674805, "logps/chosen": -226.13137817382812, "logps/rejected": -213.4936981201172, "loss": 0.4991, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.00892677903175354, "rewards/margins": 0.004470665007829666, "rewards/rejected": 0.0044561149552464485, "step": 90 }, { "epoch": 0.1, "learning_rate": 4.9997324926814375e-06, "logits/chosen": -2.382424831390381, "logits/rejected": -2.3642446994781494, "logps/chosen": -204.63381958007812, "logps/rejected": -217.92910766601562, "loss": 0.4989, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.010236050002276897, "rewards/margins": 0.00466396939009428, "rewards/rejected": 0.005572080612182617, "step": 100 }, { "epoch": 0.12, "learning_rate": 4.996723692767927e-06, "logits/chosen": -2.395820140838623, "logits/rejected": -2.407099485397339, "logps/chosen": -215.1260223388672, "logps/rejected": -210.58309936523438, "loss": 0.4986, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.010834300890564919, "rewards/margins": 0.006746213883161545, "rewards/rejected": 0.004088086076080799, "step": 110 }, { "epoch": 0.13, "learning_rate": 4.9903757462135984e-06, "logits/chosen": -2.361361503601074, "logits/rejected": -2.4017128944396973, "logps/chosen": -208.5069122314453, "logps/rejected": -193.96817016601562, "loss": 0.4984, "rewards/accuracies": 0.5625, "rewards/chosen": 0.009557174518704414, "rewards/margins": 0.00605000089854002, "rewards/rejected": 0.0035071733873337507, "step": 120 }, { "epoch": 0.14, "learning_rate": 4.980697142834315e-06, "logits/chosen": -2.3745360374450684, "logits/rejected": -2.3689522743225098, "logps/chosen": -226.9114990234375, "logps/rejected": -210.3325653076172, "loss": 0.4982, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.010273845866322517, "rewards/margins": 0.005562370643019676, "rewards/rejected": 0.004711476154625416, "step": 130 }, { "epoch": 0.15, "learning_rate": 4.967700826904229e-06, "logits/chosen": -2.39690899848938, "logits/rejected": -2.4031527042388916, "logps/chosen": -207.76968383789062, "logps/rejected": -206.6008758544922, "loss": 0.4978, "rewards/accuracies": 0.65625, "rewards/chosen": 0.011523631401360035, "rewards/margins": 0.009559462778270245, "rewards/rejected": 0.0019641686230897903, "step": 140 }, { "epoch": 0.16, "learning_rate": 4.951404179843963e-06, "logits/chosen": -2.2971677780151367, "logits/rejected": -2.3256301879882812, "logps/chosen": -223.7987823486328, "logps/rejected": -211.0409393310547, "loss": 0.498, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.007091984152793884, "rewards/margins": 0.009901536628603935, "rewards/rejected": -0.002809552475810051, "step": 150 }, { "epoch": 0.17, "learning_rate": 4.931828996974498e-06, "logits/chosen": -2.3667495250701904, "logits/rejected": -2.2750391960144043, "logps/chosen": -207.93814086914062, "logps/rejected": -217.41806030273438, "loss": 0.497, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.00839292537420988, "rewards/margins": 0.015968123450875282, "rewards/rejected": -0.0075751966796815395, "step": 160 }, { "epoch": 0.18, "learning_rate": 4.909001458367867e-06, "logits/chosen": -2.3504929542541504, "logits/rejected": -2.328986644744873, "logps/chosen": -262.8653259277344, "logps/rejected": -238.43017578125, "loss": 0.4971, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.005286640953272581, "rewards/margins": 0.01585621014237404, "rewards/rejected": -0.010569569654762745, "step": 170 }, { "epoch": 0.19, "learning_rate": 4.882952093833628e-06, "logits/chosen": -2.2022526264190674, "logits/rejected": -2.163339138031006, "logps/chosen": -211.5063018798828, "logps/rejected": -248.37081909179688, "loss": 0.4967, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.00028603168902918696, "rewards/margins": 0.02056037448346615, "rewards/rejected": -0.020274341106414795, "step": 180 }, { "epoch": 0.2, "learning_rate": 4.853715742087947e-06, "logits/chosen": -2.2854294776916504, "logits/rejected": -2.230767011642456, "logps/chosen": -295.3899230957031, "logps/rejected": -293.7907409667969, "loss": 0.4955, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.026039790362119675, "rewards/margins": 0.04283389076590538, "rewards/rejected": -0.06887368112802505, "step": 190 }, { "epoch": 0.21, "learning_rate": 4.821331504159906e-06, "logits/chosen": -2.1649932861328125, "logits/rejected": -2.122584819793701, "logps/chosen": -251.9384307861328, "logps/rejected": -293.52923583984375, "loss": 0.4952, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.030307698994874954, "rewards/margins": 0.03713207319378853, "rewards/rejected": -0.06743976473808289, "step": 200 }, { "epoch": 0.22, "learning_rate": 4.7858426910973435e-06, "logits/chosen": -2.0010428428649902, "logits/rejected": -1.9664274454116821, "logps/chosen": -352.85986328125, "logps/rejected": -415.43768310546875, "loss": 0.492, "rewards/accuracies": 0.59375, "rewards/chosen": -0.11694659292697906, "rewards/margins": 0.0868750587105751, "rewards/rejected": -0.20382165908813477, "step": 210 }, { "epoch": 0.23, "learning_rate": 4.747296766042161e-06, "logits/chosen": -1.5235364437103271, "logits/rejected": -1.551948070526123, "logps/chosen": -565.6112060546875, "logps/rejected": -687.0833129882812, "loss": 0.4871, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3442026376724243, "rewards/margins": 0.13101640343666077, "rewards/rejected": -0.47521907091140747, "step": 220 }, { "epoch": 0.24, "learning_rate": 4.705745280752586e-06, "logits/chosen": -1.5635735988616943, "logits/rejected": -1.5089839696884155, "logps/chosen": -867.3118286132812, "logps/rejected": -959.4519653320312, "loss": 0.4819, "rewards/accuracies": 0.5, "rewards/chosen": -0.6232264041900635, "rewards/margins": 0.11986882984638214, "rewards/rejected": -0.7430952787399292, "step": 230 }, { "epoch": 0.25, "learning_rate": 4.661243806657256e-06, "logits/chosen": -1.8420759439468384, "logits/rejected": -1.7493212223052979, "logps/chosen": -739.1602783203125, "logps/rejected": -1018.7802734375, "loss": 0.4822, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5061613321304321, "rewards/margins": 0.31740203499794006, "rewards/rejected": -0.8235633969306946, "step": 240 }, { "epoch": 0.26, "learning_rate": 4.613851860533367e-06, "logits/chosen": -1.7359821796417236, "logits/rejected": -1.664820909500122, "logps/chosen": -867.6565551757812, "logps/rejected": -1373.392333984375, "loss": 0.4804, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6370395421981812, "rewards/margins": 0.5256737470626831, "rewards/rejected": -1.1627132892608643, "step": 250 }, { "epoch": 0.27, "learning_rate": 4.563632824908252e-06, "logits/chosen": -1.8760831356048584, "logits/rejected": -1.84027898311615, "logps/chosen": -652.6282958984375, "logps/rejected": -1129.7745361328125, "loss": 0.4724, "rewards/accuracies": 0.53125, "rewards/chosen": -0.47100549936294556, "rewards/margins": 0.4714561402797699, "rewards/rejected": -0.9424616694450378, "step": 260 }, { "epoch": 0.28, "learning_rate": 4.510653863290871e-06, "logits/chosen": -1.8029680252075195, "logits/rejected": -1.7324800491333008, "logps/chosen": -1249.8055419921875, "logps/rejected": -1807.9056396484375, "loss": 0.4742, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.0133898258209229, "rewards/margins": 0.5813573002815247, "rewards/rejected": -1.5947470664978027, "step": 270 }, { "epoch": 0.29, "learning_rate": 4.454985830346574e-06, "logits/chosen": -1.9399795532226562, "logits/rejected": -1.8146251440048218, "logps/chosen": -902.8073120117188, "logps/rejected": -1436.665283203125, "loss": 0.469, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.6662728786468506, "rewards/margins": 0.5495506525039673, "rewards/rejected": -1.2158234119415283, "step": 280 }, { "epoch": 0.3, "learning_rate": 4.396703177135262e-06, "logits/chosen": -1.8842103481292725, "logits/rejected": -1.7202155590057373, "logps/chosen": -1762.968994140625, "logps/rejected": -1725.986572265625, "loss": 0.4717, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.510768175125122, "rewards/margins": 0.030063262209296227, "rewards/rejected": -1.540831446647644, "step": 290 }, { "epoch": 0.31, "learning_rate": 4.335883851539693e-06, "logits/chosen": -2.0325398445129395, "logits/rejected": -1.8330217599868774, "logps/chosen": -1008.1095581054688, "logps/rejected": -1700.26171875, "loss": 0.4671, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.7875405550003052, "rewards/margins": 0.731080174446106, "rewards/rejected": -1.5186206102371216, "step": 300 }, { "epoch": 0.32, "learning_rate": 4.2726091940171055e-06, "logits/chosen": -1.969412088394165, "logits/rejected": -1.8437814712524414, "logps/chosen": -798.6140747070312, "logps/rejected": -1872.577392578125, "loss": 0.4564, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5798918604850769, "rewards/margins": 1.0649895668029785, "rewards/rejected": -1.6448814868927002, "step": 310 }, { "epoch": 0.33, "learning_rate": 4.206963828813555e-06, "logits/chosen": -2.004281759262085, "logits/rejected": -1.8325812816619873, "logps/chosen": -1020.1710205078125, "logps/rejected": -2015.131591796875, "loss": 0.4614, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8064894676208496, "rewards/margins": 0.9903135299682617, "rewards/rejected": -1.7968031167984009, "step": 320 }, { "epoch": 0.35, "learning_rate": 4.139035550786495e-06, "logits/chosen": -2.0901331901550293, "logits/rejected": -1.9698684215545654, "logps/chosen": -915.0389404296875, "logps/rejected": -1435.4764404296875, "loss": 0.4679, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6817190051078796, "rewards/margins": 0.5508195161819458, "rewards/rejected": -1.2325387001037598, "step": 330 }, { "epoch": 0.36, "learning_rate": 4.068915207986931e-06, "logits/chosen": -1.9757936000823975, "logits/rejected": -1.897470474243164, "logps/chosen": -1229.423828125, "logps/rejected": -1853.947265625, "loss": 0.4667, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.007495641708374, "rewards/margins": 0.6511304974555969, "rewards/rejected": -1.6586261987686157, "step": 340 }, { "epoch": 0.37, "learning_rate": 3.996696580158211e-06, "logits/chosen": -1.968062162399292, "logits/rejected": -1.8332984447479248, "logps/chosen": -1563.551025390625, "logps/rejected": -2647.005615234375, "loss": 0.4615, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3222496509552002, "rewards/margins": 1.0978131294250488, "rewards/rejected": -2.42006254196167, "step": 350 }, { "epoch": 0.38, "learning_rate": 3.922476253313921e-06, "logits/chosen": -2.2060952186584473, "logits/rejected": -2.1268954277038574, "logps/chosen": -1001.3084106445312, "logps/rejected": -1724.416015625, "loss": 0.4584, "rewards/accuracies": 0.625, "rewards/chosen": -0.7821061611175537, "rewards/margins": 0.7285407781600952, "rewards/rejected": -1.5106468200683594, "step": 360 }, { "epoch": 0.39, "learning_rate": 3.846353490562664e-06, "logits/chosen": -2.1300292015075684, "logits/rejected": -2.000924587249756, "logps/chosen": -1243.8800048828125, "logps/rejected": -2480.47021484375, "loss": 0.4514, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.0223863124847412, "rewards/margins": 1.2320432662963867, "rewards/rejected": -2.254429578781128, "step": 370 }, { "epoch": 0.4, "learning_rate": 3.768430099352445e-06, "logits/chosen": -2.221879243850708, "logits/rejected": -2.128418207168579, "logps/chosen": -875.8338623046875, "logps/rejected": -1593.1365966796875, "loss": 0.4588, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.6446818113327026, "rewards/margins": 0.7323214411735535, "rewards/rejected": -1.3770033121109009, "step": 380 }, { "epoch": 0.41, "learning_rate": 3.6888102953122307e-06, "logits/chosen": -2.0890746116638184, "logits/rejected": -1.9689449071884155, "logps/chosen": -1064.7686767578125, "logps/rejected": -1700.5394287109375, "loss": 0.4656, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.8423658609390259, "rewards/margins": 0.6700539588928223, "rewards/rejected": -1.5124199390411377, "step": 390 }, { "epoch": 0.42, "learning_rate": 3.607600562872785e-06, "logits/chosen": -2.2589426040649414, "logits/rejected": -2.160431385040283, "logps/chosen": -959.44140625, "logps/rejected": -1434.484130859375, "loss": 0.4572, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.69480961561203, "rewards/margins": 0.5226942896842957, "rewards/rejected": -1.2175039052963257, "step": 400 }, { "epoch": 0.43, "learning_rate": 3.5249095128531863e-06, "logits/chosen": -2.2115917205810547, "logits/rejected": -2.127436399459839, "logps/chosen": -1019.97509765625, "logps/rejected": -1477.9644775390625, "loss": 0.4597, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7589401602745056, "rewards/margins": 0.49645981192588806, "rewards/rejected": -1.2553999423980713, "step": 410 }, { "epoch": 0.44, "learning_rate": 3.4408477372034743e-06, "logits/chosen": -1.9758100509643555, "logits/rejected": -1.8129494190216064, "logps/chosen": -1256.257568359375, "logps/rejected": -2412.14794921875, "loss": 0.4568, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.0521903038024902, "rewards/margins": 1.142812728881836, "rewards/rejected": -2.1950032711029053, "step": 420 }, { "epoch": 0.45, "learning_rate": 3.355527661097728e-06, "logits/chosen": -2.1546552181243896, "logits/rejected": -2.0900943279266357, "logps/chosen": -718.0090942382812, "logps/rejected": -1487.015380859375, "loss": 0.4635, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.5029612183570862, "rewards/margins": 0.7773032188415527, "rewards/rejected": -1.2802644968032837, "step": 430 }, { "epoch": 0.46, "learning_rate": 3.269063392575352e-06, "logits/chosen": -2.1480519771575928, "logits/rejected": -2.071498394012451, "logps/chosen": -1282.3341064453125, "logps/rejected": -2054.326171875, "loss": 0.4626, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.0578222274780273, "rewards/margins": 0.7977155447006226, "rewards/rejected": -1.855538010597229, "step": 440 }, { "epoch": 0.47, "learning_rate": 3.181570569931697e-06, "logits/chosen": -1.928739309310913, "logits/rejected": -1.8378665447235107, "logps/chosen": -1254.6744384765625, "logps/rejected": -2223.1025390625, "loss": 0.4614, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.060748815536499, "rewards/margins": 0.9665641784667969, "rewards/rejected": -2.027312755584717, "step": 450 }, { "epoch": 0.48, "learning_rate": 3.09316620706208e-06, "logits/chosen": -2.2325401306152344, "logits/rejected": -2.123627185821533, "logps/chosen": -882.6212768554688, "logps/rejected": -1591.8580322265625, "loss": 0.459, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6440384387969971, "rewards/margins": 0.7315788269042969, "rewards/rejected": -1.3756173849105835, "step": 460 }, { "epoch": 0.49, "learning_rate": 3.0039685369660785e-06, "logits/chosen": -2.02402663230896, "logits/rejected": -1.9017149209976196, "logps/chosen": -1743.1324462890625, "logps/rejected": -2763.71923828125, "loss": 0.454, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5182123184204102, "rewards/margins": 1.0456020832061768, "rewards/rejected": -2.563814401626587, "step": 470 }, { "epoch": 0.5, "learning_rate": 2.91409685362137e-06, "logits/chosen": -2.046326160430908, "logits/rejected": -1.985815405845642, "logps/chosen": -1439.734130859375, "logps/rejected": -2124.520263671875, "loss": 0.4659, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2466154098510742, "rewards/margins": 0.6888442039489746, "rewards/rejected": -1.935459852218628, "step": 480 }, { "epoch": 0.51, "learning_rate": 2.8236713524386085e-06, "logits/chosen": -2.134103775024414, "logits/rejected": -2.0179543495178223, "logps/chosen": -1013.251953125, "logps/rejected": -2033.077392578125, "loss": 0.4508, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.815003514289856, "rewards/margins": 1.0394479036331177, "rewards/rejected": -1.8544514179229736, "step": 490 }, { "epoch": 0.52, "learning_rate": 2.7328129695107205e-06, "logits/chosen": -2.16344952583313, "logits/rejected": -1.9862359762191772, "logps/chosen": -1314.334228515625, "logps/rejected": -2926.594970703125, "loss": 0.458, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.0824334621429443, "rewards/margins": 1.6397478580474854, "rewards/rejected": -2.7221813201904297, "step": 500 }, { "epoch": 0.53, "learning_rate": 2.641643219871597e-06, "logits/chosen": -2.2062978744506836, "logits/rejected": -2.057356357574463, "logps/chosen": -1038.0162353515625, "logps/rejected": -2059.95458984375, "loss": 0.4491, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8147695660591125, "rewards/margins": 1.0522905588150024, "rewards/rejected": -1.8670603036880493, "step": 510 }, { "epoch": 0.54, "learning_rate": 2.5502840349805074e-06, "logits/chosen": -2.2159011363983154, "logits/rejected": -2.0826644897460938, "logps/chosen": -936.4302978515625, "logps/rejected": -1992.089599609375, "loss": 0.4483, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6953409910202026, "rewards/margins": 1.08708918094635, "rewards/rejected": -1.7824300527572632, "step": 520 }, { "epoch": 0.55, "learning_rate": 2.4588575996495797e-06, "logits/chosen": -2.2215633392333984, "logits/rejected": -2.053880214691162, "logps/chosen": -846.26611328125, "logps/rejected": -2448.76416015625, "loss": 0.4513, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.5850510001182556, "rewards/margins": 1.622568130493164, "rewards/rejected": -2.2076191902160645, "step": 530 }, { "epoch": 0.57, "learning_rate": 2.367486188632446e-06, "logits/chosen": -2.221585273742676, "logits/rejected": -2.0297319889068604, "logps/chosen": -1184.786865234375, "logps/rejected": -2758.436767578125, "loss": 0.4455, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9226329922676086, "rewards/margins": 1.6065568923950195, "rewards/rejected": -2.5291898250579834, "step": 540 }, { "epoch": 0.58, "learning_rate": 2.276292003092593e-06, "logits/chosen": -2.196733236312866, "logits/rejected": -2.057121992111206, "logps/chosen": -1445.136962890625, "logps/rejected": -2547.5927734375, "loss": 0.4602, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2020736932754517, "rewards/margins": 1.1371889114379883, "rewards/rejected": -2.3392627239227295, "step": 550 }, { "epoch": 0.59, "learning_rate": 2.1853970071701415e-06, "logits/chosen": -2.14131498336792, "logits/rejected": -2.0334861278533936, "logps/chosen": -913.9793090820312, "logps/rejected": -1585.7960205078125, "loss": 0.4608, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.7021154165267944, "rewards/margins": 0.7157109975814819, "rewards/rejected": -1.417826533317566, "step": 560 }, { "epoch": 0.6, "learning_rate": 2.0949227648656194e-06, "logits/chosen": -2.150709629058838, "logits/rejected": -2.051652431488037, "logps/chosen": -930.1282348632812, "logps/rejected": -1828.182861328125, "loss": 0.4531, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7185366749763489, "rewards/margins": 0.9112586975097656, "rewards/rejected": -1.6297954320907593, "step": 570 }, { "epoch": 0.61, "learning_rate": 2.00499027745888e-06, "logits/chosen": -2.165860891342163, "logits/rejected": -1.990290641784668, "logps/chosen": -1457.848388671875, "logps/rejected": -3095.86181640625, "loss": 0.4532, "rewards/accuracies": 0.625, "rewards/chosen": -1.2225459814071655, "rewards/margins": 1.659259557723999, "rewards/rejected": -2.881805658340454, "step": 580 }, { "epoch": 0.62, "learning_rate": 1.915719821680624e-06, "logits/chosen": -2.033405303955078, "logits/rejected": -1.9788004159927368, "logps/chosen": -1339.5030517578125, "logps/rejected": -1871.1273193359375, "loss": 0.4542, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1169803142547607, "rewards/margins": 0.5490958094596863, "rewards/rejected": -1.6660760641098022, "step": 590 }, { "epoch": 0.63, "learning_rate": 1.8272307888529276e-06, "logits/chosen": -2.2021541595458984, "logits/rejected": -2.0622100830078125, "logps/chosen": -1060.39404296875, "logps/rejected": -2731.3662109375, "loss": 0.4515, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8206619024276733, "rewards/margins": 1.6797775030136108, "rewards/rejected": -2.5004396438598633, "step": 600 }, { "epoch": 0.64, "learning_rate": 1.739641525213929e-06, "logits/chosen": -2.200084686279297, "logits/rejected": -2.0629191398620605, "logps/chosen": -1195.5421142578125, "logps/rejected": -2445.993408203125, "loss": 0.4489, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.9617778658866882, "rewards/margins": 1.294762134552002, "rewards/rejected": -2.256540298461914, "step": 610 }, { "epoch": 0.65, "learning_rate": 1.6530691736402317e-06, "logits/chosen": -2.1470463275909424, "logits/rejected": -2.028573513031006, "logps/chosen": -1541.4970703125, "logps/rejected": -2454.397705078125, "loss": 0.4469, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3200973272323608, "rewards/margins": 0.9341692924499512, "rewards/rejected": -2.2542667388916016, "step": 620 }, { "epoch": 0.66, "learning_rate": 1.5676295169786864e-06, "logits/chosen": -2.1975948810577393, "logits/rejected": -2.060920476913452, "logps/chosen": -1427.9593505859375, "logps/rejected": -3144.806884765625, "loss": 0.4485, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.176941156387329, "rewards/margins": 1.7532637119293213, "rewards/rejected": -2.9302048683166504, "step": 630 }, { "epoch": 0.67, "learning_rate": 1.4834368231970922e-06, "logits/chosen": -2.156165838241577, "logits/rejected": -2.045762777328491, "logps/chosen": -1413.2603759765625, "logps/rejected": -3491.219970703125, "loss": 0.4488, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1946003437042236, "rewards/margins": 2.0828354358673096, "rewards/rejected": -3.2774360179901123, "step": 640 }, { "epoch": 0.68, "learning_rate": 1.4006036925609245e-06, "logits/chosen": -2.2174124717712402, "logits/rejected": -2.088347911834717, "logps/chosen": -1448.646484375, "logps/rejected": -2385.66259765625, "loss": 0.4474, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2005541324615479, "rewards/margins": 0.9563320875167847, "rewards/rejected": -2.156886339187622, "step": 650 }, { "epoch": 0.69, "learning_rate": 1.3192409070404582e-06, "logits/chosen": -2.2460074424743652, "logits/rejected": -2.1426172256469727, "logps/chosen": -1455.069580078125, "logps/rejected": -2169.728515625, "loss": 0.4531, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.21938955783844, "rewards/margins": 0.7492297887802124, "rewards/rejected": -1.9686193466186523, "step": 660 }, { "epoch": 0.7, "learning_rate": 1.2394572821496953e-06, "logits/chosen": -2.229182481765747, "logits/rejected": -2.115177631378174, "logps/chosen": -1379.844482421875, "logps/rejected": -2796.632568359375, "loss": 0.4498, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1495221853256226, "rewards/margins": 1.4514307975769043, "rewards/rejected": -2.6009533405303955, "step": 670 }, { "epoch": 0.71, "learning_rate": 1.1613595214152713e-06, "logits/chosen": -2.2372703552246094, "logits/rejected": -2.1571171283721924, "logps/chosen": -1203.8697509765625, "logps/rejected": -1857.9075927734375, "loss": 0.4523, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.9586626291275024, "rewards/margins": 0.6804057359695435, "rewards/rejected": -1.6390682458877563, "step": 680 }, { "epoch": 0.72, "learning_rate": 1.0850520736699362e-06, "logits/chosen": -2.1900107860565186, "logits/rejected": -2.080841064453125, "logps/chosen": -1038.609619140625, "logps/rejected": -1972.8609619140625, "loss": 0.4589, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8293488621711731, "rewards/margins": 0.9573895335197449, "rewards/rejected": -1.7867381572723389, "step": 690 }, { "epoch": 0.73, "learning_rate": 1.0106369933615043e-06, "logits/chosen": -2.208099126815796, "logits/rejected": -2.082400321960449, "logps/chosen": -1469.6568603515625, "logps/rejected": -3010.545654296875, "loss": 0.4501, "rewards/accuracies": 0.53125, "rewards/chosen": -1.243025302886963, "rewards/margins": 1.5714446306228638, "rewards/rejected": -2.814469814300537, "step": 700 }, { "epoch": 0.74, "learning_rate": 9.382138040640714e-07, "logits/chosen": -2.220716953277588, "logits/rejected": -2.1422343254089355, "logps/chosen": -1299.326904296875, "logps/rejected": -2368.89111328125, "loss": 0.4548, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.0415502786636353, "rewards/margins": 1.1175090074539185, "rewards/rejected": -2.1590590476989746, "step": 710 }, { "epoch": 0.75, "learning_rate": 8.678793653740633e-07, "logits/chosen": -2.222707509994507, "logits/rejected": -2.1047911643981934, "logps/chosen": -1158.9268798828125, "logps/rejected": -2246.68212890625, "loss": 0.4549, "rewards/accuracies": 0.625, "rewards/chosen": -0.9390741586685181, "rewards/margins": 1.1018383502960205, "rewards/rejected": -2.040912389755249, "step": 720 }, { "epoch": 0.76, "learning_rate": 7.997277433690984e-07, "logits/chosen": -2.187948226928711, "logits/rejected": -2.105868101119995, "logps/chosen": -1287.493896484375, "logps/rejected": -2305.227783203125, "loss": 0.453, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.0323898792266846, "rewards/margins": 1.072412133216858, "rewards/rejected": -2.104801893234253, "step": 730 }, { "epoch": 0.77, "learning_rate": 7.338500848029603e-07, "logits/chosen": -2.212477445602417, "logits/rejected": -2.127330780029297, "logps/chosen": -960.9691162109375, "logps/rejected": -2517.481689453125, "loss": 0.4508, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.7053281664848328, "rewards/margins": 1.5753790140151978, "rewards/rejected": -2.2807071208953857, "step": 740 }, { "epoch": 0.79, "learning_rate": 6.70334495204884e-07, "logits/chosen": -2.230347156524658, "logits/rejected": -2.119199275970459, "logps/chosen": -1013.6222534179688, "logps/rejected": -2389.435302734375, "loss": 0.4505, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7983914017677307, "rewards/margins": 1.369321346282959, "rewards/rejected": -2.167712688446045, "step": 750 }, { "epoch": 0.8, "learning_rate": 6.092659210462232e-07, "logits/chosen": -2.2451062202453613, "logits/rejected": -2.2009005546569824, "logps/chosen": -1014.0054931640625, "logps/rejected": -2509.18310546875, "loss": 0.4428, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7636991143226624, "rewards/margins": 1.5256783962249756, "rewards/rejected": -2.2893776893615723, "step": 760 }, { "epoch": 0.81, "learning_rate": 5.507260361320738e-07, "logits/chosen": -2.2545554637908936, "logits/rejected": -2.1696648597717285, "logps/chosen": -1256.4468994140625, "logps/rejected": -3355.20849609375, "loss": 0.4417, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9988948702812195, "rewards/margins": 2.1163055896759033, "rewards/rejected": -3.1152002811431885, "step": 770 }, { "epoch": 0.82, "learning_rate": 4.947931323697983e-07, "logits/chosen": -2.236833333969116, "logits/rejected": -2.116788625717163, "logps/chosen": -1157.8489990234375, "logps/rejected": -2627.938720703125, "loss": 0.4404, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.9315347671508789, "rewards/margins": 1.4900939464569092, "rewards/rejected": -2.421628475189209, "step": 780 }, { "epoch": 0.83, "learning_rate": 4.4154201506053985e-07, "logits/chosen": -2.2656142711639404, "logits/rejected": -2.1838631629943848, "logps/chosen": -1444.336181640625, "logps/rejected": -2568.44873046875, "loss": 0.4565, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2109791040420532, "rewards/margins": 1.1344887018203735, "rewards/rejected": -2.345468044281006, "step": 790 }, { "epoch": 0.84, "learning_rate": 3.910439028537638e-07, "logits/chosen": -2.1305015087127686, "logits/rejected": -2.0338492393493652, "logps/chosen": -1384.163330078125, "logps/rejected": -3120.740234375, "loss": 0.44, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1574374437332153, "rewards/margins": 1.7521774768829346, "rewards/rejected": -2.9096148014068604, "step": 800 }, { "epoch": 0.85, "learning_rate": 3.4336633249862084e-07, "logits/chosen": -2.1786255836486816, "logits/rejected": -2.1090734004974365, "logps/chosen": -1811.431640625, "logps/rejected": -1879.762939453125, "loss": 0.4564, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.5657621622085571, "rewards/margins": 0.12173604965209961, "rewards/rejected": -1.6874980926513672, "step": 810 }, { "epoch": 0.86, "learning_rate": 2.98573068519539e-07, "logits/chosen": -2.204667091369629, "logits/rejected": -2.1335368156433105, "logps/chosen": -1094.656982421875, "logps/rejected": -2983.633056640625, "loss": 0.4524, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.8493936657905579, "rewards/margins": 1.935779333114624, "rewards/rejected": -2.785172939300537, "step": 820 }, { "epoch": 0.87, "learning_rate": 2.5672401793681854e-07, "logits/chosen": -2.2261288166046143, "logits/rejected": -2.156919002532959, "logps/chosen": -1517.5009765625, "logps/rejected": -2918.010986328125, "loss": 0.4484, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2897742986679077, "rewards/margins": 1.39667546749115, "rewards/rejected": -2.6864495277404785, "step": 830 }, { "epoch": 0.88, "learning_rate": 2.178751501463036e-07, "logits/chosen": -2.181530475616455, "logits/rejected": -2.1404881477355957, "logps/chosen": -1569.401611328125, "logps/rejected": -2147.50927734375, "loss": 0.4607, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3682358264923096, "rewards/margins": 0.5883899331092834, "rewards/rejected": -1.9566256999969482, "step": 840 }, { "epoch": 0.89, "learning_rate": 1.820784220652766e-07, "logits/chosen": -2.213731050491333, "logits/rejected": -2.1309895515441895, "logps/chosen": -1591.1544189453125, "logps/rejected": -2607.8837890625, "loss": 0.4582, "rewards/accuracies": 0.59375, "rewards/chosen": -1.339202642440796, "rewards/margins": 1.0517059564590454, "rewards/rejected": -2.390908718109131, "step": 850 }, { "epoch": 0.9, "learning_rate": 1.4938170864468636e-07, "logits/chosen": -2.1618194580078125, "logits/rejected": -2.054898738861084, "logps/chosen": -1653.2855224609375, "logps/rejected": -3330.978515625, "loss": 0.453, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.4334652423858643, "rewards/margins": 1.6885372400283813, "rewards/rejected": -3.122002601623535, "step": 860 }, { "epoch": 0.91, "learning_rate": 1.1982873884064466e-07, "logits/chosen": -2.2253754138946533, "logits/rejected": -2.132044792175293, "logps/chosen": -1268.3460693359375, "logps/rejected": -2814.267578125, "loss": 0.4513, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.069705843925476, "rewards/margins": 1.5451180934906006, "rewards/rejected": -2.614823818206787, "step": 870 }, { "epoch": 0.92, "learning_rate": 9.345903713082305e-08, "logits/chosen": -2.2452187538146973, "logits/rejected": -2.1340882778167725, "logps/chosen": -1735.882568359375, "logps/rejected": -2934.236328125, "loss": 0.4454, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4951032400131226, "rewards/margins": 1.2158784866333008, "rewards/rejected": -2.710981845855713, "step": 880 }, { "epoch": 0.93, "learning_rate": 7.030787065396866e-08, "logits/chosen": -2.2767229080200195, "logits/rejected": -2.1917612552642822, "logps/chosen": -1188.2750244140625, "logps/rejected": -2986.377197265625, "loss": 0.4469, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.9747017025947571, "rewards/margins": 1.8013055324554443, "rewards/rejected": -2.7760071754455566, "step": 890 }, { "epoch": 0.94, "learning_rate": 5.0406202043228604e-08, "logits/chosen": -2.2052557468414307, "logits/rejected": -2.0551769733428955, "logps/chosen": -1045.6455078125, "logps/rejected": -2074.9453125, "loss": 0.4586, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.8170153498649597, "rewards/margins": 1.0543075799942017, "rewards/rejected": -1.8713228702545166, "step": 900 }, { "epoch": 0.95, "learning_rate": 3.378064801637687e-08, "logits/chosen": -2.2373902797698975, "logits/rejected": -2.1471071243286133, "logps/chosen": -1441.9759521484375, "logps/rejected": -3050.029296875, "loss": 0.4416, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1936613321304321, "rewards/margins": 1.6291347742080688, "rewards/rejected": -2.822796106338501, "step": 910 }, { "epoch": 0.96, "learning_rate": 2.0453443778310766e-08, "logits/chosen": -2.1485986709594727, "logits/rejected": -2.0270209312438965, "logps/chosen": -1386.749755859375, "logps/rejected": -3182.661865234375, "loss": 0.4479, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.131797432899475, "rewards/margins": 1.841048002243042, "rewards/rejected": -2.9728455543518066, "step": 920 }, { "epoch": 0.97, "learning_rate": 1.0442413283435759e-08, "logits/chosen": -2.273899793624878, "logits/rejected": -2.1478586196899414, "logps/chosen": -800.504150390625, "logps/rejected": -2901.39501953125, "loss": 0.4385, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5838597416877747, "rewards/margins": 2.1213278770446777, "rewards/rejected": -2.7051875591278076, "step": 930 }, { "epoch": 0.98, "learning_rate": 3.760945397705828e-09, "logits/chosen": -2.2581982612609863, "logits/rejected": -2.1440868377685547, "logps/chosen": -1637.424072265625, "logps/rejected": -2622.310791015625, "loss": 0.4462, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.387995958328247, "rewards/margins": 1.0160502195358276, "rewards/rejected": -2.404046058654785, "step": 940 }, { "epoch": 0.99, "learning_rate": 4.1797599220405605e-10, "logits/chosen": -2.2077736854553223, "logits/rejected": -2.0697312355041504, "logps/chosen": -1709.1343994140625, "logps/rejected": -3128.599853515625, "loss": 0.4485, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.506415843963623, "rewards/margins": 1.4321677684783936, "rewards/rejected": -2.9385836124420166, "step": 950 }, { "epoch": 1.0, "step": 955, "total_flos": 0.0, "train_loss": 0.46542558670043943, "train_runtime": 22527.5186, "train_samples_per_second": 2.714, "train_steps_per_second": 0.042 } ], "logging_steps": 10, "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }