{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9707428879600917, "eval_steps": 300, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005393016044222731, "grad_norm": 6.252899169921875, "learning_rate": 2.6881720430107528e-09, "logits/chosen": 1.2984580993652344, "logits/rejected": -0.1912941336631775, "logps/chosen": -291.1065979003906, "logps/rejected": -233.55352783203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0010786032088445463, "grad_norm": 6.871326446533203, "learning_rate": 5.3763440860215056e-09, "logits/chosen": 0.318937212228775, "logits/rejected": -0.6181799173355103, "logps/chosen": -200.44281005859375, "logps/rejected": -189.28750610351562, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0016179048132668194, "grad_norm": 6.596883773803711, "learning_rate": 8.064516129032257e-09, "logits/chosen": -0.20669524371623993, "logits/rejected": 0.18336334824562073, "logps/chosen": -200.93087768554688, "logps/rejected": -168.10914611816406, "loss": 0.6927, "rewards/accuracies": 0.25, "rewards/chosen": 0.0013637542724609375, "rewards/margins": 0.0009842871222645044, "rewards/rejected": 0.00037946703378111124, "step": 3 }, { "epoch": 0.0021572064176890926, "grad_norm": 7.821378231048584, "learning_rate": 1.0752688172043011e-08, "logits/chosen": 1.3870043754577637, "logits/rejected": 0.5697861909866333, "logps/chosen": -275.427001953125, "logps/rejected": -212.814453125, "loss": 0.6815, "rewards/accuracies": 0.375, "rewards/chosen": 0.019330786541104317, "rewards/margins": 0.02579803206026554, "rewards/rejected": -0.006467247381806374, "step": 4 }, { "epoch": 0.002696508022111366, "grad_norm": 6.9495320320129395, "learning_rate": 1.3440860215053764e-08, "logits/chosen": 0.3137480318546295, "logits/rejected": 1.1226252317428589, "logps/chosen": -284.2044982910156, "logps/rejected": -355.1482238769531, "loss": 0.7131, "rewards/accuracies": 0.375, "rewards/chosen": -0.0044082654640078545, "rewards/margins": -0.03476991504430771, "rewards/rejected": 0.030361652374267578, "step": 5 }, { "epoch": 0.003235809626533639, "grad_norm": 7.3719000816345215, "learning_rate": 1.6129032258064514e-08, "logits/chosen": 0.3381826877593994, "logits/rejected": 0.5982265472412109, "logps/chosen": -270.83221435546875, "logps/rejected": -284.4215087890625, "loss": 0.6803, "rewards/accuracies": 0.5, "rewards/chosen": 0.02286071702837944, "rewards/margins": 0.028599737212061882, "rewards/rejected": -0.005739020183682442, "step": 6 }, { "epoch": 0.003775111230955912, "grad_norm": 5.966428279876709, "learning_rate": 1.8817204301075268e-08, "logits/chosen": 0.254464328289032, "logits/rejected": -0.49474114179611206, "logps/chosen": -193.06185913085938, "logps/rejected": -197.18116760253906, "loss": 0.6933, "rewards/accuracies": 0.375, "rewards/chosen": 0.0006093983538448811, "rewards/margins": 0.0007163994014263153, "rewards/rejected": -0.00010700291022658348, "step": 7 }, { "epoch": 0.004314412835378185, "grad_norm": 7.3488030433654785, "learning_rate": 2.1505376344086022e-08, "logits/chosen": 0.17927154898643494, "logits/rejected": -0.21967613697052002, "logps/chosen": -248.5840301513672, "logps/rejected": -243.57781982421875, "loss": 0.6957, "rewards/accuracies": 0.5, "rewards/chosen": -0.03152122348546982, "rewards/margins": -0.0016973474994301796, "rewards/rejected": -0.029823873192071915, "step": 8 }, { "epoch": 0.0048537144398004585, "grad_norm": 10.78580093383789, "learning_rate": 2.4193548387096773e-08, "logits/chosen": 0.1201261579990387, "logits/rejected": -0.858873724937439, "logps/chosen": -239.93931579589844, "logps/rejected": -168.06329345703125, "loss": 0.6761, "rewards/accuracies": 0.75, "rewards/chosen": 0.048928167670965195, "rewards/margins": 0.03566570580005646, "rewards/rejected": 0.013262463733553886, "step": 9 }, { "epoch": 0.005393016044222732, "grad_norm": 6.802707195281982, "learning_rate": 2.6881720430107527e-08, "logits/chosen": 0.4103747308254242, "logits/rejected": -0.16655509173870087, "logps/chosen": -252.72149658203125, "logps/rejected": -226.48175048828125, "loss": 0.6852, "rewards/accuracies": 0.875, "rewards/chosen": -0.006937218829989433, "rewards/margins": 0.016777608543634415, "rewards/rejected": -0.023714829236268997, "step": 10 }, { "epoch": 0.005932317648645004, "grad_norm": 7.461426734924316, "learning_rate": 2.956989247311828e-08, "logits/chosen": 0.00017065182328224182, "logits/rejected": -0.1477288156747818, "logps/chosen": -385.95098876953125, "logps/rejected": -391.9120178222656, "loss": 0.7036, "rewards/accuracies": 0.625, "rewards/chosen": -0.0032224655151367188, "rewards/margins": -0.010845658369362354, "rewards/rejected": 0.007623197045177221, "step": 11 }, { "epoch": 0.006471619253067278, "grad_norm": 6.88601541519165, "learning_rate": 3.225806451612903e-08, "logits/chosen": -0.8481518030166626, "logits/rejected": -0.7560526728630066, "logps/chosen": -207.43414306640625, "logps/rejected": -260.845703125, "loss": 0.6965, "rewards/accuracies": 0.5, "rewards/chosen": -0.018787002190947533, "rewards/margins": -0.0049405088648200035, "rewards/rejected": -0.013846494257450104, "step": 12 }, { "epoch": 0.007010920857489551, "grad_norm": 8.370579719543457, "learning_rate": 3.494623655913978e-08, "logits/chosen": 0.005434185266494751, "logits/rejected": 0.9164117574691772, "logps/chosen": -132.9949493408203, "logps/rejected": -243.84381103515625, "loss": 0.7314, "rewards/accuracies": 0.25, "rewards/chosen": -0.02009115368127823, "rewards/margins": -0.0737515464425087, "rewards/rejected": 0.05366039648652077, "step": 13 }, { "epoch": 0.007550222461911824, "grad_norm": 6.3016462326049805, "learning_rate": 3.7634408602150537e-08, "logits/chosen": 1.1794731616973877, "logits/rejected": 0.32728123664855957, "logps/chosen": -205.55726623535156, "logps/rejected": -180.31341552734375, "loss": 0.69, "rewards/accuracies": 0.5, "rewards/chosen": 0.033798977732658386, "rewards/margins": 0.008153151720762253, "rewards/rejected": 0.025645826011896133, "step": 14 }, { "epoch": 0.008089524066334097, "grad_norm": 7.588430881500244, "learning_rate": 4.032258064516129e-08, "logits/chosen": 0.48337727785110474, "logits/rejected": 0.04023141413927078, "logps/chosen": -325.51275634765625, "logps/rejected": -252.53240966796875, "loss": 0.7114, "rewards/accuracies": 0.375, "rewards/chosen": 0.011207770556211472, "rewards/margins": -0.032468557357788086, "rewards/rejected": 0.043676331639289856, "step": 15 }, { "epoch": 0.00862882567075637, "grad_norm": 6.887419700622559, "learning_rate": 4.3010752688172045e-08, "logits/chosen": 0.3844994902610779, "logits/rejected": -0.07714763283729553, "logps/chosen": -256.3138732910156, "logps/rejected": -254.5594024658203, "loss": 0.6879, "rewards/accuracies": 0.5, "rewards/chosen": 0.005513000302016735, "rewards/margins": 0.013078592717647552, "rewards/rejected": -0.007565593346953392, "step": 16 }, { "epoch": 0.009168127275178644, "grad_norm": 7.771223545074463, "learning_rate": 4.5698924731182795e-08, "logits/chosen": 0.6665542721748352, "logits/rejected": -0.7292720079421997, "logps/chosen": -299.11993408203125, "logps/rejected": -258.806884765625, "loss": 0.6979, "rewards/accuracies": 0.5, "rewards/chosen": -0.025622179731726646, "rewards/margins": -0.006968118250370026, "rewards/rejected": -0.01865405961871147, "step": 17 }, { "epoch": 0.009707428879600917, "grad_norm": 6.7880539894104, "learning_rate": 4.8387096774193546e-08, "logits/chosen": 0.45089083909988403, "logits/rejected": -0.11152000725269318, "logps/chosen": -252.4178009033203, "logps/rejected": -217.87432861328125, "loss": 0.705, "rewards/accuracies": 0.375, "rewards/chosen": -0.003479194827377796, "rewards/margins": -0.022620679810643196, "rewards/rejected": 0.019141484051942825, "step": 18 }, { "epoch": 0.01024673048402319, "grad_norm": 5.98990535736084, "learning_rate": 5.1075268817204303e-08, "logits/chosen": 0.03037811815738678, "logits/rejected": -0.3154934048652649, "logps/chosen": -260.148193359375, "logps/rejected": -174.72232055664062, "loss": 0.6938, "rewards/accuracies": 0.625, "rewards/chosen": -0.006490801461040974, "rewards/margins": -0.00034084171056747437, "rewards/rejected": -0.006149959284812212, "step": 19 }, { "epoch": 0.010786032088445464, "grad_norm": 6.795949935913086, "learning_rate": 5.3763440860215054e-08, "logits/chosen": -1.5993205308914185, "logits/rejected": -1.250319480895996, "logps/chosen": -236.34591674804688, "logps/rejected": -251.27810668945312, "loss": 0.725, "rewards/accuracies": 0.25, "rewards/chosen": -0.058860018849372864, "rewards/margins": -0.06102304905653, "rewards/rejected": 0.0021630283445119858, "step": 20 }, { "epoch": 0.011325333692867737, "grad_norm": 6.1405744552612305, "learning_rate": 5.6451612903225805e-08, "logits/chosen": 0.280826210975647, "logits/rejected": -0.09767374396324158, "logps/chosen": -189.00433349609375, "logps/rejected": -178.18157958984375, "loss": 0.6765, "rewards/accuracies": 0.625, "rewards/chosen": 0.041825104504823685, "rewards/margins": 0.036451250314712524, "rewards/rejected": 0.005373857915401459, "step": 21 }, { "epoch": 0.011864635297290009, "grad_norm": 5.927632808685303, "learning_rate": 5.913978494623656e-08, "logits/chosen": 0.35737723112106323, "logits/rejected": -0.42781418561935425, "logps/chosen": -196.00181579589844, "logps/rejected": -225.70233154296875, "loss": 0.7126, "rewards/accuracies": 0.25, "rewards/chosen": -0.014331914484500885, "rewards/margins": -0.03530912473797798, "rewards/rejected": 0.020977213978767395, "step": 22 }, { "epoch": 0.012403936901712282, "grad_norm": 6.510446071624756, "learning_rate": 6.18279569892473e-08, "logits/chosen": 0.8567652702331543, "logits/rejected": 0.27756407856941223, "logps/chosen": -295.59271240234375, "logps/rejected": -320.3628234863281, "loss": 0.6882, "rewards/accuracies": 0.75, "rewards/chosen": 0.02345123328268528, "rewards/margins": 0.011982059106230736, "rewards/rejected": 0.011469174176454544, "step": 23 }, { "epoch": 0.012943238506134555, "grad_norm": 5.837512016296387, "learning_rate": 6.451612903225806e-08, "logits/chosen": 0.12894143164157867, "logits/rejected": -0.8767312169075012, "logps/chosen": -253.52415466308594, "logps/rejected": -216.59677124023438, "loss": 0.6953, "rewards/accuracies": 0.375, "rewards/chosen": 0.01358013041317463, "rewards/margins": -0.0011703483760356903, "rewards/rejected": 0.014750480651855469, "step": 24 }, { "epoch": 0.013482540110556829, "grad_norm": 6.731097221374512, "learning_rate": 6.720430107526881e-08, "logits/chosen": 0.7273112535476685, "logits/rejected": 0.05637746304273605, "logps/chosen": -301.1533203125, "logps/rejected": -262.58575439453125, "loss": 0.6649, "rewards/accuracies": 0.875, "rewards/chosen": -0.004298402927815914, "rewards/margins": 0.05871295928955078, "rewards/rejected": -0.06301136314868927, "step": 25 }, { "epoch": 0.014021841714979102, "grad_norm": 8.112701416015625, "learning_rate": 6.989247311827956e-08, "logits/chosen": 1.5232702493667603, "logits/rejected": -1.0203065872192383, "logps/chosen": -330.11358642578125, "logps/rejected": -215.6923828125, "loss": 0.6803, "rewards/accuracies": 0.75, "rewards/chosen": 0.009792422875761986, "rewards/margins": 0.029491184279322624, "rewards/rejected": -0.01969876140356064, "step": 26 }, { "epoch": 0.014561143319401375, "grad_norm": 7.133888244628906, "learning_rate": 7.258064516129032e-08, "logits/chosen": -0.2510586977005005, "logits/rejected": 0.24271687865257263, "logps/chosen": -211.10690307617188, "logps/rejected": -258.077880859375, "loss": 0.7122, "rewards/accuracies": 0.5, "rewards/chosen": 0.006464624777436256, "rewards/margins": -0.03486265987157822, "rewards/rejected": 0.041327282786369324, "step": 27 }, { "epoch": 0.015100444923823649, "grad_norm": 7.1719818115234375, "learning_rate": 7.526881720430107e-08, "logits/chosen": 0.08920338749885559, "logits/rejected": 0.3937070369720459, "logps/chosen": -264.3328857421875, "logps/rejected": -291.1400146484375, "loss": 0.6806, "rewards/accuracies": 0.5, "rewards/chosen": 0.03729715198278427, "rewards/margins": 0.027529526501893997, "rewards/rejected": 0.009767627343535423, "step": 28 }, { "epoch": 0.015639746528245922, "grad_norm": 6.114221096038818, "learning_rate": 7.795698924731182e-08, "logits/chosen": 0.6845852732658386, "logits/rejected": -0.15464790165424347, "logps/chosen": -203.93817138671875, "logps/rejected": -168.6993408203125, "loss": 0.6949, "rewards/accuracies": 0.625, "rewards/chosen": -0.004928397946059704, "rewards/margins": -0.00048751384019851685, "rewards/rejected": -0.0044408803805708885, "step": 29 }, { "epoch": 0.016179048132668194, "grad_norm": 5.770440101623535, "learning_rate": 8.064516129032257e-08, "logits/chosen": 0.6736056804656982, "logits/rejected": -0.48543646931648254, "logps/chosen": -182.6506805419922, "logps/rejected": -179.58619689941406, "loss": 0.6919, "rewards/accuracies": 0.5, "rewards/chosen": -0.04098071902990341, "rewards/margins": 0.0048411390744149685, "rewards/rejected": -0.04582185298204422, "step": 30 }, { "epoch": 0.01671834973709047, "grad_norm": 6.825628280639648, "learning_rate": 8.333333333333333e-08, "logits/chosen": -0.4072284996509552, "logits/rejected": -1.3094217777252197, "logps/chosen": -243.5831298828125, "logps/rejected": -202.86892700195312, "loss": 0.7275, "rewards/accuracies": 0.125, "rewards/chosen": -0.06486912071704865, "rewards/margins": -0.06618175655603409, "rewards/rejected": 0.0013126367703080177, "step": 31 }, { "epoch": 0.01725765134151274, "grad_norm": 6.562854766845703, "learning_rate": 8.602150537634409e-08, "logits/chosen": -0.6606875658035278, "logits/rejected": 0.291075199842453, "logps/chosen": -176.05813598632812, "logps/rejected": -246.6948699951172, "loss": 0.6989, "rewards/accuracies": 0.375, "rewards/chosen": 0.028722286224365234, "rewards/margins": -0.010805511847138405, "rewards/rejected": 0.03952779620885849, "step": 32 }, { "epoch": 0.017796952945935016, "grad_norm": 6.5901384353637695, "learning_rate": 8.870967741935484e-08, "logits/chosen": 0.13207417726516724, "logits/rejected": -0.2679581344127655, "logps/chosen": -318.0742492675781, "logps/rejected": -392.0603942871094, "loss": 0.6885, "rewards/accuracies": 0.5, "rewards/chosen": 0.014474867843091488, "rewards/margins": 0.018151283264160156, "rewards/rejected": -0.0036764144897460938, "step": 33 }, { "epoch": 0.018336254550357287, "grad_norm": 5.869822025299072, "learning_rate": 9.139784946236559e-08, "logits/chosen": 0.7116552591323853, "logits/rejected": -0.4246397614479065, "logps/chosen": -233.56243896484375, "logps/rejected": -227.7888946533203, "loss": 0.6799, "rewards/accuracies": 0.625, "rewards/chosen": -0.0068277353420853615, "rewards/margins": 0.029880236834287643, "rewards/rejected": -0.03670797497034073, "step": 34 }, { "epoch": 0.01887555615477956, "grad_norm": 7.054579257965088, "learning_rate": 9.408602150537634e-08, "logits/chosen": 0.05984312668442726, "logits/rejected": -0.9079179763793945, "logps/chosen": -260.44696044921875, "logps/rejected": -295.9472351074219, "loss": 0.7026, "rewards/accuracies": 0.5, "rewards/chosen": 0.0015808101743459702, "rewards/margins": -0.013959886506199837, "rewards/rejected": 0.015540696680545807, "step": 35 }, { "epoch": 0.019414857759201834, "grad_norm": 5.787746906280518, "learning_rate": 9.677419354838709e-08, "logits/chosen": -0.5715674161911011, "logits/rejected": -0.38944101333618164, "logps/chosen": -207.68028259277344, "logps/rejected": -202.13314819335938, "loss": 0.713, "rewards/accuracies": 0.5, "rewards/chosen": -0.02358236350119114, "rewards/margins": -0.0359317772090435, "rewards/rejected": 0.012349415570497513, "step": 36 }, { "epoch": 0.019954159363624106, "grad_norm": 6.566524982452393, "learning_rate": 9.946236559139784e-08, "logits/chosen": 0.6607987880706787, "logits/rejected": 0.26872193813323975, "logps/chosen": -300.42901611328125, "logps/rejected": -248.55531311035156, "loss": 0.6884, "rewards/accuracies": 0.5, "rewards/chosen": 0.00663738464936614, "rewards/margins": 0.011898614466190338, "rewards/rejected": -0.00526123121380806, "step": 37 }, { "epoch": 0.02049346096804638, "grad_norm": 6.399261474609375, "learning_rate": 1.0215053763440861e-07, "logits/chosen": 1.101385235786438, "logits/rejected": 0.16744400560855865, "logps/chosen": -328.1324462890625, "logps/rejected": -251.16419982910156, "loss": 0.6735, "rewards/accuracies": 0.75, "rewards/chosen": 0.026068303734064102, "rewards/margins": 0.042938232421875, "rewards/rejected": -0.01686992682516575, "step": 38 }, { "epoch": 0.021032762572468652, "grad_norm": 6.77836275100708, "learning_rate": 1.0483870967741936e-07, "logits/chosen": 0.43789803981781006, "logits/rejected": -0.4721353352069855, "logps/chosen": -225.98641967773438, "logps/rejected": -169.00540161132812, "loss": 0.7003, "rewards/accuracies": 0.625, "rewards/chosen": 0.01696815714240074, "rewards/margins": -0.0071161240339279175, "rewards/rejected": 0.02408428303897381, "step": 39 }, { "epoch": 0.021572064176890927, "grad_norm": 6.664007663726807, "learning_rate": 1.0752688172043011e-07, "logits/chosen": 0.7623529434204102, "logits/rejected": 1.1377607583999634, "logps/chosen": -262.68359375, "logps/rejected": -263.8930969238281, "loss": 0.7058, "rewards/accuracies": 0.5, "rewards/chosen": -0.015323924832046032, "rewards/margins": -0.02272767946124077, "rewards/rejected": 0.0074037546291947365, "step": 40 }, { "epoch": 0.0221113657813132, "grad_norm": 6.571826457977295, "learning_rate": 1.1021505376344086e-07, "logits/chosen": 0.4096890985965729, "logits/rejected": 0.23215217888355255, "logps/chosen": -279.9482116699219, "logps/rejected": -305.8799743652344, "loss": 0.677, "rewards/accuracies": 0.5, "rewards/chosen": -0.02642669901251793, "rewards/margins": 0.03439588472247124, "rewards/rejected": -0.060822583734989166, "step": 41 }, { "epoch": 0.022650667385735474, "grad_norm": 6.09522008895874, "learning_rate": 1.1290322580645161e-07, "logits/chosen": -0.32576605677604675, "logits/rejected": -0.9540883302688599, "logps/chosen": -241.72024536132812, "logps/rejected": -253.72756958007812, "loss": 0.6864, "rewards/accuracies": 0.625, "rewards/chosen": -0.016807176172733307, "rewards/margins": 0.015630245208740234, "rewards/rejected": -0.03243741765618324, "step": 42 }, { "epoch": 0.023189968990157746, "grad_norm": 6.928330898284912, "learning_rate": 1.1559139784946236e-07, "logits/chosen": 0.7320435047149658, "logits/rejected": -0.356320858001709, "logps/chosen": -226.54708862304688, "logps/rejected": -198.48422241210938, "loss": 0.7024, "rewards/accuracies": 0.5, "rewards/chosen": 0.005367279052734375, "rewards/margins": -0.01707020215690136, "rewards/rejected": 0.022437477484345436, "step": 43 }, { "epoch": 0.023729270594580017, "grad_norm": 7.092703819274902, "learning_rate": 1.1827956989247312e-07, "logits/chosen": -0.37553584575653076, "logits/rejected": -0.754245400428772, "logps/chosen": -248.12188720703125, "logps/rejected": -238.4471435546875, "loss": 0.7154, "rewards/accuracies": 0.375, "rewards/chosen": -0.008257055655121803, "rewards/margins": -0.041106365621089935, "rewards/rejected": 0.03284931182861328, "step": 44 }, { "epoch": 0.024268572199002292, "grad_norm": 7.550450801849365, "learning_rate": 1.2096774193548387e-07, "logits/chosen": -0.07781332731246948, "logits/rejected": 0.15177196264266968, "logps/chosen": -184.72357177734375, "logps/rejected": -249.59002685546875, "loss": 0.6808, "rewards/accuracies": 0.75, "rewards/chosen": -0.0266158115118742, "rewards/margins": 0.025844382122159004, "rewards/rejected": -0.0524601936340332, "step": 45 }, { "epoch": 0.024807873803424564, "grad_norm": 6.410195350646973, "learning_rate": 1.236559139784946e-07, "logits/chosen": 0.1117473766207695, "logits/rejected": 0.5558474063873291, "logps/chosen": -219.80604553222656, "logps/rejected": -298.1690673828125, "loss": 0.7112, "rewards/accuracies": 0.375, "rewards/chosen": -0.02552332915365696, "rewards/margins": -0.02865471877157688, "rewards/rejected": 0.0031313886865973473, "step": 46 }, { "epoch": 0.02534717540784684, "grad_norm": 6.09147834777832, "learning_rate": 1.2634408602150538e-07, "logits/chosen": 0.09431290626525879, "logits/rejected": -1.1196831464767456, "logps/chosen": -207.61712646484375, "logps/rejected": -150.15847778320312, "loss": 0.7035, "rewards/accuracies": 0.5, "rewards/chosen": 0.031181860715150833, "rewards/margins": -0.018988370895385742, "rewards/rejected": 0.050170235335826874, "step": 47 }, { "epoch": 0.02588647701226911, "grad_norm": 7.817967891693115, "learning_rate": 1.2903225806451611e-07, "logits/chosen": -0.9153267741203308, "logits/rejected": -0.34308600425720215, "logps/chosen": -354.1080017089844, "logps/rejected": -599.1622314453125, "loss": 0.7025, "rewards/accuracies": 0.375, "rewards/chosen": -0.016875457018613815, "rewards/margins": -0.013425637036561966, "rewards/rejected": -0.0034498218446969986, "step": 48 }, { "epoch": 0.026425778616691386, "grad_norm": 8.489771842956543, "learning_rate": 1.3172043010752688e-07, "logits/chosen": -0.32484811544418335, "logits/rejected": -1.047135591506958, "logps/chosen": -246.77365112304688, "logps/rejected": -181.40234375, "loss": 0.7541, "rewards/accuracies": 0.0, "rewards/chosen": -0.1079128310084343, "rewards/margins": -0.1177031546831131, "rewards/rejected": 0.009790323674678802, "step": 49 }, { "epoch": 0.026965080221113658, "grad_norm": 6.74281120300293, "learning_rate": 1.3440860215053762e-07, "logits/chosen": 0.20485439896583557, "logits/rejected": -0.22419406473636627, "logps/chosen": -196.95712280273438, "logps/rejected": -183.84579467773438, "loss": 0.7145, "rewards/accuracies": 0.25, "rewards/chosen": -0.04666566848754883, "rewards/margins": -0.04094552993774414, "rewards/rejected": -0.005720139481127262, "step": 50 }, { "epoch": 0.027504381825535933, "grad_norm": 6.660191535949707, "learning_rate": 1.3709677419354838e-07, "logits/chosen": 0.616850733757019, "logits/rejected": -0.308997243642807, "logps/chosen": -325.8602294921875, "logps/rejected": -307.119873046875, "loss": 0.695, "rewards/accuracies": 0.375, "rewards/chosen": -0.034990884363651276, "rewards/margins": 0.00037708133459091187, "rewards/rejected": -0.03536796197295189, "step": 51 }, { "epoch": 0.028043683429958204, "grad_norm": 5.875840187072754, "learning_rate": 1.3978494623655912e-07, "logits/chosen": 0.33974993228912354, "logits/rejected": 0.7493408918380737, "logps/chosen": -192.40802001953125, "logps/rejected": -208.03094482421875, "loss": 0.7179, "rewards/accuracies": 0.25, "rewards/chosen": -0.05251732096076012, "rewards/margins": -0.04642849043011665, "rewards/rejected": -0.006088827736675739, "step": 52 }, { "epoch": 0.028582985034380476, "grad_norm": 7.2490763664245605, "learning_rate": 1.424731182795699e-07, "logits/chosen": 0.08137709647417068, "logits/rejected": 0.5771705508232117, "logps/chosen": -253.27505493164062, "logps/rejected": -251.87545776367188, "loss": 0.6812, "rewards/accuracies": 0.5, "rewards/chosen": 0.02718525007367134, "rewards/margins": 0.02553110383450985, "rewards/rejected": 0.0016541481018066406, "step": 53 }, { "epoch": 0.02912228663880275, "grad_norm": 7.557590961456299, "learning_rate": 1.4516129032258064e-07, "logits/chosen": -0.3037206530570984, "logits/rejected": 0.061176326125860214, "logps/chosen": -231.7301025390625, "logps/rejected": -309.5451354980469, "loss": 0.7068, "rewards/accuracies": 0.375, "rewards/chosen": -0.0116729736328125, "rewards/margins": -0.025312330573797226, "rewards/rejected": 0.013639355078339577, "step": 54 }, { "epoch": 0.029661588243225023, "grad_norm": 8.07595157623291, "learning_rate": 1.478494623655914e-07, "logits/chosen": -0.36183544993400574, "logits/rejected": -0.7174750566482544, "logps/chosen": -177.78619384765625, "logps/rejected": -156.283447265625, "loss": 0.7244, "rewards/accuracies": 0.25, "rewards/chosen": -0.008991813287138939, "rewards/margins": -0.057465650141239166, "rewards/rejected": 0.04847383499145508, "step": 55 }, { "epoch": 0.030200889847647298, "grad_norm": 7.856146812438965, "learning_rate": 1.5053763440860215e-07, "logits/chosen": 0.01964491605758667, "logits/rejected": -0.9375946521759033, "logps/chosen": -262.23358154296875, "logps/rejected": -222.6016082763672, "loss": 0.7085, "rewards/accuracies": 0.375, "rewards/chosen": -0.037127114832401276, "rewards/margins": -0.023748017847537994, "rewards/rejected": -0.013379099778831005, "step": 56 }, { "epoch": 0.03074019145206957, "grad_norm": 5.8801445960998535, "learning_rate": 1.532258064516129e-07, "logits/chosen": 1.1437751054763794, "logits/rejected": 0.4657767713069916, "logps/chosen": -220.7467803955078, "logps/rejected": -251.58139038085938, "loss": 0.677, "rewards/accuracies": 0.75, "rewards/chosen": 0.046050261706113815, "rewards/margins": 0.03767061606049538, "rewards/rejected": 0.008379649370908737, "step": 57 }, { "epoch": 0.031279493056491844, "grad_norm": 6.616484642028809, "learning_rate": 1.5591397849462365e-07, "logits/chosen": -0.23883965611457825, "logits/rejected": 0.3297547698020935, "logps/chosen": -248.76290893554688, "logps/rejected": -204.7056427001953, "loss": 0.7185, "rewards/accuracies": 0.375, "rewards/chosen": -0.04432382807135582, "rewards/margins": -0.04567241668701172, "rewards/rejected": 0.001348591409623623, "step": 58 }, { "epoch": 0.03181879466091412, "grad_norm": 6.607509613037109, "learning_rate": 1.586021505376344e-07, "logits/chosen": 0.22468450665473938, "logits/rejected": 0.15196377038955688, "logps/chosen": -250.70733642578125, "logps/rejected": -243.27609252929688, "loss": 0.7169, "rewards/accuracies": 0.375, "rewards/chosen": -0.010921287350356579, "rewards/margins": -0.044898226857185364, "rewards/rejected": 0.03397693857550621, "step": 59 }, { "epoch": 0.03235809626533639, "grad_norm": 6.218328952789307, "learning_rate": 1.6129032258064515e-07, "logits/chosen": 0.03569874167442322, "logits/rejected": -0.15164443850517273, "logps/chosen": -181.34950256347656, "logps/rejected": -180.65032958984375, "loss": 0.6852, "rewards/accuracies": 0.625, "rewards/chosen": -0.017594050616025925, "rewards/margins": 0.020270822569727898, "rewards/rejected": -0.03786487504839897, "step": 60 }, { "epoch": 0.03289739786975866, "grad_norm": 6.776982307434082, "learning_rate": 1.639784946236559e-07, "logits/chosen": 0.6675748229026794, "logits/rejected": 0.2737099826335907, "logps/chosen": -235.38961791992188, "logps/rejected": -262.43328857421875, "loss": 0.7197, "rewards/accuracies": 0.375, "rewards/chosen": -0.015114114619791508, "rewards/margins": -0.04507589340209961, "rewards/rejected": 0.029961779713630676, "step": 61 }, { "epoch": 0.03343669947418094, "grad_norm": 6.265476226806641, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -0.5479084849357605, "logits/rejected": 0.24066512286663055, "logps/chosen": -179.8935546875, "logps/rejected": -220.2181396484375, "loss": 0.7097, "rewards/accuracies": 0.25, "rewards/chosen": 0.0043235779739916325, "rewards/margins": -0.03188037872314453, "rewards/rejected": 0.036203958094120026, "step": 62 }, { "epoch": 0.033976001078603206, "grad_norm": 6.758687496185303, "learning_rate": 1.693548387096774e-07, "logits/chosen": 0.3813549280166626, "logits/rejected": 0.052206262946128845, "logps/chosen": -252.13259887695312, "logps/rejected": -217.37841796875, "loss": 0.7081, "rewards/accuracies": 0.5, "rewards/chosen": -0.03189907222986221, "rewards/margins": -0.0279159527271986, "rewards/rejected": -0.003983113914728165, "step": 63 }, { "epoch": 0.03451530268302548, "grad_norm": 6.5390944480896, "learning_rate": 1.7204301075268818e-07, "logits/chosen": 1.0999422073364258, "logits/rejected": 0.6911662817001343, "logps/chosen": -274.45123291015625, "logps/rejected": -308.85345458984375, "loss": 0.7272, "rewards/accuracies": 0.25, "rewards/chosen": -0.019794844090938568, "rewards/margins": -0.06538476794958115, "rewards/rejected": 0.04558992385864258, "step": 64 }, { "epoch": 0.035054604287447756, "grad_norm": 6.859392166137695, "learning_rate": 1.7473118279569892e-07, "logits/chosen": 0.2836797833442688, "logits/rejected": -0.25728103518486023, "logps/chosen": -193.159912109375, "logps/rejected": -166.3253173828125, "loss": 0.673, "rewards/accuracies": 0.625, "rewards/chosen": 0.0008709430694580078, "rewards/margins": 0.04356284439563751, "rewards/rejected": -0.042691901326179504, "step": 65 }, { "epoch": 0.03559390589187003, "grad_norm": 6.81648588180542, "learning_rate": 1.7741935483870968e-07, "logits/chosen": 0.340016633272171, "logits/rejected": -0.3237540125846863, "logps/chosen": -228.1221466064453, "logps/rejected": -189.7011260986328, "loss": 0.7096, "rewards/accuracies": 0.375, "rewards/chosen": -0.01624593697488308, "rewards/margins": -0.030686475336551666, "rewards/rejected": 0.014440536499023438, "step": 66 }, { "epoch": 0.0361332074962923, "grad_norm": 6.571608066558838, "learning_rate": 1.8010752688172042e-07, "logits/chosen": -0.5837182402610779, "logits/rejected": -1.1038627624511719, "logps/chosen": -198.23443603515625, "logps/rejected": -196.94436645507812, "loss": 0.6974, "rewards/accuracies": 0.5, "rewards/chosen": -0.01607208326458931, "rewards/margins": -0.0069235824048519135, "rewards/rejected": -0.009148502722382545, "step": 67 }, { "epoch": 0.036672509100714575, "grad_norm": 7.06835412979126, "learning_rate": 1.8279569892473118e-07, "logits/chosen": -0.02401360124349594, "logits/rejected": -0.051684994250535965, "logps/chosen": -207.46688842773438, "logps/rejected": -227.0677490234375, "loss": 0.7278, "rewards/accuracies": 0.25, "rewards/chosen": -0.016698170453310013, "rewards/margins": -0.06531276553869247, "rewards/rejected": 0.04861459881067276, "step": 68 }, { "epoch": 0.03721181070513685, "grad_norm": 7.030075550079346, "learning_rate": 1.8548387096774192e-07, "logits/chosen": 0.8457868099212646, "logits/rejected": 0.07027164101600647, "logps/chosen": -199.57554626464844, "logps/rejected": -218.30075073242188, "loss": 0.7097, "rewards/accuracies": 0.5, "rewards/chosen": -0.05245237052440643, "rewards/margins": -0.028759188950061798, "rewards/rejected": -0.023693181574344635, "step": 69 }, { "epoch": 0.03775111230955912, "grad_norm": 6.275753974914551, "learning_rate": 1.8817204301075268e-07, "logits/chosen": 0.3617232143878937, "logits/rejected": 0.12509778141975403, "logps/chosen": -237.53164672851562, "logps/rejected": -252.2608642578125, "loss": 0.6804, "rewards/accuracies": 0.75, "rewards/chosen": -0.011961555108428001, "rewards/margins": 0.02681140974164009, "rewards/rejected": -0.03877296298742294, "step": 70 }, { "epoch": 0.03829041391398139, "grad_norm": 7.245884895324707, "learning_rate": 1.9086021505376342e-07, "logits/chosen": 0.9231241941452026, "logits/rejected": 0.07187989354133606, "logps/chosen": -228.5129852294922, "logps/rejected": -183.2341766357422, "loss": 0.7302, "rewards/accuracies": 0.375, "rewards/chosen": -0.061576176434755325, "rewards/margins": -0.07026262581348419, "rewards/rejected": 0.008686447516083717, "step": 71 }, { "epoch": 0.03882971551840367, "grad_norm": 7.275542736053467, "learning_rate": 1.9354838709677418e-07, "logits/chosen": 0.41939058899879456, "logits/rejected": -1.180080771446228, "logps/chosen": -216.55479431152344, "logps/rejected": -172.12850952148438, "loss": 0.6681, "rewards/accuracies": 0.625, "rewards/chosen": 0.011012649163603783, "rewards/margins": 0.052765846252441406, "rewards/rejected": -0.041753195226192474, "step": 72 }, { "epoch": 0.03936901712282594, "grad_norm": 6.812000274658203, "learning_rate": 1.9623655913978492e-07, "logits/chosen": 1.4965109825134277, "logits/rejected": -1.0670374631881714, "logps/chosen": -235.3369140625, "logps/rejected": -149.12481689453125, "loss": 0.6942, "rewards/accuracies": 0.375, "rewards/chosen": -0.00852670893073082, "rewards/margins": -0.0008061425760388374, "rewards/rejected": -0.00772056495770812, "step": 73 }, { "epoch": 0.03990831872724821, "grad_norm": 5.960543632507324, "learning_rate": 1.9892473118279569e-07, "logits/chosen": 0.05810299515724182, "logits/rejected": 1.0625340938568115, "logps/chosen": -147.47366333007812, "logps/rejected": -203.74288940429688, "loss": 0.6783, "rewards/accuracies": 0.625, "rewards/chosen": 0.04593963921070099, "rewards/margins": 0.03228893131017685, "rewards/rejected": 0.013650703243911266, "step": 74 }, { "epoch": 0.040447620331670486, "grad_norm": 7.112579822540283, "learning_rate": 2.0161290322580642e-07, "logits/chosen": 0.713447093963623, "logits/rejected": 0.8585648536682129, "logps/chosen": -243.99659729003906, "logps/rejected": -277.8067932128906, "loss": 0.6925, "rewards/accuracies": 0.625, "rewards/chosen": -0.04820976033806801, "rewards/margins": 0.004776567220687866, "rewards/rejected": -0.05298633873462677, "step": 75 }, { "epoch": 0.04098692193609276, "grad_norm": 6.378844261169434, "learning_rate": 2.0430107526881721e-07, "logits/chosen": 0.3981945216655731, "logits/rejected": -0.5262216329574585, "logps/chosen": -268.714599609375, "logps/rejected": -260.88726806640625, "loss": 0.6971, "rewards/accuracies": 0.5, "rewards/chosen": -0.02797842212021351, "rewards/margins": -0.006366348825395107, "rewards/rejected": -0.021612072363495827, "step": 76 }, { "epoch": 0.04152622354051503, "grad_norm": 6.93662691116333, "learning_rate": 2.0698924731182795e-07, "logits/chosen": 0.5951217412948608, "logits/rejected": -0.8857496976852417, "logps/chosen": -222.0436248779297, "logps/rejected": -180.20875549316406, "loss": 0.6567, "rewards/accuracies": 0.75, "rewards/chosen": 0.04595623165369034, "rewards/margins": 0.07680139690637589, "rewards/rejected": -0.030845165252685547, "step": 77 }, { "epoch": 0.042065525144937305, "grad_norm": 6.994305610656738, "learning_rate": 2.0967741935483871e-07, "logits/chosen": 0.39744284749031067, "logits/rejected": -0.19846820831298828, "logps/chosen": -200.98851013183594, "logps/rejected": -184.7799835205078, "loss": 0.6547, "rewards/accuracies": 0.875, "rewards/chosen": 0.06869182735681534, "rewards/margins": 0.07977552711963654, "rewards/rejected": -0.011083699762821198, "step": 78 }, { "epoch": 0.04260482674935958, "grad_norm": 6.051930904388428, "learning_rate": 2.1236559139784945e-07, "logits/chosen": 0.15097030997276306, "logits/rejected": -0.04082271456718445, "logps/chosen": -228.77828979492188, "logps/rejected": -274.97723388671875, "loss": 0.6783, "rewards/accuracies": 0.625, "rewards/chosen": 0.033980656415224075, "rewards/margins": 0.032660868018865585, "rewards/rejected": 0.00131978839635849, "step": 79 }, { "epoch": 0.043144128353781855, "grad_norm": 6.941885948181152, "learning_rate": 2.1505376344086022e-07, "logits/chosen": 0.2920362651348114, "logits/rejected": -1.0275171995162964, "logps/chosen": -359.3815612792969, "logps/rejected": -331.26251220703125, "loss": 0.676, "rewards/accuracies": 0.625, "rewards/chosen": 0.035140231251716614, "rewards/margins": 0.03662919998168945, "rewards/rejected": -0.0014889724552631378, "step": 80 }, { "epoch": 0.04368342995820412, "grad_norm": 7.439138889312744, "learning_rate": 2.1774193548387095e-07, "logits/chosen": -0.03226907551288605, "logits/rejected": -0.7014745473861694, "logps/chosen": -252.13601684570312, "logps/rejected": -224.40049743652344, "loss": 0.704, "rewards/accuracies": 0.5, "rewards/chosen": -0.0011888500303030014, "rewards/margins": -0.01923222653567791, "rewards/rejected": 0.01804337650537491, "step": 81 }, { "epoch": 0.0442227315626264, "grad_norm": 7.1660685539245605, "learning_rate": 2.2043010752688172e-07, "logits/chosen": 0.7280354499816895, "logits/rejected": 0.41155508160591125, "logps/chosen": -259.4278564453125, "logps/rejected": -294.70123291015625, "loss": 0.7249, "rewards/accuracies": 0.5, "rewards/chosen": -0.006667994428426027, "rewards/margins": -0.058919813483953476, "rewards/rejected": 0.05225181579589844, "step": 82 }, { "epoch": 0.04476203316704867, "grad_norm": 6.503385543823242, "learning_rate": 2.2311827956989246e-07, "logits/chosen": -0.4545082449913025, "logits/rejected": -0.3899765908718109, "logps/chosen": -192.58226013183594, "logps/rejected": -193.42739868164062, "loss": 0.7157, "rewards/accuracies": 0.375, "rewards/chosen": -0.006682205945253372, "rewards/margins": -0.04236126318573952, "rewards/rejected": 0.035679057240486145, "step": 83 }, { "epoch": 0.04530133477147095, "grad_norm": 7.5285515785217285, "learning_rate": 2.2580645161290322e-07, "logits/chosen": 0.16105994582176208, "logits/rejected": -0.22474607825279236, "logps/chosen": -316.07183837890625, "logps/rejected": -278.63165283203125, "loss": 0.6849, "rewards/accuracies": 0.625, "rewards/chosen": -0.03343396261334419, "rewards/margins": 0.019646599888801575, "rewards/rejected": -0.05308055877685547, "step": 84 }, { "epoch": 0.045840636375893216, "grad_norm": 7.229227066040039, "learning_rate": 2.2849462365591396e-07, "logits/chosen": 1.2716115713119507, "logits/rejected": 0.1929684281349182, "logps/chosen": -254.26141357421875, "logps/rejected": -202.3281707763672, "loss": 0.6534, "rewards/accuracies": 0.75, "rewards/chosen": 0.04087209701538086, "rewards/margins": 0.08400765061378479, "rewards/rejected": -0.043135546147823334, "step": 85 }, { "epoch": 0.04637993798031549, "grad_norm": 6.786709308624268, "learning_rate": 2.3118279569892472e-07, "logits/chosen": 0.29079633951187134, "logits/rejected": -0.8417977094650269, "logps/chosen": -334.629150390625, "logps/rejected": -285.68536376953125, "loss": 0.7054, "rewards/accuracies": 0.5, "rewards/chosen": -0.016495514661073685, "rewards/margins": -0.020731162279844284, "rewards/rejected": 0.004235649481415749, "step": 86 }, { "epoch": 0.04691923958473777, "grad_norm": 6.214145660400391, "learning_rate": 2.3387096774193546e-07, "logits/chosen": 0.8285027742385864, "logits/rejected": 0.2622811198234558, "logps/chosen": -307.2441101074219, "logps/rejected": -223.85598754882812, "loss": 0.6963, "rewards/accuracies": 0.5, "rewards/chosen": -0.031632233411073685, "rewards/margins": -0.003519054502248764, "rewards/rejected": -0.028113175183534622, "step": 87 }, { "epoch": 0.047458541189160035, "grad_norm": 5.84558629989624, "learning_rate": 2.3655913978494625e-07, "logits/chosen": 0.2914958596229553, "logits/rejected": 0.632845938205719, "logps/chosen": -207.36947631835938, "logps/rejected": -238.92324829101562, "loss": 0.6453, "rewards/accuracies": 0.875, "rewards/chosen": 0.02484140358865261, "rewards/margins": 0.11552038788795471, "rewards/rejected": -0.09067897498607635, "step": 88 }, { "epoch": 0.04799784279358231, "grad_norm": 6.764530181884766, "learning_rate": 2.39247311827957e-07, "logits/chosen": 1.1057637929916382, "logits/rejected": 0.04926629364490509, "logps/chosen": -340.84881591796875, "logps/rejected": -237.26190185546875, "loss": 0.6956, "rewards/accuracies": 0.625, "rewards/chosen": -0.013971135020256042, "rewards/margins": -0.0018226616084575653, "rewards/rejected": -0.012148475274443626, "step": 89 }, { "epoch": 0.048537144398004585, "grad_norm": 9.524235725402832, "learning_rate": 2.4193548387096775e-07, "logits/chosen": -1.4545323848724365, "logits/rejected": -1.2386066913604736, "logps/chosen": -219.86215209960938, "logps/rejected": -274.50927734375, "loss": 0.7021, "rewards/accuracies": 0.375, "rewards/chosen": -0.05379285663366318, "rewards/margins": -0.015750790014863014, "rewards/rejected": -0.03804206848144531, "step": 90 }, { "epoch": 0.04907644600242686, "grad_norm": 7.042910575866699, "learning_rate": 2.446236559139785e-07, "logits/chosen": 0.28195562958717346, "logits/rejected": -0.936808705329895, "logps/chosen": -286.7412109375, "logps/rejected": -227.28851318359375, "loss": 0.7218, "rewards/accuracies": 0.5, "rewards/chosen": -0.03816976398229599, "rewards/margins": -0.05110578611493111, "rewards/rejected": 0.012936020269989967, "step": 91 }, { "epoch": 0.04961574760684913, "grad_norm": 6.891688823699951, "learning_rate": 2.473118279569892e-07, "logits/chosen": 0.11737596988677979, "logits/rejected": 0.22559818625450134, "logps/chosen": -263.0616149902344, "logps/rejected": -319.7288513183594, "loss": 0.6761, "rewards/accuracies": 0.5, "rewards/chosen": 0.03290767967700958, "rewards/margins": 0.03960227966308594, "rewards/rejected": -0.006694602780044079, "step": 92 }, { "epoch": 0.0501550492112714, "grad_norm": 8.492754936218262, "learning_rate": 2.5e-07, "logits/chosen": -0.12978282570838928, "logits/rejected": -0.5343829393386841, "logps/chosen": -295.5472412109375, "logps/rejected": -260.46307373046875, "loss": 0.7041, "rewards/accuracies": 0.375, "rewards/chosen": -0.012467382475733757, "rewards/margins": -0.019531821832060814, "rewards/rejected": 0.007064437493681908, "step": 93 }, { "epoch": 0.05069435081569368, "grad_norm": 6.969022274017334, "learning_rate": 2.5268817204301075e-07, "logits/chosen": 0.7934816479682922, "logits/rejected": -0.2739827334880829, "logps/chosen": -325.30224609375, "logps/rejected": -342.4911193847656, "loss": 0.6621, "rewards/accuracies": 0.625, "rewards/chosen": 0.03208637237548828, "rewards/margins": 0.06666012108325958, "rewards/rejected": -0.0345737487077713, "step": 94 }, { "epoch": 0.05123365242011595, "grad_norm": 6.731186389923096, "learning_rate": 2.5537634408602146e-07, "logits/chosen": 1.1664433479309082, "logits/rejected": 1.0414949655532837, "logps/chosen": -266.1744689941406, "logps/rejected": -255.62445068359375, "loss": 0.7255, "rewards/accuracies": 0.125, "rewards/chosen": -0.014419556595385075, "rewards/margins": -0.06231231987476349, "rewards/rejected": 0.04789276048541069, "step": 95 }, { "epoch": 0.05177295402453822, "grad_norm": 6.407916069030762, "learning_rate": 2.5806451612903223e-07, "logits/chosen": -0.23704829812049866, "logits/rejected": 0.3088780641555786, "logps/chosen": -211.31671142578125, "logps/rejected": -191.4375762939453, "loss": 0.7011, "rewards/accuracies": 0.375, "rewards/chosen": 0.0140228271484375, "rewards/margins": -0.01306676771491766, "rewards/rejected": 0.027089595794677734, "step": 96 }, { "epoch": 0.0523122556289605, "grad_norm": 8.21934986114502, "learning_rate": 2.60752688172043e-07, "logits/chosen": 1.304435133934021, "logits/rejected": -1.2772505283355713, "logps/chosen": -309.25750732421875, "logps/rejected": -187.74191284179688, "loss": 0.6702, "rewards/accuracies": 0.5, "rewards/chosen": 0.039145328104496, "rewards/margins": 0.04989008978009224, "rewards/rejected": -0.010744764469563961, "step": 97 }, { "epoch": 0.05285155723338277, "grad_norm": 6.676281929016113, "learning_rate": 2.6344086021505376e-07, "logits/chosen": 0.0056991130113601685, "logits/rejected": -0.8222281336784363, "logps/chosen": -233.87188720703125, "logps/rejected": -164.72801208496094, "loss": 0.6906, "rewards/accuracies": 0.625, "rewards/chosen": 0.03338785097002983, "rewards/margins": 0.007143593393266201, "rewards/rejected": 0.026244260370731354, "step": 98 }, { "epoch": 0.05339085883780504, "grad_norm": 6.824514389038086, "learning_rate": 2.6612903225806447e-07, "logits/chosen": 0.7878564596176147, "logits/rejected": 1.0159544944763184, "logps/chosen": -288.6258850097656, "logps/rejected": -341.9578552246094, "loss": 0.674, "rewards/accuracies": 0.625, "rewards/chosen": 0.0322759672999382, "rewards/margins": 0.0412508025765419, "rewards/rejected": -0.008974839001893997, "step": 99 }, { "epoch": 0.053930160442227315, "grad_norm": 7.486489772796631, "learning_rate": 2.6881720430107523e-07, "logits/chosen": 0.377770334482193, "logits/rejected": -0.5051119923591614, "logps/chosen": -235.06405639648438, "logps/rejected": -193.7758026123047, "loss": 0.7256, "rewards/accuracies": 0.375, "rewards/chosen": -0.044617846608161926, "rewards/margins": -0.06121082231402397, "rewards/rejected": 0.016592979431152344, "step": 100 }, { "epoch": 0.05446946204664959, "grad_norm": 6.521635055541992, "learning_rate": 2.71505376344086e-07, "logits/chosen": 1.4169116020202637, "logits/rejected": 0.39273306727409363, "logps/chosen": -349.4755554199219, "logps/rejected": -311.147705078125, "loss": 0.668, "rewards/accuracies": 0.625, "rewards/chosen": -0.0069047920405864716, "rewards/margins": 0.06009388715028763, "rewards/rejected": -0.0669986754655838, "step": 101 }, { "epoch": 0.055008763651071865, "grad_norm": 8.732634544372559, "learning_rate": 2.7419354838709676e-07, "logits/chosen": 0.05418995022773743, "logits/rejected": -0.7306313514709473, "logps/chosen": -231.05140686035156, "logps/rejected": -175.26766967773438, "loss": 0.6837, "rewards/accuracies": 0.5, "rewards/chosen": 0.06324291229248047, "rewards/margins": 0.022243406623601913, "rewards/rejected": 0.040999509394168854, "step": 102 }, { "epoch": 0.05554806525549413, "grad_norm": 8.241607666015625, "learning_rate": 2.7688172043010747e-07, "logits/chosen": 1.1655319929122925, "logits/rejected": -0.02740282565355301, "logps/chosen": -235.9327850341797, "logps/rejected": -192.56002807617188, "loss": 0.7168, "rewards/accuracies": 0.125, "rewards/chosen": -0.0008462914265692234, "rewards/margins": -0.045664120465517044, "rewards/rejected": 0.04481782764196396, "step": 103 }, { "epoch": 0.05608736685991641, "grad_norm": 6.2211151123046875, "learning_rate": 2.7956989247311823e-07, "logits/chosen": 0.670514702796936, "logits/rejected": -0.199213445186615, "logps/chosen": -189.42759704589844, "logps/rejected": -123.34770965576172, "loss": 0.6973, "rewards/accuracies": 0.5, "rewards/chosen": 0.00715055363252759, "rewards/margins": -0.0072193630039691925, "rewards/rejected": 0.014369918033480644, "step": 104 }, { "epoch": 0.056626668464338684, "grad_norm": 9.328225135803223, "learning_rate": 2.8225806451612905e-07, "logits/chosen": -0.23505732417106628, "logits/rejected": -1.0476220846176147, "logps/chosen": -186.72921752929688, "logps/rejected": -148.9830322265625, "loss": 0.691, "rewards/accuracies": 0.5, "rewards/chosen": 0.04107093811035156, "rewards/margins": 0.005870914086699486, "rewards/rejected": 0.03520002216100693, "step": 105 }, { "epoch": 0.05716597006876095, "grad_norm": 6.453006744384766, "learning_rate": 2.849462365591398e-07, "logits/chosen": 0.04456393048167229, "logits/rejected": -0.8567557334899902, "logps/chosen": -348.1103820800781, "logps/rejected": -313.3924560546875, "loss": 0.6386, "rewards/accuracies": 0.75, "rewards/chosen": 0.022690391167998314, "rewards/margins": 0.11818485707044601, "rewards/rejected": -0.09549446403980255, "step": 106 }, { "epoch": 0.05770527167318323, "grad_norm": 6.792304515838623, "learning_rate": 2.876344086021505e-07, "logits/chosen": 0.6860268115997314, "logits/rejected": -1.010695457458496, "logps/chosen": -251.83677673339844, "logps/rejected": -196.83531188964844, "loss": 0.6898, "rewards/accuracies": 0.5, "rewards/chosen": 0.005495740100741386, "rewards/margins": 0.008941937237977982, "rewards/rejected": -0.003446197137236595, "step": 107 }, { "epoch": 0.0582445732776055, "grad_norm": 6.789505958557129, "learning_rate": 2.903225806451613e-07, "logits/chosen": -0.037228986620903015, "logits/rejected": -0.6453852653503418, "logps/chosen": -236.27919006347656, "logps/rejected": -290.3861083984375, "loss": 0.7291, "rewards/accuracies": 0.5, "rewards/chosen": -0.03332825005054474, "rewards/margins": -0.06437931209802628, "rewards/rejected": 0.031051063910126686, "step": 108 }, { "epoch": 0.05878387488202778, "grad_norm": 6.414456844329834, "learning_rate": 2.9301075268817205e-07, "logits/chosen": 1.3427150249481201, "logits/rejected": 0.8081392049789429, "logps/chosen": -287.13232421875, "logps/rejected": -317.9248962402344, "loss": 0.6799, "rewards/accuracies": 0.5, "rewards/chosen": 0.013022040948271751, "rewards/margins": 0.030969617888331413, "rewards/rejected": -0.017947576940059662, "step": 109 }, { "epoch": 0.059323176486450045, "grad_norm": 5.754662990570068, "learning_rate": 2.956989247311828e-07, "logits/chosen": 0.41133150458335876, "logits/rejected": 0.26850926876068115, "logps/chosen": -234.97900390625, "logps/rejected": -241.73138427734375, "loss": 0.6855, "rewards/accuracies": 0.5, "rewards/chosen": 0.05818986892700195, "rewards/margins": 0.0179627425968647, "rewards/rejected": 0.04022712633013725, "step": 110 }, { "epoch": 0.05986247809087232, "grad_norm": 6.649877071380615, "learning_rate": 2.9838709677419353e-07, "logits/chosen": -0.21105515956878662, "logits/rejected": -0.24641764163970947, "logps/chosen": -226.55075073242188, "logps/rejected": -199.36167907714844, "loss": 0.7009, "rewards/accuracies": 0.375, "rewards/chosen": -0.037195682525634766, "rewards/margins": -0.010514930821955204, "rewards/rejected": -0.026680754497647285, "step": 111 }, { "epoch": 0.060401779695294595, "grad_norm": 7.069550514221191, "learning_rate": 3.010752688172043e-07, "logits/chosen": -0.6333608627319336, "logits/rejected": -0.16163334250450134, "logps/chosen": -206.1290283203125, "logps/rejected": -254.71484375, "loss": 0.7047, "rewards/accuracies": 0.5, "rewards/chosen": -0.025586986914277077, "rewards/margins": -0.02041616663336754, "rewards/rejected": -0.005170824006199837, "step": 112 }, { "epoch": 0.060941081299716864, "grad_norm": 6.409275531768799, "learning_rate": 3.0376344086021506e-07, "logits/chosen": 0.5884038805961609, "logits/rejected": -0.7619173526763916, "logps/chosen": -296.135498046875, "logps/rejected": -279.05426025390625, "loss": 0.6967, "rewards/accuracies": 0.5, "rewards/chosen": -5.5598560720682144e-05, "rewards/margins": -0.005325605161488056, "rewards/rejected": 0.005270007066428661, "step": 113 }, { "epoch": 0.06148038290413914, "grad_norm": 7.077654838562012, "learning_rate": 3.064516129032258e-07, "logits/chosen": 0.8743955492973328, "logits/rejected": 0.359328031539917, "logps/chosen": -329.2562561035156, "logps/rejected": -270.51824951171875, "loss": 0.6823, "rewards/accuracies": 0.5, "rewards/chosen": -0.0049301134422421455, "rewards/margins": 0.026774410158395767, "rewards/rejected": -0.03170452266931534, "step": 114 }, { "epoch": 0.062019684508561414, "grad_norm": 6.591875076293945, "learning_rate": 3.0913978494623653e-07, "logits/chosen": 0.7048566341400146, "logits/rejected": -0.15581953525543213, "logps/chosen": -270.206298828125, "logps/rejected": -285.62701416015625, "loss": 0.6809, "rewards/accuracies": 0.625, "rewards/chosen": 0.0212465301156044, "rewards/margins": 0.02586841583251953, "rewards/rejected": -0.0046218885108828545, "step": 115 }, { "epoch": 0.06255898611298369, "grad_norm": 7.538413047790527, "learning_rate": 3.118279569892473e-07, "logits/chosen": -1.1158511638641357, "logits/rejected": -1.054720401763916, "logps/chosen": -161.6815643310547, "logps/rejected": -147.43911743164062, "loss": 0.6995, "rewards/accuracies": 0.25, "rewards/chosen": -0.019862651824951172, "rewards/margins": -0.011600733734667301, "rewards/rejected": -0.00826191995292902, "step": 116 }, { "epoch": 0.06309828771740596, "grad_norm": 6.2653374671936035, "learning_rate": 3.1451612903225806e-07, "logits/chosen": 0.39432305097579956, "logits/rejected": -0.345259428024292, "logps/chosen": -177.6084747314453, "logps/rejected": -152.01564025878906, "loss": 0.7134, "rewards/accuracies": 0.25, "rewards/chosen": -0.04515037685632706, "rewards/margins": -0.038474276661872864, "rewards/rejected": -0.006676102057099342, "step": 117 }, { "epoch": 0.06363758932182824, "grad_norm": 6.109826564788818, "learning_rate": 3.172043010752688e-07, "logits/chosen": 0.4674108028411865, "logits/rejected": 0.6075350642204285, "logps/chosen": -212.23764038085938, "logps/rejected": -228.42544555664062, "loss": 0.6823, "rewards/accuracies": 0.375, "rewards/chosen": 0.010671425610780716, "rewards/margins": 0.023857498541474342, "rewards/rejected": -0.013186073862016201, "step": 118 }, { "epoch": 0.06417689092625051, "grad_norm": 7.684591770172119, "learning_rate": 3.1989247311827953e-07, "logits/chosen": 0.6843189001083374, "logits/rejected": -1.2212755680084229, "logps/chosen": -243.41221618652344, "logps/rejected": -184.9658660888672, "loss": 0.6609, "rewards/accuracies": 0.875, "rewards/chosen": 0.05152969807386398, "rewards/margins": 0.0665821060538292, "rewards/rejected": -0.015052413567900658, "step": 119 }, { "epoch": 0.06471619253067278, "grad_norm": 6.508977890014648, "learning_rate": 3.225806451612903e-07, "logits/chosen": -0.5903627872467041, "logits/rejected": -0.987180769443512, "logps/chosen": -235.05068969726562, "logps/rejected": -215.9358673095703, "loss": 0.6925, "rewards/accuracies": 0.625, "rewards/chosen": 0.003882691264152527, "rewards/margins": 0.0048673637211322784, "rewards/rejected": -0.0009846691973507404, "step": 120 }, { "epoch": 0.06525549413509506, "grad_norm": 6.314504623413086, "learning_rate": 3.2526881720430106e-07, "logits/chosen": 0.8837164044380188, "logits/rejected": 0.5317541360855103, "logps/chosen": -235.8477020263672, "logps/rejected": -205.37924194335938, "loss": 0.6864, "rewards/accuracies": 0.625, "rewards/chosen": 0.024384401738643646, "rewards/margins": 0.017165949568152428, "rewards/rejected": 0.007218454964458942, "step": 121 }, { "epoch": 0.06579479573951733, "grad_norm": 6.692798137664795, "learning_rate": 3.279569892473118e-07, "logits/chosen": 0.11567951738834381, "logits/rejected": -0.5181595087051392, "logps/chosen": -231.08778381347656, "logps/rejected": -164.87640380859375, "loss": 0.6954, "rewards/accuracies": 0.375, "rewards/chosen": 0.014585687778890133, "rewards/margins": -0.001723096240311861, "rewards/rejected": 0.01630878634750843, "step": 122 }, { "epoch": 0.0663340973439396, "grad_norm": 6.869106292724609, "learning_rate": 3.3064516129032254e-07, "logits/chosen": 0.08426624536514282, "logits/rejected": 0.22467142343521118, "logps/chosen": -247.82705688476562, "logps/rejected": -239.68765258789062, "loss": 0.6968, "rewards/accuracies": 0.625, "rewards/chosen": 0.0067397113889455795, "rewards/margins": -0.004282906651496887, "rewards/rejected": 0.011022615246474743, "step": 123 }, { "epoch": 0.06687339894836188, "grad_norm": 6.134688377380371, "learning_rate": 3.333333333333333e-07, "logits/chosen": 0.30210328102111816, "logits/rejected": 0.3367750942707062, "logps/chosen": -198.6446533203125, "logps/rejected": -242.53306579589844, "loss": 0.685, "rewards/accuracies": 0.75, "rewards/chosen": -0.010391048155725002, "rewards/margins": 0.018226146697998047, "rewards/rejected": -0.028617193922400475, "step": 124 }, { "epoch": 0.06741270055278414, "grad_norm": 6.907747745513916, "learning_rate": 3.3602150537634406e-07, "logits/chosen": -0.17631223797798157, "logits/rejected": -1.5633583068847656, "logps/chosen": -403.77288818359375, "logps/rejected": -223.87344360351562, "loss": 0.6883, "rewards/accuracies": 0.5, "rewards/chosen": -0.02837219275534153, "rewards/margins": 0.010586069896817207, "rewards/rejected": -0.03895825892686844, "step": 125 }, { "epoch": 0.06795200215720641, "grad_norm": 6.383999824523926, "learning_rate": 3.387096774193548e-07, "logits/chosen": 1.227532148361206, "logits/rejected": -0.04487735033035278, "logps/chosen": -270.8870849609375, "logps/rejected": -208.62008666992188, "loss": 0.6944, "rewards/accuracies": 0.5, "rewards/chosen": 0.0014595985412597656, "rewards/margins": -0.0015542032197117805, "rewards/rejected": 0.003013802692294121, "step": 126 }, { "epoch": 0.0684913037616287, "grad_norm": 6.353526592254639, "learning_rate": 3.4139784946236554e-07, "logits/chosen": 0.41332340240478516, "logits/rejected": 0.39624616503715515, "logps/chosen": -173.9806671142578, "logps/rejected": -169.4077606201172, "loss": 0.6922, "rewards/accuracies": 0.5, "rewards/chosen": 0.0024746907874941826, "rewards/margins": 0.004561522975564003, "rewards/rejected": -0.002086831256747246, "step": 127 }, { "epoch": 0.06903060536605096, "grad_norm": 7.105887413024902, "learning_rate": 3.4408602150537636e-07, "logits/chosen": -0.5127610564231873, "logits/rejected": -0.44838762283325195, "logps/chosen": -162.35450744628906, "logps/rejected": -180.99191284179688, "loss": 0.7268, "rewards/accuracies": 0.125, "rewards/chosen": -0.03522186353802681, "rewards/margins": -0.06390209496021271, "rewards/rejected": 0.028680231422185898, "step": 128 }, { "epoch": 0.06956990697047323, "grad_norm": 7.621587753295898, "learning_rate": 3.467741935483871e-07, "logits/chosen": 0.8591537475585938, "logits/rejected": -0.16453510522842407, "logps/chosen": -223.06219482421875, "logps/rejected": -214.5289306640625, "loss": 0.7148, "rewards/accuracies": 0.375, "rewards/chosen": -0.05422839894890785, "rewards/margins": -0.03879585862159729, "rewards/rejected": -0.015432552434504032, "step": 129 }, { "epoch": 0.07010920857489551, "grad_norm": 6.224758148193359, "learning_rate": 3.4946236559139783e-07, "logits/chosen": 0.7909731864929199, "logits/rejected": 0.11295413970947266, "logps/chosen": -215.50811767578125, "logps/rejected": -203.31240844726562, "loss": 0.7005, "rewards/accuracies": 0.375, "rewards/chosen": 0.003505515865981579, "rewards/margins": -0.012995148077607155, "rewards/rejected": 0.016500666737556458, "step": 130 }, { "epoch": 0.07064851017931778, "grad_norm": 7.668251037597656, "learning_rate": 3.521505376344086e-07, "logits/chosen": -0.2360706329345703, "logits/rejected": -1.7903826236724854, "logps/chosen": -287.41143798828125, "logps/rejected": -186.87477111816406, "loss": 0.7076, "rewards/accuracies": 0.25, "rewards/chosen": -0.020603179931640625, "rewards/margins": -0.023633189499378204, "rewards/rejected": 0.0030300114303827286, "step": 131 }, { "epoch": 0.07118781178374006, "grad_norm": 7.104033470153809, "learning_rate": 3.5483870967741936e-07, "logits/chosen": 0.020370274782180786, "logits/rejected": 0.5299338698387146, "logps/chosen": -226.24685668945312, "logps/rejected": -253.16207885742188, "loss": 0.6604, "rewards/accuracies": 0.75, "rewards/chosen": 0.03772468864917755, "rewards/margins": 0.0723239928483963, "rewards/rejected": -0.03459930419921875, "step": 132 }, { "epoch": 0.07172711338816233, "grad_norm": 8.742920875549316, "learning_rate": 3.575268817204301e-07, "logits/chosen": -0.3816109299659729, "logits/rejected": -0.23530805110931396, "logps/chosen": -188.6468963623047, "logps/rejected": -259.71929931640625, "loss": 0.6808, "rewards/accuracies": 0.625, "rewards/chosen": 0.010636236518621445, "rewards/margins": 0.028217032551765442, "rewards/rejected": -0.017580796033143997, "step": 133 }, { "epoch": 0.0722664149925846, "grad_norm": 6.721354961395264, "learning_rate": 3.6021505376344083e-07, "logits/chosen": 0.5197598934173584, "logits/rejected": -0.4381166994571686, "logps/chosen": -300.8050231933594, "logps/rejected": -288.0959777832031, "loss": 0.7065, "rewards/accuracies": 0.375, "rewards/chosen": -0.010857867076992989, "rewards/margins": -0.021790120750665665, "rewards/rejected": 0.010932255536317825, "step": 134 }, { "epoch": 0.07280571659700688, "grad_norm": 7.106774806976318, "learning_rate": 3.629032258064516e-07, "logits/chosen": 0.9276412725448608, "logits/rejected": -0.0704178437590599, "logps/chosen": -279.3321838378906, "logps/rejected": -210.67568969726562, "loss": 0.6798, "rewards/accuracies": 0.625, "rewards/chosen": -0.01120452955365181, "rewards/margins": 0.028209494426846504, "rewards/rejected": -0.039414022117853165, "step": 135 }, { "epoch": 0.07334501820142915, "grad_norm": 7.032986640930176, "learning_rate": 3.6559139784946236e-07, "logits/chosen": 0.4905902147293091, "logits/rejected": -0.7251518368721008, "logps/chosen": -202.99661254882812, "logps/rejected": -205.9475860595703, "loss": 0.7255, "rewards/accuracies": 0.0, "rewards/chosen": -0.0351109504699707, "rewards/margins": -0.06287498027086258, "rewards/rejected": 0.027764035388827324, "step": 136 }, { "epoch": 0.07388431980585142, "grad_norm": 7.219090938568115, "learning_rate": 3.682795698924731e-07, "logits/chosen": 0.35861077904701233, "logits/rejected": -0.731515645980835, "logps/chosen": -227.3217010498047, "logps/rejected": -176.83651733398438, "loss": 0.7503, "rewards/accuracies": 0.25, "rewards/chosen": -0.047393035143613815, "rewards/margins": -0.10621261596679688, "rewards/rejected": 0.05881958454847336, "step": 137 }, { "epoch": 0.0744236214102737, "grad_norm": 6.475521564483643, "learning_rate": 3.7096774193548384e-07, "logits/chosen": 0.1530953347682953, "logits/rejected": 0.3627970814704895, "logps/chosen": -241.57369995117188, "logps/rejected": -250.31597900390625, "loss": 0.681, "rewards/accuracies": 0.75, "rewards/chosen": 0.025486279278993607, "rewards/margins": 0.025332259014248848, "rewards/rejected": 0.00015401793643832207, "step": 138 }, { "epoch": 0.07496292301469597, "grad_norm": 7.006613254547119, "learning_rate": 3.736559139784946e-07, "logits/chosen": 0.32085859775543213, "logits/rejected": -1.5040405988693237, "logps/chosen": -279.540771484375, "logps/rejected": -230.58998107910156, "loss": 0.7224, "rewards/accuracies": 0.125, "rewards/chosen": -0.05338096618652344, "rewards/margins": -0.05634450912475586, "rewards/rejected": 0.002963542938232422, "step": 139 }, { "epoch": 0.07550222461911824, "grad_norm": 7.12169075012207, "learning_rate": 3.7634408602150537e-07, "logits/chosen": 0.6614222526550293, "logits/rejected": -0.844718873500824, "logps/chosen": -314.2617492675781, "logps/rejected": -219.2525634765625, "loss": 0.7095, "rewards/accuracies": 0.5, "rewards/chosen": -0.007498360704630613, "rewards/margins": -0.030226515606045723, "rewards/rejected": 0.02272815629839897, "step": 140 }, { "epoch": 0.07604152622354052, "grad_norm": 6.555506229400635, "learning_rate": 3.7903225806451613e-07, "logits/chosen": -0.04018716514110565, "logits/rejected": -0.7562347054481506, "logps/chosen": -300.09320068359375, "logps/rejected": -289.5304260253906, "loss": 0.6716, "rewards/accuracies": 0.5, "rewards/chosen": 0.014808939769864082, "rewards/margins": 0.045479677617549896, "rewards/rejected": -0.030670735985040665, "step": 141 }, { "epoch": 0.07658082782796279, "grad_norm": 6.105082035064697, "learning_rate": 3.8172043010752684e-07, "logits/chosen": -0.13942235708236694, "logits/rejected": 0.10701143741607666, "logps/chosen": -284.3111877441406, "logps/rejected": -286.4670104980469, "loss": 0.705, "rewards/accuracies": 0.5, "rewards/chosen": -0.02376117743551731, "rewards/margins": -0.021942613646388054, "rewards/rejected": -0.0018185600638389587, "step": 142 }, { "epoch": 0.07712012943238507, "grad_norm": 6.004771709442139, "learning_rate": 3.844086021505376e-07, "logits/chosen": 0.5929513573646545, "logits/rejected": -1.4950335025787354, "logps/chosen": -298.653564453125, "logps/rejected": -137.4801025390625, "loss": 0.6815, "rewards/accuracies": 0.75, "rewards/chosen": 0.05566377937793732, "rewards/margins": 0.024037599563598633, "rewards/rejected": 0.031626179814338684, "step": 143 }, { "epoch": 0.07765943103680734, "grad_norm": 9.072623252868652, "learning_rate": 3.8709677419354837e-07, "logits/chosen": 0.23016899824142456, "logits/rejected": -0.23964515328407288, "logps/chosen": -201.08721923828125, "logps/rejected": -188.3890380859375, "loss": 0.7026, "rewards/accuracies": 0.375, "rewards/chosen": -0.021442223340272903, "rewards/margins": -0.018214799463748932, "rewards/rejected": -0.0032274238765239716, "step": 144 }, { "epoch": 0.0781987326412296, "grad_norm": 7.537434101104736, "learning_rate": 3.8978494623655913e-07, "logits/chosen": 0.9641353487968445, "logits/rejected": -0.22897176444530487, "logps/chosen": -388.0416564941406, "logps/rejected": -245.7544708251953, "loss": 0.7325, "rewards/accuracies": 0.25, "rewards/chosen": -0.036130812019109726, "rewards/margins": -0.07364273071289062, "rewards/rejected": 0.0375119224190712, "step": 145 }, { "epoch": 0.07873803424565189, "grad_norm": 7.207355499267578, "learning_rate": 3.9247311827956984e-07, "logits/chosen": -0.9681949615478516, "logits/rejected": -0.949472963809967, "logps/chosen": -275.3939208984375, "logps/rejected": -222.48867797851562, "loss": 0.6964, "rewards/accuracies": 0.5, "rewards/chosen": -0.004505349788814783, "rewards/margins": -0.00585327111184597, "rewards/rejected": 0.0013479236513376236, "step": 146 }, { "epoch": 0.07927733585007415, "grad_norm": 5.468898773193359, "learning_rate": 3.951612903225806e-07, "logits/chosen": 0.2360256463289261, "logits/rejected": -0.3465800881385803, "logps/chosen": -190.43582153320312, "logps/rejected": -159.0408935546875, "loss": 0.711, "rewards/accuracies": 0.25, "rewards/chosen": 0.009691428393125534, "rewards/margins": -0.032691195607185364, "rewards/rejected": 0.042382627725601196, "step": 147 }, { "epoch": 0.07981663745449642, "grad_norm": 6.753269672393799, "learning_rate": 3.9784946236559137e-07, "logits/chosen": 0.7971314191818237, "logits/rejected": -1.3638408184051514, "logps/chosen": -206.69964599609375, "logps/rejected": -167.602294921875, "loss": 0.6775, "rewards/accuracies": 0.875, "rewards/chosen": 0.00840530451387167, "rewards/margins": 0.03323831409215927, "rewards/rejected": -0.024833012372255325, "step": 148 }, { "epoch": 0.0803559390589187, "grad_norm": 7.281928539276123, "learning_rate": 4.0053763440860213e-07, "logits/chosen": 0.7305809259414673, "logits/rejected": -0.9924468994140625, "logps/chosen": -342.36383056640625, "logps/rejected": -297.43121337890625, "loss": 0.6512, "rewards/accuracies": 0.75, "rewards/chosen": 0.078095443546772, "rewards/margins": 0.08769045770168304, "rewards/rejected": -0.009595012292265892, "step": 149 }, { "epoch": 0.08089524066334097, "grad_norm": 6.312039375305176, "learning_rate": 4.0322580645161285e-07, "logits/chosen": -0.1042090505361557, "logits/rejected": 0.4732080399990082, "logps/chosen": -225.95968627929688, "logps/rejected": -244.32716369628906, "loss": 0.6926, "rewards/accuracies": 0.625, "rewards/chosen": -0.019490908831357956, "rewards/margins": 0.0031888969242572784, "rewards/rejected": -0.022679805755615234, "step": 150 }, { "epoch": 0.08143454226776324, "grad_norm": 7.59611701965332, "learning_rate": 4.059139784946236e-07, "logits/chosen": 0.8620892763137817, "logits/rejected": -0.5592824816703796, "logps/chosen": -211.33226013183594, "logps/rejected": -170.144287109375, "loss": 0.7258, "rewards/accuracies": 0.125, "rewards/chosen": 0.023562908172607422, "rewards/margins": -0.06263504177331924, "rewards/rejected": 0.08619795739650726, "step": 151 }, { "epoch": 0.08197384387218552, "grad_norm": 7.186800956726074, "learning_rate": 4.0860215053763443e-07, "logits/chosen": 0.17595016956329346, "logits/rejected": 0.01772180199623108, "logps/chosen": -190.59886169433594, "logps/rejected": -260.2695007324219, "loss": 0.6974, "rewards/accuracies": 0.375, "rewards/chosen": 0.01728792116045952, "rewards/margins": -0.005357556045055389, "rewards/rejected": 0.02264546975493431, "step": 152 }, { "epoch": 0.08251314547660779, "grad_norm": 5.681769847869873, "learning_rate": 4.112903225806452e-07, "logits/chosen": 1.5119154453277588, "logits/rejected": 0.061738021671772, "logps/chosen": -328.6064147949219, "logps/rejected": -178.64486694335938, "loss": 0.6688, "rewards/accuracies": 0.625, "rewards/chosen": 0.01863708719611168, "rewards/margins": 0.05297217518091202, "rewards/rejected": -0.03433508798480034, "step": 153 }, { "epoch": 0.08305244708103006, "grad_norm": 7.231444835662842, "learning_rate": 4.139784946236559e-07, "logits/chosen": 0.07486224919557571, "logits/rejected": -1.1814764738082886, "logps/chosen": -244.68421936035156, "logps/rejected": -169.36212158203125, "loss": 0.692, "rewards/accuracies": 0.5, "rewards/chosen": -0.013127517886459827, "rewards/margins": 0.005053566303104162, "rewards/rejected": -0.01818108558654785, "step": 154 }, { "epoch": 0.08359174868545234, "grad_norm": 6.000890254974365, "learning_rate": 4.1666666666666667e-07, "logits/chosen": 0.3227558434009552, "logits/rejected": 0.05958421528339386, "logps/chosen": -199.31895446777344, "logps/rejected": -161.9635772705078, "loss": 0.678, "rewards/accuracies": 0.875, "rewards/chosen": 0.004298115149140358, "rewards/margins": 0.031194305047392845, "rewards/rejected": -0.026896189898252487, "step": 155 }, { "epoch": 0.08413105028987461, "grad_norm": 6.108272075653076, "learning_rate": 4.1935483870967743e-07, "logits/chosen": -1.2782829999923706, "logits/rejected": -1.336998462677002, "logps/chosen": -237.4818115234375, "logps/rejected": -247.7439727783203, "loss": 0.7247, "rewards/accuracies": 0.25, "rewards/chosen": -0.027077438309788704, "rewards/margins": -0.058408692479133606, "rewards/rejected": 0.03133125603199005, "step": 156 }, { "epoch": 0.08467035189429689, "grad_norm": 7.470032215118408, "learning_rate": 4.2204301075268814e-07, "logits/chosen": 1.0806255340576172, "logits/rejected": -0.12295845150947571, "logps/chosen": -292.5794677734375, "logps/rejected": -194.2822265625, "loss": 0.6858, "rewards/accuracies": 0.5, "rewards/chosen": -0.030491545796394348, "rewards/margins": 0.016788672655820847, "rewards/rejected": -0.047280214726924896, "step": 157 }, { "epoch": 0.08520965349871916, "grad_norm": 6.907007217407227, "learning_rate": 4.247311827956989e-07, "logits/chosen": -0.01464160904288292, "logits/rejected": -0.7349470853805542, "logps/chosen": -327.63641357421875, "logps/rejected": -297.59600830078125, "loss": 0.6997, "rewards/accuracies": 0.5, "rewards/chosen": 0.028275679796934128, "rewards/margins": -0.006446455605328083, "rewards/rejected": 0.034722138196229935, "step": 158 }, { "epoch": 0.08574895510314143, "grad_norm": 7.9193010330200195, "learning_rate": 4.2741935483870967e-07, "logits/chosen": 0.7603756785392761, "logits/rejected": -0.4012250602245331, "logps/chosen": -462.7437744140625, "logps/rejected": -423.2889709472656, "loss": 0.7073, "rewards/accuracies": 0.375, "rewards/chosen": -0.008969686925411224, "rewards/margins": -0.026636315509676933, "rewards/rejected": 0.01766662672162056, "step": 159 }, { "epoch": 0.08628825670756371, "grad_norm": 7.91565465927124, "learning_rate": 4.3010752688172043e-07, "logits/chosen": -0.10991892218589783, "logits/rejected": -0.7663592100143433, "logps/chosen": -314.01861572265625, "logps/rejected": -247.52029418945312, "loss": 0.6926, "rewards/accuracies": 0.625, "rewards/chosen": 0.03101653978228569, "rewards/margins": 0.0042503345757722855, "rewards/rejected": 0.026766207069158554, "step": 160 }, { "epoch": 0.08682755831198598, "grad_norm": 7.735986709594727, "learning_rate": 4.3279569892473114e-07, "logits/chosen": -0.10577099025249481, "logits/rejected": -0.5776668787002563, "logps/chosen": -259.29876708984375, "logps/rejected": -215.3236083984375, "loss": 0.7173, "rewards/accuracies": 0.375, "rewards/chosen": -0.007855987176299095, "rewards/margins": -0.04503984749317169, "rewards/rejected": 0.03718385472893715, "step": 161 }, { "epoch": 0.08736685991640825, "grad_norm": 6.7582197189331055, "learning_rate": 4.354838709677419e-07, "logits/chosen": -0.8174583911895752, "logits/rejected": 0.010134726762771606, "logps/chosen": -231.94252014160156, "logps/rejected": -222.84352111816406, "loss": 0.7041, "rewards/accuracies": 0.375, "rewards/chosen": -0.030335618183016777, "rewards/margins": -0.015979193150997162, "rewards/rejected": -0.014356421306729317, "step": 162 }, { "epoch": 0.08790616152083053, "grad_norm": 5.199538707733154, "learning_rate": 4.3817204301075267e-07, "logits/chosen": 0.27147430181503296, "logits/rejected": -0.060762494802474976, "logps/chosen": -221.62904357910156, "logps/rejected": -253.1787872314453, "loss": 0.7076, "rewards/accuracies": 0.375, "rewards/chosen": 0.033156588673591614, "rewards/margins": -0.026799391955137253, "rewards/rejected": 0.05995597690343857, "step": 163 }, { "epoch": 0.0884454631252528, "grad_norm": 6.822409629821777, "learning_rate": 4.4086021505376344e-07, "logits/chosen": 0.8659541606903076, "logits/rejected": -0.003930576145648956, "logps/chosen": -265.7403259277344, "logps/rejected": -158.7683563232422, "loss": 0.6799, "rewards/accuracies": 0.875, "rewards/chosen": 0.005611513741314411, "rewards/margins": 0.02819099649786949, "rewards/rejected": -0.022579479962587357, "step": 164 }, { "epoch": 0.08898476472967506, "grad_norm": 7.557977676391602, "learning_rate": 4.4354838709677415e-07, "logits/chosen": 1.3694324493408203, "logits/rejected": -1.388077974319458, "logps/chosen": -270.0377197265625, "logps/rejected": -183.71273803710938, "loss": 0.7021, "rewards/accuracies": 0.375, "rewards/chosen": -0.012463188730180264, "rewards/margins": -0.011100389063358307, "rewards/rejected": -0.0013628005981445312, "step": 165 }, { "epoch": 0.08952406633409735, "grad_norm": 7.4241414070129395, "learning_rate": 4.462365591397849e-07, "logits/chosen": -0.823911190032959, "logits/rejected": -0.07198643684387207, "logps/chosen": -203.17630004882812, "logps/rejected": -252.27032470703125, "loss": 0.6794, "rewards/accuracies": 0.5, "rewards/chosen": 0.0339476615190506, "rewards/margins": 0.031577207148075104, "rewards/rejected": 0.0023704534396529198, "step": 166 }, { "epoch": 0.09006336793851961, "grad_norm": 7.98874568939209, "learning_rate": 4.489247311827957e-07, "logits/chosen": 1.5631698369979858, "logits/rejected": -0.8693991899490356, "logps/chosen": -394.08404541015625, "logps/rejected": -335.13543701171875, "loss": 0.6856, "rewards/accuracies": 0.5, "rewards/chosen": 0.03447513282299042, "rewards/margins": 0.018830684944987297, "rewards/rejected": 0.015644455328583717, "step": 167 }, { "epoch": 0.0906026695429419, "grad_norm": 6.797164440155029, "learning_rate": 4.5161290322580644e-07, "logits/chosen": 0.29159510135650635, "logits/rejected": 0.6974357962608337, "logps/chosen": -167.06689453125, "logps/rejected": -200.53298950195312, "loss": 0.694, "rewards/accuracies": 0.5, "rewards/chosen": -0.0015314100310206413, "rewards/margins": 0.004843136295676231, "rewards/rejected": -0.00637455191463232, "step": 168 }, { "epoch": 0.09114197114736416, "grad_norm": 7.178526401519775, "learning_rate": 4.5430107526881715e-07, "logits/chosen": 0.38148263096809387, "logits/rejected": 0.5559564232826233, "logps/chosen": -250.43603515625, "logps/rejected": -249.7041015625, "loss": 0.6911, "rewards/accuracies": 0.5, "rewards/chosen": 0.036711499094963074, "rewards/margins": 0.0073720961809158325, "rewards/rejected": 0.02933940850198269, "step": 169 }, { "epoch": 0.09168127275178643, "grad_norm": 6.532577037811279, "learning_rate": 4.569892473118279e-07, "logits/chosen": -0.6895228028297424, "logits/rejected": -0.838682234287262, "logps/chosen": -198.482421875, "logps/rejected": -212.0919189453125, "loss": 0.6898, "rewards/accuracies": 0.25, "rewards/chosen": 0.0416560173034668, "rewards/margins": 0.00894155353307724, "rewards/rejected": 0.03271446377038956, "step": 170 }, { "epoch": 0.09222057435620871, "grad_norm": 6.3342366218566895, "learning_rate": 4.596774193548387e-07, "logits/chosen": -0.4784749746322632, "logits/rejected": 0.26199638843536377, "logps/chosen": -181.67379760742188, "logps/rejected": -208.9215087890625, "loss": 0.7496, "rewards/accuracies": 0.0, "rewards/chosen": -0.026695631444454193, "rewards/margins": -0.1090383529663086, "rewards/rejected": 0.082342728972435, "step": 171 }, { "epoch": 0.09275987596063098, "grad_norm": 7.206198692321777, "learning_rate": 4.6236559139784944e-07, "logits/chosen": 0.3985270857810974, "logits/rejected": -1.1762737035751343, "logps/chosen": -279.3482971191406, "logps/rejected": -282.6239013671875, "loss": 0.6847, "rewards/accuracies": 0.5, "rewards/chosen": 0.03280820697546005, "rewards/margins": 0.019608881324529648, "rewards/rejected": 0.013199331238865852, "step": 172 }, { "epoch": 0.09329917756505325, "grad_norm": 8.360557556152344, "learning_rate": 4.6505376344086015e-07, "logits/chosen": 0.7562321424484253, "logits/rejected": -0.501426637172699, "logps/chosen": -227.8045654296875, "logps/rejected": -201.94688415527344, "loss": 0.7502, "rewards/accuracies": 0.25, "rewards/chosen": -0.027507973834872246, "rewards/margins": -0.10779093950986862, "rewards/rejected": 0.08028297871351242, "step": 173 }, { "epoch": 0.09383847916947553, "grad_norm": 6.047730922698975, "learning_rate": 4.677419354838709e-07, "logits/chosen": 0.9722182750701904, "logits/rejected": -0.019637882709503174, "logps/chosen": -238.19517517089844, "logps/rejected": -237.8121337890625, "loss": 0.68, "rewards/accuracies": 0.75, "rewards/chosen": 0.019765377044677734, "rewards/margins": 0.028975771740078926, "rewards/rejected": -0.009210396558046341, "step": 174 }, { "epoch": 0.0943777807738978, "grad_norm": 6.275174617767334, "learning_rate": 4.7043010752688173e-07, "logits/chosen": 0.6219586730003357, "logits/rejected": -0.34657323360443115, "logps/chosen": -223.41522216796875, "logps/rejected": -173.36373901367188, "loss": 0.6973, "rewards/accuracies": 0.5, "rewards/chosen": 0.031577013432979584, "rewards/margins": -0.004710676148533821, "rewards/rejected": 0.036287691444158554, "step": 175 }, { "epoch": 0.09491708237832007, "grad_norm": 6.156719207763672, "learning_rate": 4.731182795698925e-07, "logits/chosen": 0.9043510556221008, "logits/rejected": 0.673556923866272, "logps/chosen": -171.89395141601562, "logps/rejected": -190.29348754882812, "loss": 0.6843, "rewards/accuracies": 0.625, "rewards/chosen": 0.011883927509188652, "rewards/margins": 0.01932363584637642, "rewards/rejected": -0.007439708337187767, "step": 176 }, { "epoch": 0.09545638398274235, "grad_norm": 7.2206926345825195, "learning_rate": 4.758064516129032e-07, "logits/chosen": 0.3551584780216217, "logits/rejected": 0.10711896419525146, "logps/chosen": -254.1627197265625, "logps/rejected": -290.2313537597656, "loss": 0.6563, "rewards/accuracies": 0.75, "rewards/chosen": 0.06117973476648331, "rewards/margins": 0.07615766674280167, "rewards/rejected": -0.01497793197631836, "step": 177 }, { "epoch": 0.09599568558716462, "grad_norm": 7.849745750427246, "learning_rate": 4.78494623655914e-07, "logits/chosen": 1.0484585762023926, "logits/rejected": -1.208599328994751, "logps/chosen": -298.900634765625, "logps/rejected": -179.2528076171875, "loss": 0.7039, "rewards/accuracies": 0.5, "rewards/chosen": -0.0076864250004291534, "rewards/margins": -0.014931391924619675, "rewards/rejected": 0.007244969718158245, "step": 178 }, { "epoch": 0.0965349871915869, "grad_norm": 6.574667930603027, "learning_rate": 4.811827956989247e-07, "logits/chosen": 0.19276052713394165, "logits/rejected": 0.2879297137260437, "logps/chosen": -279.83612060546875, "logps/rejected": -280.78216552734375, "loss": 0.6849, "rewards/accuracies": 0.625, "rewards/chosen": 0.05282783508300781, "rewards/margins": 0.023767944425344467, "rewards/rejected": 0.029059890657663345, "step": 179 }, { "epoch": 0.09707428879600917, "grad_norm": 6.539738178253174, "learning_rate": 4.838709677419355e-07, "logits/chosen": -0.3295314311981201, "logits/rejected": -0.24328632652759552, "logps/chosen": -207.35916137695312, "logps/rejected": -259.35498046875, "loss": 0.7141, "rewards/accuracies": 0.5, "rewards/chosen": -0.0024603381752967834, "rewards/margins": -0.032614000141620636, "rewards/rejected": 0.030153660103678703, "step": 180 }, { "epoch": 0.09761359040043144, "grad_norm": 6.968689918518066, "learning_rate": 4.865591397849462e-07, "logits/chosen": -0.24601441621780396, "logits/rejected": -1.439511775970459, "logps/chosen": -276.75604248046875, "logps/rejected": -219.68165588378906, "loss": 0.6974, "rewards/accuracies": 0.125, "rewards/chosen": -0.02377147600054741, "rewards/margins": -0.0028133802115917206, "rewards/rejected": -0.02095808833837509, "step": 181 }, { "epoch": 0.09815289200485372, "grad_norm": 6.143064498901367, "learning_rate": 4.89247311827957e-07, "logits/chosen": 0.9326776266098022, "logits/rejected": 0.21904674172401428, "logps/chosen": -263.0152587890625, "logps/rejected": -244.75741577148438, "loss": 0.7078, "rewards/accuracies": 0.25, "rewards/chosen": -0.016509246081113815, "rewards/margins": -0.026605987921357155, "rewards/rejected": 0.010096740908920765, "step": 182 }, { "epoch": 0.09869219360927599, "grad_norm": 7.741256237030029, "learning_rate": 4.919354838709677e-07, "logits/chosen": 0.7792307138442993, "logits/rejected": 0.3426993489265442, "logps/chosen": -222.84432983398438, "logps/rejected": -267.7077331542969, "loss": 0.7127, "rewards/accuracies": 0.5, "rewards/chosen": -0.007165621966123581, "rewards/margins": -0.03417348861694336, "rewards/rejected": 0.02700786665081978, "step": 183 }, { "epoch": 0.09923149521369826, "grad_norm": 6.0976338386535645, "learning_rate": 4.946236559139784e-07, "logits/chosen": 0.38987305760383606, "logits/rejected": 0.3257530927658081, "logps/chosen": -239.9996337890625, "logps/rejected": -270.5169372558594, "loss": 0.6687, "rewards/accuracies": 0.875, "rewards/chosen": 0.028687190264463425, "rewards/margins": 0.05121263861656189, "rewards/rejected": -0.022525452077388763, "step": 184 }, { "epoch": 0.09977079681812054, "grad_norm": 7.034929275512695, "learning_rate": 4.973118279569893e-07, "logits/chosen": -0.1917126327753067, "logits/rejected": -0.934265673160553, "logps/chosen": -243.61903381347656, "logps/rejected": -308.2969970703125, "loss": 0.6996, "rewards/accuracies": 0.625, "rewards/chosen": 0.0031244270503520966, "rewards/margins": -0.006550407037138939, "rewards/rejected": 0.00967483315616846, "step": 185 }, { "epoch": 0.1003100984225428, "grad_norm": 6.405734539031982, "learning_rate": 5e-07, "logits/chosen": -0.9696601033210754, "logits/rejected": -1.4748178720474243, "logps/chosen": -253.4070587158203, "logps/rejected": -237.560546875, "loss": 0.674, "rewards/accuracies": 0.625, "rewards/chosen": 0.04361715540289879, "rewards/margins": 0.04063558578491211, "rewards/rejected": 0.0029815668240189552, "step": 186 }, { "epoch": 0.10084940002696507, "grad_norm": 7.137935638427734, "learning_rate": 4.999995565776927e-07, "logits/chosen": 0.32337796688079834, "logits/rejected": -0.001250341534614563, "logps/chosen": -243.45118713378906, "logps/rejected": -209.0843505859375, "loss": 0.6698, "rewards/accuracies": 0.625, "rewards/chosen": 0.045314401388168335, "rewards/margins": 0.050467394292354584, "rewards/rejected": -0.005152988247573376, "step": 187 }, { "epoch": 0.10138870163138736, "grad_norm": 6.8028483390808105, "learning_rate": 4.999982263123439e-07, "logits/chosen": -0.46957704424858093, "logits/rejected": -0.19437621533870697, "logps/chosen": -219.54592895507812, "logps/rejected": -239.01675415039062, "loss": 0.6744, "rewards/accuracies": 0.75, "rewards/chosen": 0.006217004265636206, "rewards/margins": 0.0421321876347065, "rewards/rejected": -0.03591518476605415, "step": 188 }, { "epoch": 0.10192800323580962, "grad_norm": 6.405226707458496, "learning_rate": 4.999960092086724e-07, "logits/chosen": 0.09505641460418701, "logits/rejected": -0.49824753403663635, "logps/chosen": -279.49188232421875, "logps/rejected": -234.56768798828125, "loss": 0.6652, "rewards/accuracies": 0.5, "rewards/chosen": 0.010203838348388672, "rewards/margins": 0.06002025678753853, "rewards/rejected": -0.04981641471385956, "step": 189 }, { "epoch": 0.1024673048402319, "grad_norm": 7.036262035369873, "learning_rate": 4.999929052745433e-07, "logits/chosen": 1.4189238548278809, "logits/rejected": 0.012433469295501709, "logps/chosen": -253.06494140625, "logps/rejected": -170.58248901367188, "loss": 0.6964, "rewards/accuracies": 0.5, "rewards/chosen": 0.005678797140717506, "rewards/margins": -0.0010786540806293488, "rewards/rejected": 0.006757451221346855, "step": 190 }, { "epoch": 0.10300660644465418, "grad_norm": 7.62278413772583, "learning_rate": 4.999889145209673e-07, "logits/chosen": 0.02668188512325287, "logits/rejected": -0.8062188625335693, "logps/chosen": -219.44329833984375, "logps/rejected": -252.56329345703125, "loss": 0.6799, "rewards/accuracies": 0.5, "rewards/chosen": 0.02746877819299698, "rewards/margins": 0.031227588653564453, "rewards/rejected": -0.003758814185857773, "step": 191 }, { "epoch": 0.10354590804907644, "grad_norm": 6.097692966461182, "learning_rate": 4.999840369621011e-07, "logits/chosen": -0.014339327812194824, "logits/rejected": -0.8894832134246826, "logps/chosen": -214.70794677734375, "logps/rejected": -193.51980590820312, "loss": 0.6637, "rewards/accuracies": 0.625, "rewards/chosen": 0.012210848741233349, "rewards/margins": 0.06311655044555664, "rewards/rejected": -0.050905704498291016, "step": 192 }, { "epoch": 0.10408520965349873, "grad_norm": 6.88072395324707, "learning_rate": 4.999782726152473e-07, "logits/chosen": 0.7941837906837463, "logits/rejected": 0.283910870552063, "logps/chosen": -271.1532287597656, "logps/rejected": -293.555419921875, "loss": 0.6709, "rewards/accuracies": 0.625, "rewards/chosen": -0.010912514291703701, "rewards/margins": 0.04935913532972336, "rewards/rejected": -0.060271646827459335, "step": 193 }, { "epoch": 0.104624511257921, "grad_norm": 6.610561847686768, "learning_rate": 4.999716215008541e-07, "logits/chosen": 0.17930558323860168, "logits/rejected": -0.24814483523368835, "logps/chosen": -227.82180786132812, "logps/rejected": -215.54769897460938, "loss": 0.6765, "rewards/accuracies": 0.625, "rewards/chosen": 0.029262639582157135, "rewards/margins": 0.034816838800907135, "rewards/rejected": -0.005554199684411287, "step": 194 }, { "epoch": 0.10516381286234326, "grad_norm": 8.332368850708008, "learning_rate": 4.999640836425158e-07, "logits/chosen": 0.756824254989624, "logits/rejected": -0.6484280228614807, "logps/chosen": -286.78106689453125, "logps/rejected": -277.13201904296875, "loss": 0.6626, "rewards/accuracies": 0.5, "rewards/chosen": -0.002952384762465954, "rewards/margins": 0.07001467049121857, "rewards/rejected": -0.0729670524597168, "step": 195 }, { "epoch": 0.10570311446676554, "grad_norm": 6.5538330078125, "learning_rate": 4.999556590669718e-07, "logits/chosen": 1.076865792274475, "logits/rejected": -0.13935823738574982, "logps/chosen": -261.78802490234375, "logps/rejected": -227.82969665527344, "loss": 0.7418, "rewards/accuracies": 0.25, "rewards/chosen": -0.012325858697295189, "rewards/margins": -0.09296751022338867, "rewards/rejected": 0.08064165711402893, "step": 196 }, { "epoch": 0.10624241607118781, "grad_norm": 6.386448860168457, "learning_rate": 4.999463478041073e-07, "logits/chosen": -0.4803362488746643, "logits/rejected": 0.17489734292030334, "logps/chosen": -227.22853088378906, "logps/rejected": -256.0918884277344, "loss": 0.6857, "rewards/accuracies": 0.625, "rewards/chosen": 0.005980108864605427, "rewards/margins": 0.01783571019768715, "rewards/rejected": -0.011855602264404297, "step": 197 }, { "epoch": 0.10678171767561008, "grad_norm": 6.634751319885254, "learning_rate": 4.999361498869529e-07, "logits/chosen": 0.6870744228363037, "logits/rejected": -0.6563974618911743, "logps/chosen": -267.78753662109375, "logps/rejected": -223.43617248535156, "loss": 0.707, "rewards/accuracies": 0.25, "rewards/chosen": -0.02819690853357315, "rewards/margins": -0.024837777018547058, "rewards/rejected": -0.0033591287210583687, "step": 198 }, { "epoch": 0.10732101928003236, "grad_norm": 6.97332239151001, "learning_rate": 4.999250653516845e-07, "logits/chosen": 0.41431358456611633, "logits/rejected": -0.22208243608474731, "logps/chosen": -235.4289093017578, "logps/rejected": -210.34988403320312, "loss": 0.6987, "rewards/accuracies": 0.375, "rewards/chosen": 0.0024736421182751656, "rewards/margins": -0.009373282082378864, "rewards/rejected": 0.011846923269331455, "step": 199 }, { "epoch": 0.10786032088445463, "grad_norm": 6.267891883850098, "learning_rate": 4.999130942376231e-07, "logits/chosen": 1.0571568012237549, "logits/rejected": 1.8908782005310059, "logps/chosen": -177.09030151367188, "logps/rejected": -208.27386474609375, "loss": 0.7075, "rewards/accuracies": 0.375, "rewards/chosen": 0.005918025970458984, "rewards/margins": -0.02726755291223526, "rewards/rejected": 0.033185575157403946, "step": 200 }, { "epoch": 0.1083996224888769, "grad_norm": 6.359236717224121, "learning_rate": 4.999002365872348e-07, "logits/chosen": -0.14352934062480927, "logits/rejected": 0.5862343907356262, "logps/chosen": -155.13284301757812, "logps/rejected": -227.6934814453125, "loss": 0.7008, "rewards/accuracies": 0.25, "rewards/chosen": -0.00041017378680408, "rewards/margins": -0.010064603760838509, "rewards/rejected": 0.009654426947236061, "step": 201 }, { "epoch": 0.10893892409329918, "grad_norm": 8.27083683013916, "learning_rate": 4.998864924461304e-07, "logits/chosen": 1.4799309968948364, "logits/rejected": -0.4924044907093048, "logps/chosen": -232.93711853027344, "logps/rejected": -170.4285888671875, "loss": 0.6696, "rewards/accuracies": 0.75, "rewards/chosen": 0.04558706283569336, "rewards/margins": 0.051835350692272186, "rewards/rejected": -0.006248282268643379, "step": 202 }, { "epoch": 0.10947822569772145, "grad_norm": 6.436233043670654, "learning_rate": 4.998718618630659e-07, "logits/chosen": -0.12273624539375305, "logits/rejected": -0.48034727573394775, "logps/chosen": -193.4567413330078, "logps/rejected": -194.326171875, "loss": 0.7142, "rewards/accuracies": 0.5, "rewards/chosen": 0.012621021829545498, "rewards/margins": -0.0387820228934288, "rewards/rejected": 0.05140303820371628, "step": 203 }, { "epoch": 0.11001752730214373, "grad_norm": 6.63557243347168, "learning_rate": 4.998563448899412e-07, "logits/chosen": 0.7062223553657532, "logits/rejected": -0.24172890186309814, "logps/chosen": -299.349365234375, "logps/rejected": -309.4092712402344, "loss": 0.6985, "rewards/accuracies": 0.625, "rewards/chosen": -0.06661748886108398, "rewards/margins": -0.007557202130556107, "rewards/rejected": -0.05906028673052788, "step": 204 }, { "epoch": 0.110556828906566, "grad_norm": 6.784613132476807, "learning_rate": 4.99839941581801e-07, "logits/chosen": 0.7669838666915894, "logits/rejected": -0.16963039338588715, "logps/chosen": -244.7215576171875, "logps/rejected": -218.2976531982422, "loss": 0.6941, "rewards/accuracies": 0.5, "rewards/chosen": 0.008421897888183594, "rewards/margins": 0.0012912750244140625, "rewards/rejected": 0.007130622863769531, "step": 205 }, { "epoch": 0.11109613051098827, "grad_norm": 6.878082275390625, "learning_rate": 4.998226519968341e-07, "logits/chosen": 0.581382691860199, "logits/rejected": -0.08390523493289948, "logps/chosen": -274.3362731933594, "logps/rejected": -239.4439239501953, "loss": 0.6816, "rewards/accuracies": 0.625, "rewards/chosen": 0.0321081168949604, "rewards/margins": 0.024597741663455963, "rewards/rejected": 0.007510376162827015, "step": 206 }, { "epoch": 0.11163543211541055, "grad_norm": 5.746057987213135, "learning_rate": 4.99804476196373e-07, "logits/chosen": 0.6537402868270874, "logits/rejected": 0.6574373841285706, "logps/chosen": -236.6945037841797, "logps/rejected": -213.69647216796875, "loss": 0.6934, "rewards/accuracies": 0.625, "rewards/chosen": 0.014635181054472923, "rewards/margins": 0.00023584533482789993, "rewards/rejected": 0.014399336650967598, "step": 207 }, { "epoch": 0.11217473371983282, "grad_norm": 6.943594932556152, "learning_rate": 4.997854142448944e-07, "logits/chosen": 0.3459778130054474, "logits/rejected": -0.09422534704208374, "logps/chosen": -210.82469177246094, "logps/rejected": -230.1856689453125, "loss": 0.6903, "rewards/accuracies": 0.375, "rewards/chosen": 0.049956321716308594, "rewards/margins": 0.007041167467832565, "rewards/rejected": 0.04291515424847603, "step": 208 }, { "epoch": 0.11271403532425509, "grad_norm": 7.947775363922119, "learning_rate": 4.997654662100181e-07, "logits/chosen": 0.011326480656862259, "logits/rejected": -1.4494271278381348, "logps/chosen": -306.7877197265625, "logps/rejected": -229.54562377929688, "loss": 0.7205, "rewards/accuracies": 0.375, "rewards/chosen": 0.02002897299826145, "rewards/margins": -0.04409770667552948, "rewards/rejected": 0.06412668526172638, "step": 209 }, { "epoch": 0.11325333692867737, "grad_norm": 7.874186038970947, "learning_rate": 4.997446321625073e-07, "logits/chosen": 1.5685031414031982, "logits/rejected": 0.566070020198822, "logps/chosen": -294.9813232421875, "logps/rejected": -332.3543701171875, "loss": 0.6938, "rewards/accuracies": 0.375, "rewards/chosen": -0.0400637686252594, "rewards/margins": 0.0016251513734459877, "rewards/rejected": -0.04168891906738281, "step": 210 }, { "epoch": 0.11379263853309964, "grad_norm": 6.207097053527832, "learning_rate": 4.997229121762684e-07, "logits/chosen": 0.32999634742736816, "logits/rejected": 0.17090024054050446, "logps/chosen": -243.92213439941406, "logps/rejected": -196.0301513671875, "loss": 0.6855, "rewards/accuracies": 0.5, "rewards/chosen": 0.033377982676029205, "rewards/margins": 0.016135741025209427, "rewards/rejected": 0.01724224165081978, "step": 211 }, { "epoch": 0.1143319401375219, "grad_norm": 5.998737335205078, "learning_rate": 4.997003063283502e-07, "logits/chosen": 0.3810163736343384, "logits/rejected": -0.8127812147140503, "logps/chosen": -257.32440185546875, "logps/rejected": -211.43621826171875, "loss": 0.664, "rewards/accuracies": 0.875, "rewards/chosen": 0.0796486884355545, "rewards/margins": 0.0614534392952919, "rewards/rejected": 0.018195247277617455, "step": 212 }, { "epoch": 0.11487124174194419, "grad_norm": 7.4709367752075195, "learning_rate": 4.996768146989445e-07, "logits/chosen": 0.1569305956363678, "logits/rejected": -1.1725553274154663, "logps/chosen": -235.70167541503906, "logps/rejected": -189.87998962402344, "loss": 0.7223, "rewards/accuracies": 0.25, "rewards/chosen": -0.06597356498241425, "rewards/margins": -0.05419092997908592, "rewards/rejected": -0.01178264245390892, "step": 213 }, { "epoch": 0.11541054334636645, "grad_norm": 5.617773056030273, "learning_rate": 4.996524373713848e-07, "logits/chosen": -0.030721306800842285, "logits/rejected": -1.3299014568328857, "logps/chosen": -233.33273315429688, "logps/rejected": -170.2500762939453, "loss": 0.7032, "rewards/accuracies": 0.375, "rewards/chosen": 0.030399128794670105, "rewards/margins": -0.017949961125850677, "rewards/rejected": 0.04834909364581108, "step": 214 }, { "epoch": 0.11594984495078872, "grad_norm": 6.4985198974609375, "learning_rate": 4.996271744321466e-07, "logits/chosen": 0.6587644815444946, "logits/rejected": 1.5982177257537842, "logps/chosen": -139.60458374023438, "logps/rejected": -175.57179260253906, "loss": 0.6945, "rewards/accuracies": 0.375, "rewards/chosen": -0.023651979863643646, "rewards/margins": -0.0012149792164564133, "rewards/rejected": -0.022437000647187233, "step": 215 }, { "epoch": 0.116489146555211, "grad_norm": 6.767206192016602, "learning_rate": 4.996010259708473e-07, "logits/chosen": 0.4772403836250305, "logits/rejected": -1.0584102869033813, "logps/chosen": -279.1919250488281, "logps/rejected": -192.27236938476562, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": -0.005828283727169037, "rewards/margins": 0.0037091271951794624, "rewards/rejected": -0.00953740905970335, "step": 216 }, { "epoch": 0.11702844815963327, "grad_norm": 6.8078742027282715, "learning_rate": 4.995739920802454e-07, "logits/chosen": 0.6478524804115295, "logits/rejected": -0.7123546600341797, "logps/chosen": -268.1871337890625, "logps/rejected": -281.73077392578125, "loss": 0.696, "rewards/accuracies": 0.375, "rewards/chosen": 0.03046436235308647, "rewards/margins": -0.0015881545841693878, "rewards/rejected": 0.03205251693725586, "step": 217 }, { "epoch": 0.11756774976405555, "grad_norm": 6.457679748535156, "learning_rate": 4.995460728562402e-07, "logits/chosen": 0.9126384258270264, "logits/rejected": -0.31486743688583374, "logps/chosen": -260.236572265625, "logps/rejected": -205.59991455078125, "loss": 0.6663, "rewards/accuracies": 0.5, "rewards/chosen": 0.09608058631420135, "rewards/margins": 0.05854463577270508, "rewards/rejected": 0.037535957992076874, "step": 218 }, { "epoch": 0.11810705136847782, "grad_norm": 6.880275249481201, "learning_rate": 4.995172683978719e-07, "logits/chosen": 0.2728205621242523, "logits/rejected": 0.9785915613174438, "logps/chosen": -292.6724853515625, "logps/rejected": -345.31976318359375, "loss": 0.7087, "rewards/accuracies": 0.375, "rewards/chosen": -0.028146076947450638, "rewards/margins": -0.024281978607177734, "rewards/rejected": -0.0038640983402729034, "step": 219 }, { "epoch": 0.11864635297290009, "grad_norm": 6.455776691436768, "learning_rate": 4.994875788073206e-07, "logits/chosen": 0.9185144305229187, "logits/rejected": 0.976534366607666, "logps/chosen": -269.01104736328125, "logps/rejected": -247.5106201171875, "loss": 0.659, "rewards/accuracies": 0.75, "rewards/chosen": 0.08281288295984268, "rewards/margins": 0.0744701400399208, "rewards/rejected": 0.00834274385124445, "step": 220 }, { "epoch": 0.11918565457732237, "grad_norm": 7.356355667114258, "learning_rate": 4.994570041899067e-07, "logits/chosen": 1.010449767112732, "logits/rejected": -0.3558628261089325, "logps/chosen": -340.7996826171875, "logps/rejected": -233.06402587890625, "loss": 0.7004, "rewards/accuracies": 0.5, "rewards/chosen": 0.045015525072813034, "rewards/margins": -0.007821086794137955, "rewards/rejected": 0.05283661186695099, "step": 221 }, { "epoch": 0.11972495618174464, "grad_norm": 7.158155918121338, "learning_rate": 4.994255446540899e-07, "logits/chosen": 0.8658974170684814, "logits/rejected": -0.35585254430770874, "logps/chosen": -283.4674377441406, "logps/rejected": -209.93832397460938, "loss": 0.6755, "rewards/accuracies": 0.75, "rewards/chosen": 0.05005664750933647, "rewards/margins": 0.03867883235216141, "rewards/rejected": 0.011377811431884766, "step": 222 }, { "epoch": 0.12026425778616691, "grad_norm": 6.71658182144165, "learning_rate": 4.993932003114691e-07, "logits/chosen": -0.6589486598968506, "logits/rejected": -0.6097522974014282, "logps/chosen": -305.1965637207031, "logps/rejected": -255.4315643310547, "loss": 0.6996, "rewards/accuracies": 0.25, "rewards/chosen": 0.03106517717242241, "rewards/margins": -0.011605741456151009, "rewards/rejected": 0.04267092049121857, "step": 223 }, { "epoch": 0.12080355939058919, "grad_norm": 6.22304105758667, "learning_rate": 4.993599712767819e-07, "logits/chosen": 1.000281810760498, "logits/rejected": 0.18969373404979706, "logps/chosen": -316.17083740234375, "logps/rejected": -211.33438110351562, "loss": 0.6581, "rewards/accuracies": 0.75, "rewards/chosen": 0.08975353091955185, "rewards/margins": 0.07358789443969727, "rewards/rejected": 0.016165640205144882, "step": 224 }, { "epoch": 0.12134286099501146, "grad_norm": 6.141915321350098, "learning_rate": 4.993258576679042e-07, "logits/chosen": -0.18809056282043457, "logits/rejected": -0.337306946516037, "logps/chosen": -186.1077117919922, "logps/rejected": -153.03172302246094, "loss": 0.6831, "rewards/accuracies": 0.625, "rewards/chosen": 0.04728851467370987, "rewards/margins": 0.021488238126039505, "rewards/rejected": 0.025800276547670364, "step": 225 }, { "epoch": 0.12188216259943373, "grad_norm": 7.281726360321045, "learning_rate": 4.992908596058501e-07, "logits/chosen": -0.500576913356781, "logits/rejected": -1.3929693698883057, "logps/chosen": -188.73147583007812, "logps/rejected": -169.9917755126953, "loss": 0.7157, "rewards/accuracies": 0.375, "rewards/chosen": -0.002253390848636627, "rewards/margins": -0.04139256477355957, "rewards/rejected": 0.03913917392492294, "step": 226 }, { "epoch": 0.12242146420385601, "grad_norm": 6.203949451446533, "learning_rate": 4.992549772147706e-07, "logits/chosen": 0.40049973130226135, "logits/rejected": -0.10271699726581573, "logps/chosen": -198.89724731445312, "logps/rejected": -158.23289489746094, "loss": 0.6911, "rewards/accuracies": 0.5, "rewards/chosen": 0.023543167859315872, "rewards/margins": 0.007886359468102455, "rewards/rejected": 0.015656806528568268, "step": 227 }, { "epoch": 0.12296076580827828, "grad_norm": 6.84366512298584, "learning_rate": 4.992182106219544e-07, "logits/chosen": 0.4908100664615631, "logits/rejected": -0.9280210733413696, "logps/chosen": -225.98712158203125, "logps/rejected": -146.47006225585938, "loss": 0.6775, "rewards/accuracies": 0.5, "rewards/chosen": 0.04104919731616974, "rewards/margins": 0.03267717361450195, "rewards/rejected": 0.008372019976377487, "step": 228 }, { "epoch": 0.12350006741270056, "grad_norm": 7.485854148864746, "learning_rate": 4.991805599578265e-07, "logits/chosen": 0.25295037031173706, "logits/rejected": -1.0862444639205933, "logps/chosen": -226.8338623046875, "logps/rejected": -199.894775390625, "loss": 0.6801, "rewards/accuracies": 0.625, "rewards/chosen": -0.015378187410533428, "rewards/margins": 0.030617710202932358, "rewards/rejected": -0.04599590227007866, "step": 229 }, { "epoch": 0.12403936901712283, "grad_norm": 7.3164849281311035, "learning_rate": 4.991420253559479e-07, "logits/chosen": 1.3231216669082642, "logits/rejected": 0.8034830093383789, "logps/chosen": -257.4018859863281, "logps/rejected": -204.0052490234375, "loss": 0.7527, "rewards/accuracies": 0.125, "rewards/chosen": -0.06591653823852539, "rewards/margins": -0.11462211608886719, "rewards/rejected": 0.048705581575632095, "step": 230 }, { "epoch": 0.1245786706215451, "grad_norm": 6.717327117919922, "learning_rate": 4.991026069530156e-07, "logits/chosen": -0.5272752642631531, "logits/rejected": -0.755237340927124, "logps/chosen": -231.23297119140625, "logps/rejected": -196.69200134277344, "loss": 0.6935, "rewards/accuracies": 0.375, "rewards/chosen": -0.014805985614657402, "rewards/margins": 0.0015865322202444077, "rewards/rejected": -0.01639251783490181, "step": 231 }, { "epoch": 0.12511797222596738, "grad_norm": 7.555964946746826, "learning_rate": 4.990623048888615e-07, "logits/chosen": 0.7022457718849182, "logits/rejected": -0.6594908237457275, "logps/chosen": -213.15562438964844, "logps/rejected": -170.78782653808594, "loss": 0.7047, "rewards/accuracies": 0.375, "rewards/chosen": -0.016031742095947266, "rewards/margins": -0.02069377899169922, "rewards/rejected": 0.004662036895751953, "step": 232 }, { "epoch": 0.12565727383038963, "grad_norm": 10.815020561218262, "learning_rate": 4.990211193064522e-07, "logits/chosen": 0.1610153764486313, "logits/rejected": 0.23612573742866516, "logps/chosen": -237.4895477294922, "logps/rejected": -241.71177673339844, "loss": 0.7004, "rewards/accuracies": 0.5, "rewards/chosen": 0.01094884891062975, "rewards/margins": -0.008545495569705963, "rewards/rejected": 0.019494343549013138, "step": 233 }, { "epoch": 0.12619657543481191, "grad_norm": 7.316073894500732, "learning_rate": 4.989790503518888e-07, "logits/chosen": 1.1823902130126953, "logits/rejected": -0.06588304042816162, "logps/chosen": -229.919921875, "logps/rejected": -201.7113494873047, "loss": 0.7244, "rewards/accuracies": 0.25, "rewards/chosen": -0.023639675229787827, "rewards/margins": -0.05851345509290695, "rewards/rejected": 0.03487377241253853, "step": 234 }, { "epoch": 0.1267358770392342, "grad_norm": 8.677364349365234, "learning_rate": 4.989360981744055e-07, "logits/chosen": 0.2745320796966553, "logits/rejected": -0.6499941349029541, "logps/chosen": -349.76336669921875, "logps/rejected": -306.7011413574219, "loss": 0.6737, "rewards/accuracies": 0.5, "rewards/chosen": -0.0070585282519459724, "rewards/margins": 0.0417669340968132, "rewards/rejected": -0.0488254576921463, "step": 235 }, { "epoch": 0.12727517864365648, "grad_norm": 7.247702598571777, "learning_rate": 4.988922629263701e-07, "logits/chosen": -0.415775328874588, "logits/rejected": -0.004965547472238541, "logps/chosen": -251.12445068359375, "logps/rejected": -302.75714111328125, "loss": 0.6802, "rewards/accuracies": 0.875, "rewards/chosen": 0.01622171513736248, "rewards/margins": 0.028058622032403946, "rewards/rejected": -0.011836908757686615, "step": 236 }, { "epoch": 0.12781448024807873, "grad_norm": 6.583347797393799, "learning_rate": 4.988475447632829e-07, "logits/chosen": 0.5492333173751831, "logits/rejected": 0.16161176562309265, "logps/chosen": -186.57760620117188, "logps/rejected": -185.44888305664062, "loss": 0.6837, "rewards/accuracies": 0.625, "rewards/chosen": 0.017728421837091446, "rewards/margins": 0.020546436309814453, "rewards/rejected": -0.002818011213093996, "step": 237 }, { "epoch": 0.12835378185250101, "grad_norm": 7.710158824920654, "learning_rate": 4.988019438437758e-07, "logits/chosen": -0.5364099740982056, "logits/rejected": -0.6832141280174255, "logps/chosen": -127.75313568115234, "logps/rejected": -153.7159423828125, "loss": 0.7056, "rewards/accuracies": 0.5, "rewards/chosen": 0.003597356379032135, "rewards/margins": -0.021053316071629524, "rewards/rejected": 0.02465066872537136, "step": 238 }, { "epoch": 0.1288930834569233, "grad_norm": 5.9386396408081055, "learning_rate": 4.987554603296129e-07, "logits/chosen": 0.8669990301132202, "logits/rejected": 0.17842529714107513, "logps/chosen": -183.70924377441406, "logps/rejected": -180.80966186523438, "loss": 0.7053, "rewards/accuracies": 0.375, "rewards/chosen": -0.003401852212846279, "rewards/margins": -0.023873044177889824, "rewards/rejected": 0.02047118917107582, "step": 239 }, { "epoch": 0.12943238506134555, "grad_norm": 7.245938777923584, "learning_rate": 4.987080943856886e-07, "logits/chosen": -1.1078895330429077, "logits/rejected": -0.7074912190437317, "logps/chosen": -161.60081481933594, "logps/rejected": -182.39317321777344, "loss": 0.6948, "rewards/accuracies": 0.625, "rewards/chosen": -0.004701994825154543, "rewards/margins": 0.0003358852118253708, "rewards/rejected": -0.005037880502641201, "step": 240 }, { "epoch": 0.12997168666576783, "grad_norm": 6.800535678863525, "learning_rate": 4.98659846180028e-07, "logits/chosen": -0.39338237047195435, "logits/rejected": -0.6595628261566162, "logps/chosen": -260.41357421875, "logps/rejected": -228.61856079101562, "loss": 0.7003, "rewards/accuracies": 0.375, "rewards/chosen": 0.04360198974609375, "rewards/margins": -0.012204170227050781, "rewards/rejected": 0.05580615997314453, "step": 241 }, { "epoch": 0.13051098827019011, "grad_norm": 7.6972880363464355, "learning_rate": 4.986107158837856e-07, "logits/chosen": 0.7222284078598022, "logits/rejected": 0.941322922706604, "logps/chosen": -239.0098876953125, "logps/rejected": -233.32351684570312, "loss": 0.6824, "rewards/accuracies": 0.625, "rewards/chosen": 0.062361814081668854, "rewards/margins": 0.024181842803955078, "rewards/rejected": 0.038179971277713776, "step": 242 }, { "epoch": 0.13105028987461237, "grad_norm": 7.45583963394165, "learning_rate": 4.985607036712452e-07, "logits/chosen": 0.03608454763889313, "logits/rejected": -0.7260949611663818, "logps/chosen": -301.29150390625, "logps/rejected": -291.7923583984375, "loss": 0.6838, "rewards/accuracies": 0.625, "rewards/chosen": -0.010033415630459785, "rewards/margins": 0.0206285510212183, "rewards/rejected": -0.030661962926387787, "step": 243 }, { "epoch": 0.13158959147903465, "grad_norm": 6.106156826019287, "learning_rate": 4.985098097198191e-07, "logits/chosen": 0.3672005832195282, "logits/rejected": -0.15952204167842865, "logps/chosen": -291.18603515625, "logps/rejected": -250.9359893798828, "loss": 0.6843, "rewards/accuracies": 0.625, "rewards/chosen": 0.01730194129049778, "rewards/margins": 0.019738387316465378, "rewards/rejected": -0.0024364469572901726, "step": 244 }, { "epoch": 0.13212889308345693, "grad_norm": 7.053607940673828, "learning_rate": 4.984580342100472e-07, "logits/chosen": 0.47961220145225525, "logits/rejected": -1.878825306892395, "logps/chosen": -223.56512451171875, "logps/rejected": -159.57623291015625, "loss": 0.7076, "rewards/accuracies": 0.375, "rewards/chosen": 0.025583554059267044, "rewards/margins": -0.024090243503451347, "rewards/rejected": 0.04967379570007324, "step": 245 }, { "epoch": 0.1326681946878792, "grad_norm": 7.0573554039001465, "learning_rate": 4.98405377325597e-07, "logits/chosen": 0.8123376965522766, "logits/rejected": 0.2774495482444763, "logps/chosen": -305.7408142089844, "logps/rejected": -235.2213134765625, "loss": 0.6851, "rewards/accuracies": 0.625, "rewards/chosen": 0.01801929622888565, "rewards/margins": 0.01817178726196289, "rewards/rejected": -0.0001524919643998146, "step": 246 }, { "epoch": 0.13320749629230147, "grad_norm": 6.690532207489014, "learning_rate": 4.983518392532625e-07, "logits/chosen": -0.34496867656707764, "logits/rejected": -0.46460530161857605, "logps/chosen": -216.6454620361328, "logps/rejected": -219.90728759765625, "loss": 0.6912, "rewards/accuracies": 0.5, "rewards/chosen": 0.02452860027551651, "rewards/margins": 0.005881068296730518, "rewards/rejected": 0.018647529184818268, "step": 247 }, { "epoch": 0.13374679789672375, "grad_norm": 8.130005836486816, "learning_rate": 4.982974201829633e-07, "logits/chosen": -1.1294013261795044, "logits/rejected": -0.9965440034866333, "logps/chosen": -209.34469604492188, "logps/rejected": -253.40505981445312, "loss": 0.7254, "rewards/accuracies": 0.375, "rewards/chosen": -0.019893646240234375, "rewards/margins": -0.06184587627649307, "rewards/rejected": 0.0419522300362587, "step": 248 }, { "epoch": 0.134286099501146, "grad_norm": 7.277283668518066, "learning_rate": 4.982421203077445e-07, "logits/chosen": -0.03627751022577286, "logits/rejected": -0.0241776704788208, "logps/chosen": -318.4494934082031, "logps/rejected": -339.3452453613281, "loss": 0.6709, "rewards/accuracies": 0.75, "rewards/chosen": 0.04390597343444824, "rewards/margins": 0.04830441623926163, "rewards/rejected": -0.0043984404765069485, "step": 249 }, { "epoch": 0.1348254011055683, "grad_norm": 8.035740852355957, "learning_rate": 4.981859398237757e-07, "logits/chosen": -0.5078524351119995, "logits/rejected": -0.6244974136352539, "logps/chosen": -292.7715759277344, "logps/rejected": -262.8336181640625, "loss": 0.7112, "rewards/accuracies": 0.5, "rewards/chosen": -0.03625240549445152, "rewards/margins": -0.032906532287597656, "rewards/rejected": -0.0033458704128861427, "step": 250 }, { "epoch": 0.13536470270999057, "grad_norm": 7.083550453186035, "learning_rate": 4.981288789303504e-07, "logits/chosen": -0.5907931923866272, "logits/rejected": -1.1230666637420654, "logps/chosen": -278.2828369140625, "logps/rejected": -207.8726348876953, "loss": 0.7149, "rewards/accuracies": 0.375, "rewards/chosen": 0.024218274280428886, "rewards/margins": -0.039838217198848724, "rewards/rejected": 0.06405648589134216, "step": 251 }, { "epoch": 0.13590400431441282, "grad_norm": 6.382159233093262, "learning_rate": 4.980709378298851e-07, "logits/chosen": 0.09374283254146576, "logits/rejected": -0.8206794857978821, "logps/chosen": -191.2978973388672, "logps/rejected": -180.0096435546875, "loss": 0.6614, "rewards/accuracies": 0.75, "rewards/chosen": 0.07540302723646164, "rewards/margins": 0.0697658583521843, "rewards/rejected": 0.0056371670216321945, "step": 252 }, { "epoch": 0.1364433059188351, "grad_norm": 7.469005107879639, "learning_rate": 4.980121167279188e-07, "logits/chosen": 0.9769697189331055, "logits/rejected": 0.21317024528980255, "logps/chosen": -276.62896728515625, "logps/rejected": -234.5221405029297, "loss": 0.7043, "rewards/accuracies": 0.375, "rewards/chosen": 0.013007547706365585, "rewards/margins": -0.019245335832238197, "rewards/rejected": 0.03225288540124893, "step": 253 }, { "epoch": 0.1369826075232574, "grad_norm": 8.205622673034668, "learning_rate": 4.979524158331123e-07, "logits/chosen": -0.3944055438041687, "logits/rejected": 0.07147693634033203, "logps/chosen": -267.31494140625, "logps/rejected": -261.1741638183594, "loss": 0.7172, "rewards/accuracies": 0.5, "rewards/chosen": -0.03220653906464577, "rewards/margins": -0.04531297832727432, "rewards/rejected": 0.013106439262628555, "step": 254 }, { "epoch": 0.13752190912767964, "grad_norm": 6.626962184906006, "learning_rate": 4.978918353572471e-07, "logits/chosen": -0.7329603433609009, "logits/rejected": -0.4787536859512329, "logps/chosen": -229.0244140625, "logps/rejected": -300.67352294921875, "loss": 0.6901, "rewards/accuracies": 0.5, "rewards/chosen": -0.00849170982837677, "rewards/margins": 0.00796956941485405, "rewards/rejected": -0.01646127738058567, "step": 255 }, { "epoch": 0.13806121073210192, "grad_norm": 6.935237884521484, "learning_rate": 4.978303755152254e-07, "logits/chosen": 0.03560969978570938, "logits/rejected": -1.1999659538269043, "logps/chosen": -228.10165405273438, "logps/rejected": -181.79171752929688, "loss": 0.7008, "rewards/accuracies": 0.375, "rewards/chosen": -0.023307038471102715, "rewards/margins": -0.0124327652156353, "rewards/rejected": -0.010874273255467415, "step": 256 }, { "epoch": 0.1386005123365242, "grad_norm": 7.650475025177002, "learning_rate": 4.977680365250681e-07, "logits/chosen": -1.3546632528305054, "logits/rejected": -0.3423352837562561, "logps/chosen": -211.08380126953125, "logps/rejected": -374.2903137207031, "loss": 0.699, "rewards/accuracies": 0.5, "rewards/chosen": -0.0011602388694882393, "rewards/margins": -0.008600998669862747, "rewards/rejected": 0.007440757006406784, "step": 257 }, { "epoch": 0.13913981394094646, "grad_norm": 9.429962158203125, "learning_rate": 4.977048186079155e-07, "logits/chosen": -0.45951443910598755, "logits/rejected": 0.32782310247421265, "logps/chosen": -311.54681396484375, "logps/rejected": -325.0628662109375, "loss": 0.7145, "rewards/accuracies": 0.5, "rewards/chosen": 0.0207340270280838, "rewards/margins": -0.03849802166223526, "rewards/rejected": 0.059232041239738464, "step": 258 }, { "epoch": 0.13967911554536874, "grad_norm": 5.403780937194824, "learning_rate": 4.976407219880253e-07, "logits/chosen": 0.7418658137321472, "logits/rejected": 1.404748558998108, "logps/chosen": -144.71226501464844, "logps/rejected": -232.55511474609375, "loss": 0.703, "rewards/accuracies": 0.25, "rewards/chosen": 0.008006097748875618, "rewards/margins": -0.017682263627648354, "rewards/rejected": 0.02568836137652397, "step": 259 }, { "epoch": 0.14021841714979102, "grad_norm": 6.005681991577148, "learning_rate": 4.975757468927726e-07, "logits/chosen": -0.44473379850387573, "logits/rejected": -0.9610626101493835, "logps/chosen": -250.09066772460938, "logps/rejected": -241.6405029296875, "loss": 0.6643, "rewards/accuracies": 0.625, "rewards/chosen": 0.049166202545166016, "rewards/margins": 0.061343200504779816, "rewards/rejected": -0.012176993303000927, "step": 260 }, { "epoch": 0.1407577187542133, "grad_norm": 10.016715049743652, "learning_rate": 4.975098935526487e-07, "logits/chosen": -0.18471336364746094, "logits/rejected": -1.1552635431289673, "logps/chosen": -295.13739013671875, "logps/rejected": -247.22970581054688, "loss": 0.7147, "rewards/accuracies": 0.25, "rewards/chosen": -0.005094719119369984, "rewards/margins": -0.04035167768597603, "rewards/rejected": 0.03525695949792862, "step": 261 }, { "epoch": 0.14129702035863556, "grad_norm": 6.479100704193115, "learning_rate": 4.974431622012601e-07, "logits/chosen": 0.13275699317455292, "logits/rejected": -1.5483543872833252, "logps/chosen": -243.15255737304688, "logps/rejected": -224.72824096679688, "loss": 0.6806, "rewards/accuracies": 0.625, "rewards/chosen": -0.002630426548421383, "rewards/margins": 0.026785947382450104, "rewards/rejected": -0.029416371136903763, "step": 262 }, { "epoch": 0.14183632196305784, "grad_norm": 7.741325855255127, "learning_rate": 4.973755530753283e-07, "logits/chosen": -0.6653844714164734, "logits/rejected": -0.8162959814071655, "logps/chosen": -270.0834655761719, "logps/rejected": -323.9447326660156, "loss": 0.6904, "rewards/accuracies": 0.25, "rewards/chosen": 0.00294761685654521, "rewards/margins": 0.0075589194893836975, "rewards/rejected": -0.0046113021671772, "step": 263 }, { "epoch": 0.14237562356748013, "grad_norm": 6.2527360916137695, "learning_rate": 4.973070664146885e-07, "logits/chosen": 0.11969435214996338, "logits/rejected": 0.7288479804992676, "logps/chosen": -201.0423583984375, "logps/rejected": -218.999755859375, "loss": 0.6785, "rewards/accuracies": 0.5, "rewards/chosen": 0.05385284498333931, "rewards/margins": 0.03451243042945862, "rewards/rejected": 0.01934041827917099, "step": 264 }, { "epoch": 0.14291492517190238, "grad_norm": 6.975738525390625, "learning_rate": 4.972377024622886e-07, "logits/chosen": 0.940865159034729, "logits/rejected": 0.5018800497055054, "logps/chosen": -251.44015502929688, "logps/rejected": -282.7532043457031, "loss": 0.7182, "rewards/accuracies": 0.25, "rewards/chosen": 0.034502219408750534, "rewards/margins": -0.048924922943115234, "rewards/rejected": 0.08342714607715607, "step": 265 }, { "epoch": 0.14345422677632466, "grad_norm": 7.5015130043029785, "learning_rate": 4.97167461464189e-07, "logits/chosen": 0.7512156963348389, "logits/rejected": -1.2460778951644897, "logps/chosen": -295.0986022949219, "logps/rejected": -163.7998046875, "loss": 0.6826, "rewards/accuracies": 0.5, "rewards/chosen": 0.03022785298526287, "rewards/margins": 0.022235438227653503, "rewards/rejected": 0.007992411032319069, "step": 266 }, { "epoch": 0.14399352838074694, "grad_norm": 6.968935966491699, "learning_rate": 4.970963436695611e-07, "logits/chosen": 0.9236302971839905, "logits/rejected": 0.02937990054488182, "logps/chosen": -255.0477752685547, "logps/rejected": -255.16616821289062, "loss": 0.7133, "rewards/accuracies": 0.5, "rewards/chosen": 0.019567489624023438, "rewards/margins": -0.03641472011804581, "rewards/rejected": 0.055982209742069244, "step": 267 }, { "epoch": 0.1445328299851692, "grad_norm": 6.783729553222656, "learning_rate": 4.970243493306865e-07, "logits/chosen": 0.7659680843353271, "logits/rejected": -1.0100088119506836, "logps/chosen": -232.73007202148438, "logps/rejected": -158.12939453125, "loss": 0.699, "rewards/accuracies": 0.5, "rewards/chosen": 0.028786085546016693, "rewards/margins": -0.009608175605535507, "rewards/rejected": 0.0383942611515522, "step": 268 }, { "epoch": 0.14507213158959148, "grad_norm": 7.027829170227051, "learning_rate": 4.969514787029565e-07, "logits/chosen": 0.5048552751541138, "logits/rejected": 0.04873324930667877, "logps/chosen": -331.6884765625, "logps/rejected": -293.0816650390625, "loss": 0.7249, "rewards/accuracies": 0.375, "rewards/chosen": 0.012289999052882195, "rewards/margins": -0.05680961161851883, "rewards/rejected": 0.06909961998462677, "step": 269 }, { "epoch": 0.14561143319401376, "grad_norm": 7.164796352386475, "learning_rate": 4.968777320448706e-07, "logits/chosen": -0.15304520726203918, "logits/rejected": -0.32819420099258423, "logps/chosen": -211.45181274414062, "logps/rejected": -194.14015197753906, "loss": 0.6771, "rewards/accuracies": 0.5, "rewards/chosen": 0.06357584148645401, "rewards/margins": 0.03708639368414879, "rewards/rejected": 0.02648944780230522, "step": 270 }, { "epoch": 0.14615073479843602, "grad_norm": 7.394932270050049, "learning_rate": 4.968031096180363e-07, "logits/chosen": 0.670941174030304, "logits/rejected": -1.016091227531433, "logps/chosen": -288.781982421875, "logps/rejected": -248.1809844970703, "loss": 0.6964, "rewards/accuracies": 0.375, "rewards/chosen": 0.026927758008241653, "rewards/margins": -0.0028684623539447784, "rewards/rejected": 0.029796216636896133, "step": 271 }, { "epoch": 0.1466900364028583, "grad_norm": 7.611814975738525, "learning_rate": 4.967276116871676e-07, "logits/chosen": -1.1393463611602783, "logits/rejected": 0.12975525856018066, "logps/chosen": -201.95114135742188, "logps/rejected": -260.907958984375, "loss": 0.6767, "rewards/accuracies": 0.5, "rewards/chosen": 0.05418205261230469, "rewards/margins": 0.03866462782025337, "rewards/rejected": 0.01551742572337389, "step": 272 }, { "epoch": 0.14722933800728058, "grad_norm": 6.956820011138916, "learning_rate": 4.96651238520084e-07, "logits/chosen": 1.4748153686523438, "logits/rejected": 1.1508678197860718, "logps/chosen": -298.02294921875, "logps/rejected": -305.9983215332031, "loss": 0.6766, "rewards/accuracies": 0.75, "rewards/chosen": 0.0374729186296463, "rewards/margins": 0.03598995506763458, "rewards/rejected": 0.0014829644933342934, "step": 273 }, { "epoch": 0.14776863961170283, "grad_norm": 6.799295425415039, "learning_rate": 4.965739903877104e-07, "logits/chosen": -0.44682908058166504, "logits/rejected": 0.23527950048446655, "logps/chosen": -227.02761840820312, "logps/rejected": -274.05096435546875, "loss": 0.6773, "rewards/accuracies": 0.875, "rewards/chosen": 0.004237745888531208, "rewards/margins": 0.032274533063173294, "rewards/rejected": -0.02803678810596466, "step": 274 }, { "epoch": 0.14830794121612512, "grad_norm": 7.150338649749756, "learning_rate": 4.964958675640748e-07, "logits/chosen": 0.42999449372291565, "logits/rejected": 0.46935519576072693, "logps/chosen": -291.2611999511719, "logps/rejected": -318.8696594238281, "loss": 0.6588, "rewards/accuracies": 0.875, "rewards/chosen": 0.06511078029870987, "rewards/margins": 0.07343735545873642, "rewards/rejected": -0.008326577953994274, "step": 275 }, { "epoch": 0.1488472428205474, "grad_norm": 7.595340728759766, "learning_rate": 4.964168703263086e-07, "logits/chosen": 0.3639776408672333, "logits/rejected": 0.15390892326831818, "logps/chosen": -163.7259063720703, "logps/rejected": -176.75889587402344, "loss": 0.6885, "rewards/accuracies": 0.75, "rewards/chosen": 0.02870645374059677, "rewards/margins": 0.014700697734951973, "rewards/rejected": 0.014005756005644798, "step": 276 }, { "epoch": 0.14938654442496965, "grad_norm": 7.055171966552734, "learning_rate": 4.963369989546449e-07, "logits/chosen": -0.46584421396255493, "logits/rejected": 0.39590978622436523, "logps/chosen": -211.33746337890625, "logps/rejected": -277.5067138671875, "loss": 0.7129, "rewards/accuracies": 0.25, "rewards/chosen": -0.019206620752811432, "rewards/margins": -0.0351293571293354, "rewards/rejected": 0.01592273637652397, "step": 277 }, { "epoch": 0.14992584602939193, "grad_norm": 7.625853538513184, "learning_rate": 4.962562537324176e-07, "logits/chosen": 0.28806841373443604, "logits/rejected": -0.7278885841369629, "logps/chosen": -175.975341796875, "logps/rejected": -118.99304962158203, "loss": 0.6946, "rewards/accuracies": 0.375, "rewards/chosen": 0.0036772731691598892, "rewards/margins": -0.0007590753957629204, "rewards/rejected": 0.0044363499619066715, "step": 278 }, { "epoch": 0.15046514763381422, "grad_norm": 9.51193618774414, "learning_rate": 4.961746349460606e-07, "logits/chosen": 0.7691066861152649, "logits/rejected": -0.004334405064582825, "logps/chosen": -256.93023681640625, "logps/rejected": -211.7227783203125, "loss": 0.7082, "rewards/accuracies": 0.5, "rewards/chosen": 0.05664978176355362, "rewards/margins": -0.026592634618282318, "rewards/rejected": 0.08324241638183594, "step": 279 }, { "epoch": 0.15100444923823647, "grad_norm": 6.497079849243164, "learning_rate": 4.960921428851066e-07, "logits/chosen": 0.9420140981674194, "logits/rejected": 0.3662492334842682, "logps/chosen": -255.5016326904297, "logps/rejected": -252.77618408203125, "loss": 0.7044, "rewards/accuracies": 0.375, "rewards/chosen": 0.012600135058164597, "rewards/margins": -0.01933479495346546, "rewards/rejected": 0.03193493187427521, "step": 280 }, { "epoch": 0.15154375084265875, "grad_norm": 7.17887020111084, "learning_rate": 4.960087778421863e-07, "logits/chosen": 0.21909961104393005, "logits/rejected": -1.7199742794036865, "logps/chosen": -239.9576873779297, "logps/rejected": -142.47885131835938, "loss": 0.6791, "rewards/accuracies": 0.625, "rewards/chosen": 0.05174989998340607, "rewards/margins": 0.03003067895770073, "rewards/rejected": 0.02171921730041504, "step": 281 }, { "epoch": 0.15208305244708104, "grad_norm": 6.124083995819092, "learning_rate": 4.959245401130269e-07, "logits/chosen": 1.271176815032959, "logits/rejected": 0.5631200075149536, "logps/chosen": -232.3654327392578, "logps/rejected": -204.30230712890625, "loss": 0.6724, "rewards/accuracies": 0.625, "rewards/chosen": 0.05189714580774307, "rewards/margins": 0.04403677210211754, "rewards/rejected": 0.007860374636948109, "step": 282 }, { "epoch": 0.1526223540515033, "grad_norm": 7.0963873863220215, "learning_rate": 4.958394299964515e-07, "logits/chosen": -0.49488410353660583, "logits/rejected": -0.3159482777118683, "logps/chosen": -251.44754028320312, "logps/rejected": -276.7538757324219, "loss": 0.6892, "rewards/accuracies": 0.5, "rewards/chosen": 0.05500964820384979, "rewards/margins": 0.010825539007782936, "rewards/rejected": 0.044184111058712006, "step": 283 }, { "epoch": 0.15316165565592557, "grad_norm": 6.302227973937988, "learning_rate": 4.957534477943781e-07, "logits/chosen": 0.32056769728660583, "logits/rejected": -0.5712308883666992, "logps/chosen": -249.69906616210938, "logps/rejected": -212.4783935546875, "loss": 0.7016, "rewards/accuracies": 0.5, "rewards/chosen": 0.0001766197383403778, "rewards/margins": -0.016330815851688385, "rewards/rejected": 0.016507435590028763, "step": 284 }, { "epoch": 0.15370095726034785, "grad_norm": 7.3974928855896, "learning_rate": 4.956665938118179e-07, "logits/chosen": -0.1926823854446411, "logits/rejected": -0.20581987500190735, "logps/chosen": -290.00933837890625, "logps/rejected": -242.93768310546875, "loss": 0.7051, "rewards/accuracies": 0.375, "rewards/chosen": 0.02474212646484375, "rewards/margins": -0.022595787420868874, "rewards/rejected": 0.047337912023067474, "step": 285 }, { "epoch": 0.15424025886477014, "grad_norm": 6.19757604598999, "learning_rate": 4.955788683568749e-07, "logits/chosen": 0.4595363438129425, "logits/rejected": 0.09717714786529541, "logps/chosen": -248.51939392089844, "logps/rejected": -264.4388122558594, "loss": 0.6533, "rewards/accuracies": 0.75, "rewards/chosen": 0.08515110611915588, "rewards/margins": 0.08466282486915588, "rewards/rejected": 0.00048828125, "step": 286 }, { "epoch": 0.1547795604691924, "grad_norm": 6.852083206176758, "learning_rate": 4.954902717407445e-07, "logits/chosen": -0.46771180629730225, "logits/rejected": 0.35952478647232056, "logps/chosen": -178.62408447265625, "logps/rejected": -196.06698608398438, "loss": 0.7006, "rewards/accuracies": 0.5, "rewards/chosen": 0.0633634552359581, "rewards/margins": -0.013229179196059704, "rewards/rejected": 0.07659263908863068, "step": 287 }, { "epoch": 0.15531886207361467, "grad_norm": 6.85208797454834, "learning_rate": 4.954008042777125e-07, "logits/chosen": -1.069166660308838, "logits/rejected": -1.4164541959762573, "logps/chosen": -216.59228515625, "logps/rejected": -246.2570343017578, "loss": 0.6834, "rewards/accuracies": 0.5, "rewards/chosen": 0.014000415802001953, "rewards/margins": 0.021703623235225677, "rewards/rejected": -0.007703209295868874, "step": 288 }, { "epoch": 0.15585816367803695, "grad_norm": 6.859569072723389, "learning_rate": 4.953104662851536e-07, "logits/chosen": -0.016961753368377686, "logits/rejected": 0.3308089077472687, "logps/chosen": -236.31045532226562, "logps/rejected": -305.4233703613281, "loss": 0.7123, "rewards/accuracies": 0.375, "rewards/chosen": 0.04699096828699112, "rewards/margins": -0.03661050647497177, "rewards/rejected": 0.08360147476196289, "step": 289 }, { "epoch": 0.1563974652824592, "grad_norm": 6.841517925262451, "learning_rate": 4.952192580835312e-07, "logits/chosen": 0.7645677328109741, "logits/rejected": 0.20929038524627686, "logps/chosen": -195.912353515625, "logps/rejected": -178.1552734375, "loss": 0.704, "rewards/accuracies": 0.375, "rewards/chosen": 0.033678628504276276, "rewards/margins": -0.02026071399450302, "rewards/rejected": 0.0539393424987793, "step": 290 }, { "epoch": 0.1569367668868815, "grad_norm": 6.722567558288574, "learning_rate": 4.951271799963951e-07, "logits/chosen": -0.32621827721595764, "logits/rejected": -1.4214487075805664, "logps/chosen": -237.9565887451172, "logps/rejected": -175.15277099609375, "loss": 0.6832, "rewards/accuracies": 0.875, "rewards/chosen": 0.031136898323893547, "rewards/margins": 0.021662618964910507, "rewards/rejected": 0.009474278427660465, "step": 291 }, { "epoch": 0.15747606849130377, "grad_norm": 7.189018726348877, "learning_rate": 4.950342323503811e-07, "logits/chosen": 0.14038458466529846, "logits/rejected": -1.4970124959945679, "logps/chosen": -248.1014862060547, "logps/rejected": -211.6341552734375, "loss": 0.6683, "rewards/accuracies": 0.75, "rewards/chosen": 0.05682992935180664, "rewards/margins": 0.051987361162900925, "rewards/rejected": 0.0048425691202282906, "step": 292 }, { "epoch": 0.15801537009572603, "grad_norm": 7.112308502197266, "learning_rate": 4.949404154752098e-07, "logits/chosen": 0.1918865144252777, "logits/rejected": -0.4937649667263031, "logps/chosen": -276.3597106933594, "logps/rejected": -284.0098876953125, "loss": 0.682, "rewards/accuracies": 0.5, "rewards/chosen": 0.004932403564453125, "rewards/margins": 0.02468395233154297, "rewards/rejected": -0.019751548767089844, "step": 293 }, { "epoch": 0.1585546717001483, "grad_norm": 8.482094764709473, "learning_rate": 4.948457297036851e-07, "logits/chosen": -0.3843969702720642, "logits/rejected": -0.9220460057258606, "logps/chosen": -302.621826171875, "logps/rejected": -281.5071105957031, "loss": 0.7309, "rewards/accuracies": 0.375, "rewards/chosen": -0.00798807106912136, "rewards/margins": -0.07209462672472, "rewards/rejected": 0.06410656124353409, "step": 294 }, { "epoch": 0.1590939733045706, "grad_norm": 6.009162425994873, "learning_rate": 4.947501753716933e-07, "logits/chosen": -0.12672489881515503, "logits/rejected": -0.4962383508682251, "logps/chosen": -231.71585083007812, "logps/rejected": -236.89358520507812, "loss": 0.6731, "rewards/accuracies": 0.625, "rewards/chosen": 0.0275739673525095, "rewards/margins": 0.04227294772863388, "rewards/rejected": -0.014698982238769531, "step": 295 }, { "epoch": 0.15963327490899284, "grad_norm": 6.866663455963135, "learning_rate": 4.946537528182017e-07, "logits/chosen": 0.0818101316690445, "logits/rejected": -0.5251861810684204, "logps/chosen": -248.1862335205078, "logps/rejected": -207.19595336914062, "loss": 0.6295, "rewards/accuracies": 0.875, "rewards/chosen": 0.10463066399097443, "rewards/margins": 0.1355222761631012, "rewards/rejected": -0.03089160844683647, "step": 296 }, { "epoch": 0.16017257651341513, "grad_norm": 7.0490264892578125, "learning_rate": 4.945564623852576e-07, "logits/chosen": 0.3640037775039673, "logits/rejected": 0.05723738670349121, "logps/chosen": -247.6593017578125, "logps/rejected": -204.70999145507812, "loss": 0.6893, "rewards/accuracies": 0.625, "rewards/chosen": 0.0067681786604225636, "rewards/margins": 0.013862036168575287, "rewards/rejected": -0.007093856111168861, "step": 297 }, { "epoch": 0.1607118781178374, "grad_norm": 6.879843711853027, "learning_rate": 4.944583044179871e-07, "logits/chosen": -0.19362081587314606, "logits/rejected": -0.827705979347229, "logps/chosen": -213.74229431152344, "logps/rejected": -191.89678955078125, "loss": 0.7172, "rewards/accuracies": 0.375, "rewards/chosen": -0.02147235907614231, "rewards/margins": -0.042739108204841614, "rewards/rejected": 0.021266747266054153, "step": 298 }, { "epoch": 0.16125117972225966, "grad_norm": 7.343256950378418, "learning_rate": 4.943592792645935e-07, "logits/chosen": 0.44100916385650635, "logits/rejected": -0.1896933615207672, "logps/chosen": -199.1534881591797, "logps/rejected": -241.2955780029297, "loss": 0.6784, "rewards/accuracies": 0.5, "rewards/chosen": 0.051564499735832214, "rewards/margins": 0.034538887441158295, "rewards/rejected": 0.01702561415731907, "step": 299 }, { "epoch": 0.16179048132668195, "grad_norm": 7.9021477699279785, "learning_rate": 4.942593872763566e-07, "logits/chosen": 0.9316891431808472, "logits/rejected": -0.8143026828765869, "logps/chosen": -328.4246826171875, "logps/rejected": -215.57089233398438, "loss": 0.6869, "rewards/accuracies": 0.5, "rewards/chosen": 0.030696485191583633, "rewards/margins": 0.018820764496922493, "rewards/rejected": 0.011875724419951439, "step": 300 }, { "epoch": 0.16179048132668195, "eval_logits/chosen": 1.36769700050354, "eval_logits/rejected": 1.0921075344085693, "eval_logps/chosen": -251.16909790039062, "eval_logps/rejected": -235.67300415039062, "eval_loss": 0.6902619004249573, "eval_rewards/accuracies": 0.5254658460617065, "eval_rewards/chosen": 0.028236668556928635, "eval_rewards/margins": 0.009282215498387814, "eval_rewards/rejected": 0.018954452127218246, "eval_runtime": 836.2795, "eval_samples_per_second": 1.925, "eval_steps_per_second": 0.963, "step": 300 }, { "epoch": 0.16232978293110423, "grad_norm": 6.120275497436523, "learning_rate": 4.94158628807631e-07, "logits/chosen": 1.0370820760726929, "logits/rejected": 0.23097610473632812, "logps/chosen": -234.31834411621094, "logps/rejected": -223.43011474609375, "loss": 0.6709, "rewards/accuracies": 0.75, "rewards/chosen": 0.04415950924158096, "rewards/margins": 0.04793262854218483, "rewards/rejected": -0.0037731174379587173, "step": 301 }, { "epoch": 0.16286908453552648, "grad_norm": 7.1938886642456055, "learning_rate": 4.940570042158453e-07, "logits/chosen": 0.8276575803756714, "logits/rejected": -0.5541470050811768, "logps/chosen": -249.00186157226562, "logps/rejected": -211.97348022460938, "loss": 0.7315, "rewards/accuracies": 0.375, "rewards/chosen": -0.004935741424560547, "rewards/margins": -0.07120447605848312, "rewards/rejected": 0.06626872718334198, "step": 302 }, { "epoch": 0.16340838613994876, "grad_norm": 7.518136978149414, "learning_rate": 4.939545138615002e-07, "logits/chosen": -0.6760913729667664, "logits/rejected": -0.6192342042922974, "logps/chosen": -249.30502319335938, "logps/rejected": -265.447021484375, "loss": 0.7218, "rewards/accuracies": 0.25, "rewards/chosen": -0.018178176134824753, "rewards/margins": -0.0556582473218441, "rewards/rejected": 0.03748006746172905, "step": 303 }, { "epoch": 0.16394768774437105, "grad_norm": 7.457335948944092, "learning_rate": 4.938511581081679e-07, "logits/chosen": 1.05903959274292, "logits/rejected": -0.20604082942008972, "logps/chosen": -283.94989013671875, "logps/rejected": -222.2242431640625, "loss": 0.6751, "rewards/accuracies": 0.625, "rewards/chosen": 0.04851188883185387, "rewards/margins": 0.04025869444012642, "rewards/rejected": 0.008253194391727448, "step": 304 }, { "epoch": 0.1644869893487933, "grad_norm": 6.771284580230713, "learning_rate": 4.937469373224903e-07, "logits/chosen": 0.9652900695800781, "logits/rejected": 0.35728466510772705, "logps/chosen": -203.68081665039062, "logps/rejected": -245.9359130859375, "loss": 0.6724, "rewards/accuracies": 0.875, "rewards/chosen": 0.04848422855138779, "rewards/margins": 0.044826894998550415, "rewards/rejected": 0.0036573405377566814, "step": 305 }, { "epoch": 0.16502629095321558, "grad_norm": 7.255806922912598, "learning_rate": 4.936418518741779e-07, "logits/chosen": 1.2456285953521729, "logits/rejected": -0.6015874147415161, "logps/chosen": -309.8321533203125, "logps/rejected": -223.2439727783203, "loss": 0.6977, "rewards/accuracies": 0.375, "rewards/chosen": 0.06496906280517578, "rewards/margins": -0.007153510116040707, "rewards/rejected": 0.07212257385253906, "step": 306 }, { "epoch": 0.16556559255763786, "grad_norm": 7.47069787979126, "learning_rate": 4.935359021360088e-07, "logits/chosen": 0.47164538502693176, "logits/rejected": -0.12956440448760986, "logps/chosen": -318.53411865234375, "logps/rejected": -281.558349609375, "loss": 0.6888, "rewards/accuracies": 0.5, "rewards/chosen": 0.02053089067339897, "rewards/margins": 0.00937824510037899, "rewards/rejected": 0.01115264743566513, "step": 307 }, { "epoch": 0.16610489416206012, "grad_norm": 7.144047260284424, "learning_rate": 4.934290884838266e-07, "logits/chosen": 0.2183067500591278, "logits/rejected": -1.4005802869796753, "logps/chosen": -251.5963134765625, "logps/rejected": -190.5490264892578, "loss": 0.6924, "rewards/accuracies": 0.375, "rewards/chosen": 0.011474991217255592, "rewards/margins": 0.0044061653316020966, "rewards/rejected": 0.007068825885653496, "step": 308 }, { "epoch": 0.1666441957664824, "grad_norm": 7.174031734466553, "learning_rate": 4.933214112965398e-07, "logits/chosen": 0.2931312918663025, "logits/rejected": -1.880274772644043, "logps/chosen": -264.8380432128906, "logps/rejected": -199.34259033203125, "loss": 0.6711, "rewards/accuracies": 0.5, "rewards/chosen": 0.07019510865211487, "rewards/margins": 0.04914502799510956, "rewards/rejected": 0.021050073206424713, "step": 309 }, { "epoch": 0.16718349737090468, "grad_norm": 5.5080718994140625, "learning_rate": 4.932128709561202e-07, "logits/chosen": 1.4152647256851196, "logits/rejected": 0.5885125398635864, "logps/chosen": -221.46920776367188, "logps/rejected": -188.47328186035156, "loss": 0.6642, "rewards/accuracies": 0.875, "rewards/chosen": 0.09029064327478409, "rewards/margins": 0.06055183336138725, "rewards/rejected": 0.029738808050751686, "step": 310 }, { "epoch": 0.16772279897532696, "grad_norm": 7.5732951164245605, "learning_rate": 4.931034678476014e-07, "logits/chosen": -0.5779528021812439, "logits/rejected": 0.05726081505417824, "logps/chosen": -353.75445556640625, "logps/rejected": -341.21368408203125, "loss": 0.7204, "rewards/accuracies": 0.5, "rewards/chosen": -0.03344077989459038, "rewards/margins": -0.04994068294763565, "rewards/rejected": 0.016499899327754974, "step": 311 }, { "epoch": 0.16826210057974922, "grad_norm": 9.011028289794922, "learning_rate": 4.929932023590776e-07, "logits/chosen": -0.22219768166542053, "logits/rejected": -1.3508055210113525, "logps/chosen": -344.6303405761719, "logps/rejected": -263.29180908203125, "loss": 0.6925, "rewards/accuracies": 0.5, "rewards/chosen": -0.0038212770596146584, "rewards/margins": 0.003760622814297676, "rewards/rejected": -0.007581901736557484, "step": 312 }, { "epoch": 0.1688014021841715, "grad_norm": 7.376155376434326, "learning_rate": 4.928820748817022e-07, "logits/chosen": 0.6935895681381226, "logits/rejected": -1.4672214984893799, "logps/chosen": -250.90499877929688, "logps/rejected": -124.50170135498047, "loss": 0.6731, "rewards/accuracies": 0.75, "rewards/chosen": 0.057933565229177475, "rewards/margins": 0.043183375149965286, "rewards/rejected": 0.014750193804502487, "step": 313 }, { "epoch": 0.16934070378859378, "grad_norm": 6.258421897888184, "learning_rate": 4.927700858096866e-07, "logits/chosen": 0.8714083433151245, "logits/rejected": 0.7730660438537598, "logps/chosen": -269.2637023925781, "logps/rejected": -256.07098388671875, "loss": 0.6577, "rewards/accuracies": 0.875, "rewards/chosen": 0.06649618595838547, "rewards/margins": 0.0741482824087143, "rewards/rejected": -0.007652090862393379, "step": 314 }, { "epoch": 0.16988000539301604, "grad_norm": 6.904129505157471, "learning_rate": 4.926572355402982e-07, "logits/chosen": -0.3181900978088379, "logits/rejected": 0.1865605264902115, "logps/chosen": -308.642578125, "logps/rejected": -282.591552734375, "loss": 0.6923, "rewards/accuracies": 0.375, "rewards/chosen": 0.014144515618681908, "rewards/margins": 0.005718378350138664, "rewards/rejected": 0.008426143787801266, "step": 315 }, { "epoch": 0.17041930699743832, "grad_norm": 6.827880382537842, "learning_rate": 4.925435244738598e-07, "logits/chosen": 0.9978060722351074, "logits/rejected": 1.1492372751235962, "logps/chosen": -243.276123046875, "logps/rejected": -238.74229431152344, "loss": 0.6784, "rewards/accuracies": 0.5, "rewards/chosen": 0.03729381784796715, "rewards/margins": 0.032686617225408554, "rewards/rejected": 0.004607198294252157, "step": 316 }, { "epoch": 0.1709586086018606, "grad_norm": 6.58591890335083, "learning_rate": 4.924289530137475e-07, "logits/chosen": 0.43028929829597473, "logits/rejected": 0.1454145759344101, "logps/chosen": -207.32395935058594, "logps/rejected": -173.64927673339844, "loss": 0.715, "rewards/accuracies": 0.375, "rewards/chosen": 0.050754353404045105, "rewards/margins": -0.039485834538936615, "rewards/rejected": 0.09024018794298172, "step": 317 }, { "epoch": 0.17149791020628286, "grad_norm": 7.428851127624512, "learning_rate": 4.923135215663896e-07, "logits/chosen": -0.05843360722064972, "logits/rejected": 0.9195795655250549, "logps/chosen": -236.404296875, "logps/rejected": -321.2883605957031, "loss": 0.7248, "rewards/accuracies": 0.25, "rewards/chosen": 0.0005330101121217012, "rewards/margins": -0.0600464791059494, "rewards/rejected": 0.06057949736714363, "step": 318 }, { "epoch": 0.17203721181070514, "grad_norm": 6.758139133453369, "learning_rate": 4.921972305412652e-07, "logits/chosen": -0.2088940441608429, "logits/rejected": 0.11192432045936584, "logps/chosen": -194.5166015625, "logps/rejected": -211.46156311035156, "loss": 0.7026, "rewards/accuracies": 0.25, "rewards/chosen": 0.006413553841412067, "rewards/margins": -0.017165662720799446, "rewards/rejected": 0.023579217493534088, "step": 319 }, { "epoch": 0.17257651341512742, "grad_norm": 9.099769592285156, "learning_rate": 4.920800803509025e-07, "logits/chosen": -0.26491162180900574, "logits/rejected": -1.4534555673599243, "logps/chosen": -240.38812255859375, "logps/rejected": -174.0598602294922, "loss": 0.6976, "rewards/accuracies": 0.375, "rewards/chosen": 0.027118302881717682, "rewards/margins": -0.007027913816273212, "rewards/rejected": 0.03414621204137802, "step": 320 }, { "epoch": 0.17311581501954967, "grad_norm": 7.086970329284668, "learning_rate": 4.919620714108776e-07, "logits/chosen": 0.28310921788215637, "logits/rejected": 0.0014587640762329102, "logps/chosen": -273.52874755859375, "logps/rejected": -270.67388916015625, "loss": 0.7175, "rewards/accuracies": 0.25, "rewards/chosen": 0.011449910700321198, "rewards/margins": -0.04702701419591904, "rewards/rejected": 0.058476924896240234, "step": 321 }, { "epoch": 0.17365511662397196, "grad_norm": 6.7428388595581055, "learning_rate": 4.91843204139813e-07, "logits/chosen": 0.08477144688367844, "logits/rejected": 0.7117512226104736, "logps/chosen": -314.7152099609375, "logps/rejected": -248.56326293945312, "loss": 0.6926, "rewards/accuracies": 0.5, "rewards/chosen": 0.04163064807653427, "rewards/margins": 0.005458921194076538, "rewards/rejected": 0.036171723157167435, "step": 322 }, { "epoch": 0.17419441822839424, "grad_norm": 7.0141520500183105, "learning_rate": 4.917234789593757e-07, "logits/chosen": 0.2400125116109848, "logits/rejected": 0.37906965613365173, "logps/chosen": -246.68507385253906, "logps/rejected": -274.499267578125, "loss": 0.6808, "rewards/accuracies": 0.625, "rewards/chosen": 0.030530642718076706, "rewards/margins": 0.02805623598396778, "rewards/rejected": 0.002474403940141201, "step": 323 }, { "epoch": 0.1747337198328165, "grad_norm": 6.228954315185547, "learning_rate": 4.916028962942762e-07, "logits/chosen": -0.1633884608745575, "logits/rejected": 0.3135414719581604, "logps/chosen": -165.658447265625, "logps/rejected": -217.794677734375, "loss": 0.7041, "rewards/accuracies": 0.5, "rewards/chosen": 0.04798975586891174, "rewards/margins": -0.018690109252929688, "rewards/rejected": 0.06667985767126083, "step": 324 }, { "epoch": 0.17527302143723877, "grad_norm": 7.310187339782715, "learning_rate": 4.91481456572267e-07, "logits/chosen": 1.6092603206634521, "logits/rejected": -0.0040544793009757996, "logps/chosen": -297.8753967285156, "logps/rejected": -189.53672790527344, "loss": 0.6648, "rewards/accuracies": 0.75, "rewards/chosen": 0.11985740810632706, "rewards/margins": 0.059430792927742004, "rewards/rejected": 0.06042661890387535, "step": 325 }, { "epoch": 0.17581232304166106, "grad_norm": 6.311262130737305, "learning_rate": 4.913591602241409e-07, "logits/chosen": 1.1048481464385986, "logits/rejected": 0.9923666715621948, "logps/chosen": -215.1932373046875, "logps/rejected": -261.67974853515625, "loss": 0.6782, "rewards/accuracies": 0.5, "rewards/chosen": 0.03792552649974823, "rewards/margins": 0.03330173343420029, "rewards/rejected": 0.004623793996870518, "step": 326 }, { "epoch": 0.1763516246460833, "grad_norm": 5.707093238830566, "learning_rate": 4.912360076837288e-07, "logits/chosen": 1.4086729288101196, "logits/rejected": 0.8638795018196106, "logps/chosen": -234.33419799804688, "logps/rejected": -213.75001525878906, "loss": 0.6568, "rewards/accuracies": 0.875, "rewards/chosen": 0.05860719457268715, "rewards/margins": 0.0790342390537262, "rewards/rejected": -0.02042703703045845, "step": 327 }, { "epoch": 0.1768909262505056, "grad_norm": 6.620418071746826, "learning_rate": 4.911119993878999e-07, "logits/chosen": -0.552768349647522, "logits/rejected": -1.0339789390563965, "logps/chosen": -212.3636474609375, "logps/rejected": -244.8211669921875, "loss": 0.7014, "rewards/accuracies": 0.375, "rewards/chosen": 0.06139860302209854, "rewards/margins": -0.01404170785099268, "rewards/rejected": 0.07544031739234924, "step": 328 }, { "epoch": 0.17743022785492787, "grad_norm": 6.482674598693848, "learning_rate": 4.909871357765583e-07, "logits/chosen": 0.02032032608985901, "logits/rejected": -0.19802314043045044, "logps/chosen": -211.67776489257812, "logps/rejected": -207.19070434570312, "loss": 0.6935, "rewards/accuracies": 0.5, "rewards/chosen": 0.038132958114147186, "rewards/margins": 0.0015455279499292374, "rewards/rejected": 0.0365874283015728, "step": 329 }, { "epoch": 0.17796952945935013, "grad_norm": 8.004170417785645, "learning_rate": 4.908614172926425e-07, "logits/chosen": 0.35088056325912476, "logits/rejected": -0.9160122275352478, "logps/chosen": -434.55194091796875, "logps/rejected": -238.55728149414062, "loss": 0.6474, "rewards/accuracies": 0.75, "rewards/chosen": 0.12042771279811859, "rewards/margins": 0.10106439888477325, "rewards/rejected": 0.01936330646276474, "step": 330 }, { "epoch": 0.1785088310637724, "grad_norm": 7.896081447601318, "learning_rate": 4.907348443821237e-07, "logits/chosen": 0.01562672108411789, "logits/rejected": -0.6701593399047852, "logps/chosen": -236.2156219482422, "logps/rejected": -280.30792236328125, "loss": 0.668, "rewards/accuracies": 0.75, "rewards/chosen": 0.10293684154748917, "rewards/margins": 0.05548830330371857, "rewards/rejected": 0.0474485382437706, "step": 331 }, { "epoch": 0.1790481326681947, "grad_norm": 7.543493270874023, "learning_rate": 4.906074174940037e-07, "logits/chosen": 1.4010642766952515, "logits/rejected": -1.2164325714111328, "logps/chosen": -363.7342529296875, "logps/rejected": -213.28598022460938, "loss": 0.6576, "rewards/accuracies": 0.75, "rewards/chosen": 0.108606718480587, "rewards/margins": 0.07441139221191406, "rewards/rejected": 0.03419532626867294, "step": 332 }, { "epoch": 0.17958743427261697, "grad_norm": 7.610644817352295, "learning_rate": 4.904791370803141e-07, "logits/chosen": 0.13164830207824707, "logits/rejected": -0.07849752902984619, "logps/chosen": -236.9071044921875, "logps/rejected": -322.5400085449219, "loss": 0.7281, "rewards/accuracies": 0.25, "rewards/chosen": -0.016489602625370026, "rewards/margins": -0.06675548851490021, "rewards/rejected": 0.05026588588953018, "step": 333 }, { "epoch": 0.18012673587703923, "grad_norm": 7.665918827056885, "learning_rate": 4.903500035961138e-07, "logits/chosen": 0.21236580610275269, "logits/rejected": 0.13031135499477386, "logps/chosen": -212.68338012695312, "logps/rejected": -222.82850646972656, "loss": 0.7321, "rewards/accuracies": 0.25, "rewards/chosen": -0.017282579094171524, "rewards/margins": -0.07476139068603516, "rewards/rejected": 0.057478807866573334, "step": 334 }, { "epoch": 0.1806660374814615, "grad_norm": 6.563136577606201, "learning_rate": 4.902200174994885e-07, "logits/chosen": -0.5555384159088135, "logits/rejected": -1.6604230403900146, "logps/chosen": -211.4749755859375, "logps/rejected": -144.16897583007812, "loss": 0.6973, "rewards/accuracies": 0.375, "rewards/chosen": 0.013179492205381393, "rewards/margins": -0.006801414303481579, "rewards/rejected": 0.019980909302830696, "step": 335 }, { "epoch": 0.1812053390858838, "grad_norm": 7.02855920791626, "learning_rate": 4.900891792515478e-07, "logits/chosen": -0.2692101299762726, "logits/rejected": -0.30474478006362915, "logps/chosen": -304.3692626953125, "logps/rejected": -305.10888671875, "loss": 0.6927, "rewards/accuracies": 0.5, "rewards/chosen": 0.03754234313964844, "rewards/margins": 0.005287172272801399, "rewards/rejected": 0.03225517272949219, "step": 336 }, { "epoch": 0.18174464069030605, "grad_norm": 6.892582416534424, "learning_rate": 4.899574893164246e-07, "logits/chosen": -0.9864411950111389, "logits/rejected": 0.11105549335479736, "logps/chosen": -222.7388916015625, "logps/rejected": -266.42376708984375, "loss": 0.7116, "rewards/accuracies": 0.25, "rewards/chosen": 0.047051530331373215, "rewards/margins": -0.03475913777947426, "rewards/rejected": 0.08181066811084747, "step": 337 }, { "epoch": 0.18228394229472833, "grad_norm": 5.762548923492432, "learning_rate": 4.89824948161273e-07, "logits/chosen": -0.512043297290802, "logits/rejected": -0.5357567071914673, "logps/chosen": -228.158203125, "logps/rejected": -198.63720703125, "loss": 0.6662, "rewards/accuracies": 0.625, "rewards/chosen": 0.0463777594268322, "rewards/margins": 0.05963430553674698, "rewards/rejected": -0.013256549835205078, "step": 338 }, { "epoch": 0.1828232438991506, "grad_norm": 8.227638244628906, "learning_rate": 4.896915562562664e-07, "logits/chosen": 0.017780199646949768, "logits/rejected": -0.1509542614221573, "logps/chosen": -190.68243408203125, "logps/rejected": -244.82339477539062, "loss": 0.6936, "rewards/accuracies": 0.5, "rewards/chosen": 0.009334946051239967, "rewards/margins": 0.0014528259634971619, "rewards/rejected": 0.007882120087742805, "step": 339 }, { "epoch": 0.18336254550357287, "grad_norm": 6.874902725219727, "learning_rate": 4.895573140745967e-07, "logits/chosen": -0.9079676866531372, "logits/rejected": -0.20568592846393585, "logps/chosen": -197.887451171875, "logps/rejected": -259.86688232421875, "loss": 0.7071, "rewards/accuracies": 0.375, "rewards/chosen": 0.028684522956609726, "rewards/margins": -0.02470693364739418, "rewards/rejected": 0.053391456604003906, "step": 340 }, { "epoch": 0.18390184710799515, "grad_norm": 7.905405521392822, "learning_rate": 4.894222220924714e-07, "logits/chosen": -0.9034596681594849, "logits/rejected": 0.6786455512046814, "logps/chosen": -274.13629150390625, "logps/rejected": -268.5071716308594, "loss": 0.7269, "rewards/accuracies": 0.125, "rewards/chosen": 0.0009898170828819275, "rewards/margins": -0.05828113108873367, "rewards/rejected": 0.0592709481716156, "step": 341 }, { "epoch": 0.18444114871241743, "grad_norm": 6.54630184173584, "learning_rate": 4.892862807891131e-07, "logits/chosen": 0.19631776213645935, "logits/rejected": 0.28801339864730835, "logps/chosen": -210.48802185058594, "logps/rejected": -166.89776611328125, "loss": 0.6912, "rewards/accuracies": 0.625, "rewards/chosen": 0.00026359595358371735, "rewards/margins": 0.006461523473262787, "rewards/rejected": -0.006197928451001644, "step": 342 }, { "epoch": 0.18498045031683968, "grad_norm": 6.227613925933838, "learning_rate": 4.891494906467569e-07, "logits/chosen": -0.4274633526802063, "logits/rejected": 0.14253485202789307, "logps/chosen": -212.43397521972656, "logps/rejected": -269.8636779785156, "loss": 0.6792, "rewards/accuracies": 0.625, "rewards/chosen": 0.060703568160533905, "rewards/margins": 0.031422339379787445, "rewards/rejected": 0.029281234368681908, "step": 343 }, { "epoch": 0.18551975192126197, "grad_norm": 6.577676296234131, "learning_rate": 4.890118521506494e-07, "logits/chosen": -0.587358832359314, "logits/rejected": 0.07917607575654984, "logps/chosen": -253.23019409179688, "logps/rejected": -241.08599853515625, "loss": 0.6867, "rewards/accuracies": 0.75, "rewards/chosen": 0.03272990882396698, "rewards/margins": 0.014605714939534664, "rewards/rejected": 0.018124200403690338, "step": 344 }, { "epoch": 0.18605905352568425, "grad_norm": 7.7639570236206055, "learning_rate": 4.888733657890463e-07, "logits/chosen": -0.23355908691883087, "logits/rejected": 0.030055224895477295, "logps/chosen": -228.00674438476562, "logps/rejected": -320.0171813964844, "loss": 0.703, "rewards/accuracies": 0.625, "rewards/chosen": 0.02257823944091797, "rewards/margins": -0.01605663076043129, "rewards/rejected": 0.03863487392663956, "step": 345 }, { "epoch": 0.1865983551301065, "grad_norm": 7.585418701171875, "learning_rate": 4.887340320532111e-07, "logits/chosen": 0.9668365120887756, "logits/rejected": -0.664452075958252, "logps/chosen": -271.9278564453125, "logps/rejected": -226.81329345703125, "loss": 0.6977, "rewards/accuracies": 0.625, "rewards/chosen": 0.040564537048339844, "rewards/margins": -0.0030744560062885284, "rewards/rejected": 0.043638989329338074, "step": 346 }, { "epoch": 0.18713765673452878, "grad_norm": 8.333971977233887, "learning_rate": 4.885938514374134e-07, "logits/chosen": 1.270597219467163, "logits/rejected": 0.610538125038147, "logps/chosen": -318.060546875, "logps/rejected": -280.641845703125, "loss": 0.6557, "rewards/accuracies": 0.75, "rewards/chosen": 0.07430315762758255, "rewards/margins": 0.07834368199110031, "rewards/rejected": -0.004040526691824198, "step": 347 }, { "epoch": 0.18767695833895107, "grad_norm": 8.306584358215332, "learning_rate": 4.884528244389268e-07, "logits/chosen": -0.28998616337776184, "logits/rejected": -1.217628836631775, "logps/chosen": -201.44302368164062, "logps/rejected": -249.29669189453125, "loss": 0.6781, "rewards/accuracies": 0.625, "rewards/chosen": 0.05270424112677574, "rewards/margins": 0.03135652467608452, "rewards/rejected": 0.021347712725400925, "step": 348 }, { "epoch": 0.18821625994337332, "grad_norm": 7.021573066711426, "learning_rate": 4.883109515580275e-07, "logits/chosen": 2.0263586044311523, "logits/rejected": 0.38609421253204346, "logps/chosen": -289.29339599609375, "logps/rejected": -242.59625244140625, "loss": 0.6886, "rewards/accuracies": 0.375, "rewards/chosen": 0.024378012865781784, "rewards/margins": 0.014161685481667519, "rewards/rejected": 0.01021633017808199, "step": 349 }, { "epoch": 0.1887555615477956, "grad_norm": 8.338702201843262, "learning_rate": 4.881682332979924e-07, "logits/chosen": -0.009876757860183716, "logits/rejected": -1.422835111618042, "logps/chosen": -362.1077880859375, "logps/rejected": -284.6011962890625, "loss": 0.7352, "rewards/accuracies": 0.125, "rewards/chosen": -0.032071303576231, "rewards/margins": -0.08141174167394638, "rewards/rejected": 0.049340441823005676, "step": 350 }, { "epoch": 0.18929486315221788, "grad_norm": 6.947585582733154, "learning_rate": 4.880246701650969e-07, "logits/chosen": -0.08935344964265823, "logits/rejected": -0.6580111980438232, "logps/chosen": -232.67886352539062, "logps/rejected": -208.05714416503906, "loss": 0.6852, "rewards/accuracies": 0.625, "rewards/chosen": 0.04487104341387749, "rewards/margins": 0.020653918385505676, "rewards/rejected": 0.02421712875366211, "step": 351 }, { "epoch": 0.18983416475664014, "grad_norm": 7.071460723876953, "learning_rate": 4.878802626686141e-07, "logits/chosen": 0.6866927146911621, "logits/rejected": 0.3640771508216858, "logps/chosen": -173.05859375, "logps/rejected": -227.92922973632812, "loss": 0.6871, "rewards/accuracies": 0.5, "rewards/chosen": 0.01878700591623783, "rewards/margins": 0.01570892333984375, "rewards/rejected": 0.0030780797824263573, "step": 352 }, { "epoch": 0.19037346636106242, "grad_norm": 7.694014072418213, "learning_rate": 4.877350113208119e-07, "logits/chosen": -0.9667006731033325, "logits/rejected": -0.4678923189640045, "logps/chosen": -256.6067199707031, "logps/rejected": -266.2572937011719, "loss": 0.6901, "rewards/accuracies": 0.375, "rewards/chosen": 0.07502011954784393, "rewards/margins": 0.008666230365633965, "rewards/rejected": 0.06635389477014542, "step": 353 }, { "epoch": 0.1909127679654847, "grad_norm": 6.880246639251709, "learning_rate": 4.875889166369516e-07, "logits/chosen": 0.1462407112121582, "logits/rejected": 0.47718092799186707, "logps/chosen": -237.5365753173828, "logps/rejected": -276.1529235839844, "loss": 0.686, "rewards/accuracies": 0.5, "rewards/chosen": 0.006990052759647369, "rewards/margins": 0.018410593271255493, "rewards/rejected": -0.0114205377176404, "step": 354 }, { "epoch": 0.19145206956990696, "grad_norm": 5.889229774475098, "learning_rate": 4.874419791352866e-07, "logits/chosen": 0.21845223009586334, "logits/rejected": 0.24205327033996582, "logps/chosen": -212.35934448242188, "logps/rejected": -298.28692626953125, "loss": 0.7036, "rewards/accuracies": 0.625, "rewards/chosen": 0.013100244104862213, "rewards/margins": -0.014768218621611595, "rewards/rejected": 0.02786846272647381, "step": 355 }, { "epoch": 0.19199137117432924, "grad_norm": 7.8910956382751465, "learning_rate": 4.872941993370597e-07, "logits/chosen": -0.48658061027526855, "logits/rejected": 0.7095248699188232, "logps/chosen": -333.105224609375, "logps/rejected": -409.37640380859375, "loss": 0.7506, "rewards/accuracies": 0.25, "rewards/chosen": -0.01883563958108425, "rewards/margins": -0.10961122810840607, "rewards/rejected": 0.09077558666467667, "step": 356 }, { "epoch": 0.19253067277875152, "grad_norm": 6.694991588592529, "learning_rate": 4.871455777665019e-07, "logits/chosen": 1.3582202196121216, "logits/rejected": 0.8436437249183655, "logps/chosen": -193.4810333251953, "logps/rejected": -185.2890625, "loss": 0.6834, "rewards/accuracies": 0.625, "rewards/chosen": -0.010372066870331764, "rewards/margins": 0.020956896245479584, "rewards/rejected": -0.0313289649784565, "step": 357 }, { "epoch": 0.1930699743831738, "grad_norm": 6.864197731018066, "learning_rate": 4.8699611495083e-07, "logits/chosen": -0.09530555456876755, "logits/rejected": -1.468320369720459, "logps/chosen": -258.1050109863281, "logps/rejected": -305.2944030761719, "loss": 0.6927, "rewards/accuracies": 0.625, "rewards/chosen": 0.07539310306310654, "rewards/margins": 0.002826213836669922, "rewards/rejected": 0.07256689667701721, "step": 358 }, { "epoch": 0.19360927598759606, "grad_norm": 6.490684986114502, "learning_rate": 4.868458114202454e-07, "logits/chosen": 0.589430034160614, "logits/rejected": 0.4312489926815033, "logps/chosen": -280.6070556640625, "logps/rejected": -242.93081665039062, "loss": 0.7162, "rewards/accuracies": 0.375, "rewards/chosen": 0.048340413719415665, "rewards/margins": -0.042814452201128006, "rewards/rejected": 0.09115486592054367, "step": 359 }, { "epoch": 0.19414857759201834, "grad_norm": 8.789266586303711, "learning_rate": 4.866946677079314e-07, "logits/chosen": 0.28693586587905884, "logits/rejected": -0.19853529334068298, "logps/chosen": -239.0048828125, "logps/rejected": -212.35409545898438, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.06635399162769318, "rewards/margins": 0.005817793309688568, "rewards/rejected": 0.06053619086742401, "step": 360 }, { "epoch": 0.19468787919644062, "grad_norm": 7.261064529418945, "learning_rate": 4.865426843500519e-07, "logits/chosen": -0.4050039052963257, "logits/rejected": -0.06253436207771301, "logps/chosen": -272.9020690917969, "logps/rejected": -273.347900390625, "loss": 0.6762, "rewards/accuracies": 0.5, "rewards/chosen": 0.07946338504552841, "rewards/margins": 0.03965158388018608, "rewards/rejected": 0.03981180116534233, "step": 361 }, { "epoch": 0.19522718080086288, "grad_norm": 7.862057209014893, "learning_rate": 4.863898618857495e-07, "logits/chosen": 0.2294633686542511, "logits/rejected": -0.39347198605537415, "logps/chosen": -257.12664794921875, "logps/rejected": -267.1776428222656, "loss": 0.6918, "rewards/accuracies": 0.625, "rewards/chosen": 0.05620694160461426, "rewards/margins": 0.0077588558197021484, "rewards/rejected": 0.04844808578491211, "step": 362 }, { "epoch": 0.19576648240528516, "grad_norm": 6.824700355529785, "learning_rate": 4.862362008571434e-07, "logits/chosen": 0.39338570833206177, "logits/rejected": 0.20731596648693085, "logps/chosen": -274.81243896484375, "logps/rejected": -255.22630310058594, "loss": 0.697, "rewards/accuracies": 0.375, "rewards/chosen": 0.06244230270385742, "rewards/margins": -0.006356715690344572, "rewards/rejected": 0.06879901885986328, "step": 363 }, { "epoch": 0.19630578400970744, "grad_norm": 7.012058734893799, "learning_rate": 4.860817018093272e-07, "logits/chosen": -0.024379879236221313, "logits/rejected": -0.31938666105270386, "logps/chosen": -223.982421875, "logps/rejected": -197.7273712158203, "loss": 0.6909, "rewards/accuracies": 0.5, "rewards/chosen": 0.05071048438549042, "rewards/margins": 0.005840395577251911, "rewards/rejected": 0.04487009346485138, "step": 364 }, { "epoch": 0.1968450856141297, "grad_norm": 7.037452697753906, "learning_rate": 4.859263652903677e-07, "logits/chosen": -0.586898684501648, "logits/rejected": -0.08542472124099731, "logps/chosen": -207.648193359375, "logps/rejected": -223.47705078125, "loss": 0.6799, "rewards/accuracies": 0.625, "rewards/chosen": 0.045174501836299896, "rewards/margins": 0.03182726353406906, "rewards/rejected": 0.013347243890166283, "step": 365 }, { "epoch": 0.19738438721855198, "grad_norm": 6.694331645965576, "learning_rate": 4.857701918513023e-07, "logits/chosen": 0.6872453093528748, "logits/rejected": -1.2225091457366943, "logps/chosen": -280.53076171875, "logps/rejected": -196.17208862304688, "loss": 0.6644, "rewards/accuracies": 0.875, "rewards/chosen": 0.06640224903821945, "rewards/margins": 0.0591372475028038, "rewards/rejected": 0.007264995947480202, "step": 366 }, { "epoch": 0.19792368882297426, "grad_norm": 6.654465198516846, "learning_rate": 4.856131820461371e-07, "logits/chosen": 0.06569398939609528, "logits/rejected": -0.6365481615066528, "logps/chosen": -178.68167114257812, "logps/rejected": -152.829345703125, "loss": 0.6763, "rewards/accuracies": 0.625, "rewards/chosen": 0.1112266555428505, "rewards/margins": 0.03528766334056854, "rewards/rejected": 0.07593898475170135, "step": 367 }, { "epoch": 0.1984629904273965, "grad_norm": 7.20933723449707, "learning_rate": 4.854553364318456e-07, "logits/chosen": 0.3929041922092438, "logits/rejected": -0.054650574922561646, "logps/chosen": -279.51556396484375, "logps/rejected": -295.52685546875, "loss": 0.6894, "rewards/accuracies": 0.625, "rewards/chosen": 0.03098144754767418, "rewards/margins": 0.008625603280961514, "rewards/rejected": 0.02235584333539009, "step": 368 }, { "epoch": 0.1990022920318188, "grad_norm": 7.553473472595215, "learning_rate": 4.852966555683656e-07, "logits/chosen": -0.345457524061203, "logits/rejected": -0.10828505456447601, "logps/chosen": -233.84849548339844, "logps/rejected": -212.67381286621094, "loss": 0.7103, "rewards/accuracies": 0.5, "rewards/chosen": 0.05922040715813637, "rewards/margins": -0.031231693923473358, "rewards/rejected": 0.09045210480690002, "step": 369 }, { "epoch": 0.19954159363624108, "grad_norm": 7.9647064208984375, "learning_rate": 4.851371400185985e-07, "logits/chosen": -0.9080172777175903, "logits/rejected": 1.0535495281219482, "logps/chosen": -198.04737854003906, "logps/rejected": -420.6893005371094, "loss": 0.6393, "rewards/accuracies": 0.875, "rewards/chosen": 0.0630594789981842, "rewards/margins": 0.114474818110466, "rewards/rejected": -0.051415350288152695, "step": 370 }, { "epoch": 0.20008089524066333, "grad_norm": 6.6969313621521, "learning_rate": 4.849767903484061e-07, "logits/chosen": 0.6119149923324585, "logits/rejected": -0.06911474466323853, "logps/chosen": -337.4703063964844, "logps/rejected": -331.059814453125, "loss": 0.7037, "rewards/accuracies": 0.375, "rewards/chosen": 0.04825344309210777, "rewards/margins": -0.019562721252441406, "rewards/rejected": 0.06781616806983948, "step": 371 }, { "epoch": 0.2006201968450856, "grad_norm": 6.257099628448486, "learning_rate": 4.848156071266095e-07, "logits/chosen": 0.17630332708358765, "logits/rejected": -1.0538339614868164, "logps/chosen": -238.9163818359375, "logps/rejected": -233.01763916015625, "loss": 0.6803, "rewards/accuracies": 0.625, "rewards/chosen": 0.0027381889522075653, "rewards/margins": 0.02811412699520588, "rewards/rejected": -0.025375939905643463, "step": 372 }, { "epoch": 0.2011594984495079, "grad_norm": 7.333095550537109, "learning_rate": 4.846535909249865e-07, "logits/chosen": 0.13442370295524597, "logits/rejected": -0.19318868219852448, "logps/chosen": -226.5239715576172, "logps/rejected": -241.2504425048828, "loss": 0.6921, "rewards/accuracies": 0.5, "rewards/chosen": 0.006523228716105223, "rewards/margins": 0.0033516897819936275, "rewards/rejected": 0.003171537071466446, "step": 373 }, { "epoch": 0.20169880005393015, "grad_norm": 8.75149917602539, "learning_rate": 4.844907423182699e-07, "logits/chosen": -1.3418128490447998, "logits/rejected": -1.0790444612503052, "logps/chosen": -324.30047607421875, "logps/rejected": -259.147705078125, "loss": 0.6766, "rewards/accuracies": 0.625, "rewards/chosen": 0.05356750637292862, "rewards/margins": 0.03784551844000816, "rewards/rejected": 0.015721989795565605, "step": 374 }, { "epoch": 0.20223810165835243, "grad_norm": 7.315005779266357, "learning_rate": 4.843270618841454e-07, "logits/chosen": -0.08521449565887451, "logits/rejected": 0.5157385468482971, "logps/chosen": -242.92575073242188, "logps/rejected": -239.68736267089844, "loss": 0.737, "rewards/accuracies": 0.125, "rewards/chosen": 0.010310650803148746, "rewards/margins": -0.08372659981250763, "rewards/rejected": 0.0940372496843338, "step": 375 }, { "epoch": 0.2027774032627747, "grad_norm": 7.705508708953857, "learning_rate": 4.841625502032493e-07, "logits/chosen": 0.28022608160972595, "logits/rejected": -0.6532328128814697, "logps/chosen": -271.14654541015625, "logps/rejected": -241.96910095214844, "loss": 0.6708, "rewards/accuracies": 0.75, "rewards/chosen": 0.06811809539794922, "rewards/margins": 0.047731682658195496, "rewards/rejected": 0.020386409014463425, "step": 376 }, { "epoch": 0.20331670486719697, "grad_norm": 7.394308567047119, "learning_rate": 4.83997207859167e-07, "logits/chosen": -0.6237463355064392, "logits/rejected": -0.02248181588947773, "logps/chosen": -257.9358215332031, "logps/rejected": -338.0342712402344, "loss": 0.7032, "rewards/accuracies": 0.5, "rewards/chosen": 0.014166546985507011, "rewards/margins": -0.013532452285289764, "rewards/rejected": 0.027698995545506477, "step": 377 }, { "epoch": 0.20385600647161925, "grad_norm": 7.170080184936523, "learning_rate": 4.838310354384302e-07, "logits/chosen": 0.3075699210166931, "logits/rejected": -1.1153572797775269, "logps/chosen": -220.01168823242188, "logps/rejected": -136.18179321289062, "loss": 0.6896, "rewards/accuracies": 0.5, "rewards/chosen": 0.012643338181078434, "rewards/margins": 0.01454601064324379, "rewards/rejected": -0.0019026752561330795, "step": 378 }, { "epoch": 0.20439530807604153, "grad_norm": 6.9135236740112305, "learning_rate": 4.836640335305155e-07, "logits/chosen": 0.5800840854644775, "logits/rejected": -0.13924072682857513, "logps/chosen": -222.55722045898438, "logps/rejected": -195.75408935546875, "loss": 0.6751, "rewards/accuracies": 0.625, "rewards/chosen": 0.06051664426922798, "rewards/margins": 0.041275881230831146, "rewards/rejected": 0.019240763038396835, "step": 379 }, { "epoch": 0.2049346096804638, "grad_norm": 7.134164333343506, "learning_rate": 4.834962027278417e-07, "logits/chosen": 0.3195402920246124, "logits/rejected": -1.3845407962799072, "logps/chosen": -225.51820373535156, "logps/rejected": -151.24868774414062, "loss": 0.6827, "rewards/accuracies": 0.625, "rewards/chosen": 0.04896283522248268, "rewards/margins": 0.022966958582401276, "rewards/rejected": 0.025995874777436256, "step": 380 }, { "epoch": 0.20547391128488607, "grad_norm": 7.92099142074585, "learning_rate": 4.833275436257683e-07, "logits/chosen": 0.2880376875400543, "logits/rejected": -0.08119255304336548, "logps/chosen": -227.3815460205078, "logps/rejected": -204.02304077148438, "loss": 0.688, "rewards/accuracies": 0.5, "rewards/chosen": 0.037465475499629974, "rewards/margins": 0.014240551739931107, "rewards/rejected": 0.023224927484989166, "step": 381 }, { "epoch": 0.20601321288930835, "grad_norm": 7.457998752593994, "learning_rate": 4.831580568225931e-07, "logits/chosen": 0.40409785509109497, "logits/rejected": -1.1974906921386719, "logps/chosen": -349.36883544921875, "logps/rejected": -206.65383911132812, "loss": 0.6575, "rewards/accuracies": 0.625, "rewards/chosen": 0.018580913543701172, "rewards/margins": 0.0769832655787468, "rewards/rejected": -0.058402348309755325, "step": 382 }, { "epoch": 0.20655251449373063, "grad_norm": 7.227019786834717, "learning_rate": 4.829877429195495e-07, "logits/chosen": -0.33691883087158203, "logits/rejected": -0.9526770710945129, "logps/chosen": -197.16403198242188, "logps/rejected": -262.2566833496094, "loss": 0.6762, "rewards/accuracies": 0.5, "rewards/chosen": 0.09052705764770508, "rewards/margins": 0.03643112629652023, "rewards/rejected": 0.05409593507647514, "step": 383 }, { "epoch": 0.2070918160981529, "grad_norm": 6.181154251098633, "learning_rate": 4.828166025208058e-07, "logits/chosen": 0.9656166434288025, "logits/rejected": -0.3977818489074707, "logps/chosen": -261.3162841796875, "logps/rejected": -216.73683166503906, "loss": 0.6625, "rewards/accuracies": 0.75, "rewards/chosen": 0.0653596818447113, "rewards/margins": 0.06364163756370544, "rewards/rejected": 0.0017180456779897213, "step": 384 }, { "epoch": 0.20763111770257517, "grad_norm": 9.104305267333984, "learning_rate": 4.826446362334616e-07, "logits/chosen": -0.5054339170455933, "logits/rejected": 0.2349514365196228, "logps/chosen": -253.461181640625, "logps/rejected": -357.6591796875, "loss": 0.6849, "rewards/accuracies": 0.5, "rewards/chosen": 0.076080322265625, "rewards/margins": 0.01825847290456295, "rewards/rejected": 0.0578218474984169, "step": 385 }, { "epoch": 0.20817041930699745, "grad_norm": 7.909277439117432, "learning_rate": 4.824718446675463e-07, "logits/chosen": 0.6166569590568542, "logits/rejected": -0.23691615462303162, "logps/chosen": -224.68399047851562, "logps/rejected": -219.62884521484375, "loss": 0.7076, "rewards/accuracies": 0.5, "rewards/chosen": 0.049036405980587006, "rewards/margins": -0.02614116482436657, "rewards/rejected": 0.07517757266759872, "step": 386 }, { "epoch": 0.2087097209114197, "grad_norm": 7.3198699951171875, "learning_rate": 4.822982284360173e-07, "logits/chosen": 0.6852301359176636, "logits/rejected": -0.7365946769714355, "logps/chosen": -151.46847534179688, "logps/rejected": -128.29425048828125, "loss": 0.6744, "rewards/accuracies": 0.625, "rewards/chosen": 0.07754340767860413, "rewards/margins": 0.03954648971557617, "rewards/rejected": 0.037996914237737656, "step": 387 }, { "epoch": 0.209249022515842, "grad_norm": 7.419281482696533, "learning_rate": 4.821237881547566e-07, "logits/chosen": 0.9026352763175964, "logits/rejected": 0.17987105250358582, "logps/chosen": -377.96319580078125, "logps/rejected": -315.16668701171875, "loss": 0.6571, "rewards/accuracies": 0.875, "rewards/chosen": 0.07115983963012695, "rewards/margins": 0.07581186294555664, "rewards/rejected": -0.004652022384107113, "step": 388 }, { "epoch": 0.20978832412026427, "grad_norm": 6.708138465881348, "learning_rate": 4.819485244425704e-07, "logits/chosen": -0.27074235677719116, "logits/rejected": -1.1693611145019531, "logps/chosen": -299.74908447265625, "logps/rejected": -253.1536865234375, "loss": 0.7027, "rewards/accuracies": 0.5, "rewards/chosen": 0.01133027020841837, "rewards/margins": -0.01564817503094673, "rewards/rejected": 0.026978446170687675, "step": 389 }, { "epoch": 0.21032762572468652, "grad_norm": 6.3415937423706055, "learning_rate": 4.81772437921185e-07, "logits/chosen": -0.6160057783126831, "logits/rejected": 0.2888645529747009, "logps/chosen": -252.66558837890625, "logps/rejected": -249.78231811523438, "loss": 0.7259, "rewards/accuracies": 0.25, "rewards/chosen": 0.02338705211877823, "rewards/margins": -0.060472771525382996, "rewards/rejected": 0.08385982364416122, "step": 390 }, { "epoch": 0.2108669273291088, "grad_norm": 7.047018527984619, "learning_rate": 4.815955292152463e-07, "logits/chosen": 1.6989691257476807, "logits/rejected": -0.411396861076355, "logps/chosen": -337.050537109375, "logps/rejected": -241.0614013671875, "loss": 0.6806, "rewards/accuracies": 0.625, "rewards/chosen": 0.10074672102928162, "rewards/margins": 0.02888297848403454, "rewards/rejected": 0.07186374813318253, "step": 391 }, { "epoch": 0.2114062289335311, "grad_norm": 6.930529594421387, "learning_rate": 4.814177989523161e-07, "logits/chosen": 0.20047977566719055, "logits/rejected": -1.1591991186141968, "logps/chosen": -301.0263671875, "logps/rejected": -172.48342895507812, "loss": 0.7132, "rewards/accuracies": 0.5, "rewards/chosen": 0.0846126526594162, "rewards/margins": -0.03280916437506676, "rewards/rejected": 0.11742182075977325, "step": 392 }, { "epoch": 0.21194553053795334, "grad_norm": 7.272785186767578, "learning_rate": 4.812392477628711e-07, "logits/chosen": -0.23182687163352966, "logits/rejected": -1.040536642074585, "logps/chosen": -209.7711181640625, "logps/rejected": -205.3397979736328, "loss": 0.6632, "rewards/accuracies": 0.75, "rewards/chosen": 0.1411784142255783, "rewards/margins": 0.06424690037965775, "rewards/rejected": 0.07693152129650116, "step": 393 }, { "epoch": 0.21248483214237562, "grad_norm": 6.953366279602051, "learning_rate": 4.810598762802999e-07, "logits/chosen": 0.2825128138065338, "logits/rejected": 0.14199307560920715, "logps/chosen": -225.98223876953125, "logps/rejected": -224.84701538085938, "loss": 0.7095, "rewards/accuracies": 0.375, "rewards/chosen": -0.0014404281973838806, "rewards/margins": -0.02740802802145481, "rewards/rejected": 0.02596759982407093, "step": 394 }, { "epoch": 0.2130241337467979, "grad_norm": 8.671500205993652, "learning_rate": 4.808796851409011e-07, "logits/chosen": -0.4263255000114441, "logits/rejected": -1.42429780960083, "logps/chosen": -309.5006408691406, "logps/rejected": -226.78758239746094, "loss": 0.6858, "rewards/accuracies": 0.625, "rewards/chosen": 0.09152308106422424, "rewards/margins": 0.021492861211299896, "rewards/rejected": 0.07003021240234375, "step": 395 }, { "epoch": 0.21356343535122016, "grad_norm": 6.883856773376465, "learning_rate": 4.806986749838806e-07, "logits/chosen": -0.30329054594039917, "logits/rejected": -1.0244797468185425, "logps/chosen": -234.00308227539062, "logps/rejected": -182.92510986328125, "loss": 0.6882, "rewards/accuracies": 0.5, "rewards/chosen": 0.026663687080144882, "rewards/margins": 0.012723876163363457, "rewards/rejected": 0.013939805328845978, "step": 396 }, { "epoch": 0.21410273695564244, "grad_norm": 7.319916725158691, "learning_rate": 4.805168464513503e-07, "logits/chosen": 0.7687097787857056, "logits/rejected": -0.7790307402610779, "logps/chosen": -302.0040283203125, "logps/rejected": -163.4280242919922, "loss": 0.6688, "rewards/accuracies": 0.625, "rewards/chosen": 0.06818065792322159, "rewards/margins": 0.05539340898394585, "rewards/rejected": 0.012787248939275742, "step": 397 }, { "epoch": 0.21464203856006472, "grad_norm": 8.066868782043457, "learning_rate": 4.803342001883246e-07, "logits/chosen": -0.2550211548805237, "logits/rejected": -0.3846513628959656, "logps/chosen": -308.55584716796875, "logps/rejected": -259.9303283691406, "loss": 0.6816, "rewards/accuracies": 0.5, "rewards/chosen": 0.0454343780875206, "rewards/margins": 0.029129642993211746, "rewards/rejected": 0.016304735094308853, "step": 398 }, { "epoch": 0.21518134016448698, "grad_norm": 5.978872776031494, "learning_rate": 4.80150736842719e-07, "logits/chosen": -0.014196164906024933, "logits/rejected": -0.8099868893623352, "logps/chosen": -242.60888671875, "logps/rejected": -207.58432006835938, "loss": 0.6549, "rewards/accuracies": 0.625, "rewards/chosen": 0.10741758346557617, "rewards/margins": 0.08367462456226349, "rewards/rejected": 0.023742960765957832, "step": 399 }, { "epoch": 0.21572064176890926, "grad_norm": 7.8489484786987305, "learning_rate": 4.799664570653473e-07, "logits/chosen": 0.5577570796012878, "logits/rejected": -0.47654786705970764, "logps/chosen": -260.66595458984375, "logps/rejected": -199.458740234375, "loss": 0.6498, "rewards/accuracies": 0.625, "rewards/chosen": 0.12360820174217224, "rewards/margins": 0.09132710099220276, "rewards/rejected": 0.03228111192584038, "step": 400 }, { "epoch": 0.21625994337333154, "grad_norm": 7.658378601074219, "learning_rate": 4.797813615099197e-07, "logits/chosen": 0.6642405390739441, "logits/rejected": -0.803749680519104, "logps/chosen": -210.57830810546875, "logps/rejected": -162.27993774414062, "loss": 0.6814, "rewards/accuracies": 0.75, "rewards/chosen": 0.04240674898028374, "rewards/margins": 0.025582412257790565, "rewards/rejected": 0.01682434044778347, "step": 401 }, { "epoch": 0.2167992449777538, "grad_norm": 6.25300931930542, "learning_rate": 4.795954508330402e-07, "logits/chosen": 0.7864577174186707, "logits/rejected": -0.6060475707054138, "logps/chosen": -265.6658020019531, "logps/rejected": -207.28736877441406, "loss": 0.6589, "rewards/accuracies": 0.625, "rewards/chosen": 0.06607189774513245, "rewards/margins": 0.07308216392993927, "rewards/rejected": -0.007010270841419697, "step": 402 }, { "epoch": 0.21733854658217608, "grad_norm": 6.537905693054199, "learning_rate": 4.794087256942044e-07, "logits/chosen": 0.26068219542503357, "logits/rejected": -0.05424365773797035, "logps/chosen": -189.9384307861328, "logps/rejected": -203.94992065429688, "loss": 0.6637, "rewards/accuracies": 0.625, "rewards/chosen": 0.10797318816184998, "rewards/margins": 0.06251630187034607, "rewards/rejected": 0.045456890016794205, "step": 403 }, { "epoch": 0.21787784818659836, "grad_norm": 7.706308364868164, "learning_rate": 4.792211867557969e-07, "logits/chosen": -0.17289988696575165, "logits/rejected": -0.3884209394454956, "logps/chosen": -303.15252685546875, "logps/rejected": -313.5324401855469, "loss": 0.6829, "rewards/accuracies": 0.625, "rewards/chosen": 0.07717571407556534, "rewards/margins": 0.023098183795809746, "rewards/rejected": 0.05407752841711044, "step": 404 }, { "epoch": 0.21841714979102062, "grad_norm": 7.041989326477051, "learning_rate": 4.790328346830893e-07, "logits/chosen": 0.11865111440420151, "logits/rejected": -1.735650658607483, "logps/chosen": -230.29400634765625, "logps/rejected": -163.25778198242188, "loss": 0.6903, "rewards/accuracies": 0.5, "rewards/chosen": 0.05130377039313316, "rewards/margins": 0.010387231595814228, "rewards/rejected": 0.04091653972864151, "step": 405 }, { "epoch": 0.2189564513954429, "grad_norm": 8.238775253295898, "learning_rate": 4.788436701442377e-07, "logits/chosen": -0.8087069988250732, "logits/rejected": -0.34753119945526123, "logps/chosen": -189.05648803710938, "logps/rejected": -203.09100341796875, "loss": 0.7044, "rewards/accuracies": 0.5, "rewards/chosen": 0.03833422809839249, "rewards/margins": -0.01673169434070587, "rewards/rejected": 0.05506592243909836, "step": 406 }, { "epoch": 0.21949575299986518, "grad_norm": 12.492842674255371, "learning_rate": 4.786536938102804e-07, "logits/chosen": 0.36827796697616577, "logits/rejected": 0.3515341281890869, "logps/chosen": -216.50303649902344, "logps/rejected": -232.4757080078125, "loss": 0.6755, "rewards/accuracies": 0.75, "rewards/chosen": 0.0939735472202301, "rewards/margins": 0.03878974914550781, "rewards/rejected": 0.05518379434943199, "step": 407 }, { "epoch": 0.22003505460428746, "grad_norm": 7.9749979972839355, "learning_rate": 4.784629063551354e-07, "logits/chosen": -0.6909058690071106, "logits/rejected": -0.7014387249946594, "logps/chosen": -231.4223175048828, "logps/rejected": -165.01425170898438, "loss": 0.6819, "rewards/accuracies": 0.5, "rewards/chosen": 0.0522923469543457, "rewards/margins": 0.027119258418679237, "rewards/rejected": 0.025173092260956764, "step": 408 }, { "epoch": 0.22057435620870972, "grad_norm": 6.006499767303467, "learning_rate": 4.782713084555977e-07, "logits/chosen": 0.22519327700138092, "logits/rejected": -0.33126717805862427, "logps/chosen": -169.1832275390625, "logps/rejected": -164.9869384765625, "loss": 0.6915, "rewards/accuracies": 0.5, "rewards/chosen": 0.05186624825000763, "rewards/margins": 0.009499266743659973, "rewards/rejected": 0.042366981506347656, "step": 409 }, { "epoch": 0.221113657813132, "grad_norm": 6.771450519561768, "learning_rate": 4.780789007913378e-07, "logits/chosen": -0.16915297508239746, "logits/rejected": 0.43882906436920166, "logps/chosen": -238.2813720703125, "logps/rejected": -226.19448852539062, "loss": 0.7085, "rewards/accuracies": 0.5, "rewards/chosen": 0.07892666012048721, "rewards/margins": -0.024120714515447617, "rewards/rejected": 0.10304737091064453, "step": 410 }, { "epoch": 0.22165295941755428, "grad_norm": 7.225942134857178, "learning_rate": 4.778856840448985e-07, "logits/chosen": 0.38493621349334717, "logits/rejected": -1.0390433073043823, "logps/chosen": -232.12351989746094, "logps/rejected": -223.66940307617188, "loss": 0.6914, "rewards/accuracies": 0.375, "rewards/chosen": 0.07561254501342773, "rewards/margins": 0.007748986594378948, "rewards/rejected": 0.06786356121301651, "step": 411 }, { "epoch": 0.22219226102197653, "grad_norm": 7.288419246673584, "learning_rate": 4.776916589016927e-07, "logits/chosen": 0.3390800356864929, "logits/rejected": -0.8051856756210327, "logps/chosen": -228.9473419189453, "logps/rejected": -180.05374145507812, "loss": 0.6602, "rewards/accuracies": 0.625, "rewards/chosen": 0.16151075065135956, "rewards/margins": 0.07199392467737198, "rewards/rejected": 0.08951682597398758, "step": 412 }, { "epoch": 0.22273156262639882, "grad_norm": 7.558515548706055, "learning_rate": 4.77496826050001e-07, "logits/chosen": 0.6302477717399597, "logits/rejected": 0.3762071430683136, "logps/chosen": -390.3526611328125, "logps/rejected": -348.01959228515625, "loss": 0.6643, "rewards/accuracies": 0.625, "rewards/chosen": 0.05222950130701065, "rewards/margins": 0.06178608536720276, "rewards/rejected": -0.009556584060192108, "step": 413 }, { "epoch": 0.2232708642308211, "grad_norm": 6.964341640472412, "learning_rate": 4.773011861809693e-07, "logits/chosen": 0.5892799496650696, "logits/rejected": 0.25635385513305664, "logps/chosen": -262.3338317871094, "logps/rejected": -220.86703491210938, "loss": 0.6828, "rewards/accuracies": 0.5, "rewards/chosen": 0.08779506385326385, "rewards/margins": 0.023922154679894447, "rewards/rejected": 0.06387291848659515, "step": 414 }, { "epoch": 0.22381016583524335, "grad_norm": 7.099416732788086, "learning_rate": 4.771047399886061e-07, "logits/chosen": -1.0317720174789429, "logits/rejected": -1.2030916213989258, "logps/chosen": -159.8741455078125, "logps/rejected": -204.43894958496094, "loss": 0.6753, "rewards/accuracies": 0.625, "rewards/chosen": 0.0791199654340744, "rewards/margins": 0.03729453310370445, "rewards/rejected": 0.04182543605566025, "step": 415 }, { "epoch": 0.22434946743966563, "grad_norm": 5.933521747589111, "learning_rate": 4.769074881697805e-07, "logits/chosen": 0.6817495226860046, "logits/rejected": -0.37808117270469666, "logps/chosen": -199.48602294921875, "logps/rejected": -157.6605224609375, "loss": 0.6714, "rewards/accuracies": 0.75, "rewards/chosen": 0.11826782673597336, "rewards/margins": 0.04933681711554527, "rewards/rejected": 0.06893101334571838, "step": 416 }, { "epoch": 0.22488876904408792, "grad_norm": 7.5077595710754395, "learning_rate": 4.767094314242195e-07, "logits/chosen": -0.2822217345237732, "logits/rejected": -0.45044761896133423, "logps/chosen": -288.1246032714844, "logps/rejected": -260.7871398925781, "loss": 0.6937, "rewards/accuracies": 0.375, "rewards/chosen": 0.04142723232507706, "rewards/margins": 0.0028268760070204735, "rewards/rejected": 0.03860035538673401, "step": 417 }, { "epoch": 0.22542807064851017, "grad_norm": 7.126457214355469, "learning_rate": 4.7651057045450515e-07, "logits/chosen": 0.5030932426452637, "logits/rejected": 0.2637019753456116, "logps/chosen": -246.8565216064453, "logps/rejected": -327.64202880859375, "loss": 0.6819, "rewards/accuracies": 0.625, "rewards/chosen": 0.03527422249317169, "rewards/margins": 0.026005081832408905, "rewards/rejected": 0.009269140660762787, "step": 418 }, { "epoch": 0.22596737225293245, "grad_norm": 7.570833683013916, "learning_rate": 4.763109059660726e-07, "logits/chosen": 0.46531206369400024, "logits/rejected": 0.5101313591003418, "logps/chosen": -186.01087951660156, "logps/rejected": -235.08331298828125, "loss": 0.7128, "rewards/accuracies": 0.375, "rewards/chosen": 0.004204370081424713, "rewards/margins": -0.03709707409143448, "rewards/rejected": 0.04130144417285919, "step": 419 }, { "epoch": 0.22650667385735473, "grad_norm": 7.080228328704834, "learning_rate": 4.7611043866720737e-07, "logits/chosen": 0.20741486549377441, "logits/rejected": 0.28030556440353394, "logps/chosen": -173.58425903320312, "logps/rejected": -183.79971313476562, "loss": 0.712, "rewards/accuracies": 0.375, "rewards/chosen": 0.036680128425359726, "rewards/margins": -0.03534460440278053, "rewards/rejected": 0.07202473282814026, "step": 420 }, { "epoch": 0.227045975461777, "grad_norm": 8.097362518310547, "learning_rate": 4.759091692690428e-07, "logits/chosen": 0.9104297757148743, "logits/rejected": 0.6630874872207642, "logps/chosen": -242.09365844726562, "logps/rejected": -304.67633056640625, "loss": 0.6761, "rewards/accuracies": 0.75, "rewards/chosen": 0.06632184982299805, "rewards/margins": 0.03589048609137535, "rewards/rejected": 0.030431363731622696, "step": 421 }, { "epoch": 0.22758527706619927, "grad_norm": 6.8532867431640625, "learning_rate": 4.757070984855577e-07, "logits/chosen": 0.1871090531349182, "logits/rejected": 0.8479843735694885, "logps/chosen": -206.53988647460938, "logps/rejected": -268.5478515625, "loss": 0.7057, "rewards/accuracies": 0.5, "rewards/chosen": 0.08826122432947159, "rewards/margins": -0.021822165697813034, "rewards/rejected": 0.11008339375257492, "step": 422 }, { "epoch": 0.22812457867062155, "grad_norm": 7.2350687980651855, "learning_rate": 4.755042270335735e-07, "logits/chosen": -0.1260930597782135, "logits/rejected": -1.1617059707641602, "logps/chosen": -218.11415100097656, "logps/rejected": -192.3035430908203, "loss": 0.6815, "rewards/accuracies": 0.75, "rewards/chosen": 0.06942605972290039, "rewards/margins": 0.02646365389227867, "rewards/rejected": 0.04296240955591202, "step": 423 }, { "epoch": 0.2286638802750438, "grad_norm": 7.70059871673584, "learning_rate": 4.7530055563275217e-07, "logits/chosen": -0.5338404178619385, "logits/rejected": 0.149825319647789, "logps/chosen": -204.77023315429688, "logps/rejected": -239.98419189453125, "loss": 0.6866, "rewards/accuracies": 0.625, "rewards/chosen": 0.05291872099041939, "rewards/margins": 0.017859363928437233, "rewards/rejected": 0.03505935147404671, "step": 424 }, { "epoch": 0.2292031818794661, "grad_norm": 6.978697776794434, "learning_rate": 4.7509608500559317e-07, "logits/chosen": -0.18657398223876953, "logits/rejected": -0.8567643165588379, "logps/chosen": -266.6151123046875, "logps/rejected": -237.3177490234375, "loss": 0.6646, "rewards/accuracies": 0.625, "rewards/chosen": 0.12355288863182068, "rewards/margins": 0.06021442264318466, "rewards/rejected": 0.06333847343921661, "step": 425 }, { "epoch": 0.22974248348388837, "grad_norm": 6.839248180389404, "learning_rate": 4.748908158774312e-07, "logits/chosen": 0.7138242721557617, "logits/rejected": -0.0070875585079193115, "logps/chosen": -302.4195556640625, "logps/rejected": -363.30029296875, "loss": 0.6894, "rewards/accuracies": 0.75, "rewards/chosen": 0.10547275841236115, "rewards/margins": 0.014514345675706863, "rewards/rejected": 0.09095840156078339, "step": 426 }, { "epoch": 0.23028178508831063, "grad_norm": 8.237014770507812, "learning_rate": 4.746847489764335e-07, "logits/chosen": -1.4465854167938232, "logits/rejected": -0.37608182430267334, "logps/chosen": -182.14366149902344, "logps/rejected": -223.2122802734375, "loss": 0.7204, "rewards/accuracies": 0.25, "rewards/chosen": 0.05586891248822212, "rewards/margins": -0.050902847200632095, "rewards/rejected": 0.10677175968885422, "step": 427 }, { "epoch": 0.2308210866927329, "grad_norm": 7.912062644958496, "learning_rate": 4.7447788503359735e-07, "logits/chosen": -0.10998120903968811, "logits/rejected": 0.4261658787727356, "logps/chosen": -294.1578674316406, "logps/rejected": -279.168212890625, "loss": 0.7015, "rewards/accuracies": 0.375, "rewards/chosen": 0.09324941784143448, "rewards/margins": -0.007143307477235794, "rewards/rejected": 0.10039272904396057, "step": 428 }, { "epoch": 0.2313603882971552, "grad_norm": 6.947909355163574, "learning_rate": 4.742702247827476e-07, "logits/chosen": 0.43863388895988464, "logits/rejected": 0.3179701566696167, "logps/chosen": -336.0065612792969, "logps/rejected": -356.61248779296875, "loss": 0.6817, "rewards/accuracies": 0.5, "rewards/chosen": 0.10138149559497833, "rewards/margins": 0.025555038824677467, "rewards/rejected": 0.07582645863294601, "step": 429 }, { "epoch": 0.23189968990157744, "grad_norm": 6.278693675994873, "learning_rate": 4.740617689605335e-07, "logits/chosen": -0.2051234096288681, "logits/rejected": -0.3810882568359375, "logps/chosen": -231.0037078857422, "logps/rejected": -243.25054931640625, "loss": 0.6861, "rewards/accuracies": 0.625, "rewards/chosen": 0.09100799262523651, "rewards/margins": 0.018782712519168854, "rewards/rejected": 0.07222528755664825, "step": 430 }, { "epoch": 0.23243899150599973, "grad_norm": 6.786123275756836, "learning_rate": 4.7385251830642703e-07, "logits/chosen": -0.2255568653345108, "logits/rejected": -0.17252743244171143, "logps/chosen": -231.77330017089844, "logps/rejected": -177.88314819335938, "loss": 0.6793, "rewards/accuracies": 0.75, "rewards/chosen": 0.08765235543251038, "rewards/margins": 0.03473625332117081, "rewards/rejected": 0.05291609838604927, "step": 431 }, { "epoch": 0.232978293110422, "grad_norm": 7.433266639709473, "learning_rate": 4.7364247356271927e-07, "logits/chosen": 0.7243268489837646, "logits/rejected": -0.15356573462486267, "logps/chosen": -284.4349365234375, "logps/rejected": -293.39068603515625, "loss": 0.668, "rewards/accuracies": 0.625, "rewards/chosen": 0.07015438377857208, "rewards/margins": 0.058263398706912994, "rewards/rejected": 0.011890985071659088, "step": 432 }, { "epoch": 0.2335175947148443, "grad_norm": 8.147842407226562, "learning_rate": 4.7343163547451837e-07, "logits/chosen": 1.900984287261963, "logits/rejected": 0.646363377571106, "logps/chosen": -289.6388854980469, "logps/rejected": -255.48899841308594, "loss": 0.6758, "rewards/accuracies": 0.5, "rewards/chosen": 0.07785424590110779, "rewards/margins": 0.036707065999507904, "rewards/rejected": 0.04114718735218048, "step": 433 }, { "epoch": 0.23405689631926654, "grad_norm": 6.0299835205078125, "learning_rate": 4.73220004789747e-07, "logits/chosen": 0.5401232242584229, "logits/rejected": 0.9865555763244629, "logps/chosen": -264.91424560546875, "logps/rejected": -277.9501647949219, "loss": 0.7156, "rewards/accuracies": 0.25, "rewards/chosen": 0.07585716247558594, "rewards/margins": -0.042084503918886185, "rewards/rejected": 0.11794166266918182, "step": 434 }, { "epoch": 0.23459619792368883, "grad_norm": 7.255166530609131, "learning_rate": 4.730075822591392e-07, "logits/chosen": 1.3292572498321533, "logits/rejected": 0.0050951167941093445, "logps/chosen": -312.70953369140625, "logps/rejected": -181.95033264160156, "loss": 0.6641, "rewards/accuracies": 0.75, "rewards/chosen": 0.11225757747888565, "rewards/margins": 0.06250162422657013, "rewards/rejected": 0.04975595325231552, "step": 435 }, { "epoch": 0.2351354995281111, "grad_norm": 6.767951488494873, "learning_rate": 4.72794368636238e-07, "logits/chosen": 0.5447127223014832, "logits/rejected": -0.054179489612579346, "logps/chosen": -233.47076416015625, "logps/rejected": -224.56446838378906, "loss": 0.7048, "rewards/accuracies": 0.25, "rewards/chosen": 0.06902046501636505, "rewards/margins": -0.02281642146408558, "rewards/rejected": 0.09183688461780548, "step": 436 }, { "epoch": 0.23567480113253336, "grad_norm": 9.310495376586914, "learning_rate": 4.725803646773929e-07, "logits/chosen": -0.0932348370552063, "logits/rejected": -1.5040910243988037, "logps/chosen": -175.17123413085938, "logps/rejected": -142.4755859375, "loss": 0.6425, "rewards/accuracies": 0.625, "rewards/chosen": 0.12270031124353409, "rewards/margins": 0.11269865185022354, "rewards/rejected": 0.010001659393310547, "step": 437 }, { "epoch": 0.23621410273695564, "grad_norm": 8.22139835357666, "learning_rate": 4.72365571141757e-07, "logits/chosen": -0.0802992582321167, "logits/rejected": -0.8927159905433655, "logps/chosen": -408.2183837890625, "logps/rejected": -381.6817321777344, "loss": 0.671, "rewards/accuracies": 0.75, "rewards/chosen": 0.006046294700354338, "rewards/margins": 0.059876441955566406, "rewards/rejected": -0.05383014678955078, "step": 438 }, { "epoch": 0.23675340434137793, "grad_norm": 6.9446868896484375, "learning_rate": 4.721499887912841e-07, "logits/chosen": 0.040776729583740234, "logits/rejected": -0.8744984865188599, "logps/chosen": -302.64208984375, "logps/rejected": -231.75677490234375, "loss": 0.6606, "rewards/accuracies": 0.625, "rewards/chosen": 0.05539760738611221, "rewards/margins": 0.0698874443769455, "rewards/rejected": -0.01448984444141388, "step": 439 }, { "epoch": 0.23729270594580018, "grad_norm": 7.668523788452148, "learning_rate": 4.719336183907265e-07, "logits/chosen": -0.5980689525604248, "logits/rejected": 0.49965137243270874, "logps/chosen": -239.99461364746094, "logps/rejected": -265.11175537109375, "loss": 0.6767, "rewards/accuracies": 0.625, "rewards/chosen": 0.06864919513463974, "rewards/margins": 0.03696151077747345, "rewards/rejected": 0.03168769180774689, "step": 440 }, { "epoch": 0.23783200755022246, "grad_norm": 7.239579677581787, "learning_rate": 4.7171646070763193e-07, "logits/chosen": 0.1765495240688324, "logits/rejected": -0.45508408546447754, "logps/chosen": -220.1367645263672, "logps/rejected": -208.8322296142578, "loss": 0.6687, "rewards/accuracies": 0.625, "rewards/chosen": 0.10634814202785492, "rewards/margins": 0.058116715401411057, "rewards/rejected": 0.04823140799999237, "step": 441 }, { "epoch": 0.23837130915464474, "grad_norm": 7.300921440124512, "learning_rate": 4.714985165123408e-07, "logits/chosen": -0.10452218353748322, "logits/rejected": -0.8185518980026245, "logps/chosen": -236.78956604003906, "logps/rejected": -170.86245727539062, "loss": 0.7304, "rewards/accuracies": 0.125, "rewards/chosen": 0.03783464431762695, "rewards/margins": -0.06792192906141281, "rewards/rejected": 0.10575656592845917, "step": 442 }, { "epoch": 0.238910610759067, "grad_norm": 7.427123069763184, "learning_rate": 4.712797865779836e-07, "logits/chosen": 0.7553325891494751, "logits/rejected": 0.482425332069397, "logps/chosen": -281.103759765625, "logps/rejected": -358.413818359375, "loss": 0.6458, "rewards/accuracies": 0.75, "rewards/chosen": 0.10688381642103195, "rewards/margins": 0.09993896633386612, "rewards/rejected": 0.006944848224520683, "step": 443 }, { "epoch": 0.23944991236348928, "grad_norm": 6.079413890838623, "learning_rate": 4.7106027168047833e-07, "logits/chosen": 1.444748878479004, "logits/rejected": 0.3194524645805359, "logps/chosen": -310.47076416015625, "logps/rejected": -148.880615234375, "loss": 0.689, "rewards/accuracies": 0.625, "rewards/chosen": 0.07565803825855255, "rewards/margins": 0.010638806968927383, "rewards/rejected": 0.06501922756433487, "step": 444 }, { "epoch": 0.23998921396791156, "grad_norm": 6.716400146484375, "learning_rate": 4.708399725985273e-07, "logits/chosen": 0.8786967992782593, "logits/rejected": -0.20565009117126465, "logps/chosen": -247.99293518066406, "logps/rejected": -194.5557403564453, "loss": 0.6827, "rewards/accuracies": 0.625, "rewards/chosen": 0.08329201489686966, "rewards/margins": 0.026914123445749283, "rewards/rejected": 0.05637789145112038, "step": 445 }, { "epoch": 0.24052851557233382, "grad_norm": 6.778453350067139, "learning_rate": 4.7061889011361476e-07, "logits/chosen": 1.0440999269485474, "logits/rejected": -0.8760435581207275, "logps/chosen": -234.63308715820312, "logps/rejected": -188.323486328125, "loss": 0.676, "rewards/accuracies": 0.75, "rewards/chosen": 0.12554818391799927, "rewards/margins": 0.04243632033467293, "rewards/rejected": 0.08311185240745544, "step": 446 }, { "epoch": 0.2410678171767561, "grad_norm": 7.602561950683594, "learning_rate": 4.70397025010004e-07, "logits/chosen": -0.5324638485908508, "logits/rejected": 0.23996248841285706, "logps/chosen": -318.2411804199219, "logps/rejected": -336.9674072265625, "loss": 0.7012, "rewards/accuracies": 0.375, "rewards/chosen": 0.03732948750257492, "rewards/margins": -0.015102384611964226, "rewards/rejected": 0.052431870251894, "step": 447 }, { "epoch": 0.24160711878117838, "grad_norm": 6.956057548522949, "learning_rate": 4.701743780747345e-07, "logits/chosen": 0.24663883447647095, "logits/rejected": 0.24115559458732605, "logps/chosen": -221.927001953125, "logps/rejected": -259.9642639160156, "loss": 0.7192, "rewards/accuracies": 0.25, "rewards/chosen": 0.04701528698205948, "rewards/margins": -0.05043087154626846, "rewards/rejected": 0.09744615107774734, "step": 448 }, { "epoch": 0.24214642038560064, "grad_norm": 7.3972930908203125, "learning_rate": 4.6995095009761907e-07, "logits/chosen": -0.111935555934906, "logits/rejected": -0.41162919998168945, "logps/chosen": -212.36451721191406, "logps/rejected": -222.6585693359375, "loss": 0.676, "rewards/accuracies": 0.375, "rewards/chosen": 0.08022002875804901, "rewards/margins": 0.043947841972112656, "rewards/rejected": 0.03627219423651695, "step": 449 }, { "epoch": 0.24268572199002292, "grad_norm": 8.340505599975586, "learning_rate": 4.6972674187124146e-07, "logits/chosen": 0.17032718658447266, "logits/rejected": 0.9198901653289795, "logps/chosen": -327.402099609375, "logps/rejected": -337.0530700683594, "loss": 0.675, "rewards/accuracies": 0.5, "rewards/chosen": 0.11209507286548615, "rewards/margins": 0.04228249192237854, "rewards/rejected": 0.0698125809431076, "step": 450 }, { "epoch": 0.2432250235944452, "grad_norm": 7.6588592529296875, "learning_rate": 4.69501754190953e-07, "logits/chosen": 0.27993103861808777, "logits/rejected": -1.3800134658813477, "logps/chosen": -392.64306640625, "logps/rejected": -181.62525939941406, "loss": 0.6654, "rewards/accuracies": 0.75, "rewards/chosen": 0.11260166019201279, "rewards/margins": 0.059601791203022, "rewards/rejected": 0.05299987643957138, "step": 451 }, { "epoch": 0.24376432519886745, "grad_norm": 10.277013778686523, "learning_rate": 4.692759878548702e-07, "logits/chosen": 0.6880121231079102, "logits/rejected": -0.29246193170547485, "logps/chosen": -288.58331298828125, "logps/rejected": -291.72674560546875, "loss": 0.665, "rewards/accuracies": 0.625, "rewards/chosen": 0.1479720026254654, "rewards/margins": 0.05865068733692169, "rewards/rejected": 0.0893213301897049, "step": 452 }, { "epoch": 0.24430362680328974, "grad_norm": 7.888538837432861, "learning_rate": 4.6904944366387173e-07, "logits/chosen": 0.2893691658973694, "logits/rejected": -1.1846729516983032, "logps/chosen": -277.0822448730469, "logps/rejected": -257.998046875, "loss": 0.7092, "rewards/accuracies": 0.5, "rewards/chosen": 0.06660366803407669, "rewards/margins": -0.02677593193948269, "rewards/rejected": 0.09337959438562393, "step": 453 }, { "epoch": 0.24484292840771202, "grad_norm": 8.006468772888184, "learning_rate": 4.6882212242159546e-07, "logits/chosen": 0.8142073154449463, "logits/rejected": -0.3104702830314636, "logps/chosen": -228.76565551757812, "logps/rejected": -192.13018798828125, "loss": 0.7009, "rewards/accuracies": 0.625, "rewards/chosen": 0.024773407727479935, "rewards/margins": -0.012520314194262028, "rewards/rejected": 0.03729372099041939, "step": 454 }, { "epoch": 0.24538223001213427, "grad_norm": 7.7854108810424805, "learning_rate": 4.6859402493443603e-07, "logits/chosen": 0.46380919218063354, "logits/rejected": 0.9948883652687073, "logps/chosen": -202.89300537109375, "logps/rejected": -246.4232940673828, "loss": 0.6769, "rewards/accuracies": 0.75, "rewards/chosen": 0.10287380963563919, "rewards/margins": 0.04309701919555664, "rewards/rejected": 0.05977678298950195, "step": 455 }, { "epoch": 0.24592153161655655, "grad_norm": 7.033171653747559, "learning_rate": 4.6836515201154135e-07, "logits/chosen": -0.14878606796264648, "logits/rejected": -0.8293918371200562, "logps/chosen": -225.2577362060547, "logps/rejected": -270.13067626953125, "loss": 0.6914, "rewards/accuracies": 0.375, "rewards/chosen": 0.0804838240146637, "rewards/margins": 0.004049303475767374, "rewards/rejected": 0.07643452286720276, "step": 456 }, { "epoch": 0.24646083322097884, "grad_norm": 6.34185266494751, "learning_rate": 4.681355044648105e-07, "logits/chosen": 0.34435540437698364, "logits/rejected": -0.9408673048019409, "logps/chosen": -218.24867248535156, "logps/rejected": -196.5282440185547, "loss": 0.6733, "rewards/accuracies": 0.625, "rewards/chosen": 0.07355594635009766, "rewards/margins": 0.0438992977142334, "rewards/rejected": 0.029656650498509407, "step": 457 }, { "epoch": 0.24700013482540112, "grad_norm": 8.315898895263672, "learning_rate": 4.6790508310889007e-07, "logits/chosen": 0.6986597180366516, "logits/rejected": -1.912509560585022, "logps/chosen": -303.97882080078125, "logps/rejected": -238.4620361328125, "loss": 0.6735, "rewards/accuracies": 0.625, "rewards/chosen": 0.08768653869628906, "rewards/margins": 0.04394569620490074, "rewards/rejected": 0.04374084621667862, "step": 458 }, { "epoch": 0.24753943642982337, "grad_norm": 7.125856399536133, "learning_rate": 4.6767388876117194e-07, "logits/chosen": 0.018315456807613373, "logits/rejected": -0.27896398305892944, "logps/chosen": -206.85861206054688, "logps/rejected": -212.33242797851562, "loss": 0.7011, "rewards/accuracies": 0.5, "rewards/chosen": 0.05674295499920845, "rewards/margins": -0.01142587698996067, "rewards/rejected": 0.06816883385181427, "step": 459 }, { "epoch": 0.24807873803424565, "grad_norm": 6.845198154449463, "learning_rate": 4.6744192224178984e-07, "logits/chosen": 0.01624882221221924, "logits/rejected": -1.5889636278152466, "logps/chosen": -221.8258819580078, "logps/rejected": -203.33990478515625, "loss": 0.6718, "rewards/accuracies": 0.75, "rewards/chosen": 0.08326683193445206, "rewards/margins": 0.04464368894696236, "rewards/rejected": 0.03862314671278, "step": 460 }, { "epoch": 0.24861803963866794, "grad_norm": 6.807131767272949, "learning_rate": 4.672091843736169e-07, "logits/chosen": 0.01757591962814331, "logits/rejected": -0.12911541759967804, "logps/chosen": -252.224365234375, "logps/rejected": -248.76559448242188, "loss": 0.6897, "rewards/accuracies": 0.375, "rewards/chosen": 0.12012500315904617, "rewards/margins": 0.009713554754853249, "rewards/rejected": 0.11041144281625748, "step": 461 }, { "epoch": 0.2491573412430902, "grad_norm": 6.623655319213867, "learning_rate": 4.669756759822624e-07, "logits/chosen": 0.1823408603668213, "logits/rejected": -0.09465713798999786, "logps/chosen": -260.7433776855469, "logps/rejected": -305.829345703125, "loss": 0.6729, "rewards/accuracies": 0.5, "rewards/chosen": 0.0784023329615593, "rewards/margins": 0.0462215431034565, "rewards/rejected": 0.0321807861328125, "step": 462 }, { "epoch": 0.24969664284751247, "grad_norm": 6.448901176452637, "learning_rate": 4.66741397896069e-07, "logits/chosen": -0.21277937293052673, "logits/rejected": -0.5466647148132324, "logps/chosen": -228.8004150390625, "logps/rejected": -209.8694610595703, "loss": 0.7077, "rewards/accuracies": 0.375, "rewards/chosen": 0.03997645527124405, "rewards/margins": -0.02513756975531578, "rewards/rejected": 0.06511402130126953, "step": 463 }, { "epoch": 0.25023594445193476, "grad_norm": 7.127066135406494, "learning_rate": 4.6650635094610966e-07, "logits/chosen": -0.8645235300064087, "logits/rejected": -1.1424100399017334, "logps/chosen": -173.25140380859375, "logps/rejected": -167.64840698242188, "loss": 0.7012, "rewards/accuracies": 0.625, "rewards/chosen": 0.05579061806201935, "rewards/margins": -0.0017268694937229156, "rewards/rejected": 0.057517483830451965, "step": 464 }, { "epoch": 0.25077524605635704, "grad_norm": 6.481484889984131, "learning_rate": 4.6627053596618495e-07, "logits/chosen": 0.1939200460910797, "logits/rejected": 0.5022667646408081, "logps/chosen": -227.187255859375, "logps/rejected": -241.09854125976562, "loss": 0.688, "rewards/accuracies": 0.625, "rewards/chosen": 0.039253804832696915, "rewards/margins": 0.012539289891719818, "rewards/rejected": 0.026714514940977097, "step": 465 }, { "epoch": 0.25131454766077926, "grad_norm": 7.004006862640381, "learning_rate": 4.6603395379281976e-07, "logits/chosen": -0.15507948398590088, "logits/rejected": -0.8616396188735962, "logps/chosen": -221.08660888671875, "logps/rejected": -173.71456909179688, "loss": 0.7001, "rewards/accuracies": 0.75, "rewards/chosen": 0.05303439870476723, "rewards/margins": -0.01127595640718937, "rewards/rejected": 0.06431036442518234, "step": 466 }, { "epoch": 0.25185384926520155, "grad_norm": 7.756710052490234, "learning_rate": 4.6579660526526067e-07, "logits/chosen": 0.6073187589645386, "logits/rejected": 0.07498564571142197, "logps/chosen": -235.86358642578125, "logps/rejected": -235.2806396484375, "loss": 0.7105, "rewards/accuracies": 0.5, "rewards/chosen": 0.04865093529224396, "rewards/margins": -0.03045654296875, "rewards/rejected": 0.07910747826099396, "step": 467 }, { "epoch": 0.25239315086962383, "grad_norm": 9.019181251525879, "learning_rate": 4.6555849122547263e-07, "logits/chosen": 1.8389211893081665, "logits/rejected": 0.9436342120170593, "logps/chosen": -314.28143310546875, "logps/rejected": -261.494140625, "loss": 0.6859, "rewards/accuracies": 0.625, "rewards/chosen": 0.13690251111984253, "rewards/margins": 0.018134206533432007, "rewards/rejected": 0.11876831948757172, "step": 468 }, { "epoch": 0.2529324524740461, "grad_norm": 7.207802772521973, "learning_rate": 4.6531961251813647e-07, "logits/chosen": -0.33853116631507874, "logits/rejected": -0.43947988748550415, "logps/chosen": -194.7056884765625, "logps/rejected": -207.70384216308594, "loss": 0.6654, "rewards/accuracies": 0.625, "rewards/chosen": 0.0811224952340126, "rewards/margins": 0.06006702780723572, "rewards/rejected": 0.02105546183884144, "step": 469 }, { "epoch": 0.2534717540784684, "grad_norm": 6.922300815582275, "learning_rate": 4.6507996999064513e-07, "logits/chosen": 1.0596474409103394, "logits/rejected": 0.12666000425815582, "logps/chosen": -391.7001953125, "logps/rejected": -303.2344970703125, "loss": 0.7015, "rewards/accuracies": 0.375, "rewards/chosen": 0.08831120282411575, "rewards/margins": -0.006366349756717682, "rewards/rejected": 0.09467753767967224, "step": 470 }, { "epoch": 0.2540110556828907, "grad_norm": 7.267258167266846, "learning_rate": 4.6483956449310153e-07, "logits/chosen": -0.09377053380012512, "logits/rejected": 0.036060407757759094, "logps/chosen": -252.8148651123047, "logps/rejected": -236.51132202148438, "loss": 0.7125, "rewards/accuracies": 0.5, "rewards/chosen": 0.08277782797813416, "rewards/margins": -0.035025455057621, "rewards/rejected": 0.11780329048633575, "step": 471 }, { "epoch": 0.25455035728731296, "grad_norm": 6.425400257110596, "learning_rate": 4.645983968783148e-07, "logits/chosen": 0.5822290182113647, "logits/rejected": 0.15930478274822235, "logps/chosen": -217.17063903808594, "logps/rejected": -245.51031494140625, "loss": 0.7208, "rewards/accuracies": 0.125, "rewards/chosen": 0.04456320032477379, "rewards/margins": -0.05417947471141815, "rewards/rejected": 0.09874267131090164, "step": 472 }, { "epoch": 0.2550896588917352, "grad_norm": 6.564186096191406, "learning_rate": 4.6435646800179784e-07, "logits/chosen": 0.43850189447402954, "logits/rejected": -0.848135232925415, "logps/chosen": -193.87673950195312, "logps/rejected": -163.3260498046875, "loss": 0.6666, "rewards/accuracies": 0.625, "rewards/chosen": 0.06298589706420898, "rewards/margins": 0.06419382244348526, "rewards/rejected": -0.0012079253792762756, "step": 473 }, { "epoch": 0.25562896049615746, "grad_norm": 6.773420810699463, "learning_rate": 4.64113778721764e-07, "logits/chosen": 0.3528349995613098, "logits/rejected": -0.6309361457824707, "logps/chosen": -187.4700469970703, "logps/rejected": -161.04620361328125, "loss": 0.6652, "rewards/accuracies": 0.625, "rewards/chosen": 0.1130366325378418, "rewards/margins": 0.058067891746759415, "rewards/rejected": 0.05496874079108238, "step": 474 }, { "epoch": 0.25616826210057975, "grad_norm": 6.20838737487793, "learning_rate": 4.6387032989912385e-07, "logits/chosen": -0.8627215623855591, "logits/rejected": -1.4507941007614136, "logps/chosen": -196.92678833007812, "logps/rejected": -170.9830322265625, "loss": 0.6604, "rewards/accuracies": 0.625, "rewards/chosen": 0.10210895538330078, "rewards/margins": 0.07077455520629883, "rewards/rejected": 0.03133440017700195, "step": 475 }, { "epoch": 0.25670756370500203, "grad_norm": 6.788063049316406, "learning_rate": 4.6362612239748254e-07, "logits/chosen": 0.36536741256713867, "logits/rejected": -0.30294349789619446, "logps/chosen": -347.6169128417969, "logps/rejected": -334.212890625, "loss": 0.682, "rewards/accuracies": 0.5, "rewards/chosen": 0.11321373283863068, "rewards/margins": 0.026862338185310364, "rewards/rejected": 0.08635139465332031, "step": 476 }, { "epoch": 0.2572468653094243, "grad_norm": 8.6376314163208, "learning_rate": 4.6338115708313663e-07, "logits/chosen": 0.5009814500808716, "logits/rejected": -0.8296367526054382, "logps/chosen": -391.3541564941406, "logps/rejected": -198.30075073242188, "loss": 0.7176, "rewards/accuracies": 0.375, "rewards/chosen": 0.05142784118652344, "rewards/margins": -0.04593782499432564, "rewards/rejected": 0.09736566245555878, "step": 477 }, { "epoch": 0.2577861669138466, "grad_norm": 6.456984519958496, "learning_rate": 4.6313543482507056e-07, "logits/chosen": 0.17933909595012665, "logits/rejected": -0.6546423435211182, "logps/chosen": -223.8617706298828, "logps/rejected": -180.56118774414062, "loss": 0.6938, "rewards/accuracies": 0.5, "rewards/chosen": 0.10823497921228409, "rewards/margins": 0.005049417726695538, "rewards/rejected": 0.10318556427955627, "step": 478 }, { "epoch": 0.2583254685182688, "grad_norm": 8.280733108520508, "learning_rate": 4.628889564949544e-07, "logits/chosen": 0.49050310254096985, "logits/rejected": -0.6479477286338806, "logps/chosen": -254.11756896972656, "logps/rejected": -209.74526977539062, "loss": 0.6585, "rewards/accuracies": 0.75, "rewards/chosen": 0.14264145493507385, "rewards/margins": 0.0767180472612381, "rewards/rejected": 0.06592340767383575, "step": 479 }, { "epoch": 0.2588647701226911, "grad_norm": 7.137206554412842, "learning_rate": 4.6264172296714e-07, "logits/chosen": 0.5555503964424133, "logits/rejected": 0.743564248085022, "logps/chosen": -241.19595336914062, "logps/rejected": -218.04942321777344, "loss": 0.7048, "rewards/accuracies": 0.5, "rewards/chosen": 0.09476719796657562, "rewards/margins": -0.020723532885313034, "rewards/rejected": 0.11549071967601776, "step": 480 }, { "epoch": 0.2594040717271134, "grad_norm": 6.2181220054626465, "learning_rate": 4.623937351186582e-07, "logits/chosen": -0.41029781103134155, "logits/rejected": 0.4492748975753784, "logps/chosen": -287.411865234375, "logps/rejected": -237.34970092773438, "loss": 0.6833, "rewards/accuracies": 0.75, "rewards/chosen": 0.09552697837352753, "rewards/margins": 0.021042540669441223, "rewards/rejected": 0.0744844377040863, "step": 481 }, { "epoch": 0.25994337333153567, "grad_norm": 7.211349010467529, "learning_rate": 4.621449938292158e-07, "logits/chosen": 0.3338015377521515, "logits/rejected": 0.6471433639526367, "logps/chosen": -364.65350341796875, "logps/rejected": -324.95062255859375, "loss": 0.6832, "rewards/accuracies": 0.625, "rewards/chosen": 0.04734516143798828, "rewards/margins": 0.02363872528076172, "rewards/rejected": 0.023706436157226562, "step": 482 }, { "epoch": 0.26048267493595795, "grad_norm": 6.697983741760254, "learning_rate": 4.618954999811923e-07, "logits/chosen": 0.06545805186033249, "logits/rejected": 0.8292796611785889, "logps/chosen": -273.1618347167969, "logps/rejected": -369.9500732421875, "loss": 0.6357, "rewards/accuracies": 0.75, "rewards/chosen": 0.18838366866111755, "rewards/margins": 0.1238628402352333, "rewards/rejected": 0.06452083587646484, "step": 483 }, { "epoch": 0.26102197654038023, "grad_norm": 6.967606067657471, "learning_rate": 4.6164525445963666e-07, "logits/chosen": -0.19935669004917145, "logits/rejected": -0.6955276131629944, "logps/chosen": -275.6037902832031, "logps/rejected": -242.71092224121094, "loss": 0.7161, "rewards/accuracies": 0.375, "rewards/chosen": 0.049130629748106, "rewards/margins": -0.043042901903390884, "rewards/rejected": 0.09217352420091629, "step": 484 }, { "epoch": 0.26156127814480246, "grad_norm": 7.310938835144043, "learning_rate": 4.613942581522646e-07, "logits/chosen": 0.5055464506149292, "logits/rejected": -0.18017929792404175, "logps/chosen": -380.3412170410156, "logps/rejected": -235.22882080078125, "loss": 0.7144, "rewards/accuracies": 0.375, "rewards/chosen": 0.08004283905029297, "rewards/margins": -0.038005828857421875, "rewards/rejected": 0.11804867535829544, "step": 485 }, { "epoch": 0.26210057974922474, "grad_norm": 7.847559452056885, "learning_rate": 4.611425119494551e-07, "logits/chosen": -0.19762486219406128, "logits/rejected": -0.4425765872001648, "logps/chosen": -198.66754150390625, "logps/rejected": -169.8974151611328, "loss": 0.6568, "rewards/accuracies": 0.625, "rewards/chosen": 0.11015758663415909, "rewards/margins": 0.08163246512413025, "rewards/rejected": 0.028525114059448242, "step": 486 }, { "epoch": 0.262639881353647, "grad_norm": 6.667302131652832, "learning_rate": 4.60890016744247e-07, "logits/chosen": 0.7275026440620422, "logits/rejected": -0.010479390621185303, "logps/chosen": -293.123046875, "logps/rejected": -218.7869415283203, "loss": 0.6902, "rewards/accuracies": 0.375, "rewards/chosen": 0.09082536399364471, "rewards/margins": 0.008562946692109108, "rewards/rejected": 0.08226242661476135, "step": 487 }, { "epoch": 0.2631791829580693, "grad_norm": 7.251954555511475, "learning_rate": 4.6063677343233644e-07, "logits/chosen": -0.8530893325805664, "logits/rejected": -1.2318122386932373, "logps/chosen": -241.80169677734375, "logps/rejected": -252.88978576660156, "loss": 0.6544, "rewards/accuracies": 0.75, "rewards/chosen": 0.16478300094604492, "rewards/margins": 0.08272619545459747, "rewards/rejected": 0.08205680549144745, "step": 488 }, { "epoch": 0.2637184845624916, "grad_norm": 7.839651584625244, "learning_rate": 4.6038278291207336e-07, "logits/chosen": 1.0050885677337646, "logits/rejected": 0.31807225942611694, "logps/chosen": -284.6713562011719, "logps/rejected": -208.2411346435547, "loss": 0.7109, "rewards/accuracies": 0.375, "rewards/chosen": 0.04279928281903267, "rewards/margins": -0.03278398886322975, "rewards/rejected": 0.07558327168226242, "step": 489 }, { "epoch": 0.26425778616691387, "grad_norm": 6.166024684906006, "learning_rate": 4.601280460844582e-07, "logits/chosen": 1.1343919038772583, "logits/rejected": 0.8591243028640747, "logps/chosen": -242.44955444335938, "logps/rejected": -309.0887451171875, "loss": 0.6581, "rewards/accuracies": 0.875, "rewards/chosen": 0.11627645045518875, "rewards/margins": 0.07434635609388351, "rewards/rejected": 0.04193010553717613, "step": 490 }, { "epoch": 0.2647970877713361, "grad_norm": 6.6571879386901855, "learning_rate": 4.5987256385313887e-07, "logits/chosen": 0.22793474793434143, "logits/rejected": -0.4236736595630646, "logps/chosen": -310.8161315917969, "logps/rejected": -266.61016845703125, "loss": 0.6638, "rewards/accuracies": 0.625, "rewards/chosen": 0.12313518673181534, "rewards/margins": 0.0630825087428093, "rewards/rejected": 0.06005268171429634, "step": 491 }, { "epoch": 0.2653363893757584, "grad_norm": 6.168181419372559, "learning_rate": 4.596163371244076e-07, "logits/chosen": 0.1704631894826889, "logits/rejected": -0.1016831174492836, "logps/chosen": -243.65225219726562, "logps/rejected": -244.79104614257812, "loss": 0.6869, "rewards/accuracies": 0.625, "rewards/chosen": 0.09348908066749573, "rewards/margins": 0.015112495981156826, "rewards/rejected": 0.07837657630443573, "step": 492 }, { "epoch": 0.26587569098018066, "grad_norm": 7.199425220489502, "learning_rate": 4.5935936680719745e-07, "logits/chosen": 0.39721664786338806, "logits/rejected": -0.39690327644348145, "logps/chosen": -277.7702941894531, "logps/rejected": -263.6070251464844, "loss": 0.6614, "rewards/accuracies": 0.625, "rewards/chosen": 0.12300454080104828, "rewards/margins": 0.06680937111377716, "rewards/rejected": 0.05619516596198082, "step": 493 }, { "epoch": 0.26641499258460294, "grad_norm": 7.200730800628662, "learning_rate": 4.591016538130795e-07, "logits/chosen": 0.745773196220398, "logits/rejected": -0.7921133637428284, "logps/chosen": -317.51904296875, "logps/rejected": -194.17327880859375, "loss": 0.6881, "rewards/accuracies": 0.5, "rewards/chosen": 0.0699453353881836, "rewards/margins": 0.016145994886755943, "rewards/rejected": 0.0537993386387825, "step": 494 }, { "epoch": 0.2669542941890252, "grad_norm": 6.737907409667969, "learning_rate": 4.5884319905625925e-07, "logits/chosen": 0.7443384528160095, "logits/rejected": 1.5788600444793701, "logps/chosen": -280.5777893066406, "logps/rejected": -295.47833251953125, "loss": 0.6719, "rewards/accuracies": 0.625, "rewards/chosen": 0.1440977156162262, "rewards/margins": 0.050081826746463776, "rewards/rejected": 0.09401588141918182, "step": 495 }, { "epoch": 0.2674935957934475, "grad_norm": 5.908520698547363, "learning_rate": 4.5858400345357353e-07, "logits/chosen": -0.36334267258644104, "logits/rejected": -1.2209806442260742, "logps/chosen": -151.4418182373047, "logps/rejected": -163.59866333007812, "loss": 0.6619, "rewards/accuracies": 0.625, "rewards/chosen": 0.11944837868213654, "rewards/margins": 0.06583166122436523, "rewards/rejected": 0.053616713732481, "step": 496 }, { "epoch": 0.2680328973978698, "grad_norm": 6.961855411529541, "learning_rate": 4.5832406792448725e-07, "logits/chosen": 0.15212324261665344, "logits/rejected": 0.307167649269104, "logps/chosen": -178.96044921875, "logps/rejected": -206.24371337890625, "loss": 0.7183, "rewards/accuracies": 0.5, "rewards/chosen": 0.03221549838781357, "rewards/margins": -0.04390105977654457, "rewards/rejected": 0.07611656188964844, "step": 497 }, { "epoch": 0.268572199002292, "grad_norm": 6.2520833015441895, "learning_rate": 4.580633933910901e-07, "logits/chosen": 0.5437613725662231, "logits/rejected": -0.4280567169189453, "logps/chosen": -353.7703857421875, "logps/rejected": -263.0242919921875, "loss": 0.6875, "rewards/accuracies": 0.625, "rewards/chosen": 0.11958561092615128, "rewards/margins": 0.012789439409971237, "rewards/rejected": 0.10679617524147034, "step": 498 }, { "epoch": 0.2691115006067143, "grad_norm": 8.157962799072266, "learning_rate": 4.578019807780932e-07, "logits/chosen": 0.5562778115272522, "logits/rejected": -0.022573281079530716, "logps/chosen": -304.6226501464844, "logps/rejected": -220.19036865234375, "loss": 0.6797, "rewards/accuracies": 0.625, "rewards/chosen": 0.0701436996459961, "rewards/margins": 0.03234148025512695, "rewards/rejected": 0.03780222311615944, "step": 499 }, { "epoch": 0.2696508022111366, "grad_norm": 7.093189239501953, "learning_rate": 4.575398310128262e-07, "logits/chosen": 1.792557954788208, "logits/rejected": 0.9535876512527466, "logps/chosen": -312.6296691894531, "logps/rejected": -269.78375244140625, "loss": 0.7148, "rewards/accuracies": 0.25, "rewards/chosen": 0.051151275634765625, "rewards/margins": -0.041470907628536224, "rewards/rejected": 0.09262219071388245, "step": 500 }, { "epoch": 0.27019010381555886, "grad_norm": 7.185142517089844, "learning_rate": 4.5727694502523344e-07, "logits/chosen": 0.8286672234535217, "logits/rejected": -0.4325653314590454, "logps/chosen": -250.57484436035156, "logps/rejected": -191.45053100585938, "loss": 0.7167, "rewards/accuracies": 0.5, "rewards/chosen": 0.04022340476512909, "rewards/margins": -0.0442267470061779, "rewards/rejected": 0.08445015549659729, "step": 501 }, { "epoch": 0.27072940541998114, "grad_norm": 7.429171085357666, "learning_rate": 4.57013323747871e-07, "logits/chosen": 0.6877495050430298, "logits/rejected": 0.04757431149482727, "logps/chosen": -237.89779663085938, "logps/rejected": -230.09393310546875, "loss": 0.6722, "rewards/accuracies": 0.5, "rewards/chosen": 0.16677379608154297, "rewards/margins": 0.050931937992572784, "rewards/rejected": 0.11584187299013138, "step": 502 }, { "epoch": 0.2712687070244034, "grad_norm": 7.7794508934021, "learning_rate": 4.5674896811590336e-07, "logits/chosen": 0.45763054490089417, "logits/rejected": -0.7522737979888916, "logps/chosen": -249.78518676757812, "logps/rejected": -203.1766357421875, "loss": 0.6502, "rewards/accuracies": 0.75, "rewards/chosen": 0.11529161036014557, "rewards/margins": 0.10207920521497726, "rewards/rejected": 0.013212397694587708, "step": 503 }, { "epoch": 0.27180800862882565, "grad_norm": 9.311572074890137, "learning_rate": 4.5648387906709995e-07, "logits/chosen": 0.551693856716156, "logits/rejected": -0.2344600111246109, "logps/chosen": -361.20989990234375, "logps/rejected": -375.52679443359375, "loss": 0.7473, "rewards/accuracies": 0.0, "rewards/chosen": -0.005584335420280695, "rewards/margins": -0.1040775328874588, "rewards/rejected": 0.09849319607019424, "step": 504 }, { "epoch": 0.27234731023324793, "grad_norm": 7.299941539764404, "learning_rate": 4.56218057541832e-07, "logits/chosen": -0.3259763717651367, "logits/rejected": 0.3734908998012543, "logps/chosen": -239.25013732910156, "logps/rejected": -319.7045593261719, "loss": 0.6795, "rewards/accuracies": 0.625, "rewards/chosen": 0.07623767852783203, "rewards/margins": 0.02930278889834881, "rewards/rejected": 0.04693489149212837, "step": 505 }, { "epoch": 0.2728866118376702, "grad_norm": 7.1865129470825195, "learning_rate": 4.55951504483069e-07, "logits/chosen": -1.339876413345337, "logits/rejected": 0.030967026948928833, "logps/chosen": -179.9476318359375, "logps/rejected": -214.17092895507812, "loss": 0.7148, "rewards/accuracies": 0.5, "rewards/chosen": 0.012676239013671875, "rewards/margins": -0.0401432029902935, "rewards/rejected": 0.05281944200396538, "step": 506 }, { "epoch": 0.2734259134420925, "grad_norm": 7.814990043640137, "learning_rate": 4.5568422083637555e-07, "logits/chosen": 0.6160321235656738, "logits/rejected": 0.3164424002170563, "logps/chosen": -201.16188049316406, "logps/rejected": -193.428955078125, "loss": 0.6902, "rewards/accuracies": 0.5, "rewards/chosen": 0.07399788498878479, "rewards/margins": 0.009225942194461823, "rewards/rejected": 0.06477194279432297, "step": 507 }, { "epoch": 0.2739652150465148, "grad_norm": 6.246123790740967, "learning_rate": 4.5541620754990797e-07, "logits/chosen": 0.08677777647972107, "logits/rejected": -1.3809415102005005, "logps/chosen": -222.34915161132812, "logps/rejected": -151.52708435058594, "loss": 0.6725, "rewards/accuracies": 0.75, "rewards/chosen": 0.06351132690906525, "rewards/margins": 0.04555349797010422, "rewards/rejected": 0.017957832664251328, "step": 508 }, { "epoch": 0.27450451665093706, "grad_norm": 7.261342525482178, "learning_rate": 4.551474655744107e-07, "logits/chosen": 0.028408054262399673, "logits/rejected": -0.7542426586151123, "logps/chosen": -321.3362731933594, "logps/rejected": -321.86029052734375, "loss": 0.6629, "rewards/accuracies": 0.625, "rewards/chosen": 0.10726547241210938, "rewards/margins": 0.06627121567726135, "rewards/rejected": 0.04099426418542862, "step": 509 }, { "epoch": 0.2750438182553593, "grad_norm": 7.590214729309082, "learning_rate": 4.548779958632133e-07, "logits/chosen": 0.4339359402656555, "logits/rejected": -0.6814838647842407, "logps/chosen": -296.93780517578125, "logps/rejected": -214.11700439453125, "loss": 0.6509, "rewards/accuracies": 0.875, "rewards/chosen": 0.0895722433924675, "rewards/margins": 0.0891018882393837, "rewards/rejected": 0.0004703514277935028, "step": 510 }, { "epoch": 0.27558311985978157, "grad_norm": 7.327652931213379, "learning_rate": 4.546077993722268e-07, "logits/chosen": 1.4162437915802002, "logits/rejected": 0.02970665693283081, "logps/chosen": -259.075927734375, "logps/rejected": -221.54385375976562, "loss": 0.6959, "rewards/accuracies": 0.375, "rewards/chosen": 0.06558647751808167, "rewards/margins": -0.002466870006173849, "rewards/rejected": 0.06805334240198135, "step": 511 }, { "epoch": 0.27612242146420385, "grad_norm": 6.215558052062988, "learning_rate": 4.5433687705994053e-07, "logits/chosen": 0.09257781505584717, "logits/rejected": 0.1747371405363083, "logps/chosen": -200.38150024414062, "logps/rejected": -186.43475341796875, "loss": 0.7353, "rewards/accuracies": 0.125, "rewards/chosen": 0.07260351628065109, "rewards/margins": -0.08018694072961807, "rewards/rejected": 0.15279045701026917, "step": 512 }, { "epoch": 0.27666172306862613, "grad_norm": 7.193265438079834, "learning_rate": 4.5406522988741825e-07, "logits/chosen": 1.664480447769165, "logits/rejected": -0.02677369862794876, "logps/chosen": -310.5202941894531, "logps/rejected": -214.42755126953125, "loss": 0.6579, "rewards/accuracies": 0.75, "rewards/chosen": 0.10711383819580078, "rewards/margins": 0.07971544563770294, "rewards/rejected": 0.027398396283388138, "step": 513 }, { "epoch": 0.2772010246730484, "grad_norm": 6.551218509674072, "learning_rate": 4.537928588182955e-07, "logits/chosen": 0.010613195598125458, "logits/rejected": 0.4220626950263977, "logps/chosen": -189.34793090820312, "logps/rejected": -238.67578125, "loss": 0.7051, "rewards/accuracies": 0.5, "rewards/chosen": 0.07822684943675995, "rewards/margins": -0.019647125154733658, "rewards/rejected": 0.09787397086620331, "step": 514 }, { "epoch": 0.2777403262774707, "grad_norm": 7.7376933097839355, "learning_rate": 4.5351976481877534e-07, "logits/chosen": 0.478842556476593, "logits/rejected": -0.23967322707176208, "logps/chosen": -180.85047912597656, "logps/rejected": -184.91159057617188, "loss": 0.737, "rewards/accuracies": 0.25, "rewards/chosen": 0.014184094965457916, "rewards/margins": -0.07972431182861328, "rewards/rejected": 0.0939084067940712, "step": 515 }, { "epoch": 0.2782796278818929, "grad_norm": 8.47560977935791, "learning_rate": 4.5324594885762576e-07, "logits/chosen": 0.29644545912742615, "logits/rejected": -0.1019030511379242, "logps/chosen": -366.21673583984375, "logps/rejected": -341.0246887207031, "loss": 0.679, "rewards/accuracies": 0.875, "rewards/chosen": 0.02700214460492134, "rewards/margins": 0.03107595629990101, "rewards/rejected": -0.004073811694979668, "step": 516 }, { "epoch": 0.2788189294863152, "grad_norm": 7.032925128936768, "learning_rate": 4.529714119061754e-07, "logits/chosen": 0.1700747013092041, "logits/rejected": -0.5632091760635376, "logps/chosen": -290.310791015625, "logps/rejected": -246.93380737304688, "loss": 0.682, "rewards/accuracies": 0.5, "rewards/chosen": 0.1119704321026802, "rewards/margins": 0.02367411181330681, "rewards/rejected": 0.0882963240146637, "step": 517 }, { "epoch": 0.2793582310907375, "grad_norm": 7.034486770629883, "learning_rate": 4.526961549383108e-07, "logits/chosen": -0.2978000044822693, "logits/rejected": -0.22808454930782318, "logps/chosen": -286.1770935058594, "logps/rejected": -293.29010009765625, "loss": 0.7161, "rewards/accuracies": 0.375, "rewards/chosen": 0.07668371498584747, "rewards/margins": -0.03969907760620117, "rewards/rejected": 0.11638280004262924, "step": 518 }, { "epoch": 0.27989753269515977, "grad_norm": 7.200978755950928, "learning_rate": 4.5242017893047266e-07, "logits/chosen": -0.595328688621521, "logits/rejected": -1.0988932847976685, "logps/chosen": -264.6426086425781, "logps/rejected": -164.83602905273438, "loss": 0.71, "rewards/accuracies": 0.25, "rewards/chosen": -0.009674263186752796, "rewards/margins": -0.031828880310058594, "rewards/rejected": 0.02215461991727352, "step": 519 }, { "epoch": 0.28043683429958205, "grad_norm": 7.123920917510986, "learning_rate": 4.5214348486165227e-07, "logits/chosen": -0.324192613363266, "logits/rejected": -1.6090575456619263, "logps/chosen": -206.56289672851562, "logps/rejected": -176.34628295898438, "loss": 0.6396, "rewards/accuracies": 0.75, "rewards/chosen": 0.11579227447509766, "rewards/margins": 0.11992330849170685, "rewards/rejected": -0.004131030291318893, "step": 520 }, { "epoch": 0.28097613590400433, "grad_norm": 6.209903240203857, "learning_rate": 4.5186607371338817e-07, "logits/chosen": 0.3830757141113281, "logits/rejected": -0.1866755187511444, "logps/chosen": -235.16860961914062, "logps/rejected": -191.02557373046875, "loss": 0.6756, "rewards/accuracies": 0.75, "rewards/chosen": 0.10209731757640839, "rewards/margins": 0.03829565644264221, "rewards/rejected": 0.06380166858434677, "step": 521 }, { "epoch": 0.2815154375084266, "grad_norm": 7.067611217498779, "learning_rate": 4.5158794646976283e-07, "logits/chosen": 0.09728683531284332, "logits/rejected": -0.6607043743133545, "logps/chosen": -277.1498107910156, "logps/rejected": -235.18894958496094, "loss": 0.6763, "rewards/accuracies": 0.5, "rewards/chosen": 0.1057218536734581, "rewards/margins": 0.03760872036218643, "rewards/rejected": 0.06811313331127167, "step": 522 }, { "epoch": 0.28205473911284884, "grad_norm": 6.5906147956848145, "learning_rate": 4.513091041173986e-07, "logits/chosen": 1.1790670156478882, "logits/rejected": 0.3003544211387634, "logps/chosen": -363.6327819824219, "logps/rejected": -300.6464538574219, "loss": 0.6331, "rewards/accuracies": 0.75, "rewards/chosen": 0.15473727881908417, "rewards/margins": 0.1288326382637024, "rewards/rejected": 0.02590465545654297, "step": 523 }, { "epoch": 0.2825940407172711, "grad_norm": 7.257049083709717, "learning_rate": 4.510295476454552e-07, "logits/chosen": -0.7263363599777222, "logits/rejected": 0.17180195450782776, "logps/chosen": -250.1275177001953, "logps/rejected": -257.53216552734375, "loss": 0.7296, "rewards/accuracies": 0.375, "rewards/chosen": 0.05193805694580078, "rewards/margins": -0.06928940117359161, "rewards/rejected": 0.1212274581193924, "step": 524 }, { "epoch": 0.2831333423216934, "grad_norm": 7.5877556800842285, "learning_rate": 4.5074927804562486e-07, "logits/chosen": -1.172125220298767, "logits/rejected": -0.691711962223053, "logps/chosen": -194.5272216796875, "logps/rejected": -298.83502197265625, "loss": 0.6667, "rewards/accuracies": 0.75, "rewards/chosen": 0.1185576468706131, "rewards/margins": 0.05999382957816124, "rewards/rejected": 0.05856380984187126, "step": 525 }, { "epoch": 0.2836726439261157, "grad_norm": 8.002728462219238, "learning_rate": 4.504682963121301e-07, "logits/chosen": 0.7305630445480347, "logits/rejected": 0.23653535544872284, "logps/chosen": -322.79071044921875, "logps/rejected": -313.36016845703125, "loss": 0.6409, "rewards/accuracies": 0.75, "rewards/chosen": 0.15366192162036896, "rewards/margins": 0.11140661686658859, "rewards/rejected": 0.042255304753780365, "step": 526 }, { "epoch": 0.28421194553053797, "grad_norm": 6.87769079208374, "learning_rate": 4.5018660344171947e-07, "logits/chosen": -0.6256831288337708, "logits/rejected": -1.0025053024291992, "logps/chosen": -195.82199096679688, "logps/rejected": -169.24136352539062, "loss": 0.6689, "rewards/accuracies": 0.625, "rewards/chosen": 0.07403755187988281, "rewards/margins": 0.0506281852722168, "rewards/rejected": 0.023409368470311165, "step": 527 }, { "epoch": 0.28475124713496025, "grad_norm": 6.157224178314209, "learning_rate": 4.4990420043366415e-07, "logits/chosen": 0.4971553087234497, "logits/rejected": 0.534791886806488, "logps/chosen": -174.60150146484375, "logps/rejected": -170.55320739746094, "loss": 0.689, "rewards/accuracies": 0.5, "rewards/chosen": 0.0685497298836708, "rewards/margins": 0.012090682052075863, "rewards/rejected": 0.05645904690027237, "step": 528 }, { "epoch": 0.2852905487393825, "grad_norm": 7.323132514953613, "learning_rate": 4.4962108828975457e-07, "logits/chosen": 0.5848884582519531, "logits/rejected": -0.7058426737785339, "logps/chosen": -259.46356201171875, "logps/rejected": -172.37258911132812, "loss": 0.6947, "rewards/accuracies": 0.625, "rewards/chosen": 0.07527218014001846, "rewards/margins": 0.003465362824499607, "rewards/rejected": 0.07180681824684143, "step": 529 }, { "epoch": 0.28582985034380476, "grad_norm": 8.120479583740234, "learning_rate": 4.4933726801429655e-07, "logits/chosen": 0.4182187020778656, "logits/rejected": 0.8379035592079163, "logps/chosen": -260.4678955078125, "logps/rejected": -246.03067016601562, "loss": 0.6744, "rewards/accuracies": 0.625, "rewards/chosen": 0.09033040702342987, "rewards/margins": 0.041741371154785156, "rewards/rejected": 0.04858903959393501, "step": 530 }, { "epoch": 0.28636915194822704, "grad_norm": 6.738630771636963, "learning_rate": 4.490527406141081e-07, "logits/chosen": 0.6110442280769348, "logits/rejected": 0.5129086971282959, "logps/chosen": -263.52471923828125, "logps/rejected": -249.03915405273438, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": 0.0762203186750412, "rewards/margins": 0.009801864624023438, "rewards/rejected": 0.06641845405101776, "step": 531 }, { "epoch": 0.2869084535526493, "grad_norm": 6.326533317565918, "learning_rate": 4.487675070985155e-07, "logits/chosen": 0.6730331778526306, "logits/rejected": 0.03719504922628403, "logps/chosen": -202.3102569580078, "logps/rejected": -175.50906372070312, "loss": 0.6908, "rewards/accuracies": 0.625, "rewards/chosen": 0.07556352764368057, "rewards/margins": 0.008326245471835136, "rewards/rejected": 0.06723728775978088, "step": 532 }, { "epoch": 0.2874477551570716, "grad_norm": 6.315502166748047, "learning_rate": 4.4848156847935005e-07, "logits/chosen": 0.6571420431137085, "logits/rejected": -0.019453376531600952, "logps/chosen": -267.8207702636719, "logps/rejected": -231.38197326660156, "loss": 0.6945, "rewards/accuracies": 0.375, "rewards/chosen": 0.10109902173280716, "rewards/margins": 0.005318263545632362, "rewards/rejected": 0.09578075259923935, "step": 533 }, { "epoch": 0.2879870567614939, "grad_norm": 6.275911808013916, "learning_rate": 4.481949257709442e-07, "logits/chosen": 0.1272127628326416, "logits/rejected": 0.14176279306411743, "logps/chosen": -248.7495574951172, "logps/rejected": -278.3167419433594, "loss": 0.7048, "rewards/accuracies": 0.5, "rewards/chosen": 0.1136406883597374, "rewards/margins": -0.020089052617549896, "rewards/rejected": 0.1337297409772873, "step": 534 }, { "epoch": 0.2885263583659161, "grad_norm": 6.643585205078125, "learning_rate": 4.479075799901282e-07, "logits/chosen": 0.22694310545921326, "logits/rejected": -0.9489772319793701, "logps/chosen": -299.3971862792969, "logps/rejected": -246.90283203125, "loss": 0.6812, "rewards/accuracies": 0.625, "rewards/chosen": 0.12239504605531693, "rewards/margins": 0.02997398003935814, "rewards/rejected": 0.09242105484008789, "step": 535 }, { "epoch": 0.2890656599703384, "grad_norm": 8.017497062683105, "learning_rate": 4.4761953215622615e-07, "logits/chosen": 0.3751024603843689, "logits/rejected": -0.044216930866241455, "logps/chosen": -216.73609924316406, "logps/rejected": -222.166015625, "loss": 0.6826, "rewards/accuracies": 0.625, "rewards/chosen": 0.08355875313282013, "rewards/margins": 0.02849598228931427, "rewards/rejected": 0.05506276711821556, "step": 536 }, { "epoch": 0.2896049615747607, "grad_norm": 7.786693572998047, "learning_rate": 4.473307832910529e-07, "logits/chosen": 0.2702327072620392, "logits/rejected": -0.5459766387939453, "logps/chosen": -196.53359985351562, "logps/rejected": -168.07199096679688, "loss": 0.7081, "rewards/accuracies": 0.625, "rewards/chosen": 0.0531800240278244, "rewards/margins": -0.025127792730927467, "rewards/rejected": 0.07830782234668732, "step": 537 }, { "epoch": 0.29014426317918296, "grad_norm": 7.344721794128418, "learning_rate": 4.470413344189098e-07, "logits/chosen": 0.13310639560222626, "logits/rejected": -0.9600553512573242, "logps/chosen": -283.4891357421875, "logps/rejected": -233.7626495361328, "loss": 0.683, "rewards/accuracies": 0.625, "rewards/chosen": 0.0953126922249794, "rewards/margins": 0.02386636659502983, "rewards/rejected": 0.07144632190465927, "step": 538 }, { "epoch": 0.29068356478360524, "grad_norm": 6.480898380279541, "learning_rate": 4.467511865665816e-07, "logits/chosen": 1.0764315128326416, "logits/rejected": 0.815995991230011, "logps/chosen": -252.09249877929688, "logps/rejected": -265.8923645019531, "loss": 0.7606, "rewards/accuracies": 0.25, "rewards/chosen": 0.06358508765697479, "rewards/margins": -0.12720128893852234, "rewards/rejected": 0.19078637659549713, "step": 539 }, { "epoch": 0.2912228663880275, "grad_norm": 7.857486724853516, "learning_rate": 4.4646034076333254e-07, "logits/chosen": 0.47655290365219116, "logits/rejected": 0.2380804419517517, "logps/chosen": -229.661376953125, "logps/rejected": -233.14630126953125, "loss": 0.6785, "rewards/accuracies": 0.5, "rewards/chosen": 0.12825261056423187, "rewards/margins": 0.03472500294446945, "rewards/rejected": 0.09352760016918182, "step": 540 }, { "epoch": 0.29176216799244975, "grad_norm": 7.698617458343506, "learning_rate": 4.461687980409028e-07, "logits/chosen": -0.41175538301467896, "logits/rejected": -0.8819028735160828, "logps/chosen": -262.73944091796875, "logps/rejected": -239.05145263671875, "loss": 0.6951, "rewards/accuracies": 0.625, "rewards/chosen": 0.10191507637500763, "rewards/margins": 0.002126786857843399, "rewards/rejected": 0.09978827834129333, "step": 541 }, { "epoch": 0.29230146959687203, "grad_norm": 5.998939514160156, "learning_rate": 4.458765594335047e-07, "logits/chosen": 0.025421470403671265, "logits/rejected": 0.455361008644104, "logps/chosen": -171.44503784179688, "logps/rejected": -228.13824462890625, "loss": 0.6659, "rewards/accuracies": 0.625, "rewards/chosen": 0.06549978256225586, "rewards/margins": 0.058152392506599426, "rewards/rejected": 0.007347390986979008, "step": 542 }, { "epoch": 0.2928407712012943, "grad_norm": 6.882619857788086, "learning_rate": 4.4558362597781925e-07, "logits/chosen": 0.8641858100891113, "logits/rejected": -0.9157354235649109, "logps/chosen": -403.128173828125, "logps/rejected": -246.20843505859375, "loss": 0.6527, "rewards/accuracies": 1.0, "rewards/chosen": 0.10536938160657883, "rewards/margins": 0.0835714340209961, "rewards/rejected": 0.021797940135002136, "step": 543 }, { "epoch": 0.2933800728057166, "grad_norm": 7.282421112060547, "learning_rate": 4.4528999871299224e-07, "logits/chosen": 0.5276063680648804, "logits/rejected": -0.31177347898483276, "logps/chosen": -228.54446411132812, "logps/rejected": -191.19056701660156, "loss": 0.6718, "rewards/accuracies": 0.625, "rewards/chosen": 0.13207073509693146, "rewards/margins": 0.04701319336891174, "rewards/rejected": 0.08505754172801971, "step": 544 }, { "epoch": 0.2939193744101389, "grad_norm": 6.507221221923828, "learning_rate": 4.449956786806306e-07, "logits/chosen": 0.3836410939693451, "logits/rejected": -0.3897758424282074, "logps/chosen": -255.3372039794922, "logps/rejected": -244.5368194580078, "loss": 0.6992, "rewards/accuracies": 0.375, "rewards/chosen": 0.0830455794930458, "rewards/margins": -0.009006500244140625, "rewards/rejected": 0.09205207973718643, "step": 545 }, { "epoch": 0.29445867601456116, "grad_norm": 7.858604431152344, "learning_rate": 4.4470066692479903e-07, "logits/chosen": 0.2546898126602173, "logits/rejected": -1.2093006372451782, "logps/chosen": -252.02565002441406, "logps/rejected": -171.18260192871094, "loss": 0.6664, "rewards/accuracies": 0.75, "rewards/chosen": 0.1611279547214508, "rewards/margins": 0.0595855787396431, "rewards/rejected": 0.10154237598180771, "step": 546 }, { "epoch": 0.29499797761898344, "grad_norm": 5.817309379577637, "learning_rate": 4.444049644920158e-07, "logits/chosen": 0.7483332753181458, "logits/rejected": 0.8423135876655579, "logps/chosen": -171.98341369628906, "logps/rejected": -209.59042358398438, "loss": 0.6847, "rewards/accuracies": 0.5, "rewards/chosen": 0.11968708038330078, "rewards/margins": 0.01936321333050728, "rewards/rejected": 0.1003238707780838, "step": 547 }, { "epoch": 0.29553727922340567, "grad_norm": 6.094029426574707, "learning_rate": 4.4410857243124933e-07, "logits/chosen": 0.418576717376709, "logits/rejected": 0.09928800910711288, "logps/chosen": -186.42652893066406, "logps/rejected": -181.73171997070312, "loss": 0.7081, "rewards/accuracies": 0.25, "rewards/chosen": 0.0593351349234581, "rewards/margins": -0.02921438217163086, "rewards/rejected": 0.08854952454566956, "step": 548 }, { "epoch": 0.29607658082782795, "grad_norm": 7.255157947540283, "learning_rate": 4.4381149179391446e-07, "logits/chosen": 0.23280055820941925, "logits/rejected": 0.6662101745605469, "logps/chosen": -283.9414367675781, "logps/rejected": -301.0223693847656, "loss": 0.6574, "rewards/accuracies": 0.75, "rewards/chosen": 0.11677560955286026, "rewards/margins": 0.07736597210168839, "rewards/rejected": 0.039409637451171875, "step": 549 }, { "epoch": 0.29661588243225023, "grad_norm": 7.723174571990967, "learning_rate": 4.435137236338687e-07, "logits/chosen": 0.2633057236671448, "logits/rejected": -0.38561707735061646, "logps/chosen": -250.8154296875, "logps/rejected": -218.99131774902344, "loss": 0.6537, "rewards/accuracies": 0.625, "rewards/chosen": 0.0925566628575325, "rewards/margins": 0.08366402983665466, "rewards/rejected": 0.008892633020877838, "step": 550 }, { "epoch": 0.2971551840366725, "grad_norm": 6.330657005310059, "learning_rate": 4.4321526900740837e-07, "logits/chosen": 0.2732391059398651, "logits/rejected": -0.6858182549476624, "logps/chosen": -223.95901489257812, "logps/rejected": -235.09783935546875, "loss": 0.66, "rewards/accuracies": 0.5, "rewards/chosen": 0.11571255326271057, "rewards/margins": 0.0735052078962326, "rewards/rejected": 0.04220733791589737, "step": 551 }, { "epoch": 0.2976944856410948, "grad_norm": 8.335531234741211, "learning_rate": 4.429161289732649e-07, "logits/chosen": 0.37265872955322266, "logits/rejected": -0.18157456815242767, "logps/chosen": -210.87738037109375, "logps/rejected": -170.09445190429688, "loss": 0.6694, "rewards/accuracies": 0.75, "rewards/chosen": 0.09210357815027237, "rewards/margins": 0.051414065062999725, "rewards/rejected": 0.040689513087272644, "step": 552 }, { "epoch": 0.2982337872455171, "grad_norm": 9.341516494750977, "learning_rate": 4.4261630459260136e-07, "logits/chosen": 0.14987236261367798, "logits/rejected": 0.49522367119789124, "logps/chosen": -327.8892822265625, "logps/rejected": -324.403076171875, "loss": 0.7344, "rewards/accuracies": 0.25, "rewards/chosen": 0.04268340766429901, "rewards/margins": -0.0771753340959549, "rewards/rejected": 0.1198587492108345, "step": 553 }, { "epoch": 0.2987730888499393, "grad_norm": 7.038036346435547, "learning_rate": 4.423157969290081e-07, "logits/chosen": 1.1664437055587769, "logits/rejected": 0.11163356900215149, "logps/chosen": -254.24278259277344, "logps/rejected": -234.70828247070312, "loss": 0.6553, "rewards/accuracies": 0.75, "rewards/chosen": 0.10684490203857422, "rewards/margins": 0.08251456916332245, "rewards/rejected": 0.02433033287525177, "step": 554 }, { "epoch": 0.2993123904543616, "grad_norm": 7.90248966217041, "learning_rate": 4.420146070484997e-07, "logits/chosen": 0.48804131150245667, "logits/rejected": -0.09489759802818298, "logps/chosen": -259.60595703125, "logps/rejected": -240.62701416015625, "loss": 0.7184, "rewards/accuracies": 0.625, "rewards/chosen": 0.03271027281880379, "rewards/margins": -0.044107627123594284, "rewards/rejected": 0.07681789249181747, "step": 555 }, { "epoch": 0.29985169205878387, "grad_norm": 6.984221935272217, "learning_rate": 4.4171273601951064e-07, "logits/chosen": -0.4022127389907837, "logits/rejected": 0.3127640187740326, "logps/chosen": -258.93603515625, "logps/rejected": -340.5013122558594, "loss": 0.7122, "rewards/accuracies": 0.375, "rewards/chosen": 0.07113686203956604, "rewards/margins": -0.03377962112426758, "rewards/rejected": 0.10491648316383362, "step": 556 }, { "epoch": 0.30039099366320615, "grad_norm": 7.234861850738525, "learning_rate": 4.414101849128916e-07, "logits/chosen": 0.793062686920166, "logits/rejected": -0.2701895236968994, "logps/chosen": -323.5763854980469, "logps/rejected": -336.86370849609375, "loss": 0.7109, "rewards/accuracies": 0.25, "rewards/chosen": 0.041594311594963074, "rewards/margins": -0.03340530768036842, "rewards/rejected": 0.0749996155500412, "step": 557 }, { "epoch": 0.30093029526762843, "grad_norm": 8.095574378967285, "learning_rate": 4.4110695480190597e-07, "logits/chosen": -0.3851381540298462, "logits/rejected": 1.0592715740203857, "logps/chosen": -201.73863220214844, "logps/rejected": -192.2958526611328, "loss": 0.7331, "rewards/accuracies": 0.375, "rewards/chosen": 0.04681835323572159, "rewards/margins": -0.07405701279640198, "rewards/rejected": 0.12087536603212357, "step": 558 }, { "epoch": 0.3014695968720507, "grad_norm": 7.561378002166748, "learning_rate": 4.408030467622256e-07, "logits/chosen": 0.6563804745674133, "logits/rejected": -0.6160975098609924, "logps/chosen": -327.71417236328125, "logps/rejected": -305.7525329589844, "loss": 0.6601, "rewards/accuracies": 0.75, "rewards/chosen": 0.12991124391555786, "rewards/margins": 0.07444705814123154, "rewards/rejected": 0.05546416714787483, "step": 559 }, { "epoch": 0.30200889847647294, "grad_norm": 7.280357837677002, "learning_rate": 4.404984618719274e-07, "logits/chosen": 0.6257022023200989, "logits/rejected": -1.3543320894241333, "logps/chosen": -276.3218994140625, "logps/rejected": -148.42047119140625, "loss": 0.6857, "rewards/accuracies": 0.625, "rewards/chosen": 0.07648611068725586, "rewards/margins": 0.01708674430847168, "rewards/rejected": 0.05939936637878418, "step": 560 }, { "epoch": 0.3025482000808952, "grad_norm": 6.989957332611084, "learning_rate": 4.401932012114893e-07, "logits/chosen": 0.7971677780151367, "logits/rejected": 0.2752721309661865, "logps/chosen": -205.76568603515625, "logps/rejected": -223.87826538085938, "loss": 0.6919, "rewards/accuracies": 0.625, "rewards/chosen": 0.0902584120631218, "rewards/margins": 0.0053304703906178474, "rewards/rejected": 0.08492793887853622, "step": 561 }, { "epoch": 0.3030875016853175, "grad_norm": 6.920658588409424, "learning_rate": 4.398872658637863e-07, "logits/chosen": 0.4105834364891052, "logits/rejected": -0.8370625972747803, "logps/chosen": -247.56973266601562, "logps/rejected": -221.530029296875, "loss": 0.669, "rewards/accuracies": 0.625, "rewards/chosen": 0.11003074795007706, "rewards/margins": 0.05154590308666229, "rewards/rejected": 0.058484841138124466, "step": 562 }, { "epoch": 0.3036268032897398, "grad_norm": 7.472982883453369, "learning_rate": 4.3958065691408685e-07, "logits/chosen": 0.8332287073135376, "logits/rejected": 1.3058934211730957, "logps/chosen": -258.47100830078125, "logps/rejected": -247.1056365966797, "loss": 0.6581, "rewards/accuracies": 0.75, "rewards/chosen": 0.14005565643310547, "rewards/margins": 0.07417707145214081, "rewards/rejected": 0.06587857753038406, "step": 563 }, { "epoch": 0.30416610489416207, "grad_norm": 6.707991123199463, "learning_rate": 4.3927337545004894e-07, "logits/chosen": 0.153589129447937, "logits/rejected": -1.4013370275497437, "logps/chosen": -224.98828125, "logps/rejected": -179.00372314453125, "loss": 0.6601, "rewards/accuracies": 0.625, "rewards/chosen": 0.09586601704359055, "rewards/margins": 0.07279806584119797, "rewards/rejected": 0.023067951202392578, "step": 564 }, { "epoch": 0.30470540649858435, "grad_norm": 6.987203121185303, "learning_rate": 4.3896542256171634e-07, "logits/chosen": 0.23447948694229126, "logits/rejected": 0.029784195125102997, "logps/chosen": -222.7562713623047, "logps/rejected": -234.16835021972656, "loss": 0.7082, "rewards/accuracies": 0.625, "rewards/chosen": 0.08623190224170685, "rewards/margins": -0.023373892530798912, "rewards/rejected": 0.10960578918457031, "step": 565 }, { "epoch": 0.3052447081030066, "grad_norm": 6.302793979644775, "learning_rate": 4.3865679934151433e-07, "logits/chosen": 0.6880659461021423, "logits/rejected": 1.270774006843567, "logps/chosen": -184.70828247070312, "logps/rejected": -236.49386596679688, "loss": 0.6893, "rewards/accuracies": 0.625, "rewards/chosen": 0.05689048767089844, "rewards/margins": 0.013707926496863365, "rewards/rejected": 0.04318256676197052, "step": 566 }, { "epoch": 0.30578400970742886, "grad_norm": 7.176483631134033, "learning_rate": 4.383475068842464e-07, "logits/chosen": 0.4021707773208618, "logits/rejected": -0.816571831703186, "logps/chosen": -243.34881591796875, "logps/rejected": -210.88723754882812, "loss": 0.6978, "rewards/accuracies": 0.625, "rewards/chosen": 0.05366206169128418, "rewards/margins": -0.0024780742824077606, "rewards/rejected": 0.05614013597369194, "step": 567 }, { "epoch": 0.30632331131185114, "grad_norm": 7.851681232452393, "learning_rate": 4.3803754628708987e-07, "logits/chosen": 0.9754573106765747, "logits/rejected": -0.6699434518814087, "logps/chosen": -249.35946655273438, "logps/rejected": -218.40220642089844, "loss": 0.65, "rewards/accuracies": 0.75, "rewards/chosen": 0.10816384106874466, "rewards/margins": 0.09117288887500763, "rewards/rejected": 0.01699094846844673, "step": 568 }, { "epoch": 0.3068626129162734, "grad_norm": 8.945590019226074, "learning_rate": 4.377269186495923e-07, "logits/chosen": -0.14392106235027313, "logits/rejected": 0.12389056384563446, "logps/chosen": -358.9559326171875, "logps/rejected": -307.21759033203125, "loss": 0.7119, "rewards/accuracies": 0.375, "rewards/chosen": 0.1286056637763977, "rewards/margins": -0.032366178929805756, "rewards/rejected": 0.16097185015678406, "step": 569 }, { "epoch": 0.3074019145206957, "grad_norm": 6.884068489074707, "learning_rate": 4.374156250736675e-07, "logits/chosen": 0.137131005525589, "logits/rejected": -0.00422077439725399, "logps/chosen": -221.028564453125, "logps/rejected": -220.23178100585938, "loss": 0.7089, "rewards/accuracies": 0.25, "rewards/chosen": 0.09424476325511932, "rewards/margins": -0.02936229482293129, "rewards/rejected": 0.12360706180334091, "step": 570 }, { "epoch": 0.307941216125118, "grad_norm": 8.23255729675293, "learning_rate": 4.371036666635916e-07, "logits/chosen": -0.7547909021377563, "logits/rejected": -0.939976692199707, "logps/chosen": -187.43011474609375, "logps/rejected": -261.232177734375, "loss": 0.7284, "rewards/accuracies": 0.375, "rewards/chosen": 0.019048498943448067, "rewards/margins": -0.05963592603802681, "rewards/rejected": 0.07868442684412003, "step": 571 }, { "epoch": 0.30848051772954027, "grad_norm": 6.961707592010498, "learning_rate": 4.36791044525999e-07, "logits/chosen": 0.6033744812011719, "logits/rejected": -0.4668518900871277, "logps/chosen": -300.29913330078125, "logps/rejected": -209.8974609375, "loss": 0.6458, "rewards/accuracies": 0.75, "rewards/chosen": 0.14676788449287415, "rewards/margins": 0.11135177314281464, "rewards/rejected": 0.0354161262512207, "step": 572 }, { "epoch": 0.3090198193339625, "grad_norm": 7.363255977630615, "learning_rate": 4.36477759769879e-07, "logits/chosen": -0.4540369212627411, "logits/rejected": -1.1680140495300293, "logps/chosen": -318.94439697265625, "logps/rejected": -252.4499053955078, "loss": 0.7091, "rewards/accuracies": 0.375, "rewards/chosen": 0.05976381152868271, "rewards/margins": -0.029482368379831314, "rewards/rejected": 0.08924618363380432, "step": 573 }, { "epoch": 0.3095591209383848, "grad_norm": 6.865524768829346, "learning_rate": 4.3616381350657104e-07, "logits/chosen": 0.5423776507377625, "logits/rejected": -0.23473793268203735, "logps/chosen": -288.71453857421875, "logps/rejected": -228.84654235839844, "loss": 0.6927, "rewards/accuracies": 0.75, "rewards/chosen": 0.06900273263454437, "rewards/margins": 0.005478478968143463, "rewards/rejected": 0.06352424621582031, "step": 574 }, { "epoch": 0.31009842254280706, "grad_norm": 6.348721504211426, "learning_rate": 4.3584920684976135e-07, "logits/chosen": 1.2425332069396973, "logits/rejected": 1.3157280683517456, "logps/chosen": -219.5159912109375, "logps/rejected": -215.56961059570312, "loss": 0.6836, "rewards/accuracies": 0.75, "rewards/chosen": 0.11620121449232101, "rewards/margins": 0.020728489384055138, "rewards/rejected": 0.09547272324562073, "step": 575 }, { "epoch": 0.31063772414722934, "grad_norm": 6.481983661651611, "learning_rate": 4.355339409154788e-07, "logits/chosen": 1.2290172576904297, "logits/rejected": 0.6221218109130859, "logps/chosen": -257.6651306152344, "logps/rejected": -236.4681854248047, "loss": 0.6599, "rewards/accuracies": 0.75, "rewards/chosen": 0.16064205765724182, "rewards/margins": 0.0686735212802887, "rewards/rejected": 0.09196853637695312, "step": 576 }, { "epoch": 0.3111770257516516, "grad_norm": 6.600047588348389, "learning_rate": 4.352180168220909e-07, "logits/chosen": 0.6255033612251282, "logits/rejected": -0.8213228583335876, "logps/chosen": -207.70004272460938, "logps/rejected": -163.0606689453125, "loss": 0.6679, "rewards/accuracies": 0.375, "rewards/chosen": 0.1302211880683899, "rewards/margins": 0.054554034024477005, "rewards/rejected": 0.07566714286804199, "step": 577 }, { "epoch": 0.3117163273560739, "grad_norm": 6.959785461425781, "learning_rate": 4.3490143569030017e-07, "logits/chosen": 0.6958807706832886, "logits/rejected": -0.3824461102485657, "logps/chosen": -260.88201904296875, "logps/rejected": -296.7778625488281, "loss": 0.6321, "rewards/accuracies": 0.5, "rewards/chosen": 0.17501068115234375, "rewards/margins": 0.13710355758666992, "rewards/rejected": 0.03790712356567383, "step": 578 }, { "epoch": 0.31225562896049613, "grad_norm": 7.770034313201904, "learning_rate": 4.3458419864313957e-07, "logits/chosen": 0.6470165252685547, "logits/rejected": -0.7410808801651001, "logps/chosen": -298.60479736328125, "logps/rejected": -288.8153991699219, "loss": 0.6559, "rewards/accuracies": 0.5, "rewards/chosen": 0.10625743865966797, "rewards/margins": 0.08561259508132935, "rewards/rejected": 0.02064485475420952, "step": 579 }, { "epoch": 0.3127949305649184, "grad_norm": 6.701475620269775, "learning_rate": 4.342663068059689e-07, "logits/chosen": 0.40584051609039307, "logits/rejected": 0.1929965615272522, "logps/chosen": -259.96966552734375, "logps/rejected": -283.3058776855469, "loss": 0.7046, "rewards/accuracies": 0.625, "rewards/chosen": 0.10329237580299377, "rewards/margins": -0.014500336721539497, "rewards/rejected": 0.11779271066188812, "step": 580 }, { "epoch": 0.3133342321693407, "grad_norm": 6.065678596496582, "learning_rate": 4.33947761306471e-07, "logits/chosen": 0.6177975535392761, "logits/rejected": 0.031432297080755234, "logps/chosen": -269.6898193359375, "logps/rejected": -222.04489135742188, "loss": 0.7278, "rewards/accuracies": 0.375, "rewards/chosen": 0.05523720011115074, "rewards/margins": -0.06138649210333824, "rewards/rejected": 0.11662369221448898, "step": 581 }, { "epoch": 0.313873533773763, "grad_norm": 8.910660743713379, "learning_rate": 4.336285632746471e-07, "logits/chosen": -0.6462715268135071, "logits/rejected": -0.46227219700813293, "logps/chosen": -257.09527587890625, "logps/rejected": -285.19866943359375, "loss": 0.6988, "rewards/accuracies": 0.375, "rewards/chosen": 0.06765852123498917, "rewards/margins": -0.0044403038918972015, "rewards/rejected": 0.07209882885217667, "step": 582 }, { "epoch": 0.31441283537818526, "grad_norm": 7.055410385131836, "learning_rate": 4.3330871384281366e-07, "logits/chosen": -0.8362792134284973, "logits/rejected": -0.5395189523696899, "logps/chosen": -199.5119171142578, "logps/rejected": -181.02574157714844, "loss": 0.6915, "rewards/accuracies": 0.375, "rewards/chosen": 0.09136296063661575, "rewards/margins": 0.014379506930708885, "rewards/rejected": 0.07698345184326172, "step": 583 }, { "epoch": 0.31495213698260754, "grad_norm": 8.364042282104492, "learning_rate": 4.329882141455974e-07, "logits/chosen": 0.3895821273326874, "logits/rejected": -0.5845776200294495, "logps/chosen": -297.20928955078125, "logps/rejected": -358.896484375, "loss": 0.6692, "rewards/accuracies": 0.375, "rewards/chosen": 0.11437530815601349, "rewards/margins": 0.06135158985853195, "rewards/rejected": 0.05302371829748154, "step": 584 }, { "epoch": 0.31549143858702977, "grad_norm": 6.815564155578613, "learning_rate": 4.3266706531993225e-07, "logits/chosen": 0.7342793345451355, "logits/rejected": -0.29900017380714417, "logps/chosen": -309.8541564941406, "logps/rejected": -188.65296936035156, "loss": 0.6564, "rewards/accuracies": 0.625, "rewards/chosen": 0.1392311155796051, "rewards/margins": 0.08181505650281906, "rewards/rejected": 0.05741605907678604, "step": 585 }, { "epoch": 0.31603074019145205, "grad_norm": 7.343289852142334, "learning_rate": 4.323452685050545e-07, "logits/chosen": -0.39099931716918945, "logits/rejected": -0.2674649953842163, "logps/chosen": -203.62844848632812, "logps/rejected": -209.64959716796875, "loss": 0.6953, "rewards/accuracies": 0.5, "rewards/chosen": 0.09567814320325851, "rewards/margins": 0.00014925003051757812, "rewards/rejected": 0.09552889317274094, "step": 586 }, { "epoch": 0.31657004179587434, "grad_norm": 7.006906032562256, "learning_rate": 4.320228248424994e-07, "logits/chosen": -0.2351691722869873, "logits/rejected": -0.42485159635543823, "logps/chosen": -263.6348876953125, "logps/rejected": -267.14141845703125, "loss": 0.7088, "rewards/accuracies": 0.375, "rewards/chosen": 0.04219837114214897, "rewards/margins": -0.027815058827400208, "rewards/rejected": 0.07001343369483948, "step": 587 }, { "epoch": 0.3171093434002966, "grad_norm": 6.2134246826171875, "learning_rate": 4.3169973547609644e-07, "logits/chosen": -0.9432584047317505, "logits/rejected": -0.0910138338804245, "logps/chosen": -205.2262725830078, "logps/rejected": -238.2607421875, "loss": 0.6899, "rewards/accuracies": 0.375, "rewards/chosen": 0.13486257195472717, "rewards/margins": 0.009099729359149933, "rewards/rejected": 0.12576285004615784, "step": 588 }, { "epoch": 0.3176486450047189, "grad_norm": 7.204646587371826, "learning_rate": 4.313760015519661e-07, "logits/chosen": -0.020453665405511856, "logits/rejected": -0.6195744872093201, "logps/chosen": -226.78952026367188, "logps/rejected": -218.67794799804688, "loss": 0.661, "rewards/accuracies": 0.5, "rewards/chosen": 0.13076171278953552, "rewards/margins": 0.06985616683959961, "rewards/rejected": 0.06090555340051651, "step": 589 }, { "epoch": 0.3181879466091412, "grad_norm": 7.284700870513916, "learning_rate": 4.310516242185149e-07, "logits/chosen": -0.7305541038513184, "logits/rejected": -0.7711143493652344, "logps/chosen": -162.48251342773438, "logps/rejected": -188.6340789794922, "loss": 0.6702, "rewards/accuracies": 0.625, "rewards/chosen": 0.12299223244190216, "rewards/margins": 0.05215072259306908, "rewards/rejected": 0.07084150612354279, "step": 590 }, { "epoch": 0.3187272482135634, "grad_norm": 8.09624195098877, "learning_rate": 4.307266046264322e-07, "logits/chosen": 0.8073479533195496, "logits/rejected": 0.512790322303772, "logps/chosen": -255.85658264160156, "logps/rejected": -214.6768035888672, "loss": 0.728, "rewards/accuracies": 0.25, "rewards/chosen": 0.04457607492804527, "rewards/margins": -0.05694140866398811, "rewards/rejected": 0.10151748359203339, "step": 591 }, { "epoch": 0.3192665498179857, "grad_norm": 7.4116058349609375, "learning_rate": 4.304009439286854e-07, "logits/chosen": -0.02704434096813202, "logits/rejected": -0.8409602642059326, "logps/chosen": -205.52157592773438, "logps/rejected": -155.84332275390625, "loss": 0.72, "rewards/accuracies": 0.375, "rewards/chosen": 0.06656980514526367, "rewards/margins": -0.04906883090734482, "rewards/rejected": 0.11563863605260849, "step": 592 }, { "epoch": 0.31980585142240797, "grad_norm": 6.486231803894043, "learning_rate": 4.3007464328051634e-07, "logits/chosen": 0.12399131059646606, "logits/rejected": 0.03043264150619507, "logps/chosen": -187.19793701171875, "logps/rejected": -208.04769897460938, "loss": 0.6774, "rewards/accuracies": 0.5, "rewards/chosen": 0.15279167890548706, "rewards/margins": 0.03886737674474716, "rewards/rejected": 0.11392431706190109, "step": 593 }, { "epoch": 0.32034515302683025, "grad_norm": 7.031278133392334, "learning_rate": 4.297477038394368e-07, "logits/chosen": -0.4206901788711548, "logits/rejected": -0.5926955938339233, "logps/chosen": -191.99343872070312, "logps/rejected": -185.70166015625, "loss": 0.6899, "rewards/accuracies": 0.625, "rewards/chosen": 0.1418568193912506, "rewards/margins": 0.015319157391786575, "rewards/rejected": 0.12653765082359314, "step": 594 }, { "epoch": 0.32088445463125254, "grad_norm": 7.589864253997803, "learning_rate": 4.2942012676522473e-07, "logits/chosen": 0.12166494131088257, "logits/rejected": -0.10803680121898651, "logps/chosen": -255.91070556640625, "logps/rejected": -221.94158935546875, "loss": 0.6665, "rewards/accuracies": 0.625, "rewards/chosen": 0.10445347428321838, "rewards/margins": 0.06044946238398552, "rewards/rejected": 0.044004011899232864, "step": 595 }, { "epoch": 0.3214237562356748, "grad_norm": 11.895929336547852, "learning_rate": 4.2909191321992e-07, "logits/chosen": -0.25860512256622314, "logits/rejected": -0.756062924861908, "logps/chosen": -250.02450561523438, "logps/rejected": -241.09664916992188, "loss": 0.7259, "rewards/accuracies": 0.25, "rewards/chosen": 0.058535002171993256, "rewards/margins": -0.062489889562129974, "rewards/rejected": 0.12102489918470383, "step": 596 }, { "epoch": 0.3219630578400971, "grad_norm": 7.06307315826416, "learning_rate": 4.287630643678203e-07, "logits/chosen": -0.4696665406227112, "logits/rejected": 0.7127017974853516, "logps/chosen": -288.76458740234375, "logps/rejected": -266.11151123046875, "loss": 0.6992, "rewards/accuracies": 0.375, "rewards/chosen": 0.13915330171585083, "rewards/margins": -0.007753180339932442, "rewards/rejected": 0.14690646529197693, "step": 597 }, { "epoch": 0.3225023594445193, "grad_norm": 6.340827941894531, "learning_rate": 4.284335813754769e-07, "logits/chosen": 0.8562440276145935, "logits/rejected": 0.9728953242301941, "logps/chosen": -230.66815185546875, "logps/rejected": -265.36334228515625, "loss": 0.6582, "rewards/accuracies": 0.75, "rewards/chosen": 0.1507326066493988, "rewards/margins": 0.07255324721336365, "rewards/rejected": 0.07817936688661575, "step": 598 }, { "epoch": 0.3230416610489416, "grad_norm": 6.922592639923096, "learning_rate": 4.2810346541169073e-07, "logits/chosen": 0.8995894193649292, "logits/rejected": 1.06847083568573, "logps/chosen": -325.12945556640625, "logps/rejected": -381.759765625, "loss": 0.7022, "rewards/accuracies": 0.625, "rewards/chosen": 0.1275438368320465, "rewards/margins": -0.012306973338127136, "rewards/rejected": 0.13985081017017365, "step": 599 }, { "epoch": 0.3235809626533639, "grad_norm": 7.296640396118164, "learning_rate": 4.27772717647508e-07, "logits/chosen": 0.9847571849822998, "logits/rejected": -0.1995711326599121, "logps/chosen": -324.3867492675781, "logps/rejected": -249.35662841796875, "loss": 0.647, "rewards/accuracies": 0.625, "rewards/chosen": 0.14785461127758026, "rewards/margins": 0.10059948265552521, "rewards/rejected": 0.04725513607263565, "step": 600 }, { "epoch": 0.3235809626533639, "eval_logits/chosen": 1.3656604290008545, "eval_logits/rejected": 1.0919525623321533, "eval_logps/chosen": -250.2683868408203, "eval_logps/rejected": -234.98260498046875, "eval_loss": 0.6813110113143921, "eval_rewards/accuracies": 0.5844720602035522, "eval_rewards/chosen": 0.11830902844667435, "eval_rewards/margins": 0.030314499512314796, "eval_rewards/rejected": 0.0879945382475853, "eval_runtime": 836.6079, "eval_samples_per_second": 1.924, "eval_steps_per_second": 0.962, "step": 600 }, { "epoch": 0.3241202642577862, "grad_norm": 6.2786545753479, "learning_rate": 4.274413392562163e-07, "logits/chosen": -0.10899145901203156, "logits/rejected": -0.27682557702064514, "logps/chosen": -198.83245849609375, "logps/rejected": -181.95388793945312, "loss": 0.6936, "rewards/accuracies": 0.5, "rewards/chosen": 0.12034101039171219, "rewards/margins": 0.009901810437440872, "rewards/rejected": 0.11043921113014221, "step": 601 }, { "epoch": 0.32465956586220845, "grad_norm": 7.75433874130249, "learning_rate": 4.2710933141334003e-07, "logits/chosen": 0.15866199135780334, "logits/rejected": -1.2913832664489746, "logps/chosen": -273.6816711425781, "logps/rejected": -223.90399169921875, "loss": 0.6961, "rewards/accuracies": 0.375, "rewards/chosen": 0.11349543929100037, "rewards/margins": -0.00307393167167902, "rewards/rejected": 0.116569384932518, "step": 602 }, { "epoch": 0.32519886746663074, "grad_norm": 7.337603569030762, "learning_rate": 4.2677669529663686e-07, "logits/chosen": 0.8527324199676514, "logits/rejected": -0.8457676768302917, "logps/chosen": -260.04791259765625, "logps/rejected": -209.3777618408203, "loss": 0.6712, "rewards/accuracies": 0.625, "rewards/chosen": 0.12967529892921448, "rewards/margins": 0.04993715509772301, "rewards/rejected": 0.07973814010620117, "step": 603 }, { "epoch": 0.32573816907105296, "grad_norm": 8.81859302520752, "learning_rate": 4.2644343208609286e-07, "logits/chosen": -0.9052882194519043, "logits/rejected": -0.24009670317173004, "logps/chosen": -278.75604248046875, "logps/rejected": -212.80438232421875, "loss": 0.729, "rewards/accuracies": 0.25, "rewards/chosen": 0.04285307228565216, "rewards/margins": -0.06763296574354172, "rewards/rejected": 0.11048603057861328, "step": 604 }, { "epoch": 0.32627747067547525, "grad_norm": 7.766134262084961, "learning_rate": 4.2610954296391876e-07, "logits/chosen": 1.0273587703704834, "logits/rejected": 0.19947147369384766, "logps/chosen": -274.3766784667969, "logps/rejected": -300.0667724609375, "loss": 0.7108, "rewards/accuracies": 0.375, "rewards/chosen": 0.0786685049533844, "rewards/margins": -0.025431059300899506, "rewards/rejected": 0.10409955680370331, "step": 605 }, { "epoch": 0.3268167722798975, "grad_norm": 7.184971809387207, "learning_rate": 4.2577502911454565e-07, "logits/chosen": 0.4998660087585449, "logits/rejected": -0.21711230278015137, "logps/chosen": -211.7227325439453, "logps/rejected": -190.69171142578125, "loss": 0.7014, "rewards/accuracies": 0.75, "rewards/chosen": 0.1195157989859581, "rewards/margins": -0.006993204355239868, "rewards/rejected": 0.12650901079177856, "step": 606 }, { "epoch": 0.3273560738843198, "grad_norm": 7.109960556030273, "learning_rate": 4.254398917246208e-07, "logits/chosen": -0.24025708436965942, "logits/rejected": -0.12119320034980774, "logps/chosen": -221.9161834716797, "logps/rejected": -298.21710205078125, "loss": 0.695, "rewards/accuracies": 0.625, "rewards/chosen": 0.09527626633644104, "rewards/margins": -0.0019070645794272423, "rewards/rejected": 0.09718332439661026, "step": 607 }, { "epoch": 0.3278953754887421, "grad_norm": 7.040911674499512, "learning_rate": 4.251041319830033e-07, "logits/chosen": -0.5689091086387634, "logits/rejected": 0.30074524879455566, "logps/chosen": -232.92794799804688, "logps/rejected": -395.3621520996094, "loss": 0.6581, "rewards/accuracies": 0.75, "rewards/chosen": 0.13419896364212036, "rewards/margins": 0.07554512470960617, "rewards/rejected": 0.05865383520722389, "step": 608 }, { "epoch": 0.3284346770931644, "grad_norm": 6.6701436042785645, "learning_rate": 4.2476775108076015e-07, "logits/chosen": 0.7244387865066528, "logits/rejected": 0.43084338307380676, "logps/chosen": -238.638427734375, "logps/rejected": -213.0946502685547, "loss": 0.6781, "rewards/accuracies": 0.875, "rewards/chosen": 0.16019201278686523, "rewards/margins": 0.036214448511600494, "rewards/rejected": 0.12397757172584534, "step": 609 }, { "epoch": 0.3289739786975866, "grad_norm": 7.379875183105469, "learning_rate": 4.2443075021116157e-07, "logits/chosen": 0.40510687232017517, "logits/rejected": 0.23357123136520386, "logps/chosen": -203.8571014404297, "logps/rejected": -193.57681274414062, "loss": 0.712, "rewards/accuracies": 0.375, "rewards/chosen": 0.0806090384721756, "rewards/margins": -0.025533676147460938, "rewards/rejected": 0.10614272207021713, "step": 610 }, { "epoch": 0.3295132803020089, "grad_norm": 6.347879886627197, "learning_rate": 4.240931305696772e-07, "logits/chosen": 0.5368471145629883, "logits/rejected": -1.1083781719207764, "logps/chosen": -292.7464294433594, "logps/rejected": -204.46096801757812, "loss": 0.7076, "rewards/accuracies": 0.5, "rewards/chosen": 0.12701281905174255, "rewards/margins": -0.017159566283226013, "rewards/rejected": 0.14417238533496857, "step": 611 }, { "epoch": 0.33005258190643116, "grad_norm": 7.093725681304932, "learning_rate": 4.237548933539718e-07, "logits/chosen": -0.7645618915557861, "logits/rejected": 0.29170840978622437, "logps/chosen": -227.52655029296875, "logps/rejected": -355.31768798828125, "loss": 0.6602, "rewards/accuracies": 0.875, "rewards/chosen": 0.14677409827709198, "rewards/margins": 0.068273164331913, "rewards/rejected": 0.07850094139575958, "step": 612 }, { "epoch": 0.33059188351085345, "grad_norm": 7.37656307220459, "learning_rate": 4.2341603976390073e-07, "logits/chosen": -0.054111748933792114, "logits/rejected": -1.199847936630249, "logps/chosen": -202.8127899169922, "logps/rejected": -144.06170654296875, "loss": 0.6871, "rewards/accuracies": 0.5, "rewards/chosen": 0.11235284805297852, "rewards/margins": 0.0346376858651638, "rewards/rejected": 0.07771515846252441, "step": 613 }, { "epoch": 0.33113118511527573, "grad_norm": 5.558141231536865, "learning_rate": 4.2307657100150576e-07, "logits/chosen": 0.0819002240896225, "logits/rejected": -0.8648250699043274, "logps/chosen": -199.13516235351562, "logps/rejected": -161.72369384765625, "loss": 0.6572, "rewards/accuracies": 0.625, "rewards/chosen": 0.1383434236049652, "rewards/margins": 0.08092079311609268, "rewards/rejected": 0.05742264166474342, "step": 614 }, { "epoch": 0.331670486719698, "grad_norm": 9.975825309753418, "learning_rate": 4.227364882710113e-07, "logits/chosen": 0.9833521842956543, "logits/rejected": -1.6807987689971924, "logps/chosen": -380.0235900878906, "logps/rejected": -185.38565063476562, "loss": 0.702, "rewards/accuracies": 0.625, "rewards/chosen": 0.0663909986615181, "rewards/margins": -0.0058607980608940125, "rewards/rejected": 0.07225179672241211, "step": 615 }, { "epoch": 0.33220978832412024, "grad_norm": 6.750873565673828, "learning_rate": 4.2239579277881943e-07, "logits/chosen": 1.1705106496810913, "logits/rejected": -0.3611442446708679, "logps/chosen": -305.0623474121094, "logps/rejected": -192.49449157714844, "loss": 0.693, "rewards/accuracies": 0.375, "rewards/chosen": 0.11285534501075745, "rewards/margins": 0.0021207816898822784, "rewards/rejected": 0.11073455959558487, "step": 616 }, { "epoch": 0.3327490899285425, "grad_norm": 7.303081512451172, "learning_rate": 4.220544857335059e-07, "logits/chosen": -0.43156158924102783, "logits/rejected": -0.7975560426712036, "logps/chosen": -229.775634765625, "logps/rejected": -239.14781188964844, "loss": 0.6773, "rewards/accuracies": 0.5, "rewards/chosen": 0.1277034729719162, "rewards/margins": 0.03822498768568039, "rewards/rejected": 0.0894784927368164, "step": 617 }, { "epoch": 0.3332883915329648, "grad_norm": 8.150777816772461, "learning_rate": 4.217125683458161e-07, "logits/chosen": -0.7638305425643921, "logits/rejected": 1.224173665046692, "logps/chosen": -197.8125, "logps/rejected": -342.7564392089844, "loss": 0.7118, "rewards/accuracies": 0.5, "rewards/chosen": 0.060766879469156265, "rewards/margins": -0.028272632509469986, "rewards/rejected": 0.08903951197862625, "step": 618 }, { "epoch": 0.3338276931373871, "grad_norm": 7.570537090301514, "learning_rate": 4.2137004182866034e-07, "logits/chosen": 0.5080280303955078, "logits/rejected": -0.6863911151885986, "logps/chosen": -258.9812927246094, "logps/rejected": -260.62689208984375, "loss": 0.6586, "rewards/accuracies": 0.625, "rewards/chosen": 0.10931643843650818, "rewards/margins": 0.0778077095746994, "rewards/rejected": 0.031508732587099075, "step": 619 }, { "epoch": 0.33436699474180936, "grad_norm": 6.32940149307251, "learning_rate": 4.2102690739710975e-07, "logits/chosen": 0.5716466307640076, "logits/rejected": 0.5718950629234314, "logps/chosen": -276.3244934082031, "logps/rejected": -254.17970275878906, "loss": 0.6534, "rewards/accuracies": 0.75, "rewards/chosen": 0.1689077466726303, "rewards/margins": 0.0866469293832779, "rewards/rejected": 0.08226080238819122, "step": 620 }, { "epoch": 0.33490629634623165, "grad_norm": 7.33854341506958, "learning_rate": 4.2068316626839216e-07, "logits/chosen": 0.45073390007019043, "logits/rejected": 0.2598947584629059, "logps/chosen": -291.9530029296875, "logps/rejected": -291.38018798828125, "loss": 0.6898, "rewards/accuracies": 0.625, "rewards/chosen": 0.09892988204956055, "rewards/margins": 0.010870072990655899, "rewards/rejected": 0.08805980533361435, "step": 621 }, { "epoch": 0.33544559795065393, "grad_norm": 7.46140193939209, "learning_rate": 4.2033881966188734e-07, "logits/chosen": 0.5341659188270569, "logits/rejected": -0.8368080258369446, "logps/chosen": -251.19644165039062, "logps/rejected": -189.6043701171875, "loss": 0.6417, "rewards/accuracies": 0.875, "rewards/chosen": 0.1376544088125229, "rewards/margins": 0.10995922237634659, "rewards/rejected": 0.027695177122950554, "step": 622 }, { "epoch": 0.33598489955507616, "grad_norm": 6.346938133239746, "learning_rate": 4.1999386879912306e-07, "logits/chosen": 0.14561396837234497, "logits/rejected": 0.08612965047359467, "logps/chosen": -173.71417236328125, "logps/rejected": -192.78367614746094, "loss": 0.7207, "rewards/accuracies": 0.5, "rewards/chosen": 0.0889713317155838, "rewards/margins": -0.04745092988014221, "rewards/rejected": 0.13642224669456482, "step": 623 }, { "epoch": 0.33652420115949844, "grad_norm": 6.22803258895874, "learning_rate": 4.196483149037706e-07, "logits/chosen": -0.3541274070739746, "logits/rejected": -0.09838428348302841, "logps/chosen": -241.1786346435547, "logps/rejected": -198.81619262695312, "loss": 0.6482, "rewards/accuracies": 0.75, "rewards/chosen": 0.1603793203830719, "rewards/margins": 0.09791870415210724, "rewards/rejected": 0.06246061623096466, "step": 624 }, { "epoch": 0.3370635027639207, "grad_norm": 7.129199028015137, "learning_rate": 4.193021592016404e-07, "logits/chosen": 0.9335537552833557, "logits/rejected": 0.42540666460990906, "logps/chosen": -253.46746826171875, "logps/rejected": -206.14703369140625, "loss": 0.6628, "rewards/accuracies": 0.625, "rewards/chosen": 0.14885026216506958, "rewards/margins": 0.06503792107105255, "rewards/rejected": 0.08381232619285583, "step": 625 }, { "epoch": 0.337602804368343, "grad_norm": 6.781412124633789, "learning_rate": 4.1895540292067763e-07, "logits/chosen": 0.5029339790344238, "logits/rejected": -0.02584916353225708, "logps/chosen": -323.60406494140625, "logps/rejected": -254.4581298828125, "loss": 0.6287, "rewards/accuracies": 0.625, "rewards/chosen": 0.1829279065132141, "rewards/margins": 0.145043283700943, "rewards/rejected": 0.03788462281227112, "step": 626 }, { "epoch": 0.3381421059727653, "grad_norm": 7.107027053833008, "learning_rate": 4.1860804729095817e-07, "logits/chosen": -0.2831658124923706, "logits/rejected": -1.5945096015930176, "logps/chosen": -336.1997375488281, "logps/rejected": -235.7979278564453, "loss": 0.7098, "rewards/accuracies": 0.25, "rewards/chosen": 0.0805848091840744, "rewards/margins": -0.030807975679636, "rewards/rejected": 0.1113927885890007, "step": 627 }, { "epoch": 0.33868140757718757, "grad_norm": 6.101095676422119, "learning_rate": 4.182600935446839e-07, "logits/chosen": 0.25705793499946594, "logits/rejected": 0.23228739202022552, "logps/chosen": -213.77780151367188, "logps/rejected": -207.1191864013672, "loss": 0.6659, "rewards/accuracies": 0.75, "rewards/chosen": 0.1538858413696289, "rewards/margins": 0.05908384919166565, "rewards/rejected": 0.09480199962854385, "step": 628 }, { "epoch": 0.3392207091816098, "grad_norm": 5.054253578186035, "learning_rate": 4.1791154291617836e-07, "logits/chosen": 0.2992972433567047, "logits/rejected": -0.5158688426017761, "logps/chosen": -218.169189453125, "logps/rejected": -211.85467529296875, "loss": 0.6664, "rewards/accuracies": 0.625, "rewards/chosen": 0.15157242119312286, "rewards/margins": 0.057265181094408035, "rewards/rejected": 0.09430723637342453, "step": 629 }, { "epoch": 0.3397600107860321, "grad_norm": 7.292877197265625, "learning_rate": 4.175623966418827e-07, "logits/chosen": 1.1392712593078613, "logits/rejected": 0.3600119948387146, "logps/chosen": -252.7994842529297, "logps/rejected": -215.797119140625, "loss": 0.6847, "rewards/accuracies": 0.75, "rewards/chosen": 0.0702541321516037, "rewards/margins": 0.01966075599193573, "rewards/rejected": 0.05059337988495827, "step": 630 }, { "epoch": 0.34029931239045436, "grad_norm": 6.534811496734619, "learning_rate": 4.172126559603507e-07, "logits/chosen": -1.099869966506958, "logits/rejected": -0.7525999546051025, "logps/chosen": -169.20571899414062, "logps/rejected": -247.546630859375, "loss": 0.6894, "rewards/accuracies": 0.5, "rewards/chosen": 0.10687685012817383, "rewards/margins": 0.01058654673397541, "rewards/rejected": 0.09629030525684357, "step": 631 }, { "epoch": 0.34083861399487664, "grad_norm": 7.182568073272705, "learning_rate": 4.1686232211224504e-07, "logits/chosen": 0.3530278503894806, "logits/rejected": -0.033142536878585815, "logps/chosen": -239.90872192382812, "logps/rejected": -252.65391540527344, "loss": 0.7092, "rewards/accuracies": 0.25, "rewards/chosen": 0.14775362610816956, "rewards/margins": -0.026249030604958534, "rewards/rejected": 0.17400264739990234, "step": 632 }, { "epoch": 0.3413779155992989, "grad_norm": 6.8634514808654785, "learning_rate": 4.165113963403325e-07, "logits/chosen": -0.33843567967414856, "logits/rejected": 0.11740422248840332, "logps/chosen": -267.33551025390625, "logps/rejected": -257.41180419921875, "loss": 0.6676, "rewards/accuracies": 0.625, "rewards/chosen": 0.13952654600143433, "rewards/margins": 0.0565711110830307, "rewards/rejected": 0.08295545727014542, "step": 633 }, { "epoch": 0.3419172172037212, "grad_norm": 6.839290618896484, "learning_rate": 4.1615987988947944e-07, "logits/chosen": 0.3397829532623291, "logits/rejected": -0.8635229468345642, "logps/chosen": -220.13388061523438, "logps/rejected": -165.63232421875, "loss": 0.6748, "rewards/accuracies": 0.75, "rewards/chosen": 0.15676994621753693, "rewards/margins": 0.043332576751708984, "rewards/rejected": 0.11343736946582794, "step": 634 }, { "epoch": 0.34245651880814343, "grad_norm": 8.402804374694824, "learning_rate": 4.158077740066479e-07, "logits/chosen": 0.008630931377410889, "logits/rejected": 0.1700807809829712, "logps/chosen": -225.49044799804688, "logps/rejected": -232.23802185058594, "loss": 0.7017, "rewards/accuracies": 0.5, "rewards/chosen": 0.14302854239940643, "rewards/margins": -0.0102018341422081, "rewards/rejected": 0.15323038399219513, "step": 635 }, { "epoch": 0.3429958204125657, "grad_norm": 7.876453399658203, "learning_rate": 4.154550799408906e-07, "logits/chosen": -1.0499725341796875, "logits/rejected": -0.012530863285064697, "logps/chosen": -207.5320587158203, "logps/rejected": -209.29135131835938, "loss": 0.7261, "rewards/accuracies": 0.5, "rewards/chosen": 0.028132393956184387, "rewards/margins": -0.05338578298687935, "rewards/rejected": 0.08151817321777344, "step": 636 }, { "epoch": 0.343535122016988, "grad_norm": 7.5076584815979, "learning_rate": 4.1510179894334696e-07, "logits/chosen": -0.34374916553497314, "logits/rejected": -0.024840116500854492, "logps/chosen": -246.308837890625, "logps/rejected": -301.9224853515625, "loss": 0.6957, "rewards/accuracies": 0.375, "rewards/chosen": 0.08273563534021378, "rewards/margins": -0.00281524658203125, "rewards/rejected": 0.08555088192224503, "step": 637 }, { "epoch": 0.3440744236214103, "grad_norm": 9.35781478881836, "learning_rate": 4.1474793226723825e-07, "logits/chosen": 0.7212384939193726, "logits/rejected": -0.4611467123031616, "logps/chosen": -369.5855407714844, "logps/rejected": -280.6455078125, "loss": 0.6851, "rewards/accuracies": 0.75, "rewards/chosen": 0.17801342904567719, "rewards/margins": 0.021371841430664062, "rewards/rejected": 0.15664157271385193, "step": 638 }, { "epoch": 0.34461372522583256, "grad_norm": 7.184645175933838, "learning_rate": 4.1439348116786363e-07, "logits/chosen": -0.17123720049858093, "logits/rejected": -0.8840218186378479, "logps/chosen": -231.88656616210938, "logps/rejected": -202.85946655273438, "loss": 0.6585, "rewards/accuracies": 0.75, "rewards/chosen": 0.16259297728538513, "rewards/margins": 0.07498884946107864, "rewards/rejected": 0.08760414272546768, "step": 639 }, { "epoch": 0.34515302683025484, "grad_norm": 7.102171421051025, "learning_rate": 4.140384469025954e-07, "logits/chosen": 0.3108949065208435, "logits/rejected": -0.30008235573768616, "logps/chosen": -218.15628051757812, "logps/rejected": -224.87692260742188, "loss": 0.6988, "rewards/accuracies": 0.5, "rewards/chosen": 0.1446901261806488, "rewards/margins": -0.005250167101621628, "rewards/rejected": 0.14994029700756073, "step": 640 }, { "epoch": 0.3456923284346771, "grad_norm": 7.370213031768799, "learning_rate": 4.1368283073087406e-07, "logits/chosen": 0.08263781666755676, "logits/rejected": -0.7920692563056946, "logps/chosen": -256.37738037109375, "logps/rejected": -202.60321044921875, "loss": 0.669, "rewards/accuracies": 0.5, "rewards/chosen": 0.16241951286792755, "rewards/margins": 0.06275521218776703, "rewards/rejected": 0.09966430068016052, "step": 641 }, { "epoch": 0.34623163003909935, "grad_norm": 7.773151397705078, "learning_rate": 4.133266339142051e-07, "logits/chosen": 0.8491486310958862, "logits/rejected": 0.806330680847168, "logps/chosen": -304.10546875, "logps/rejected": -273.81585693359375, "loss": 0.6922, "rewards/accuracies": 0.5, "rewards/chosen": 0.061330318450927734, "rewards/margins": 0.013003254309296608, "rewards/rejected": 0.04832706227898598, "step": 642 }, { "epoch": 0.34677093164352163, "grad_norm": 6.661800384521484, "learning_rate": 4.129698577161534e-07, "logits/chosen": 0.9065155982971191, "logits/rejected": 0.07849207520484924, "logps/chosen": -195.296630859375, "logps/rejected": -180.19692993164062, "loss": 0.6978, "rewards/accuracies": 0.625, "rewards/chosen": 0.09760494530200958, "rewards/margins": -0.006766980513930321, "rewards/rejected": 0.10437192022800446, "step": 643 }, { "epoch": 0.3473102332479439, "grad_norm": 7.634369373321533, "learning_rate": 4.1261250340233913e-07, "logits/chosen": 0.7593842148780823, "logits/rejected": 0.049960434436798096, "logps/chosen": -342.1758728027344, "logps/rejected": -228.71627807617188, "loss": 0.6708, "rewards/accuracies": 0.5, "rewards/chosen": 0.1395397186279297, "rewards/margins": 0.04702873155474663, "rewards/rejected": 0.09251099079847336, "step": 644 }, { "epoch": 0.3478495348523662, "grad_norm": 8.004140853881836, "learning_rate": 4.122545722404331e-07, "logits/chosen": -0.18135559558868408, "logits/rejected": 0.57820725440979, "logps/chosen": -244.820068359375, "logps/rejected": -320.4314880371094, "loss": 0.6848, "rewards/accuracies": 0.625, "rewards/chosen": 0.09543609619140625, "rewards/margins": 0.024050328880548477, "rewards/rejected": 0.07138577103614807, "step": 645 }, { "epoch": 0.3483888364567885, "grad_norm": 6.736301422119141, "learning_rate": 4.118960655001529e-07, "logits/chosen": -0.21985584497451782, "logits/rejected": 0.5602186918258667, "logps/chosen": -198.5083770751953, "logps/rejected": -221.89608764648438, "loss": 0.6975, "rewards/accuracies": 0.5, "rewards/chosen": 0.11704178154468536, "rewards/margins": -0.005498120561242104, "rewards/rejected": 0.12253990024328232, "step": 646 }, { "epoch": 0.34892813806121076, "grad_norm": 7.411139488220215, "learning_rate": 4.1153698445325745e-07, "logits/chosen": 0.3506886065006256, "logits/rejected": -0.11802643537521362, "logps/chosen": -212.44378662109375, "logps/rejected": -240.92291259765625, "loss": 0.7012, "rewards/accuracies": 0.5, "rewards/chosen": 0.07603597640991211, "rewards/margins": -0.008935928344726562, "rewards/rejected": 0.08497190475463867, "step": 647 }, { "epoch": 0.349467439665633, "grad_norm": 8.932652473449707, "learning_rate": 4.1117733037354313e-07, "logits/chosen": -0.6465153694152832, "logits/rejected": -1.2412159442901611, "logps/chosen": -280.9736328125, "logps/rejected": -320.7649841308594, "loss": 0.6329, "rewards/accuracies": 0.75, "rewards/chosen": 0.18706494569778442, "rewards/margins": 0.13527803122997284, "rewards/rejected": 0.05178689956665039, "step": 648 }, { "epoch": 0.35000674127005527, "grad_norm": 6.432776927947998, "learning_rate": 4.108171045368391e-07, "logits/chosen": 0.6513592600822449, "logits/rejected": -0.3164811432361603, "logps/chosen": -215.05349731445312, "logps/rejected": -178.2601318359375, "loss": 0.6515, "rewards/accuracies": 0.875, "rewards/chosen": 0.16835995018482208, "rewards/margins": 0.08745966106653214, "rewards/rejected": 0.08090028166770935, "step": 649 }, { "epoch": 0.35054604287447755, "grad_norm": 8.619903564453125, "learning_rate": 4.1045630822100274e-07, "logits/chosen": 0.6399524211883545, "logits/rejected": -0.1071038693189621, "logps/chosen": -312.970458984375, "logps/rejected": -305.86566162109375, "loss": 0.7188, "rewards/accuracies": 0.375, "rewards/chosen": 0.10240097343921661, "rewards/margins": -0.04641089588403702, "rewards/rejected": 0.14881186187267303, "step": 650 }, { "epoch": 0.35108534447889983, "grad_norm": 7.564671993255615, "learning_rate": 4.1009494270591506e-07, "logits/chosen": -1.2592718601226807, "logits/rejected": -0.3329123556613922, "logps/chosen": -245.9640350341797, "logps/rejected": -223.0111083984375, "loss": 0.7048, "rewards/accuracies": 0.5, "rewards/chosen": 0.04169635847210884, "rewards/margins": -0.0150285754352808, "rewards/rejected": 0.05672493577003479, "step": 651 }, { "epoch": 0.3516246460833221, "grad_norm": 6.66050386428833, "learning_rate": 4.097330092734764e-07, "logits/chosen": 1.1099973917007446, "logits/rejected": 0.5086526870727539, "logps/chosen": -243.56893920898438, "logps/rejected": -243.64495849609375, "loss": 0.7047, "rewards/accuracies": 0.375, "rewards/chosen": 0.046172335743904114, "rewards/margins": -0.017931178212165833, "rewards/rejected": 0.06410351395606995, "step": 652 }, { "epoch": 0.3521639476877444, "grad_norm": 7.184839248657227, "learning_rate": 4.093705092076016e-07, "logits/chosen": -0.34670472145080566, "logits/rejected": -1.4159961938858032, "logps/chosen": -290.3460388183594, "logps/rejected": -198.84759521484375, "loss": 0.711, "rewards/accuracies": 0.375, "rewards/chosen": 0.10297785699367523, "rewards/margins": -0.02705850638449192, "rewards/rejected": 0.1300363540649414, "step": 653 }, { "epoch": 0.3527032492921666, "grad_norm": 7.8568596839904785, "learning_rate": 4.090074437942155e-07, "logits/chosen": -0.7856915593147278, "logits/rejected": 0.2949948012828827, "logps/chosen": -265.59368896484375, "logps/rejected": -308.6920166015625, "loss": 0.7223, "rewards/accuracies": 0.25, "rewards/chosen": 0.0907713919878006, "rewards/margins": -0.05488567799329758, "rewards/rejected": 0.14565707743167877, "step": 654 }, { "epoch": 0.3532425508965889, "grad_norm": 6.211113929748535, "learning_rate": 4.086438143212487e-07, "logits/chosen": 0.39479130506515503, "logits/rejected": 0.41658324003219604, "logps/chosen": -216.6243133544922, "logps/rejected": -229.06158447265625, "loss": 0.6786, "rewards/accuracies": 0.625, "rewards/chosen": 0.10563144832849503, "rewards/margins": 0.03265819698572159, "rewards/rejected": 0.07297325134277344, "step": 655 }, { "epoch": 0.3537818525010112, "grad_norm": 6.685275077819824, "learning_rate": 4.082796220786323e-07, "logits/chosen": -0.21247391402721405, "logits/rejected": -0.6198834180831909, "logps/chosen": -202.2766876220703, "logps/rejected": -219.36776733398438, "loss": 0.669, "rewards/accuracies": 0.625, "rewards/chosen": 0.175203338265419, "rewards/margins": 0.05601511150598526, "rewards/rejected": 0.11918821930885315, "step": 656 }, { "epoch": 0.35432115410543347, "grad_norm": 6.2634735107421875, "learning_rate": 4.0791486835829423e-07, "logits/chosen": 0.8415133357048035, "logits/rejected": 0.49255457520484924, "logps/chosen": -226.52706909179688, "logps/rejected": -174.03273010253906, "loss": 0.6972, "rewards/accuracies": 0.5, "rewards/chosen": 0.09195481240749359, "rewards/margins": -0.0069890012964606285, "rewards/rejected": 0.0989438146352768, "step": 657 }, { "epoch": 0.35486045570985575, "grad_norm": 8.995536804199219, "learning_rate": 4.0754955445415396e-07, "logits/chosen": 0.24255067110061646, "logits/rejected": -0.18611174821853638, "logps/chosen": -370.3765563964844, "logps/rejected": -296.1278076171875, "loss": 0.6642, "rewards/accuracies": 0.75, "rewards/chosen": 0.22723828256130219, "rewards/margins": 0.0696401596069336, "rewards/rejected": 0.1575981229543686, "step": 658 }, { "epoch": 0.35539975731427803, "grad_norm": 7.50978946685791, "learning_rate": 4.0718368166211807e-07, "logits/chosen": -0.3023432195186615, "logits/rejected": 0.5523338317871094, "logps/chosen": -214.98321533203125, "logps/rejected": -283.6015319824219, "loss": 0.7439, "rewards/accuracies": 0.125, "rewards/chosen": 0.11675272136926651, "rewards/margins": -0.09545394033193588, "rewards/rejected": 0.2122066617012024, "step": 659 }, { "epoch": 0.35593905891870026, "grad_norm": 6.862890720367432, "learning_rate": 4.068172512800759e-07, "logits/chosen": 0.3527814745903015, "logits/rejected": -0.40360236167907715, "logps/chosen": -209.77301025390625, "logps/rejected": -185.69845581054688, "loss": 0.7094, "rewards/accuracies": 0.375, "rewards/chosen": 0.05804329365491867, "rewards/margins": -0.02689380571246147, "rewards/rejected": 0.08493709564208984, "step": 660 }, { "epoch": 0.35647836052312254, "grad_norm": 6.443294048309326, "learning_rate": 4.064502646078947e-07, "logits/chosen": -0.7714328169822693, "logits/rejected": -0.7311117649078369, "logps/chosen": -206.8424530029297, "logps/rejected": -203.427001953125, "loss": 0.6899, "rewards/accuracies": 0.375, "rewards/chosen": 0.042174722999334335, "rewards/margins": 0.010302449576556683, "rewards/rejected": 0.03187227249145508, "step": 661 }, { "epoch": 0.3570176621275448, "grad_norm": 6.585171222686768, "learning_rate": 4.0608272294741495e-07, "logits/chosen": -0.029344886541366577, "logits/rejected": -0.8344898223876953, "logps/chosen": -218.83084106445312, "logps/rejected": -240.1602783203125, "loss": 0.6344, "rewards/accuracies": 0.875, "rewards/chosen": 0.14543123543262482, "rewards/margins": 0.12352123111486435, "rewards/rejected": 0.02191000059247017, "step": 662 }, { "epoch": 0.3575569637319671, "grad_norm": 7.667994976043701, "learning_rate": 4.0571462760244626e-07, "logits/chosen": 0.6178479790687561, "logits/rejected": -2.0470073223114014, "logps/chosen": -322.2120056152344, "logps/rejected": -204.81805419921875, "loss": 0.6306, "rewards/accuracies": 1.0, "rewards/chosen": 0.19023293256759644, "rewards/margins": 0.1325773298740387, "rewards/rejected": 0.05765562132000923, "step": 663 }, { "epoch": 0.3580962653363894, "grad_norm": 7.024248123168945, "learning_rate": 4.0534597987876183e-07, "logits/chosen": 0.12415990978479385, "logits/rejected": -0.6389439105987549, "logps/chosen": -174.64675903320312, "logps/rejected": -169.52989196777344, "loss": 0.7007, "rewards/accuracies": 0.5, "rewards/chosen": 0.05793027579784393, "rewards/margins": -0.006118301302194595, "rewards/rejected": 0.06404857337474823, "step": 664 }, { "epoch": 0.35863556694081167, "grad_norm": 6.618443489074707, "learning_rate": 4.049767810840949e-07, "logits/chosen": -0.6309881806373596, "logits/rejected": -1.0372744798660278, "logps/chosen": -204.93475341796875, "logps/rejected": -182.6759033203125, "loss": 0.7249, "rewards/accuracies": 0.375, "rewards/chosen": 0.06930617988109589, "rewards/margins": -0.05723858252167702, "rewards/rejected": 0.1265447735786438, "step": 665 }, { "epoch": 0.35917486854523395, "grad_norm": 8.261268615722656, "learning_rate": 4.046070325281332e-07, "logits/chosen": 0.24161463975906372, "logits/rejected": -1.3425389528274536, "logps/chosen": -283.8684997558594, "logps/rejected": -233.95541381835938, "loss": 0.6452, "rewards/accuracies": 0.75, "rewards/chosen": 0.14216738939285278, "rewards/margins": 0.10266037285327911, "rewards/rejected": 0.039507001638412476, "step": 666 }, { "epoch": 0.3597141701496562, "grad_norm": 7.101573467254639, "learning_rate": 4.0423673552251487e-07, "logits/chosen": 0.1631816327571869, "logits/rejected": -0.2881859242916107, "logps/chosen": -237.9154052734375, "logps/rejected": -218.6690673828125, "loss": 0.6625, "rewards/accuracies": 0.75, "rewards/chosen": 0.16845540702342987, "rewards/margins": 0.06744280457496643, "rewards/rejected": 0.10101261734962463, "step": 667 }, { "epoch": 0.36025347175407846, "grad_norm": 8.623574256896973, "learning_rate": 4.038658913808234e-07, "logits/chosen": 0.9888397455215454, "logits/rejected": 0.258731871843338, "logps/chosen": -274.4478454589844, "logps/rejected": -248.45831298828125, "loss": 0.7245, "rewards/accuracies": 0.375, "rewards/chosen": 0.03308162838220596, "rewards/margins": -0.052855975925922394, "rewards/rejected": 0.08593758940696716, "step": 668 }, { "epoch": 0.36079277335850074, "grad_norm": 7.558940410614014, "learning_rate": 4.034945014185835e-07, "logits/chosen": 0.1949400007724762, "logits/rejected": 0.09854066371917725, "logps/chosen": -202.1747589111328, "logps/rejected": -242.1273956298828, "loss": 0.6835, "rewards/accuracies": 0.625, "rewards/chosen": 0.11869301646947861, "rewards/margins": 0.02850346453487873, "rewards/rejected": 0.09018955379724503, "step": 669 }, { "epoch": 0.361332074962923, "grad_norm": 8.280952453613281, "learning_rate": 4.0312256695325577e-07, "logits/chosen": -0.9975400567054749, "logits/rejected": 0.790474534034729, "logps/chosen": -176.7533721923828, "logps/rejected": -270.40631103515625, "loss": 0.74, "rewards/accuracies": 0.125, "rewards/chosen": 0.10875502228736877, "rewards/margins": -0.08408480137586594, "rewards/rejected": 0.1928398162126541, "step": 670 }, { "epoch": 0.3618713765673453, "grad_norm": 7.328878402709961, "learning_rate": 4.027500893042325e-07, "logits/chosen": 0.007661640644073486, "logits/rejected": -0.5105205774307251, "logps/chosen": -354.36962890625, "logps/rejected": -289.9536437988281, "loss": 0.6858, "rewards/accuracies": 0.375, "rewards/chosen": 0.16526947915554047, "rewards/margins": 0.020781904458999634, "rewards/rejected": 0.14448757469654083, "step": 671 }, { "epoch": 0.3624106781717676, "grad_norm": 7.1913371086120605, "learning_rate": 4.0237706979283305e-07, "logits/chosen": 0.870129406452179, "logits/rejected": 0.8167535662651062, "logps/chosen": -276.8998107910156, "logps/rejected": -287.427001953125, "loss": 0.6577, "rewards/accuracies": 0.625, "rewards/chosen": 0.14738903939723969, "rewards/margins": 0.07493076473474503, "rewards/rejected": 0.07245826721191406, "step": 672 }, { "epoch": 0.3629499797761898, "grad_norm": 7.234364986419678, "learning_rate": 4.020035097422986e-07, "logits/chosen": 1.0668858289718628, "logits/rejected": -0.6641507148742676, "logps/chosen": -243.06985473632812, "logps/rejected": -206.16213989257812, "loss": 0.6733, "rewards/accuracies": 0.75, "rewards/chosen": 0.1500442624092102, "rewards/margins": 0.04823208227753639, "rewards/rejected": 0.10181216895580292, "step": 673 }, { "epoch": 0.3634892813806121, "grad_norm": 7.04037618637085, "learning_rate": 4.016294104777882e-07, "logits/chosen": 0.9134920835494995, "logits/rejected": -0.2017798125743866, "logps/chosen": -259.2301940917969, "logps/rejected": -230.19189453125, "loss": 0.7169, "rewards/accuracies": 0.25, "rewards/chosen": 0.06607504189014435, "rewards/margins": -0.04560118168592453, "rewards/rejected": 0.11167621612548828, "step": 674 }, { "epoch": 0.3640285829850344, "grad_norm": 6.429827690124512, "learning_rate": 4.0125477332637336e-07, "logits/chosen": -0.1120431125164032, "logits/rejected": 0.6029688119888306, "logps/chosen": -214.66668701171875, "logps/rejected": -307.35235595703125, "loss": 0.6837, "rewards/accuracies": 0.375, "rewards/chosen": 0.13695716857910156, "rewards/margins": 0.025208665058016777, "rewards/rejected": 0.11174850165843964, "step": 675 }, { "epoch": 0.36456788458945666, "grad_norm": 7.907284736633301, "learning_rate": 4.00879599617034e-07, "logits/chosen": -0.3900030255317688, "logits/rejected": -0.2107498198747635, "logps/chosen": -249.6759033203125, "logps/rejected": -300.11419677734375, "loss": 0.7356, "rewards/accuracies": 0.5, "rewards/chosen": 0.11637906730175018, "rewards/margins": -0.07343387603759766, "rewards/rejected": 0.18981294333934784, "step": 676 }, { "epoch": 0.36510718619387894, "grad_norm": 7.751900672912598, "learning_rate": 4.0050389068065317e-07, "logits/chosen": 0.009443119168281555, "logits/rejected": 0.14603683352470398, "logps/chosen": -201.86093139648438, "logps/rejected": -222.644287109375, "loss": 0.6714, "rewards/accuracies": 0.625, "rewards/chosen": 0.059552766382694244, "rewards/margins": 0.05304413288831711, "rewards/rejected": 0.006508634425699711, "step": 677 }, { "epoch": 0.3656464877983012, "grad_norm": 7.617511749267578, "learning_rate": 4.001276478500126e-07, "logits/chosen": 0.6382745504379272, "logits/rejected": -0.4406091272830963, "logps/chosen": -261.65081787109375, "logps/rejected": -300.12060546875, "loss": 0.6653, "rewards/accuracies": 0.625, "rewards/chosen": 0.15799303352832794, "rewards/margins": 0.06815916299819946, "rewards/rejected": 0.08983387798070908, "step": 678 }, { "epoch": 0.36618578940272345, "grad_norm": 7.203710556030273, "learning_rate": 3.997508724597881e-07, "logits/chosen": -0.06055241450667381, "logits/rejected": -0.6191487908363342, "logps/chosen": -150.90802001953125, "logps/rejected": -183.04812622070312, "loss": 0.6885, "rewards/accuracies": 0.75, "rewards/chosen": 0.08195219933986664, "rewards/margins": 0.016319377347826958, "rewards/rejected": 0.06563282012939453, "step": 679 }, { "epoch": 0.36672509100714573, "grad_norm": 6.707647323608398, "learning_rate": 3.993735658465446e-07, "logits/chosen": 0.3589208722114563, "logits/rejected": 1.1415048837661743, "logps/chosen": -244.75985717773438, "logps/rejected": -322.16156005859375, "loss": 0.6928, "rewards/accuracies": 0.625, "rewards/chosen": 0.12477751076221466, "rewards/margins": 0.0028384216129779816, "rewards/rejected": 0.12193909287452698, "step": 680 }, { "epoch": 0.367264392611568, "grad_norm": 8.6582670211792, "learning_rate": 3.9899572934873133e-07, "logits/chosen": 0.6863880157470703, "logits/rejected": 0.29444780945777893, "logps/chosen": -259.4687805175781, "logps/rejected": -290.7474365234375, "loss": 0.7195, "rewards/accuracies": 0.375, "rewards/chosen": 0.08653393387794495, "rewards/margins": -0.04021359235048294, "rewards/rejected": 0.1267475187778473, "step": 681 }, { "epoch": 0.3678036942159903, "grad_norm": 6.574652671813965, "learning_rate": 3.9861736430667736e-07, "logits/chosen": 0.4236941337585449, "logits/rejected": 0.011192560195922852, "logps/chosen": -233.607421875, "logps/rejected": -168.5436553955078, "loss": 0.6697, "rewards/accuracies": 0.625, "rewards/chosen": 0.16579827666282654, "rewards/margins": 0.06152429059147835, "rewards/rejected": 0.10427398979663849, "step": 682 }, { "epoch": 0.3683429958204126, "grad_norm": 7.272289752960205, "learning_rate": 3.982384720625868e-07, "logits/chosen": -1.0195996761322021, "logits/rejected": -1.0946141481399536, "logps/chosen": -171.80206298828125, "logps/rejected": -193.14678955078125, "loss": 0.6796, "rewards/accuracies": 0.625, "rewards/chosen": 0.13542146980762482, "rewards/margins": 0.02897481992840767, "rewards/rejected": 0.10644664615392685, "step": 683 }, { "epoch": 0.36888229742483486, "grad_norm": 7.217228412628174, "learning_rate": 3.9785905396053377e-07, "logits/chosen": 0.07271790504455566, "logits/rejected": -0.8938460350036621, "logps/chosen": -266.11346435546875, "logps/rejected": -234.6513214111328, "loss": 0.6867, "rewards/accuracies": 0.5, "rewards/chosen": 0.12807922065258026, "rewards/margins": 0.016293715685606003, "rewards/rejected": 0.11178550869226456, "step": 684 }, { "epoch": 0.3694215990292571, "grad_norm": 7.186672687530518, "learning_rate": 3.9747911134645785e-07, "logits/chosen": 1.2268080711364746, "logits/rejected": -0.4098159372806549, "logps/chosen": -357.0971374511719, "logps/rejected": -291.4966735839844, "loss": 0.6473, "rewards/accuracies": 0.625, "rewards/chosen": 0.12099342048168182, "rewards/margins": 0.10345754027366638, "rewards/rejected": 0.017535876482725143, "step": 685 }, { "epoch": 0.36996090063367937, "grad_norm": 7.558313846588135, "learning_rate": 3.970986455681593e-07, "logits/chosen": 0.23617345094680786, "logits/rejected": -0.01953992247581482, "logps/chosen": -336.7087707519531, "logps/rejected": -246.1487579345703, "loss": 0.6322, "rewards/accuracies": 1.0, "rewards/chosen": 0.1750497817993164, "rewards/margins": 0.12911272048950195, "rewards/rejected": 0.04593706130981445, "step": 686 }, { "epoch": 0.37050020223810165, "grad_norm": 6.400732517242432, "learning_rate": 3.967176579752942e-07, "logits/chosen": 0.5170989036560059, "logits/rejected": -0.0947224497795105, "logps/chosen": -226.22805786132812, "logps/rejected": -226.72344970703125, "loss": 0.6842, "rewards/accuracies": 0.625, "rewards/chosen": 0.13570037484169006, "rewards/margins": 0.024048853665590286, "rewards/rejected": 0.11165151745080948, "step": 687 }, { "epoch": 0.37103950384252393, "grad_norm": 6.4523210525512695, "learning_rate": 3.963361499193698e-07, "logits/chosen": 0.5495195984840393, "logits/rejected": 0.1891181915998459, "logps/chosen": -223.951171875, "logps/rejected": -201.33399963378906, "loss": 0.6912, "rewards/accuracies": 0.375, "rewards/chosen": 0.12672872841358185, "rewards/margins": 0.010697836056351662, "rewards/rejected": 0.11603088676929474, "step": 688 }, { "epoch": 0.3715788054469462, "grad_norm": 7.7667975425720215, "learning_rate": 3.959541227537396e-07, "logits/chosen": 0.9119040369987488, "logits/rejected": 0.4465295076370239, "logps/chosen": -229.6144561767578, "logps/rejected": -235.04673767089844, "loss": 0.7165, "rewards/accuracies": 0.5, "rewards/chosen": 0.03242845460772514, "rewards/margins": -0.039949655532836914, "rewards/rejected": 0.07237810641527176, "step": 689 }, { "epoch": 0.3721181070513685, "grad_norm": 7.561617851257324, "learning_rate": 3.9557157783359836e-07, "logits/chosen": 0.20337587594985962, "logits/rejected": -0.17118863761425018, "logps/chosen": -249.52352905273438, "logps/rejected": -217.36444091796875, "loss": 0.6887, "rewards/accuracies": 0.625, "rewards/chosen": 0.14044532179832458, "rewards/margins": 0.0102348318323493, "rewards/rejected": 0.1302104890346527, "step": 690 }, { "epoch": 0.3726574086557908, "grad_norm": 7.37512731552124, "learning_rate": 3.951885165159778e-07, "logits/chosen": 0.2510984539985657, "logits/rejected": 0.33931660652160645, "logps/chosen": -312.619384765625, "logps/rejected": -272.2721252441406, "loss": 0.7261, "rewards/accuracies": 0.375, "rewards/chosen": 0.039868928492069244, "rewards/margins": -0.059189796447753906, "rewards/rejected": 0.09905871748924255, "step": 691 }, { "epoch": 0.373196710260213, "grad_norm": 8.37732982635498, "learning_rate": 3.9480494015974135e-07, "logits/chosen": 0.10785791277885437, "logits/rejected": -0.21172721683979034, "logps/chosen": -237.03726196289062, "logps/rejected": -248.7191619873047, "loss": 0.7064, "rewards/accuracies": 0.5, "rewards/chosen": 0.08470316231250763, "rewards/margins": -0.022981172427535057, "rewards/rejected": 0.10768432170152664, "step": 692 }, { "epoch": 0.3737360118646353, "grad_norm": 6.671052932739258, "learning_rate": 3.944208501255796e-07, "logits/chosen": 0.6437547206878662, "logits/rejected": -0.9527337551116943, "logps/chosen": -193.69915771484375, "logps/rejected": -175.61825561523438, "loss": 0.6907, "rewards/accuracies": 0.625, "rewards/chosen": 0.067377470433712, "rewards/margins": 0.014722058549523354, "rewards/rejected": 0.0526554137468338, "step": 693 }, { "epoch": 0.37427531346905757, "grad_norm": 7.434191703796387, "learning_rate": 3.940362477760052e-07, "logits/chosen": -0.5671972632408142, "logits/rejected": -0.2311156839132309, "logps/chosen": -258.3771057128906, "logps/rejected": -271.6859130859375, "loss": 0.7145, "rewards/accuracies": 0.375, "rewards/chosen": 0.12863272428512573, "rewards/margins": -0.03708191215991974, "rewards/rejected": 0.16571465134620667, "step": 694 }, { "epoch": 0.37481461507347985, "grad_norm": 8.588833808898926, "learning_rate": 3.9365113447534813e-07, "logits/chosen": -0.08262571692466736, "logits/rejected": 0.2894994914531708, "logps/chosen": -203.6293182373047, "logps/rejected": -278.1625671386719, "loss": 0.6988, "rewards/accuracies": 0.5, "rewards/chosen": 0.07075557857751846, "rewards/margins": -0.010118103586137295, "rewards/rejected": 0.08087368309497833, "step": 695 }, { "epoch": 0.37535391667790213, "grad_norm": 6.7656354904174805, "learning_rate": 3.9326551158975124e-07, "logits/chosen": 0.7476806044578552, "logits/rejected": 0.03284101560711861, "logps/chosen": -244.73582458496094, "logps/rejected": -248.45718383789062, "loss": 0.6716, "rewards/accuracies": 0.5, "rewards/chosen": 0.0980614647269249, "rewards/margins": 0.05057850107550621, "rewards/rejected": 0.047482967376708984, "step": 696 }, { "epoch": 0.3758932182823244, "grad_norm": 7.075323104858398, "learning_rate": 3.928793804871647e-07, "logits/chosen": 0.7798939943313599, "logits/rejected": -0.9884523153305054, "logps/chosen": -235.83792114257812, "logps/rejected": -200.18653869628906, "loss": 0.6529, "rewards/accuracies": 0.75, "rewards/chosen": 0.11655254662036896, "rewards/margins": 0.08589916676282883, "rewards/rejected": 0.03065338358283043, "step": 697 }, { "epoch": 0.37643251988674664, "grad_norm": 5.946983814239502, "learning_rate": 3.9249274253734164e-07, "logits/chosen": 0.7547985911369324, "logits/rejected": -0.41199925541877747, "logps/chosen": -263.99761962890625, "logps/rejected": -169.86282348632812, "loss": 0.6712, "rewards/accuracies": 0.75, "rewards/chosen": 0.06439085304737091, "rewards/margins": 0.04839525371789932, "rewards/rejected": 0.015995599329471588, "step": 698 }, { "epoch": 0.3769718214911689, "grad_norm": 6.240135192871094, "learning_rate": 3.921055991118334e-07, "logits/chosen": -0.5132712125778198, "logits/rejected": -0.1352815181016922, "logps/chosen": -206.41592407226562, "logps/rejected": -214.5946044921875, "loss": 0.6665, "rewards/accuracies": 0.875, "rewards/chosen": 0.10113940387964249, "rewards/margins": 0.05961356312036514, "rewards/rejected": 0.041525840759277344, "step": 699 }, { "epoch": 0.3775111230955912, "grad_norm": 7.53602409362793, "learning_rate": 3.917179515839839e-07, "logits/chosen": 0.9069461822509766, "logits/rejected": 0.9736020565032959, "logps/chosen": -304.2300109863281, "logps/rejected": -325.72735595703125, "loss": 0.6838, "rewards/accuracies": 0.5, "rewards/chosen": 0.1524990200996399, "rewards/margins": 0.021233849227428436, "rewards/rejected": 0.13126516342163086, "step": 700 }, { "epoch": 0.3780504247000135, "grad_norm": 6.589388847351074, "learning_rate": 3.913298013289259e-07, "logits/chosen": -0.3125966787338257, "logits/rejected": 0.6204515695571899, "logps/chosen": -197.3863525390625, "logps/rejected": -279.6521301269531, "loss": 0.7077, "rewards/accuracies": 0.5, "rewards/chosen": 0.0330604612827301, "rewards/margins": -0.022343821823596954, "rewards/rejected": 0.05540428310632706, "step": 701 }, { "epoch": 0.37858972630443577, "grad_norm": 6.650210380554199, "learning_rate": 3.9094114972357516e-07, "logits/chosen": -0.15430274605751038, "logits/rejected": -0.6214738488197327, "logps/chosen": -263.2239074707031, "logps/rejected": -195.0603790283203, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": 0.0828678160905838, "rewards/margins": 0.0035748500376939774, "rewards/rejected": 0.07929296791553497, "step": 702 }, { "epoch": 0.37912902790885805, "grad_norm": 7.232953071594238, "learning_rate": 3.90551998146626e-07, "logits/chosen": 0.06451549381017685, "logits/rejected": -0.9827207326889038, "logps/chosen": -171.88209533691406, "logps/rejected": -147.6817169189453, "loss": 0.6718, "rewards/accuracies": 0.625, "rewards/chosen": 0.11252735555171967, "rewards/margins": 0.04861268773674965, "rewards/rejected": 0.06391467899084091, "step": 703 }, { "epoch": 0.3796683295132803, "grad_norm": 6.659940242767334, "learning_rate": 3.901623479785464e-07, "logits/chosen": -0.07589512318372726, "logits/rejected": -0.8480207920074463, "logps/chosen": -181.81716918945312, "logps/rejected": -214.81491088867188, "loss": 0.66, "rewards/accuracies": 0.75, "rewards/chosen": 0.14315977692604065, "rewards/margins": 0.06986570358276367, "rewards/rejected": 0.07329407334327698, "step": 704 }, { "epoch": 0.38020763111770256, "grad_norm": 6.419321537017822, "learning_rate": 3.8977220060157287e-07, "logits/chosen": 0.06457459926605225, "logits/rejected": 0.3948506712913513, "logps/chosen": -206.75656127929688, "logps/rejected": -272.6645812988281, "loss": 0.6793, "rewards/accuracies": 0.625, "rewards/chosen": 0.16901618242263794, "rewards/margins": 0.029410555958747864, "rewards/rejected": 0.13960561156272888, "step": 705 }, { "epoch": 0.38074693272212484, "grad_norm": 8.82973861694336, "learning_rate": 3.8938155739970595e-07, "logits/chosen": 0.2847813665866852, "logits/rejected": 0.7425129413604736, "logps/chosen": -173.60830688476562, "logps/rejected": -190.26828002929688, "loss": 0.7404, "rewards/accuracies": 0.5, "rewards/chosen": 0.07162046432495117, "rewards/margins": -0.07575025409460068, "rewards/rejected": 0.14737071096897125, "step": 706 }, { "epoch": 0.3812862343265471, "grad_norm": 8.014866828918457, "learning_rate": 3.8899041975870486e-07, "logits/chosen": 0.7863078117370605, "logits/rejected": -1.658959150314331, "logps/chosen": -596.62939453125, "logps/rejected": -231.1915283203125, "loss": 0.6925, "rewards/accuracies": 0.5, "rewards/chosen": 0.09782715141773224, "rewards/margins": 0.01527271419763565, "rewards/rejected": 0.08255442976951599, "step": 707 }, { "epoch": 0.3818255359309694, "grad_norm": 7.359444618225098, "learning_rate": 3.885987890660827e-07, "logits/chosen": -0.09692955017089844, "logits/rejected": -0.9140403270721436, "logps/chosen": -295.2633361816406, "logps/rejected": -222.7205810546875, "loss": 0.7029, "rewards/accuracies": 0.625, "rewards/chosen": 0.08877410739660263, "rewards/margins": -0.016307830810546875, "rewards/rejected": 0.1050819456577301, "step": 708 }, { "epoch": 0.3823648375353917, "grad_norm": 6.272069931030273, "learning_rate": 3.8820666671110193e-07, "logits/chosen": 1.4919092655181885, "logits/rejected": 1.4982807636260986, "logps/chosen": -341.39569091796875, "logps/rejected": -300.71240234375, "loss": 0.6716, "rewards/accuracies": 0.75, "rewards/chosen": 0.1254146695137024, "rewards/margins": 0.047312259674072266, "rewards/rejected": 0.07810239493846893, "step": 709 }, { "epoch": 0.3829041391398139, "grad_norm": 39.557926177978516, "learning_rate": 3.878140540847689e-07, "logits/chosen": -0.8789240121841431, "logits/rejected": -0.7406452894210815, "logps/chosen": -150.26638793945312, "logps/rejected": -161.82925415039062, "loss": 0.6486, "rewards/accuracies": 0.5, "rewards/chosen": 0.06576652824878693, "rewards/margins": 0.09796801954507828, "rewards/rejected": -0.032201483845710754, "step": 710 }, { "epoch": 0.3834434407442362, "grad_norm": 8.975081443786621, "learning_rate": 3.8742095257982923e-07, "logits/chosen": -0.3225148320198059, "logits/rejected": -0.4718588590621948, "logps/chosen": -334.0635681152344, "logps/rejected": -300.63018798828125, "loss": 0.7045, "rewards/accuracies": 0.5, "rewards/chosen": 0.19645501673221588, "rewards/margins": -0.018102167174220085, "rewards/rejected": 0.21455718576908112, "step": 711 }, { "epoch": 0.3839827423486585, "grad_norm": 6.93854284286499, "learning_rate": 3.870273635907626e-07, "logits/chosen": 0.44961118698120117, "logits/rejected": 0.8882404565811157, "logps/chosen": -218.06594848632812, "logps/rejected": -249.45303344726562, "loss": 0.6866, "rewards/accuracies": 0.625, "rewards/chosen": 0.12198609858751297, "rewards/margins": 0.022020723670721054, "rewards/rejected": 0.09996539354324341, "step": 712 }, { "epoch": 0.38452204395308076, "grad_norm": 8.067686080932617, "learning_rate": 3.866332885137782e-07, "logits/chosen": -0.47181183099746704, "logits/rejected": 0.03403353691101074, "logps/chosen": -277.50518798828125, "logps/rejected": -312.3043212890625, "loss": 0.6739, "rewards/accuracies": 0.625, "rewards/chosen": 0.169298455119133, "rewards/margins": 0.050203703343868256, "rewards/rejected": 0.11909474432468414, "step": 713 }, { "epoch": 0.38506134555750304, "grad_norm": 7.193358898162842, "learning_rate": 3.862387287468094e-07, "logits/chosen": -0.5420240163803101, "logits/rejected": -0.9245604872703552, "logps/chosen": -254.23678588867188, "logps/rejected": -327.3460998535156, "loss": 0.6463, "rewards/accuracies": 0.875, "rewards/chosen": 0.14166049659252167, "rewards/margins": 0.09999656677246094, "rewards/rejected": 0.04166393354535103, "step": 714 }, { "epoch": 0.3856006471619253, "grad_norm": 7.227481842041016, "learning_rate": 3.858436856895091e-07, "logits/chosen": 0.03640760853886604, "logits/rejected": -1.3828628063201904, "logps/chosen": -252.2200927734375, "logps/rejected": -175.07171630859375, "loss": 0.6272, "rewards/accuracies": 1.0, "rewards/chosen": 0.16628704965114594, "rewards/margins": 0.1384325921535492, "rewards/rejected": 0.027854442596435547, "step": 715 }, { "epoch": 0.3861399487663476, "grad_norm": 6.0440754890441895, "learning_rate": 3.854481607432445e-07, "logits/chosen": -0.3408215045928955, "logits/rejected": -0.4602469801902771, "logps/chosen": -192.29901123046875, "logps/rejected": -191.22047424316406, "loss": 0.6651, "rewards/accuracies": 0.625, "rewards/chosen": 0.1516161859035492, "rewards/margins": 0.0700373649597168, "rewards/rejected": 0.0815788209438324, "step": 716 }, { "epoch": 0.38667925037076983, "grad_norm": 7.067590713500977, "learning_rate": 3.850521553110923e-07, "logits/chosen": 0.17385771870613098, "logits/rejected": 0.026358231902122498, "logps/chosen": -226.64724731445312, "logps/rejected": -212.7227020263672, "loss": 0.6839, "rewards/accuracies": 0.5, "rewards/chosen": 0.1043970137834549, "rewards/margins": 0.023290161043405533, "rewards/rejected": 0.08110684901475906, "step": 717 }, { "epoch": 0.3872185519751921, "grad_norm": 8.104598999023438, "learning_rate": 3.846556707978337e-07, "logits/chosen": 0.05406050384044647, "logits/rejected": -1.0202980041503906, "logps/chosen": -329.7591247558594, "logps/rejected": -232.35610961914062, "loss": 0.7364, "rewards/accuracies": 0.375, "rewards/chosen": 0.07479687035083771, "rewards/margins": -0.08213386684656143, "rewards/rejected": 0.15693072974681854, "step": 718 }, { "epoch": 0.3877578535796144, "grad_norm": 7.86714506149292, "learning_rate": 3.8425870860994906e-07, "logits/chosen": 1.2109090089797974, "logits/rejected": 0.08965088427066803, "logps/chosen": -218.35475158691406, "logps/rejected": -167.62539672851562, "loss": 0.6125, "rewards/accuracies": 1.0, "rewards/chosen": 0.12959527969360352, "rewards/margins": 0.1712634116411209, "rewards/rejected": -0.041668131947517395, "step": 719 }, { "epoch": 0.3882971551840367, "grad_norm": 7.375299453735352, "learning_rate": 3.8386127015561377e-07, "logits/chosen": 0.6948750019073486, "logits/rejected": 0.21961119771003723, "logps/chosen": -216.26104736328125, "logps/rejected": -191.88900756835938, "loss": 0.6375, "rewards/accuracies": 0.875, "rewards/chosen": 0.16639119386672974, "rewards/margins": 0.11947250366210938, "rewards/rejected": 0.046918679028749466, "step": 720 }, { "epoch": 0.38883645678845896, "grad_norm": 7.1065497398376465, "learning_rate": 3.8346335684469235e-07, "logits/chosen": -0.19235339760780334, "logits/rejected": 0.4313853979110718, "logps/chosen": -285.1688232421875, "logps/rejected": -327.2862243652344, "loss": 0.7409, "rewards/accuracies": 0.25, "rewards/chosen": 0.062302395701408386, "rewards/margins": -0.09057779610157013, "rewards/rejected": 0.15288019180297852, "step": 721 }, { "epoch": 0.38937575839288124, "grad_norm": 7.007901668548584, "learning_rate": 3.830649700887338e-07, "logits/chosen": 0.06240057945251465, "logits/rejected": -1.914198875427246, "logps/chosen": -344.288330078125, "logps/rejected": -245.6734161376953, "loss": 0.7167, "rewards/accuracies": 0.375, "rewards/chosen": 0.03604402765631676, "rewards/margins": -0.0409298837184906, "rewards/rejected": 0.07697390764951706, "step": 722 }, { "epoch": 0.38991505999730347, "grad_norm": 8.985590934753418, "learning_rate": 3.8266611130096706e-07, "logits/chosen": 0.8310876488685608, "logits/rejected": 1.2460873126983643, "logps/chosen": -220.35467529296875, "logps/rejected": -258.6464538574219, "loss": 0.7404, "rewards/accuracies": 0.25, "rewards/chosen": 0.08942484855651855, "rewards/margins": -0.0891743153333664, "rewards/rejected": 0.17859916388988495, "step": 723 }, { "epoch": 0.39045436160172575, "grad_norm": 13.884251594543457, "learning_rate": 3.8226678189629476e-07, "logits/chosen": 0.4621732234954834, "logits/rejected": -1.127577304840088, "logps/chosen": -391.94873046875, "logps/rejected": -260.2015380859375, "loss": 0.6964, "rewards/accuracies": 0.5, "rewards/chosen": 0.08251610398292542, "rewards/margins": 0.0055189114063978195, "rewards/rejected": 0.07699718326330185, "step": 724 }, { "epoch": 0.39099366320614803, "grad_norm": 7.473394393920898, "learning_rate": 3.8186698329128963e-07, "logits/chosen": 0.48130467534065247, "logits/rejected": -0.2818508446216583, "logps/chosen": -232.50376892089844, "logps/rejected": -219.56683349609375, "loss": 0.6799, "rewards/accuracies": 0.5, "rewards/chosen": 0.09736595302820206, "rewards/margins": 0.03405433148145676, "rewards/rejected": 0.0633116215467453, "step": 725 }, { "epoch": 0.3915329648105703, "grad_norm": 6.567690372467041, "learning_rate": 3.8146671690418865e-07, "logits/chosen": 0.243768572807312, "logits/rejected": 0.014787465333938599, "logps/chosen": -215.24951171875, "logps/rejected": -220.6099853515625, "loss": 0.6517, "rewards/accuracies": 0.75, "rewards/chosen": 0.15101775527000427, "rewards/margins": 0.08691825717687607, "rewards/rejected": 0.0640995055437088, "step": 726 }, { "epoch": 0.3920722664149926, "grad_norm": 6.328996658325195, "learning_rate": 3.8106598415488813e-07, "logits/chosen": -0.7500069737434387, "logits/rejected": -0.3795830011367798, "logps/chosen": -239.56732177734375, "logps/rejected": -258.6773986816406, "loss": 0.7264, "rewards/accuracies": 0.25, "rewards/chosen": 0.10890531539916992, "rewards/margins": -0.06153087317943573, "rewards/rejected": 0.17043620347976685, "step": 727 }, { "epoch": 0.3926115680194149, "grad_norm": 7.439212322235107, "learning_rate": 3.8066478646493894e-07, "logits/chosen": -0.1464882493019104, "logits/rejected": -0.6419050097465515, "logps/chosen": -320.19256591796875, "logps/rejected": -286.0921630859375, "loss": 0.6698, "rewards/accuracies": 0.625, "rewards/chosen": 0.12369786202907562, "rewards/margins": 0.05212421715259552, "rewards/rejected": 0.0715736374258995, "step": 728 }, { "epoch": 0.3931508696238371, "grad_norm": 6.690340042114258, "learning_rate": 3.802631252575409e-07, "logits/chosen": 0.27355706691741943, "logits/rejected": -0.597679853439331, "logps/chosen": -232.39295959472656, "logps/rejected": -260.3680725097656, "loss": 0.6651, "rewards/accuracies": 0.625, "rewards/chosen": 0.16933900117874146, "rewards/margins": 0.06174316257238388, "rewards/rejected": 0.10759583115577698, "step": 729 }, { "epoch": 0.3936901712282594, "grad_norm": 7.0188822746276855, "learning_rate": 3.7986100195753834e-07, "logits/chosen": -0.056621745228767395, "logits/rejected": -0.5733910202980042, "logps/chosen": -296.01068115234375, "logps/rejected": -286.499755859375, "loss": 0.6536, "rewards/accuracies": 0.625, "rewards/chosen": 0.15262871980667114, "rewards/margins": 0.08998365700244904, "rewards/rejected": 0.06264505535364151, "step": 730 }, { "epoch": 0.39422947283268167, "grad_norm": 7.379519939422607, "learning_rate": 3.7945841799141497e-07, "logits/chosen": -0.9056635499000549, "logits/rejected": 0.002716265618801117, "logps/chosen": -169.1240692138672, "logps/rejected": -200.73365783691406, "loss": 0.7254, "rewards/accuracies": 0.25, "rewards/chosen": 0.09409923851490021, "rewards/margins": -0.06104297935962677, "rewards/rejected": 0.15514221787452698, "step": 731 }, { "epoch": 0.39476877443710395, "grad_norm": 6.682973384857178, "learning_rate": 3.7905537478728844e-07, "logits/chosen": 0.6338790655136108, "logits/rejected": -0.23092761635780334, "logps/chosen": -254.2645263671875, "logps/rejected": -194.38180541992188, "loss": 0.7058, "rewards/accuracies": 0.25, "rewards/chosen": 0.18056392669677734, "rewards/margins": -0.018109126016497612, "rewards/rejected": 0.1986730545759201, "step": 732 }, { "epoch": 0.39530807604152624, "grad_norm": 7.555744647979736, "learning_rate": 3.786518737749054e-07, "logits/chosen": 0.6083473563194275, "logits/rejected": -0.6556715369224548, "logps/chosen": -257.0181579589844, "logps/rejected": -225.205810546875, "loss": 0.6836, "rewards/accuracies": 0.75, "rewards/chosen": 0.11004886776208878, "rewards/margins": 0.023859310895204544, "rewards/rejected": 0.08618955314159393, "step": 733 }, { "epoch": 0.3958473776459485, "grad_norm": 7.510571002960205, "learning_rate": 3.782479163856367e-07, "logits/chosen": 0.07478386163711548, "logits/rejected": 1.3462388515472412, "logps/chosen": -253.0230255126953, "logps/rejected": -403.6554260253906, "loss": 0.6596, "rewards/accuracies": 0.5, "rewards/chosen": 0.13026493787765503, "rewards/margins": 0.0753725990653038, "rewards/rejected": 0.05489235371351242, "step": 734 }, { "epoch": 0.39638667925037074, "grad_norm": 7.531533241271973, "learning_rate": 3.778435040524721e-07, "logits/chosen": 0.8080272674560547, "logits/rejected": -0.03953937068581581, "logps/chosen": -285.290771484375, "logps/rejected": -246.22769165039062, "loss": 0.6462, "rewards/accuracies": 0.75, "rewards/chosen": 0.20342731475830078, "rewards/margins": 0.10265149921178818, "rewards/rejected": 0.1007758229970932, "step": 735 }, { "epoch": 0.396925980854793, "grad_norm": 7.582393169403076, "learning_rate": 3.774386382100153e-07, "logits/chosen": -0.8957055807113647, "logits/rejected": -0.41381189227104187, "logps/chosen": -226.41690063476562, "logps/rejected": -242.75540161132812, "loss": 0.6931, "rewards/accuracies": 0.375, "rewards/chosen": 0.1433912217617035, "rewards/margins": 0.006628135219216347, "rewards/rejected": 0.1367630958557129, "step": 736 }, { "epoch": 0.3974652824592153, "grad_norm": 8.460551261901855, "learning_rate": 3.7703332029447853e-07, "logits/chosen": 0.21590596437454224, "logits/rejected": -1.3003783226013184, "logps/chosen": -181.1671600341797, "logps/rejected": -153.3270721435547, "loss": 0.6301, "rewards/accuracies": 0.5, "rewards/chosen": 0.1278090476989746, "rewards/margins": 0.1438625454902649, "rewards/rejected": -0.01605348475277424, "step": 737 }, { "epoch": 0.3980045840636376, "grad_norm": 5.903291702270508, "learning_rate": 3.766275517436779e-07, "logits/chosen": 0.9285943508148193, "logits/rejected": -0.06324826180934906, "logps/chosen": -188.27969360351562, "logps/rejected": -203.3505859375, "loss": 0.6775, "rewards/accuracies": 0.5, "rewards/chosen": 0.13433027267456055, "rewards/margins": 0.045339103788137436, "rewards/rejected": 0.08899116516113281, "step": 738 }, { "epoch": 0.39854388566805987, "grad_norm": 6.681094169616699, "learning_rate": 3.7622133399702796e-07, "logits/chosen": 1.0312312841415405, "logits/rejected": -0.01657341606914997, "logps/chosen": -337.9908447265625, "logps/rejected": -290.47076416015625, "loss": 0.6694, "rewards/accuracies": 0.75, "rewards/chosen": 0.13524208962917328, "rewards/margins": 0.0511101670563221, "rewards/rejected": 0.08413191139698029, "step": 739 }, { "epoch": 0.39908318727248215, "grad_norm": 9.527825355529785, "learning_rate": 3.758146684955368e-07, "logits/chosen": -0.47900280356407166, "logits/rejected": 0.47158002853393555, "logps/chosen": -328.6796569824219, "logps/rejected": -248.52664184570312, "loss": 0.7405, "rewards/accuracies": 0.375, "rewards/chosen": 0.10227061063051224, "rewards/margins": -0.08298540115356445, "rewards/rejected": 0.1852560043334961, "step": 740 }, { "epoch": 0.39962248887690444, "grad_norm": 6.84505558013916, "learning_rate": 3.75407556681801e-07, "logits/chosen": 1.319567322731018, "logits/rejected": 0.48839807510375977, "logps/chosen": -277.15826416015625, "logps/rejected": -250.6380615234375, "loss": 0.6941, "rewards/accuracies": 0.625, "rewards/chosen": 0.10953141003847122, "rewards/margins": 0.0022380854934453964, "rewards/rejected": 0.10729332268238068, "step": 741 }, { "epoch": 0.40016179048132666, "grad_norm": 7.413286209106445, "learning_rate": 3.75e-07, "logits/chosen": -0.11916853487491608, "logits/rejected": -1.2374738454818726, "logps/chosen": -234.5213165283203, "logps/rejected": -217.14935302734375, "loss": 0.6979, "rewards/accuracies": 0.5, "rewards/chosen": 0.08993473649024963, "rewards/margins": -0.0019314736127853394, "rewards/rejected": 0.09186621010303497, "step": 742 }, { "epoch": 0.40070109208574894, "grad_norm": 8.045169830322266, "learning_rate": 3.745919998958918e-07, "logits/chosen": 0.7425777912139893, "logits/rejected": -0.09428369998931885, "logps/chosen": -343.8515319824219, "logps/rejected": -244.69801330566406, "loss": 0.725, "rewards/accuracies": 0.375, "rewards/chosen": 0.06218442693352699, "rewards/margins": -0.05805530399084091, "rewards/rejected": 0.1202397346496582, "step": 743 }, { "epoch": 0.4012403936901712, "grad_norm": 7.24379301071167, "learning_rate": 3.7418355781680707e-07, "logits/chosen": 1.2689112424850464, "logits/rejected": 0.40461671352386475, "logps/chosen": -319.4320373535156, "logps/rejected": -251.34527587890625, "loss": 0.716, "rewards/accuracies": 0.625, "rewards/chosen": 0.12883520126342773, "rewards/margins": -0.03321171551942825, "rewards/rejected": 0.1620469093322754, "step": 744 }, { "epoch": 0.4017796952945935, "grad_norm": 8.785052299499512, "learning_rate": 3.7377467521164453e-07, "logits/chosen": -0.41553443670272827, "logits/rejected": -0.4593271017074585, "logps/chosen": -186.41412353515625, "logps/rejected": -225.08038330078125, "loss": 0.6935, "rewards/accuracies": 0.375, "rewards/chosen": 0.1237272247672081, "rewards/margins": 0.007733152247965336, "rewards/rejected": 0.11599406599998474, "step": 745 }, { "epoch": 0.4023189968990158, "grad_norm": 7.357199192047119, "learning_rate": 3.733653535308654e-07, "logits/chosen": 0.7877209782600403, "logits/rejected": -0.47482553124427795, "logps/chosen": -314.91485595703125, "logps/rejected": -189.84542846679688, "loss": 0.6622, "rewards/accuracies": 0.625, "rewards/chosen": 0.14236068725585938, "rewards/margins": 0.06668825447559357, "rewards/rejected": 0.0756724402308464, "step": 746 }, { "epoch": 0.4028582985034381, "grad_norm": 7.639172077178955, "learning_rate": 3.729555942264887e-07, "logits/chosen": 0.9487427473068237, "logits/rejected": -1.0042088031768799, "logps/chosen": -260.5947570800781, "logps/rejected": -153.21200561523438, "loss": 0.6401, "rewards/accuracies": 0.75, "rewards/chosen": 0.13817930221557617, "rewards/margins": 0.11162500083446503, "rewards/rejected": 0.026554299518465996, "step": 747 }, { "epoch": 0.4033976001078603, "grad_norm": 6.325625419616699, "learning_rate": 3.7254539875208576e-07, "logits/chosen": 0.13017144799232483, "logits/rejected": -0.5887124538421631, "logps/chosen": -254.8124542236328, "logps/rejected": -240.36041259765625, "loss": 0.6518, "rewards/accuracies": 0.875, "rewards/chosen": 0.2055189311504364, "rewards/margins": 0.08655872195959091, "rewards/rejected": 0.1189601868391037, "step": 748 }, { "epoch": 0.4039369017122826, "grad_norm": 6.116194725036621, "learning_rate": 3.721347685627751e-07, "logits/chosen": 0.17738397419452667, "logits/rejected": -0.9456039071083069, "logps/chosen": -354.0346984863281, "logps/rejected": -217.82516479492188, "loss": 0.6566, "rewards/accuracies": 0.625, "rewards/chosen": 0.1532835066318512, "rewards/margins": 0.07877492904663086, "rewards/rejected": 0.07450857758522034, "step": 749 }, { "epoch": 0.40447620331670486, "grad_norm": 9.507534980773926, "learning_rate": 3.7172370511521743e-07, "logits/chosen": 0.45388102531433105, "logits/rejected": -1.4279735088348389, "logps/chosen": -222.70571899414062, "logps/rejected": -172.6669921875, "loss": 0.6714, "rewards/accuracies": 0.75, "rewards/chosen": 0.1549166738986969, "rewards/margins": 0.05006589740514755, "rewards/rejected": 0.10485076904296875, "step": 750 }, { "epoch": 0.40501550492112715, "grad_norm": 7.292059421539307, "learning_rate": 3.713122098676104e-07, "logits/chosen": 0.04940403997898102, "logits/rejected": -0.7714976072311401, "logps/chosen": -260.26641845703125, "logps/rejected": -229.62026977539062, "loss": 0.6693, "rewards/accuracies": 0.75, "rewards/chosen": 0.11998462677001953, "rewards/margins": 0.05887765809893608, "rewards/rejected": 0.06110696867108345, "step": 751 }, { "epoch": 0.4055548065255494, "grad_norm": 6.519262313842773, "learning_rate": 3.709002842796834e-07, "logits/chosen": 0.40987974405288696, "logits/rejected": -1.7920080423355103, "logps/chosen": -257.19598388671875, "logps/rejected": -187.15179443359375, "loss": 0.6615, "rewards/accuracies": 0.625, "rewards/chosen": 0.17061185836791992, "rewards/margins": 0.07204294204711914, "rewards/rejected": 0.09856892377138138, "step": 752 }, { "epoch": 0.4060941081299717, "grad_norm": 7.15476131439209, "learning_rate": 3.704879298126924e-07, "logits/chosen": 0.5893940925598145, "logits/rejected": -0.9838780164718628, "logps/chosen": -297.9543762207031, "logps/rejected": -205.67616271972656, "loss": 0.6584, "rewards/accuracies": 0.875, "rewards/chosen": 0.22744522988796234, "rewards/margins": 0.07658690959215164, "rewards/rejected": 0.1508583128452301, "step": 753 }, { "epoch": 0.40663340973439394, "grad_norm": 6.848525524139404, "learning_rate": 3.700751479294146e-07, "logits/chosen": -1.285628318786621, "logits/rejected": 0.19251269102096558, "logps/chosen": -193.38026428222656, "logps/rejected": -231.95547485351562, "loss": 0.7198, "rewards/accuracies": 0.375, "rewards/chosen": 0.10900039970874786, "rewards/margins": -0.043428461998701096, "rewards/rejected": 0.15242886543273926, "step": 754 }, { "epoch": 0.4071727113388162, "grad_norm": 7.721908092498779, "learning_rate": 3.696619400941437e-07, "logits/chosen": -0.3194442391395569, "logits/rejected": 1.0289143323898315, "logps/chosen": -171.14996337890625, "logps/rejected": -176.4877166748047, "loss": 0.7272, "rewards/accuracies": 0.5, "rewards/chosen": 0.14167289435863495, "rewards/margins": -0.042322926223278046, "rewards/rejected": 0.1839958131313324, "step": 755 }, { "epoch": 0.4077120129432385, "grad_norm": 7.512411117553711, "learning_rate": 3.692483077726842e-07, "logits/chosen": -0.016666531562805176, "logits/rejected": 0.14634916186332703, "logps/chosen": -310.487548828125, "logps/rejected": -341.16949462890625, "loss": 0.6555, "rewards/accuracies": 0.625, "rewards/chosen": 0.20039205253124237, "rewards/margins": 0.09220962971448898, "rewards/rejected": 0.10818243026733398, "step": 756 }, { "epoch": 0.4082513145476608, "grad_norm": 7.661859035491943, "learning_rate": 3.688342524323466e-07, "logits/chosen": 0.019185870885849, "logits/rejected": 0.5133339166641235, "logps/chosen": -250.08251953125, "logps/rejected": -311.06732177734375, "loss": 0.7052, "rewards/accuracies": 0.5, "rewards/chosen": 0.1211310401558876, "rewards/margins": -0.008065983653068542, "rewards/rejected": 0.12919703125953674, "step": 757 }, { "epoch": 0.40879061615208306, "grad_norm": 8.113073348999023, "learning_rate": 3.684197755419419e-07, "logits/chosen": -0.3495449721813202, "logits/rejected": -0.8607818484306335, "logps/chosen": -182.18734741210938, "logps/rejected": -203.3445281982422, "loss": 0.7248, "rewards/accuracies": 0.25, "rewards/chosen": 0.08882036805152893, "rewards/margins": -0.05444479361176491, "rewards/rejected": 0.14326515793800354, "step": 758 }, { "epoch": 0.40932991775650535, "grad_norm": 8.434864044189453, "learning_rate": 3.6800487857177633e-07, "logits/chosen": -0.22236141562461853, "logits/rejected": -0.7832068204879761, "logps/chosen": -291.4192199707031, "logps/rejected": -315.40618896484375, "loss": 0.6935, "rewards/accuracies": 0.625, "rewards/chosen": 0.18775710463523865, "rewards/margins": 0.021253107115626335, "rewards/rejected": 0.16650399565696716, "step": 759 }, { "epoch": 0.4098692193609276, "grad_norm": 7.408906936645508, "learning_rate": 3.6758956299364643e-07, "logits/chosen": 0.4608685374259949, "logits/rejected": -0.9187576770782471, "logps/chosen": -250.39854431152344, "logps/rejected": -195.5494842529297, "loss": 0.6666, "rewards/accuracies": 0.625, "rewards/chosen": 0.12468919903039932, "rewards/margins": 0.059453871101140976, "rewards/rejected": 0.06523533165454865, "step": 760 }, { "epoch": 0.41040852096534985, "grad_norm": 6.801870822906494, "learning_rate": 3.67173830280834e-07, "logits/chosen": 0.8233687877655029, "logits/rejected": 0.2594197392463684, "logps/chosen": -238.46078491210938, "logps/rejected": -197.8284912109375, "loss": 0.6886, "rewards/accuracies": 0.5, "rewards/chosen": 0.1717044860124588, "rewards/margins": 0.01621055230498314, "rewards/rejected": 0.15549392998218536, "step": 761 }, { "epoch": 0.41094782256977214, "grad_norm": 7.25845193862915, "learning_rate": 3.667576819081002e-07, "logits/chosen": 0.15429049730300903, "logits/rejected": 0.4641600549221039, "logps/chosen": -353.84521484375, "logps/rejected": -300.65643310546875, "loss": 0.6889, "rewards/accuracies": 0.375, "rewards/chosen": 0.09411773830652237, "rewards/margins": 0.013655569404363632, "rewards/rejected": 0.08046217262744904, "step": 762 }, { "epoch": 0.4114871241741944, "grad_norm": 7.176961421966553, "learning_rate": 3.6634111935168063e-07, "logits/chosen": -0.3469969630241394, "logits/rejected": 0.28042471408843994, "logps/chosen": -199.10171508789062, "logps/rejected": -219.275634765625, "loss": 0.7156, "rewards/accuracies": 0.25, "rewards/chosen": 0.11013136804103851, "rewards/margins": -0.0402190200984478, "rewards/rejected": 0.1503503918647766, "step": 763 }, { "epoch": 0.4120264257786167, "grad_norm": 8.800867080688477, "learning_rate": 3.659241440892806e-07, "logits/chosen": 0.2237386703491211, "logits/rejected": 0.7399559020996094, "logps/chosen": -282.44171142578125, "logps/rejected": -395.8780212402344, "loss": 0.6963, "rewards/accuracies": 0.625, "rewards/chosen": 0.09168730676174164, "rewards/margins": 0.0037966743111610413, "rewards/rejected": 0.087890625, "step": 764 }, { "epoch": 0.412565727383039, "grad_norm": 7.048729419708252, "learning_rate": 3.65506757600069e-07, "logits/chosen": 0.16627147793769836, "logits/rejected": -0.685332179069519, "logps/chosen": -313.11376953125, "logps/rejected": -230.62892150878906, "loss": 0.6563, "rewards/accuracies": 0.625, "rewards/chosen": 0.227691650390625, "rewards/margins": 0.08740653097629547, "rewards/rejected": 0.14028511941432953, "step": 765 }, { "epoch": 0.41310502898746126, "grad_norm": 7.099205493927002, "learning_rate": 3.650889613646737e-07, "logits/chosen": 0.3669476807117462, "logits/rejected": 0.0005875453352928162, "logps/chosen": -234.54483032226562, "logps/rejected": -307.43585205078125, "loss": 0.6863, "rewards/accuracies": 0.5, "rewards/chosen": 0.14229698479175568, "rewards/margins": 0.021768569946289062, "rewards/rejected": 0.12052841484546661, "step": 766 }, { "epoch": 0.4136443305918835, "grad_norm": 6.7020263671875, "learning_rate": 3.646707568651761e-07, "logits/chosen": -0.13127845525741577, "logits/rejected": -0.53232342004776, "logps/chosen": -309.75958251953125, "logps/rejected": -217.16455078125, "loss": 0.689, "rewards/accuracies": 0.625, "rewards/chosen": 0.08703594654798508, "rewards/margins": 0.009737968444824219, "rewards/rejected": 0.07729797810316086, "step": 767 }, { "epoch": 0.4141836321963058, "grad_norm": 7.394976615905762, "learning_rate": 3.6425214558510574e-07, "logits/chosen": 1.2697641849517822, "logits/rejected": -0.45461785793304443, "logps/chosen": -301.9517822265625, "logps/rejected": -213.53262329101562, "loss": 0.623, "rewards/accuracies": 0.75, "rewards/chosen": 0.24425411224365234, "rewards/margins": 0.15653924643993378, "rewards/rejected": 0.08771486580371857, "step": 768 }, { "epoch": 0.41472293380072806, "grad_norm": 8.621826171875, "learning_rate": 3.6383312900943544e-07, "logits/chosen": -0.23876357078552246, "logits/rejected": 0.026867344975471497, "logps/chosen": -304.96856689453125, "logps/rejected": -279.3773498535156, "loss": 0.6971, "rewards/accuracies": 0.625, "rewards/chosen": 0.12820987403392792, "rewards/margins": -0.002599716652184725, "rewards/rejected": 0.13080959022045135, "step": 769 }, { "epoch": 0.41526223540515034, "grad_norm": 7.91318941116333, "learning_rate": 3.6341370862457536e-07, "logits/chosen": 0.4112922251224518, "logits/rejected": -0.015687093138694763, "logps/chosen": -220.2643280029297, "logps/rejected": -245.8761444091797, "loss": 0.7228, "rewards/accuracies": 0.25, "rewards/chosen": 0.13964635133743286, "rewards/margins": -0.05107440799474716, "rewards/rejected": 0.19072073698043823, "step": 770 }, { "epoch": 0.4158015370095726, "grad_norm": 6.070452690124512, "learning_rate": 3.6299388591836853e-07, "logits/chosen": 0.36332398653030396, "logits/rejected": 0.2578658163547516, "logps/chosen": -219.35586547851562, "logps/rejected": -220.26788330078125, "loss": 0.6837, "rewards/accuracies": 0.625, "rewards/chosen": 0.13362999260425568, "rewards/margins": 0.02505359798669815, "rewards/rejected": 0.10857639461755753, "step": 771 }, { "epoch": 0.4163408386139949, "grad_norm": 6.387154579162598, "learning_rate": 3.6257366238008485e-07, "logits/chosen": 0.45052894949913025, "logits/rejected": -0.5951903462409973, "logps/chosen": -268.8022155761719, "logps/rejected": -246.37416076660156, "loss": 0.6471, "rewards/accuracies": 0.75, "rewards/chosen": 0.1738039255142212, "rewards/margins": 0.10313597321510315, "rewards/rejected": 0.07066793739795685, "step": 772 }, { "epoch": 0.41688014021841713, "grad_norm": 6.878170967102051, "learning_rate": 3.621530395004163e-07, "logits/chosen": -0.555835485458374, "logits/rejected": -0.22375041246414185, "logps/chosen": -175.76974487304688, "logps/rejected": -161.07618713378906, "loss": 0.7016, "rewards/accuracies": 0.5, "rewards/chosen": 0.08465542644262314, "rewards/margins": -0.013134147971868515, "rewards/rejected": 0.09778957813978195, "step": 773 }, { "epoch": 0.4174194418228394, "grad_norm": 6.956783771514893, "learning_rate": 3.617320187714713e-07, "logits/chosen": -0.022893108427524567, "logits/rejected": -0.5085261464118958, "logps/chosen": -218.34152221679688, "logps/rejected": -195.85321044921875, "loss": 0.6615, "rewards/accuracies": 0.875, "rewards/chosen": 0.1750367283821106, "rewards/margins": 0.07123240828514099, "rewards/rejected": 0.10380430519580841, "step": 774 }, { "epoch": 0.4179587434272617, "grad_norm": 9.30107593536377, "learning_rate": 3.6131060168676993e-07, "logits/chosen": -0.06832996755838394, "logits/rejected": -0.3280823826789856, "logps/chosen": -192.6497802734375, "logps/rejected": -274.9858093261719, "loss": 0.6733, "rewards/accuracies": 0.625, "rewards/chosen": 0.18501487374305725, "rewards/margins": 0.053107306361198425, "rewards/rejected": 0.13190756738185883, "step": 775 }, { "epoch": 0.418498045031684, "grad_norm": 6.881377220153809, "learning_rate": 3.6088878974123795e-07, "logits/chosen": -1.2553513050079346, "logits/rejected": -1.0359200239181519, "logps/chosen": -151.22560119628906, "logps/rejected": -170.11708068847656, "loss": 0.694, "rewards/accuracies": 0.5, "rewards/chosen": 0.09424643963575363, "rewards/margins": 0.0028985501267015934, "rewards/rejected": 0.09134788811206818, "step": 776 }, { "epoch": 0.41903734663610626, "grad_norm": 7.280738830566406, "learning_rate": 3.604665844312019e-07, "logits/chosen": 0.1489703208208084, "logits/rejected": -0.055786292999982834, "logps/chosen": -245.51910400390625, "logps/rejected": -261.73687744140625, "loss": 0.6924, "rewards/accuracies": 0.5, "rewards/chosen": 0.1475086212158203, "rewards/margins": 0.014864921569824219, "rewards/rejected": 0.1326436996459961, "step": 777 }, { "epoch": 0.41957664824052854, "grad_norm": 7.588500022888184, "learning_rate": 3.60043987254384e-07, "logits/chosen": 0.6074700951576233, "logits/rejected": 0.5602543354034424, "logps/chosen": -272.80914306640625, "logps/rejected": -280.3509826660156, "loss": 0.6988, "rewards/accuracies": 0.625, "rewards/chosen": 0.06714525818824768, "rewards/margins": -0.0028643570840358734, "rewards/rejected": 0.07000961154699326, "step": 778 }, { "epoch": 0.42011594984495076, "grad_norm": 6.207444190979004, "learning_rate": 3.5962099970989626e-07, "logits/chosen": 0.31915390491485596, "logits/rejected": -0.3422451913356781, "logps/chosen": -257.14593505859375, "logps/rejected": -181.9139404296875, "loss": 0.6704, "rewards/accuracies": 0.5, "rewards/chosen": 0.14094582200050354, "rewards/margins": 0.05012941733002663, "rewards/rejected": 0.09081640094518661, "step": 779 }, { "epoch": 0.42065525144937305, "grad_norm": 8.76144790649414, "learning_rate": 3.591976232982355e-07, "logits/chosen": 0.051818132400512695, "logits/rejected": 0.3980032801628113, "logps/chosen": -272.7035827636719, "logps/rejected": -420.2403564453125, "loss": 0.7457, "rewards/accuracies": 0.375, "rewards/chosen": 0.07806139439344406, "rewards/margins": -0.08809661865234375, "rewards/rejected": 0.1661580204963684, "step": 780 }, { "epoch": 0.42119455305379533, "grad_norm": 7.696545600891113, "learning_rate": 3.587738595212782e-07, "logits/chosen": -0.8131085634231567, "logits/rejected": -0.7823805809020996, "logps/chosen": -272.1556091308594, "logps/rejected": -278.07086181640625, "loss": 0.6683, "rewards/accuracies": 0.625, "rewards/chosen": 0.11681217700242996, "rewards/margins": 0.0722990557551384, "rewards/rejected": 0.04451312869787216, "step": 781 }, { "epoch": 0.4217338546582176, "grad_norm": 7.58675479888916, "learning_rate": 3.583497098822748e-07, "logits/chosen": 0.4561305642127991, "logits/rejected": 0.8648815155029297, "logps/chosen": -277.71856689453125, "logps/rejected": -273.2974853515625, "loss": 0.7452, "rewards/accuracies": 0.25, "rewards/chosen": 0.1349756270647049, "rewards/margins": -0.09750346839427948, "rewards/rejected": 0.23247909545898438, "step": 782 }, { "epoch": 0.4222731562626399, "grad_norm": 7.0915656089782715, "learning_rate": 3.5792517588584463e-07, "logits/chosen": 0.8298697471618652, "logits/rejected": -0.9337893724441528, "logps/chosen": -309.0354919433594, "logps/rejected": -221.1575927734375, "loss": 0.6503, "rewards/accuracies": 0.75, "rewards/chosen": 0.13318434357643127, "rewards/margins": 0.09003296494483948, "rewards/rejected": 0.0431513786315918, "step": 783 }, { "epoch": 0.4228124578670622, "grad_norm": 6.674697399139404, "learning_rate": 3.575002590379705e-07, "logits/chosen": 0.3839167356491089, "logits/rejected": -0.68036949634552, "logps/chosen": -231.96363830566406, "logps/rejected": -237.5269775390625, "loss": 0.6448, "rewards/accuracies": 0.875, "rewards/chosen": 0.11570778489112854, "rewards/margins": 0.1015392318367958, "rewards/rejected": 0.014168549329042435, "step": 784 }, { "epoch": 0.4233517594714844, "grad_norm": 7.361809730529785, "learning_rate": 3.5707496084599316e-07, "logits/chosen": 1.9135664701461792, "logits/rejected": 0.2805786728858948, "logps/chosen": -230.7689971923828, "logps/rejected": -201.48513793945312, "loss": 0.6743, "rewards/accuracies": 0.5, "rewards/chosen": 0.13519763946533203, "rewards/margins": 0.04722003638744354, "rewards/rejected": 0.08797760307788849, "step": 785 }, { "epoch": 0.4238910610759067, "grad_norm": 8.401148796081543, "learning_rate": 3.5664928281860625e-07, "logits/chosen": 0.9828423261642456, "logits/rejected": -0.55497145652771, "logps/chosen": -297.0108337402344, "logps/rejected": -298.04315185546875, "loss": 0.6897, "rewards/accuracies": 0.5, "rewards/chosen": 0.13096199929714203, "rewards/margins": 0.013143353164196014, "rewards/rejected": 0.11781864613294601, "step": 786 }, { "epoch": 0.42443036268032897, "grad_norm": 6.940704822540283, "learning_rate": 3.562232264658509e-07, "logits/chosen": 0.21322807669639587, "logits/rejected": -0.2418319284915924, "logps/chosen": -255.18858337402344, "logps/rejected": -257.40350341796875, "loss": 0.6434, "rewards/accuracies": 0.875, "rewards/chosen": 0.21598245203495026, "rewards/margins": 0.1050073578953743, "rewards/rejected": 0.11097507178783417, "step": 787 }, { "epoch": 0.42496966428475125, "grad_norm": 6.167534351348877, "learning_rate": 3.557967932991102e-07, "logits/chosen": 0.12966258823871613, "logits/rejected": -0.16928884387016296, "logps/chosen": -270.630859375, "logps/rejected": -270.3060302734375, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": 0.1510225236415863, "rewards/margins": 0.002612020820379257, "rewards/rejected": 0.14841051399707794, "step": 788 }, { "epoch": 0.42550896588917353, "grad_norm": 7.243565559387207, "learning_rate": 3.5536998483110416e-07, "logits/chosen": -0.4700443744659424, "logits/rejected": -0.9608567953109741, "logps/chosen": -224.97930908203125, "logps/rejected": -228.15859985351562, "loss": 0.647, "rewards/accuracies": 0.875, "rewards/chosen": 0.1723681390285492, "rewards/margins": 0.10283757001161575, "rewards/rejected": 0.06953058391809464, "step": 789 }, { "epoch": 0.4260482674935958, "grad_norm": 7.60852575302124, "learning_rate": 3.549428025758836e-07, "logits/chosen": 0.5991213321685791, "logits/rejected": 0.4929075241088867, "logps/chosen": -264.2192077636719, "logps/rejected": -249.96902465820312, "loss": 0.6967, "rewards/accuracies": 0.5, "rewards/chosen": 0.1851721853017807, "rewards/margins": 0.006516552530229092, "rewards/rejected": 0.17865562438964844, "step": 790 }, { "epoch": 0.4265875690980181, "grad_norm": 8.163580894470215, "learning_rate": 3.545152480488259e-07, "logits/chosen": -0.20556192100048065, "logits/rejected": -0.8349020481109619, "logps/chosen": -257.43896484375, "logps/rejected": -253.42477416992188, "loss": 0.6432, "rewards/accuracies": 0.875, "rewards/chosen": 0.21206188201904297, "rewards/margins": 0.10542936623096466, "rewards/rejected": 0.1066325232386589, "step": 791 }, { "epoch": 0.4271268707024403, "grad_norm": 6.408261299133301, "learning_rate": 3.5408732276662876e-07, "logits/chosen": -0.1556905210018158, "logits/rejected": -0.7635886669158936, "logps/chosen": -267.91815185546875, "logps/rejected": -169.0664520263672, "loss": 0.6605, "rewards/accuracies": 0.625, "rewards/chosen": 0.12619848549365997, "rewards/margins": 0.07477550208568573, "rewards/rejected": 0.051422975957393646, "step": 792 }, { "epoch": 0.4276661723068626, "grad_norm": 8.201440811157227, "learning_rate": 3.5365902824730506e-07, "logits/chosen": 0.9858843088150024, "logits/rejected": -0.5516830086708069, "logps/chosen": -281.3367919921875, "logps/rejected": -198.31129455566406, "loss": 0.6091, "rewards/accuracies": 0.875, "rewards/chosen": 0.19002781808376312, "rewards/margins": 0.18093128502368927, "rewards/rejected": 0.009096529334783554, "step": 793 }, { "epoch": 0.4282054739112849, "grad_norm": 6.578029155731201, "learning_rate": 3.532303660101775e-07, "logits/chosen": 0.21865683794021606, "logits/rejected": 0.6002670526504517, "logps/chosen": -237.01577758789062, "logps/rejected": -255.81480407714844, "loss": 0.659, "rewards/accuracies": 0.625, "rewards/chosen": 0.1602146178483963, "rewards/margins": 0.07784004509449005, "rewards/rejected": 0.08237458020448685, "step": 794 }, { "epoch": 0.42874477551570717, "grad_norm": 7.1674299240112305, "learning_rate": 3.5280133757587336e-07, "logits/chosen": 0.427878201007843, "logits/rejected": -0.018427208065986633, "logps/chosen": -211.3483123779297, "logps/rejected": -199.89569091796875, "loss": 0.669, "rewards/accuracies": 0.75, "rewards/chosen": 0.1928855925798416, "rewards/margins": 0.05068359524011612, "rewards/rejected": 0.1422020047903061, "step": 795 }, { "epoch": 0.42928407712012945, "grad_norm": 6.315996170043945, "learning_rate": 3.523719444663188e-07, "logits/chosen": 0.4390186369419098, "logits/rejected": 0.19071102142333984, "logps/chosen": -239.9767303466797, "logps/rejected": -238.86624145507812, "loss": 0.6303, "rewards/accuracies": 0.875, "rewards/chosen": 0.16571331024169922, "rewards/margins": 0.13947811722755432, "rewards/rejected": 0.026235198602080345, "step": 796 }, { "epoch": 0.42982337872455173, "grad_norm": 7.09416389465332, "learning_rate": 3.519421882047338e-07, "logits/chosen": -0.4518160820007324, "logits/rejected": -1.1117470264434814, "logps/chosen": -188.04119873046875, "logps/rejected": -156.4473876953125, "loss": 0.6697, "rewards/accuracies": 0.75, "rewards/chosen": 0.09238843619823456, "rewards/margins": 0.05178647115826607, "rewards/rejected": 0.04060196876525879, "step": 797 }, { "epoch": 0.43036268032897396, "grad_norm": 8.867244720458984, "learning_rate": 3.5151207031562633e-07, "logits/chosen": 0.34340739250183105, "logits/rejected": -1.125366449356079, "logps/chosen": -303.7674560546875, "logps/rejected": -261.2860107421875, "loss": 0.724, "rewards/accuracies": 0.5, "rewards/chosen": 0.06239137798547745, "rewards/margins": -0.038651324808597565, "rewards/rejected": 0.10104270279407501, "step": 798 }, { "epoch": 0.43090198193339624, "grad_norm": 7.037902355194092, "learning_rate": 3.5108159232478737e-07, "logits/chosen": 0.8354260921478271, "logits/rejected": 0.5034649968147278, "logps/chosen": -219.18121337890625, "logps/rejected": -178.37635803222656, "loss": 0.7047, "rewards/accuracies": 0.625, "rewards/chosen": 0.15149880945682526, "rewards/margins": -0.013895511627197266, "rewards/rejected": 0.16539430618286133, "step": 799 }, { "epoch": 0.4314412835378185, "grad_norm": 7.196881294250488, "learning_rate": 3.506507557592853e-07, "logits/chosen": 0.05764308571815491, "logits/rejected": -0.517903208732605, "logps/chosen": -218.48435974121094, "logps/rejected": -251.290283203125, "loss": 0.6704, "rewards/accuracies": 0.75, "rewards/chosen": 0.13348622620105743, "rewards/margins": 0.04933934286236763, "rewards/rejected": 0.0841468796133995, "step": 800 }, { "epoch": 0.4319805851422408, "grad_norm": 10.982433319091797, "learning_rate": 3.502195621474604e-07, "logits/chosen": -0.30723413825035095, "logits/rejected": 0.38968712091445923, "logps/chosen": -123.38273620605469, "logps/rejected": -190.34873962402344, "loss": 0.741, "rewards/accuracies": 0.25, "rewards/chosen": 0.02244105562567711, "rewards/margins": -0.09082073718309402, "rewards/rejected": 0.11326178908348083, "step": 801 }, { "epoch": 0.4325198867466631, "grad_norm": 7.427231311798096, "learning_rate": 3.497880130189197e-07, "logits/chosen": 0.5716754198074341, "logits/rejected": -0.30527710914611816, "logps/chosen": -280.0397033691406, "logps/rejected": -216.585205078125, "loss": 0.6441, "rewards/accuracies": 0.625, "rewards/chosen": 0.1524617224931717, "rewards/margins": 0.11279487609863281, "rewards/rejected": 0.03966683894395828, "step": 802 }, { "epoch": 0.43305918835108537, "grad_norm": 6.83045768737793, "learning_rate": 3.493561099045312e-07, "logits/chosen": 0.48005741834640503, "logits/rejected": 0.6593004465103149, "logps/chosen": -197.84869384765625, "logps/rejected": -229.63461303710938, "loss": 0.678, "rewards/accuracies": 0.5, "rewards/chosen": 0.1287505179643631, "rewards/margins": 0.03899747505784035, "rewards/rejected": 0.08975305408239365, "step": 803 }, { "epoch": 0.4335984899555076, "grad_norm": 8.370401382446289, "learning_rate": 3.489238543364187e-07, "logits/chosen": -0.10893526673316956, "logits/rejected": -1.683495283126831, "logps/chosen": -285.9869384765625, "logps/rejected": -245.81008911132812, "loss": 0.64, "rewards/accuracies": 0.75, "rewards/chosen": 0.15914136171340942, "rewards/margins": 0.12174730002880096, "rewards/rejected": 0.037394046783447266, "step": 804 }, { "epoch": 0.4341377915599299, "grad_norm": 6.211988925933838, "learning_rate": 3.4849124784795636e-07, "logits/chosen": -0.030931800603866577, "logits/rejected": -0.5699371695518494, "logps/chosen": -232.4140625, "logps/rejected": -269.80224609375, "loss": 0.63, "rewards/accuracies": 1.0, "rewards/chosen": 0.16036280989646912, "rewards/margins": 0.13166570663452148, "rewards/rejected": 0.02869711071252823, "step": 805 }, { "epoch": 0.43467709316435216, "grad_norm": 7.473235130310059, "learning_rate": 3.4805829197376305e-07, "logits/chosen": -0.21241380274295807, "logits/rejected": -0.18890675902366638, "logps/chosen": -226.89202880859375, "logps/rejected": -228.33314514160156, "loss": 0.7235, "rewards/accuracies": 0.5, "rewards/chosen": 0.14275923371315002, "rewards/margins": -0.04781866818666458, "rewards/rejected": 0.190577894449234, "step": 806 }, { "epoch": 0.43521639476877444, "grad_norm": 6.204257011413574, "learning_rate": 3.4762498824969724e-07, "logits/chosen": 0.659818708896637, "logits/rejected": -0.10519585013389587, "logps/chosen": -258.9788818359375, "logps/rejected": -194.73922729492188, "loss": 0.7094, "rewards/accuracies": 0.5, "rewards/chosen": 0.10836353152990341, "rewards/margins": -0.02426052838563919, "rewards/rejected": 0.1326240599155426, "step": 807 }, { "epoch": 0.4357556963731967, "grad_norm": 7.209741115570068, "learning_rate": 3.47191338212851e-07, "logits/chosen": -0.3289354145526886, "logits/rejected": -0.8826521635055542, "logps/chosen": -243.28257751464844, "logps/rejected": -272.3832702636719, "loss": 0.6736, "rewards/accuracies": 0.75, "rewards/chosen": 0.12418022006750107, "rewards/margins": 0.04969730228185654, "rewards/rejected": 0.07448291778564453, "step": 808 }, { "epoch": 0.436294997977619, "grad_norm": 7.7021918296813965, "learning_rate": 3.467573434015454e-07, "logits/chosen": 0.07942277193069458, "logits/rejected": 0.1804562509059906, "logps/chosen": -225.339599609375, "logps/rejected": -294.1304931640625, "loss": 0.7254, "rewards/accuracies": 0.375, "rewards/chosen": 0.14471597969532013, "rewards/margins": -0.04959401488304138, "rewards/rejected": 0.1943099945783615, "step": 809 }, { "epoch": 0.43683429958204123, "grad_norm": 6.440825939178467, "learning_rate": 3.463230053553241e-07, "logits/chosen": 0.599102258682251, "logits/rejected": -0.2805129289627075, "logps/chosen": -267.7939453125, "logps/rejected": -264.0869140625, "loss": 0.7044, "rewards/accuracies": 0.5, "rewards/chosen": 0.12296285480260849, "rewards/margins": -0.01474313996732235, "rewards/rejected": 0.137705996632576, "step": 810 }, { "epoch": 0.4373736011864635, "grad_norm": 6.895138740539551, "learning_rate": 3.458883256149485e-07, "logits/chosen": -1.3813567161560059, "logits/rejected": 0.2244376391172409, "logps/chosen": -158.99685668945312, "logps/rejected": -208.40548706054688, "loss": 0.6679, "rewards/accuracies": 0.625, "rewards/chosen": 0.11574907600879669, "rewards/margins": 0.06071114540100098, "rewards/rejected": 0.05503792688250542, "step": 811 }, { "epoch": 0.4379129027908858, "grad_norm": 7.655596733093262, "learning_rate": 3.454533057223923e-07, "logits/chosen": -1.1262449026107788, "logits/rejected": -0.09094354510307312, "logps/chosen": -151.7474365234375, "logps/rejected": -202.24530029296875, "loss": 0.689, "rewards/accuracies": 0.5, "rewards/chosen": 0.1384524405002594, "rewards/margins": 0.012470529414713383, "rewards/rejected": 0.1259819120168686, "step": 812 }, { "epoch": 0.4384522043953081, "grad_norm": 7.159083843231201, "learning_rate": 3.4501794722083554e-07, "logits/chosen": 0.30826854705810547, "logits/rejected": 0.07478424906730652, "logps/chosen": -274.06927490234375, "logps/rejected": -190.62274169921875, "loss": 0.7004, "rewards/accuracies": 0.5, "rewards/chosen": 0.11247320473194122, "rewards/margins": -0.011199761182069778, "rewards/rejected": 0.1236729621887207, "step": 813 }, { "epoch": 0.43899150599973036, "grad_norm": 7.807650089263916, "learning_rate": 3.4458225165465974e-07, "logits/chosen": -0.9280798435211182, "logits/rejected": -0.5051683187484741, "logps/chosen": -222.0675811767578, "logps/rejected": -250.00375366210938, "loss": 0.7165, "rewards/accuracies": 0.375, "rewards/chosen": 0.13693104684352875, "rewards/margins": -0.032962799072265625, "rewards/rejected": 0.16989383101463318, "step": 814 }, { "epoch": 0.43953080760415264, "grad_norm": 6.911833763122559, "learning_rate": 3.4414622056944185e-07, "logits/chosen": 0.3316459357738495, "logits/rejected": -0.8515408635139465, "logps/chosen": -250.47775268554688, "logps/rejected": -185.99000549316406, "loss": 0.6749, "rewards/accuracies": 0.75, "rewards/chosen": 0.090929314494133, "rewards/margins": 0.043079569935798645, "rewards/rejected": 0.04784975200891495, "step": 815 }, { "epoch": 0.4400701092085749, "grad_norm": 6.896199703216553, "learning_rate": 3.4370985551194926e-07, "logits/chosen": 1.4833146333694458, "logits/rejected": -0.04457986354827881, "logps/chosen": -245.41281127929688, "logps/rejected": -230.98707580566406, "loss": 0.6514, "rewards/accuracies": 0.75, "rewards/chosen": 0.16334152221679688, "rewards/margins": 0.09191256761550903, "rewards/rejected": 0.07142896205186844, "step": 816 }, { "epoch": 0.44060941081299715, "grad_norm": 5.6806511878967285, "learning_rate": 3.432731580301338e-07, "logits/chosen": 0.1917436420917511, "logits/rejected": 0.011618640273809433, "logps/chosen": -158.4386749267578, "logps/rejected": -159.8053436279297, "loss": 0.6518, "rewards/accuracies": 0.625, "rewards/chosen": 0.1980375200510025, "rewards/margins": 0.08784833550453186, "rewards/rejected": 0.11018919944763184, "step": 817 }, { "epoch": 0.44114871241741943, "grad_norm": 8.428309440612793, "learning_rate": 3.4283612967312687e-07, "logits/chosen": -0.03359782695770264, "logits/rejected": -0.8016406893730164, "logps/chosen": -267.4262390136719, "logps/rejected": -246.89268493652344, "loss": 0.6647, "rewards/accuracies": 0.75, "rewards/chosen": 0.1824422925710678, "rewards/margins": 0.07179412990808487, "rewards/rejected": 0.11064815521240234, "step": 818 }, { "epoch": 0.4416880140218417, "grad_norm": 6.437076091766357, "learning_rate": 3.423987719912334e-07, "logits/chosen": 1.6841295957565308, "logits/rejected": 0.515087366104126, "logps/chosen": -325.54876708984375, "logps/rejected": -209.4253387451172, "loss": 0.6379, "rewards/accuracies": 0.75, "rewards/chosen": 0.25425320863723755, "rewards/margins": 0.12162351608276367, "rewards/rejected": 0.13262967765331268, "step": 819 }, { "epoch": 0.442227315626264, "grad_norm": 7.63007926940918, "learning_rate": 3.419610865359266e-07, "logits/chosen": 0.0003452301025390625, "logits/rejected": -0.3282938599586487, "logps/chosen": -200.67568969726562, "logps/rejected": -271.9146728515625, "loss": 0.6382, "rewards/accuracies": 0.75, "rewards/chosen": 0.18124255537986755, "rewards/margins": 0.1222471222281456, "rewards/rejected": 0.05899544060230255, "step": 820 }, { "epoch": 0.4427666172306863, "grad_norm": 7.145211696624756, "learning_rate": 3.415230748598424e-07, "logits/chosen": -0.2123877853155136, "logits/rejected": -1.3161386251449585, "logps/chosen": -240.24435424804688, "logps/rejected": -161.9334716796875, "loss": 0.6434, "rewards/accuracies": 0.625, "rewards/chosen": 0.21674834191799164, "rewards/margins": 0.1173238754272461, "rewards/rejected": 0.09942444413900375, "step": 821 }, { "epoch": 0.44330591883510856, "grad_norm": 7.904824256896973, "learning_rate": 3.410847385167741e-07, "logits/chosen": -0.4248572587966919, "logits/rejected": 0.06801196932792664, "logps/chosen": -159.8046875, "logps/rejected": -175.48751831054688, "loss": 0.6687, "rewards/accuracies": 0.75, "rewards/chosen": 0.06810101866722107, "rewards/margins": 0.05675850063562393, "rewards/rejected": 0.011342523619532585, "step": 822 }, { "epoch": 0.4438452204395308, "grad_norm": 7.654214859008789, "learning_rate": 3.406460790616664e-07, "logits/chosen": 0.631363570690155, "logits/rejected": -0.43350750207901, "logps/chosen": -215.93118286132812, "logps/rejected": -202.6182098388672, "loss": 0.7534, "rewards/accuracies": 0.25, "rewards/chosen": 0.05318431928753853, "rewards/margins": -0.10586757957935333, "rewards/rejected": 0.15905189514160156, "step": 823 }, { "epoch": 0.44438452204395307, "grad_norm": 6.800428867340088, "learning_rate": 3.402070980506106e-07, "logits/chosen": 0.17939773201942444, "logits/rejected": 0.7279377579689026, "logps/chosen": -211.92449951171875, "logps/rejected": -291.093505859375, "loss": 0.6776, "rewards/accuracies": 0.5, "rewards/chosen": 0.1659923493862152, "rewards/margins": 0.03709697723388672, "rewards/rejected": 0.12889538705348969, "step": 824 }, { "epoch": 0.44492382364837535, "grad_norm": 9.032998085021973, "learning_rate": 3.3976779704083835e-07, "logits/chosen": -0.4390459656715393, "logits/rejected": -0.09485200047492981, "logps/chosen": -245.29275512695312, "logps/rejected": -260.9823303222656, "loss": 0.6227, "rewards/accuracies": 0.75, "rewards/chosen": 0.17614039778709412, "rewards/margins": 0.1557733565568924, "rewards/rejected": 0.020367050543427467, "step": 825 }, { "epoch": 0.44546312525279763, "grad_norm": 7.683836936950684, "learning_rate": 3.3932817759071666e-07, "logits/chosen": -0.2666785717010498, "logits/rejected": -0.9381544589996338, "logps/chosen": -254.29147338867188, "logps/rejected": -191.8270263671875, "loss": 0.6472, "rewards/accuracies": 0.875, "rewards/chosen": 0.2701072692871094, "rewards/margins": 0.09594135731458664, "rewards/rejected": 0.17416591942310333, "step": 826 }, { "epoch": 0.4460024268572199, "grad_norm": 7.572053909301758, "learning_rate": 3.3888824125974213e-07, "logits/chosen": 0.07458442449569702, "logits/rejected": -0.34182289242744446, "logps/chosen": -274.69091796875, "logps/rejected": -218.6166534423828, "loss": 0.6642, "rewards/accuracies": 0.75, "rewards/chosen": 0.15365514159202576, "rewards/margins": 0.06682242453098297, "rewards/rejected": 0.08683271706104279, "step": 827 }, { "epoch": 0.4465417284616422, "grad_norm": 6.402702331542969, "learning_rate": 3.384479896085353e-07, "logits/chosen": 0.9007664918899536, "logits/rejected": 0.2839197814464569, "logps/chosen": -195.0093536376953, "logps/rejected": -210.32647705078125, "loss": 0.6453, "rewards/accuracies": 0.875, "rewards/chosen": 0.17960225045681, "rewards/margins": 0.10559426248073578, "rewards/rejected": 0.07400798797607422, "step": 828 }, { "epoch": 0.4470810300660644, "grad_norm": 8.414557456970215, "learning_rate": 3.3800742419883545e-07, "logits/chosen": -1.036014437675476, "logits/rejected": -0.4841776490211487, "logps/chosen": -169.1973876953125, "logps/rejected": -202.149169921875, "loss": 0.7027, "rewards/accuracies": 0.5, "rewards/chosen": 0.033339884132146835, "rewards/margins": -0.001909349113702774, "rewards/rejected": 0.03524922952055931, "step": 829 }, { "epoch": 0.4476203316704867, "grad_norm": 8.517156600952148, "learning_rate": 3.375665465934948e-07, "logits/chosen": 0.5337153077125549, "logits/rejected": -0.8645511865615845, "logps/chosen": -294.42254638671875, "logps/rejected": -252.7401885986328, "loss": 0.6341, "rewards/accuracies": 0.625, "rewards/chosen": 0.31263411045074463, "rewards/margins": 0.13307762145996094, "rewards/rejected": 0.1795564591884613, "step": 830 }, { "epoch": 0.448159633274909, "grad_norm": 8.671710968017578, "learning_rate": 3.3712535835647323e-07, "logits/chosen": 0.3776247203350067, "logits/rejected": 0.7622449994087219, "logps/chosen": -258.9962158203125, "logps/rejected": -252.0510711669922, "loss": 0.7197, "rewards/accuracies": 0.5, "rewards/chosen": 0.15279369056224823, "rewards/margins": -0.04196929559111595, "rewards/rejected": 0.19476298987865448, "step": 831 }, { "epoch": 0.44869893487933127, "grad_norm": 7.289159774780273, "learning_rate": 3.366838610528322e-07, "logits/chosen": 0.13937756419181824, "logits/rejected": -1.2270162105560303, "logps/chosen": -224.12493896484375, "logps/rejected": -175.72113037109375, "loss": 0.6622, "rewards/accuracies": 0.625, "rewards/chosen": 0.1239444762468338, "rewards/margins": 0.07278866320848465, "rewards/rejected": 0.051155805587768555, "step": 832 }, { "epoch": 0.44923823648375355, "grad_norm": 6.511632442474365, "learning_rate": 3.362420562487298e-07, "logits/chosen": 0.634651243686676, "logits/rejected": -1.0293046236038208, "logps/chosen": -206.71990966796875, "logps/rejected": -151.57147216796875, "loss": 0.6356, "rewards/accuracies": 0.75, "rewards/chosen": 0.1784602701663971, "rewards/margins": 0.1343650221824646, "rewards/rejected": 0.044095225632190704, "step": 833 }, { "epoch": 0.44977753808817583, "grad_norm": 7.214990139007568, "learning_rate": 3.3579994551141476e-07, "logits/chosen": -0.6741156578063965, "logits/rejected": -0.5083538293838501, "logps/chosen": -193.578125, "logps/rejected": -196.13925170898438, "loss": 0.6623, "rewards/accuracies": 0.625, "rewards/chosen": 0.18255110085010529, "rewards/margins": 0.07869081199169159, "rewards/rejected": 0.1038602888584137, "step": 834 }, { "epoch": 0.45031683969259806, "grad_norm": 7.082584381103516, "learning_rate": 3.353575304092215e-07, "logits/chosen": 0.6105650067329407, "logits/rejected": 0.716999351978302, "logps/chosen": -285.3293151855469, "logps/rejected": -296.62042236328125, "loss": 0.6725, "rewards/accuracies": 0.5, "rewards/chosen": 0.17105990648269653, "rewards/margins": 0.04918164759874344, "rewards/rejected": 0.1218782514333725, "step": 835 }, { "epoch": 0.45085614129702034, "grad_norm": 7.200114727020264, "learning_rate": 3.349148125115635e-07, "logits/chosen": 0.9519221782684326, "logits/rejected": -0.23506756126880646, "logps/chosen": -222.18557739257812, "logps/rejected": -217.17799377441406, "loss": 0.6402, "rewards/accuracies": 0.75, "rewards/chosen": 0.2573316693305969, "rewards/margins": 0.11893025040626526, "rewards/rejected": 0.13840141892433167, "step": 836 }, { "epoch": 0.4513954429014426, "grad_norm": 6.876852035522461, "learning_rate": 3.3447179338892883e-07, "logits/chosen": 0.3224862217903137, "logits/rejected": -0.3542498052120209, "logps/chosen": -200.84573364257812, "logps/rejected": -164.94229125976562, "loss": 0.6354, "rewards/accuracies": 0.875, "rewards/chosen": 0.1887475997209549, "rewards/margins": 0.12525779008865356, "rewards/rejected": 0.06348981708288193, "step": 837 }, { "epoch": 0.4519347445058649, "grad_norm": 7.101534366607666, "learning_rate": 3.34028474612874e-07, "logits/chosen": -0.50386643409729, "logits/rejected": -0.7874704599380493, "logps/chosen": -227.93080139160156, "logps/rejected": -325.9467468261719, "loss": 0.6678, "rewards/accuracies": 0.625, "rewards/chosen": 0.20499305427074432, "rewards/margins": 0.057570844888687134, "rewards/rejected": 0.14742222428321838, "step": 838 }, { "epoch": 0.4524740461102872, "grad_norm": 10.918599128723145, "learning_rate": 3.335848577560184e-07, "logits/chosen": 1.8337842226028442, "logits/rejected": 1.0158476829528809, "logps/chosen": -384.9781494140625, "logps/rejected": -276.9891357421875, "loss": 0.6681, "rewards/accuracies": 0.625, "rewards/chosen": 0.18179893493652344, "rewards/margins": 0.05549917370080948, "rewards/rejected": 0.12629975378513336, "step": 839 }, { "epoch": 0.45301334771470947, "grad_norm": 7.134556293487549, "learning_rate": 3.33140944392039e-07, "logits/chosen": -0.0804498940706253, "logits/rejected": -0.21385399997234344, "logps/chosen": -248.30963134765625, "logps/rejected": -209.98538208007812, "loss": 0.6901, "rewards/accuracies": 0.5, "rewards/chosen": 0.17621250450611115, "rewards/margins": 0.019902512431144714, "rewards/rejected": 0.15630999207496643, "step": 840 }, { "epoch": 0.45355264931913175, "grad_norm": 6.4481072425842285, "learning_rate": 3.326967360956645e-07, "logits/chosen": -0.08579501509666443, "logits/rejected": 0.2139672338962555, "logps/chosen": -238.89035034179688, "logps/rejected": -294.15692138671875, "loss": 0.6353, "rewards/accuracies": 0.75, "rewards/chosen": 0.17330622673034668, "rewards/margins": 0.13477039337158203, "rewards/rejected": 0.03853583708405495, "step": 841 }, { "epoch": 0.454091950923554, "grad_norm": 8.625262260437012, "learning_rate": 3.3225223444266977e-07, "logits/chosen": 0.017900638282299042, "logits/rejected": -0.6647141575813293, "logps/chosen": -248.07827758789062, "logps/rejected": -208.751220703125, "loss": 0.6455, "rewards/accuracies": 0.875, "rewards/chosen": 0.2189488410949707, "rewards/margins": 0.10274119675159454, "rewards/rejected": 0.11620765179395676, "step": 842 }, { "epoch": 0.45463125252797626, "grad_norm": 6.58638858795166, "learning_rate": 3.318074410098704e-07, "logits/chosen": 0.377674400806427, "logits/rejected": 0.21528178453445435, "logps/chosen": -231.63218688964844, "logps/rejected": -234.31362915039062, "loss": 0.667, "rewards/accuracies": 0.625, "rewards/chosen": 0.1885223388671875, "rewards/margins": 0.05792408064007759, "rewards/rejected": 0.1305982619524002, "step": 843 }, { "epoch": 0.45517055413239854, "grad_norm": 8.071711540222168, "learning_rate": 3.3136235737511706e-07, "logits/chosen": 0.40744903683662415, "logits/rejected": 1.142212152481079, "logps/chosen": -288.94500732421875, "logps/rejected": -479.81341552734375, "loss": 0.6871, "rewards/accuracies": 0.625, "rewards/chosen": 0.1847081333398819, "rewards/margins": 0.01828070357441902, "rewards/rejected": 0.16642743349075317, "step": 844 }, { "epoch": 0.4557098557368208, "grad_norm": 7.240639686584473, "learning_rate": 3.3091698511729e-07, "logits/chosen": -0.05416986346244812, "logits/rejected": -0.3304019868373871, "logps/chosen": -328.678955078125, "logps/rejected": -350.79888916015625, "loss": 0.6566, "rewards/accuracies": 0.625, "rewards/chosen": 0.19650879502296448, "rewards/margins": 0.08833151310682297, "rewards/rejected": 0.10817728191614151, "step": 845 }, { "epoch": 0.4562491573412431, "grad_norm": 7.616847038269043, "learning_rate": 3.3047132581629297e-07, "logits/chosen": -0.9259423017501831, "logits/rejected": -1.1871182918548584, "logps/chosen": -302.2892761230469, "logps/rejected": -334.8741149902344, "loss": 0.7137, "rewards/accuracies": 0.5, "rewards/chosen": 0.15675126016139984, "rewards/margins": -0.03438425809144974, "rewards/rejected": 0.1911354959011078, "step": 846 }, { "epoch": 0.4567884589456654, "grad_norm": 6.805056571960449, "learning_rate": 3.3002538105304816e-07, "logits/chosen": 0.7010163068771362, "logits/rejected": 0.8250281810760498, "logps/chosen": -190.47161865234375, "logps/rejected": -197.50546264648438, "loss": 0.7126, "rewards/accuracies": 0.5, "rewards/chosen": 0.11139744520187378, "rewards/margins": -0.028967950493097305, "rewards/rejected": 0.14036542177200317, "step": 847 }, { "epoch": 0.4573277605500876, "grad_norm": 6.984966278076172, "learning_rate": 3.295791524094905e-07, "logits/chosen": -0.33989202976226807, "logits/rejected": -0.25522419810295105, "logps/chosen": -217.80308532714844, "logps/rejected": -195.29197692871094, "loss": 0.7276, "rewards/accuracies": 0.25, "rewards/chosen": 0.163700670003891, "rewards/margins": -0.06544427573680878, "rewards/rejected": 0.22914496064186096, "step": 848 }, { "epoch": 0.4578670621545099, "grad_norm": 6.637675762176514, "learning_rate": 3.2913264146856193e-07, "logits/chosen": 0.04211172088980675, "logits/rejected": -0.18376250565052032, "logps/chosen": -233.82925415039062, "logps/rejected": -293.0185852050781, "loss": 0.7015, "rewards/accuracies": 0.375, "rewards/chosen": 0.15195685625076294, "rewards/margins": -0.012288668192923069, "rewards/rejected": 0.16424551606178284, "step": 849 }, { "epoch": 0.4584063637589322, "grad_norm": 7.4223246574401855, "learning_rate": 3.2868584981420565e-07, "logits/chosen": 0.605965256690979, "logits/rejected": -0.07926419377326965, "logps/chosen": -345.9329528808594, "logps/rejected": -314.2640380859375, "loss": 0.6744, "rewards/accuracies": 0.375, "rewards/chosen": 0.17922095954418182, "rewards/margins": 0.04162149876356125, "rewards/rejected": 0.13759946823120117, "step": 850 }, { "epoch": 0.45894566536335446, "grad_norm": 8.053653717041016, "learning_rate": 3.2823877903136076e-07, "logits/chosen": -0.5287497043609619, "logits/rejected": -0.3441305458545685, "logps/chosen": -295.77850341796875, "logps/rejected": -356.105712890625, "loss": 0.7034, "rewards/accuracies": 0.5, "rewards/chosen": 0.18884220719337463, "rewards/margins": -0.006403736770153046, "rewards/rejected": 0.19524593651294708, "step": 851 }, { "epoch": 0.45948496696777674, "grad_norm": 8.236499786376953, "learning_rate": 3.2779143070595654e-07, "logits/chosen": -0.3799043893814087, "logits/rejected": -0.6359858512878418, "logps/chosen": -190.39923095703125, "logps/rejected": -202.69473266601562, "loss": 0.67, "rewards/accuracies": 0.75, "rewards/chosen": 0.22237035632133484, "rewards/margins": 0.05943737551569939, "rewards/rejected": 0.16293296217918396, "step": 852 }, { "epoch": 0.460024268572199, "grad_norm": 6.288727760314941, "learning_rate": 3.273438064249069e-07, "logits/chosen": 0.597598671913147, "logits/rejected": 0.34642863273620605, "logps/chosen": -216.6488037109375, "logps/rejected": -192.31875610351562, "loss": 0.673, "rewards/accuracies": 0.625, "rewards/chosen": 0.18918456137180328, "rewards/margins": 0.04483185335993767, "rewards/rejected": 0.1443527191877365, "step": 853 }, { "epoch": 0.46056357017662125, "grad_norm": 8.749762535095215, "learning_rate": 3.268959077761044e-07, "logits/chosen": 1.1612300872802734, "logits/rejected": -0.7603844404220581, "logps/chosen": -246.46224975585938, "logps/rejected": -180.56370544433594, "loss": 0.6019, "rewards/accuracies": 1.0, "rewards/chosen": 0.225703626871109, "rewards/margins": 0.19906282424926758, "rewards/rejected": 0.026640798896551132, "step": 854 }, { "epoch": 0.46110287178104353, "grad_norm": 7.9200358390808105, "learning_rate": 3.2644773634841505e-07, "logits/chosen": 0.7263745069503784, "logits/rejected": 1.5991132259368896, "logps/chosen": -161.1514129638672, "logps/rejected": -234.985595703125, "loss": 0.7187, "rewards/accuracies": 0.5, "rewards/chosen": 0.061463356018066406, "rewards/margins": -0.03901835158467293, "rewards/rejected": 0.10048170387744904, "step": 855 }, { "epoch": 0.4616421733854658, "grad_norm": 8.496932983398438, "learning_rate": 3.259992937316727e-07, "logits/chosen": 0.2282615453004837, "logits/rejected": 1.3253785371780396, "logps/chosen": -251.0919189453125, "logps/rejected": -330.82501220703125, "loss": 0.6803, "rewards/accuracies": 0.5, "rewards/chosen": 0.23457878828048706, "rewards/margins": 0.04180632159113884, "rewards/rejected": 0.1927724927663803, "step": 856 }, { "epoch": 0.4621814749898881, "grad_norm": 8.323101043701172, "learning_rate": 3.255505815166729e-07, "logits/chosen": 0.36995238065719604, "logits/rejected": -0.24643340706825256, "logps/chosen": -259.0662841796875, "logps/rejected": -178.77297973632812, "loss": 0.6397, "rewards/accuracies": 0.75, "rewards/chosen": 0.26287707686424255, "rewards/margins": 0.12070827931165695, "rewards/rejected": 0.1421688050031662, "step": 857 }, { "epoch": 0.4627207765943104, "grad_norm": 7.300182819366455, "learning_rate": 3.2510160129516775e-07, "logits/chosen": 0.7567973136901855, "logits/rejected": 0.4954546093940735, "logps/chosen": -268.80096435546875, "logps/rejected": -255.08143615722656, "loss": 0.6581, "rewards/accuracies": 0.625, "rewards/chosen": 0.2402307540178299, "rewards/margins": 0.07856178283691406, "rewards/rejected": 0.16166897118091583, "step": 858 }, { "epoch": 0.46326007819873266, "grad_norm": 6.589439868927002, "learning_rate": 3.246523546598599e-07, "logits/chosen": -0.5337706804275513, "logits/rejected": -1.9693031311035156, "logps/chosen": -224.5887451171875, "logps/rejected": -182.023681640625, "loss": 0.6676, "rewards/accuracies": 0.75, "rewards/chosen": 0.15814581513404846, "rewards/margins": 0.05621213838458061, "rewards/rejected": 0.10193367302417755, "step": 859 }, { "epoch": 0.4637993798031549, "grad_norm": 7.376667022705078, "learning_rate": 3.2420284320439736e-07, "logits/chosen": 0.13334394991397858, "logits/rejected": -0.4815720319747925, "logps/chosen": -226.76730346679688, "logps/rejected": -235.7096710205078, "loss": 0.6833, "rewards/accuracies": 0.375, "rewards/chosen": 0.17120666801929474, "rewards/margins": 0.02777709625661373, "rewards/rejected": 0.14342956244945526, "step": 860 }, { "epoch": 0.46433868140757717, "grad_norm": 6.620248317718506, "learning_rate": 3.2375306852336724e-07, "logits/chosen": -0.7654955387115479, "logits/rejected": -0.7292631268501282, "logps/chosen": -227.82484436035156, "logps/rejected": -217.31280517578125, "loss": 0.6579, "rewards/accuracies": 0.75, "rewards/chosen": 0.17562350630760193, "rewards/margins": 0.07732658088207245, "rewards/rejected": 0.09829693287611008, "step": 861 }, { "epoch": 0.46487798301199945, "grad_norm": 7.061690330505371, "learning_rate": 3.2330303221229073e-07, "logits/chosen": 0.47176414728164673, "logits/rejected": -0.6624717712402344, "logps/chosen": -259.52581787109375, "logps/rejected": -248.74844360351562, "loss": 0.6584, "rewards/accuracies": 0.75, "rewards/chosen": 0.22304287552833557, "rewards/margins": 0.07643204182386398, "rewards/rejected": 0.146610826253891, "step": 862 }, { "epoch": 0.46541728461642173, "grad_norm": 6.853357315063477, "learning_rate": 3.2285273586761667e-07, "logits/chosen": -0.6115254163742065, "logits/rejected": -1.0462466478347778, "logps/chosen": -239.2843017578125, "logps/rejected": -200.64495849609375, "loss": 0.6792, "rewards/accuracies": 0.625, "rewards/chosen": 0.12875600159168243, "rewards/margins": 0.03806610032916069, "rewards/rejected": 0.09068989753723145, "step": 863 }, { "epoch": 0.465956586220844, "grad_norm": 7.039694309234619, "learning_rate": 3.224021810867168e-07, "logits/chosen": -0.15630072355270386, "logits/rejected": -0.33615291118621826, "logps/chosen": -313.06817626953125, "logps/rejected": -299.3121643066406, "loss": 0.6753, "rewards/accuracies": 0.75, "rewards/chosen": 0.19359149038791656, "rewards/margins": 0.038919635117053986, "rewards/rejected": 0.15467186272144318, "step": 864 }, { "epoch": 0.4664958878252663, "grad_norm": 7.701940059661865, "learning_rate": 3.219513694678795e-07, "logits/chosen": 1.0345121622085571, "logits/rejected": -1.0471718311309814, "logps/chosen": -348.0606689453125, "logps/rejected": -257.3437805175781, "loss": 0.6628, "rewards/accuracies": 0.5, "rewards/chosen": 0.1705915480852127, "rewards/margins": 0.07671957463026047, "rewards/rejected": 0.09387196600437164, "step": 865 }, { "epoch": 0.4670351894296886, "grad_norm": 6.622201919555664, "learning_rate": 3.2150030261030407e-07, "logits/chosen": 0.41603103280067444, "logits/rejected": -0.08122193813323975, "logps/chosen": -261.39178466796875, "logps/rejected": -279.758056640625, "loss": 0.6668, "rewards/accuracies": 0.5, "rewards/chosen": 0.16078144311904907, "rewards/margins": 0.05944476276636124, "rewards/rejected": 0.10133667290210724, "step": 866 }, { "epoch": 0.4675744910341108, "grad_norm": 6.583987712860107, "learning_rate": 3.2104898211409544e-07, "logits/chosen": 0.4488541781902313, "logits/rejected": 0.3143254518508911, "logps/chosen": -256.20050048828125, "logps/rejected": -255.86817932128906, "loss": 0.676, "rewards/accuracies": 0.75, "rewards/chosen": 0.11530962586402893, "rewards/margins": 0.04315586015582085, "rewards/rejected": 0.07215376198291779, "step": 867 }, { "epoch": 0.4681137926385331, "grad_norm": 5.528810977935791, "learning_rate": 3.2059740958025814e-07, "logits/chosen": 0.22824275493621826, "logits/rejected": 0.11628293991088867, "logps/chosen": -174.49795532226562, "logps/rejected": -182.15757751464844, "loss": 0.6592, "rewards/accuracies": 0.5, "rewards/chosen": 0.1764841079711914, "rewards/margins": 0.07579269260168076, "rewards/rejected": 0.10069141536951065, "step": 868 }, { "epoch": 0.46865309424295537, "grad_norm": 6.277967929840088, "learning_rate": 3.201455866106909e-07, "logits/chosen": 0.21576917171478271, "logits/rejected": -0.2757137417793274, "logps/chosen": -206.93020629882812, "logps/rejected": -192.04425048828125, "loss": 0.6646, "rewards/accuracies": 0.625, "rewards/chosen": 0.15605315566062927, "rewards/margins": 0.06265288591384888, "rewards/rejected": 0.09340028464794159, "step": 869 }, { "epoch": 0.46919239584737765, "grad_norm": 7.144298076629639, "learning_rate": 3.196935148081807e-07, "logits/chosen": 0.1506403088569641, "logits/rejected": -0.17190396785736084, "logps/chosen": -225.91156005859375, "logps/rejected": -158.5275421142578, "loss": 0.6591, "rewards/accuracies": 0.75, "rewards/chosen": 0.2109566628932953, "rewards/margins": 0.07311734557151794, "rewards/rejected": 0.13783931732177734, "step": 870 }, { "epoch": 0.46973169745179993, "grad_norm": 8.614044189453125, "learning_rate": 3.192411957763976e-07, "logits/chosen": 0.8601639270782471, "logits/rejected": 0.873691976070404, "logps/chosen": -173.29006958007812, "logps/rejected": -207.4072265625, "loss": 0.6644, "rewards/accuracies": 0.5, "rewards/chosen": 0.14567060768604279, "rewards/margins": 0.07099704444408417, "rewards/rejected": 0.07467355579137802, "step": 871 }, { "epoch": 0.4702709990562222, "grad_norm": 6.905462741851807, "learning_rate": 3.18788631119888e-07, "logits/chosen": -0.2916131019592285, "logits/rejected": -0.3497648239135742, "logps/chosen": -409.574951171875, "logps/rejected": -357.1993713378906, "loss": 0.6596, "rewards/accuracies": 0.75, "rewards/chosen": 0.2531720995903015, "rewards/margins": 0.07810249924659729, "rewards/rejected": 0.1750696301460266, "step": 872 }, { "epoch": 0.47081030066064444, "grad_norm": 8.09859561920166, "learning_rate": 3.183358224440703e-07, "logits/chosen": -0.4454405903816223, "logits/rejected": -0.4554668068885803, "logps/chosen": -190.40817260742188, "logps/rejected": -248.06724548339844, "loss": 0.6965, "rewards/accuracies": 0.625, "rewards/chosen": 0.12735843658447266, "rewards/margins": 0.005603216588497162, "rewards/rejected": 0.12175522744655609, "step": 873 }, { "epoch": 0.4713496022650667, "grad_norm": 6.360964775085449, "learning_rate": 3.1788277135522807e-07, "logits/chosen": 1.064213514328003, "logits/rejected": 0.9279011487960815, "logps/chosen": -349.5693359375, "logps/rejected": -349.0695495605469, "loss": 0.651, "rewards/accuracies": 0.625, "rewards/chosen": 0.19093437492847443, "rewards/margins": 0.09161662310361862, "rewards/rejected": 0.09931774437427521, "step": 874 }, { "epoch": 0.471888903869489, "grad_norm": 9.267683982849121, "learning_rate": 3.1742947946050515e-07, "logits/chosen": -0.5411428809165955, "logits/rejected": -0.1911538988351822, "logps/chosen": -282.6304626464844, "logps/rejected": -231.20132446289062, "loss": 0.7111, "rewards/accuracies": 0.5, "rewards/chosen": 0.0862785279750824, "rewards/margins": -0.02529444918036461, "rewards/rejected": 0.1115729883313179, "step": 875 }, { "epoch": 0.4724282054739113, "grad_norm": 7.275974273681641, "learning_rate": 3.169759483678992e-07, "logits/chosen": 0.8102379441261292, "logits/rejected": -0.022911950945854187, "logps/chosen": -267.8459167480469, "logps/rejected": -294.5773010253906, "loss": 0.7212, "rewards/accuracies": 0.625, "rewards/chosen": 0.14925536513328552, "rewards/margins": -0.03489571064710617, "rewards/rejected": 0.1841510832309723, "step": 876 }, { "epoch": 0.47296750707833357, "grad_norm": 7.008012771606445, "learning_rate": 3.165221796862569e-07, "logits/chosen": 0.8200975656509399, "logits/rejected": -0.3712586760520935, "logps/chosen": -228.60182189941406, "logps/rejected": -183.0015411376953, "loss": 0.6347, "rewards/accuracies": 0.625, "rewards/chosen": 0.2005319744348526, "rewards/margins": 0.1273690164089203, "rewards/rejected": 0.07316293567419052, "step": 877 }, { "epoch": 0.47350680868275585, "grad_norm": 7.889534950256348, "learning_rate": 3.1606817502526736e-07, "logits/chosen": 0.09812727570533752, "logits/rejected": 0.18480396270751953, "logps/chosen": -245.30923461914062, "logps/rejected": -285.92999267578125, "loss": 0.7321, "rewards/accuracies": 0.25, "rewards/chosen": 0.19936007261276245, "rewards/margins": -0.06411361694335938, "rewards/rejected": 0.2634737193584442, "step": 878 }, { "epoch": 0.4740461102871781, "grad_norm": 7.585082530975342, "learning_rate": 3.156139359954569e-07, "logits/chosen": 0.0012699365615844727, "logits/rejected": -0.9791393876075745, "logps/chosen": -175.13003540039062, "logps/rejected": -161.63710021972656, "loss": 0.6881, "rewards/accuracies": 0.5, "rewards/chosen": 0.14300690591335297, "rewards/margins": 0.022081177681684494, "rewards/rejected": 0.12092570960521698, "step": 879 }, { "epoch": 0.47458541189160036, "grad_norm": 7.688393592834473, "learning_rate": 3.151594642081834e-07, "logits/chosen": 1.3934719562530518, "logits/rejected": 0.32628029584884644, "logps/chosen": -308.3377990722656, "logps/rejected": -265.8641357421875, "loss": 0.6284, "rewards/accuracies": 0.875, "rewards/chosen": 0.29110604524612427, "rewards/margins": 0.1381978988647461, "rewards/rejected": 0.15290813148021698, "step": 880 }, { "epoch": 0.47512471349602264, "grad_norm": 7.195122241973877, "learning_rate": 3.147047612756302e-07, "logits/chosen": 0.10059723258018494, "logits/rejected": -0.1680571734905243, "logps/chosen": -251.35948181152344, "logps/rejected": -301.705322265625, "loss": 0.674, "rewards/accuracies": 0.75, "rewards/chosen": 0.16960115730762482, "rewards/margins": 0.04326653853058815, "rewards/rejected": 0.12633462250232697, "step": 881 }, { "epoch": 0.4756640151004449, "grad_norm": 7.311324119567871, "learning_rate": 3.1424982881080065e-07, "logits/chosen": -0.3359007239341736, "logits/rejected": -0.9197202920913696, "logps/chosen": -193.9486083984375, "logps/rejected": -189.5323486328125, "loss": 0.6637, "rewards/accuracies": 0.625, "rewards/chosen": 0.1886705905199051, "rewards/margins": 0.07047580927610397, "rewards/rejected": 0.11819477379322052, "step": 882 }, { "epoch": 0.4762033167048672, "grad_norm": 7.415416240692139, "learning_rate": 3.1379466842751256e-07, "logits/chosen": -0.2730938196182251, "logits/rejected": 0.2323896586894989, "logps/chosen": -198.4519500732422, "logps/rejected": -232.77679443359375, "loss": 0.6959, "rewards/accuracies": 0.375, "rewards/chosen": 0.10966377705335617, "rewards/margins": 0.0036295000463724136, "rewards/rejected": 0.1060342788696289, "step": 883 }, { "epoch": 0.4767426183092895, "grad_norm": 6.830358028411865, "learning_rate": 3.133392817403919e-07, "logits/chosen": -0.11690309643745422, "logits/rejected": 0.32603827118873596, "logps/chosen": -198.80282592773438, "logps/rejected": -229.43678283691406, "loss": 0.6802, "rewards/accuracies": 0.625, "rewards/chosen": 0.17170554399490356, "rewards/margins": 0.036303140223026276, "rewards/rejected": 0.1354023814201355, "step": 884 }, { "epoch": 0.4772819199137117, "grad_norm": 7.597024917602539, "learning_rate": 3.1288367036486755e-07, "logits/chosen": -0.26084601879119873, "logits/rejected": 0.4442320168018341, "logps/chosen": -238.56246948242188, "logps/rejected": -276.1705627441406, "loss": 0.7023, "rewards/accuracies": 0.625, "rewards/chosen": 0.17582854628562927, "rewards/margins": -0.013201899826526642, "rewards/rejected": 0.1890304535627365, "step": 885 }, { "epoch": 0.477821221518134, "grad_norm": 8.465553283691406, "learning_rate": 3.1242783591716567e-07, "logits/chosen": -0.7130929827690125, "logits/rejected": -0.37828296422958374, "logps/chosen": -222.6154327392578, "logps/rejected": -217.5286102294922, "loss": 0.5907, "rewards/accuracies": 0.875, "rewards/chosen": 0.28524351119995117, "rewards/margins": 0.222116619348526, "rewards/rejected": 0.06312690675258636, "step": 886 }, { "epoch": 0.4783605231225563, "grad_norm": 7.183916091918945, "learning_rate": 3.1197178001430343e-07, "logits/chosen": 0.23710186779499054, "logits/rejected": -0.5446954369544983, "logps/chosen": -220.81610107421875, "logps/rejected": -232.26199340820312, "loss": 0.6675, "rewards/accuracies": 0.625, "rewards/chosen": 0.19044914841651917, "rewards/margins": 0.06036768853664398, "rewards/rejected": 0.13008145987987518, "step": 887 }, { "epoch": 0.47889982472697856, "grad_norm": 6.844493389129639, "learning_rate": 3.115155042740838e-07, "logits/chosen": 0.50167316198349, "logits/rejected": 0.2981233596801758, "logps/chosen": -292.5979919433594, "logps/rejected": -294.1312255859375, "loss": 0.6786, "rewards/accuracies": 0.5, "rewards/chosen": 0.22663497924804688, "rewards/margins": 0.034253112971782684, "rewards/rejected": 0.1923818588256836, "step": 888 }, { "epoch": 0.47943912633140084, "grad_norm": 7.185238838195801, "learning_rate": 3.110590103150894e-07, "logits/chosen": -0.038041144609451294, "logits/rejected": -0.9834379553794861, "logps/chosen": -230.75946044921875, "logps/rejected": -218.6208953857422, "loss": 0.6364, "rewards/accuracies": 0.75, "rewards/chosen": 0.16939659416675568, "rewards/margins": 0.12319040298461914, "rewards/rejected": 0.046206191182136536, "step": 889 }, { "epoch": 0.4799784279358231, "grad_norm": 7.179860591888428, "learning_rate": 3.106022997566771e-07, "logits/chosen": 0.15127474069595337, "logits/rejected": -0.1563432216644287, "logps/chosen": -297.3650817871094, "logps/rejected": -282.0970764160156, "loss": 0.7416, "rewards/accuracies": 0.25, "rewards/chosen": 0.11605873703956604, "rewards/margins": -0.09081459790468216, "rewards/rejected": 0.2068733274936676, "step": 890 }, { "epoch": 0.4805177295402454, "grad_norm": 6.997697353363037, "learning_rate": 3.101453742189722e-07, "logits/chosen": 0.2310725897550583, "logits/rejected": -0.3931082487106323, "logps/chosen": -273.5390930175781, "logps/rejected": -178.54513549804688, "loss": 0.7272, "rewards/accuracies": 0.25, "rewards/chosen": 0.07463540881872177, "rewards/margins": -0.06300698220729828, "rewards/rejected": 0.13764238357543945, "step": 891 }, { "epoch": 0.48105703114466764, "grad_norm": 6.839488506317139, "learning_rate": 3.096882353228624e-07, "logits/chosen": 0.4810059070587158, "logits/rejected": -1.0613294839859009, "logps/chosen": -243.768798828125, "logps/rejected": -194.4390869140625, "loss": 0.6119, "rewards/accuracies": 0.75, "rewards/chosen": 0.2681194245815277, "rewards/margins": 0.18688488006591797, "rewards/rejected": 0.08123455196619034, "step": 892 }, { "epoch": 0.4815963327490899, "grad_norm": 7.872755527496338, "learning_rate": 3.0923088468999246e-07, "logits/chosen": 0.02519351989030838, "logits/rejected": -0.1562420129776001, "logps/chosen": -269.33453369140625, "logps/rejected": -211.61375427246094, "loss": 0.7117, "rewards/accuracies": 0.5, "rewards/chosen": 0.18448010087013245, "rewards/margins": -0.030185889452695847, "rewards/rejected": 0.2146659940481186, "step": 893 }, { "epoch": 0.4821356343535122, "grad_norm": 5.855284214019775, "learning_rate": 3.08773323942758e-07, "logits/chosen": -1.0793596506118774, "logits/rejected": -0.5859922170639038, "logps/chosen": -235.0098419189453, "logps/rejected": -250.94158935546875, "loss": 0.6562, "rewards/accuracies": 0.625, "rewards/chosen": 0.21887607872486115, "rewards/margins": 0.09078120440244675, "rewards/rejected": 0.1280948668718338, "step": 894 }, { "epoch": 0.4826749359579345, "grad_norm": 6.811209201812744, "learning_rate": 3.0831555470430034e-07, "logits/chosen": 0.7880208492279053, "logits/rejected": -0.8538607954978943, "logps/chosen": -282.03253173828125, "logps/rejected": -160.04183959960938, "loss": 0.6322, "rewards/accuracies": 0.75, "rewards/chosen": 0.2449936866760254, "rewards/margins": 0.15789957344532013, "rewards/rejected": 0.08709412068128586, "step": 895 }, { "epoch": 0.48321423756235676, "grad_norm": 8.641796112060547, "learning_rate": 3.078575785985002e-07, "logits/chosen": -0.4042390286922455, "logits/rejected": -0.608069658279419, "logps/chosen": -325.2460021972656, "logps/rejected": -324.3793029785156, "loss": 0.7206, "rewards/accuracies": 0.625, "rewards/chosen": 0.16175690293312073, "rewards/margins": -0.039628900587558746, "rewards/rejected": 0.20138578116893768, "step": 896 }, { "epoch": 0.48375353916677905, "grad_norm": 7.277120113372803, "learning_rate": 3.07399397249972e-07, "logits/chosen": -0.24727021157741547, "logits/rejected": 0.8578449487686157, "logps/chosen": -247.8604736328125, "logps/rejected": -336.361572265625, "loss": 0.6984, "rewards/accuracies": 0.75, "rewards/chosen": 0.14214029908180237, "rewards/margins": -0.0025279056280851364, "rewards/rejected": 0.14466819167137146, "step": 897 }, { "epoch": 0.48429284077120127, "grad_norm": 7.428501605987549, "learning_rate": 3.069410122840585e-07, "logits/chosen": -0.3570909798145294, "logits/rejected": 0.1636829376220703, "logps/chosen": -196.65579223632812, "logps/rejected": -269.6961669921875, "loss": 0.7723, "rewards/accuracies": 0.125, "rewards/chosen": 0.08537350594997406, "rewards/margins": -0.14837531745433807, "rewards/rejected": 0.23374882340431213, "step": 898 }, { "epoch": 0.48483214237562355, "grad_norm": 6.6067423820495605, "learning_rate": 3.064824253268247e-07, "logits/chosen": 0.47892647981643677, "logits/rejected": -0.44319671392440796, "logps/chosen": -320.55340576171875, "logps/rejected": -270.1787414550781, "loss": 0.6542, "rewards/accuracies": 0.625, "rewards/chosen": 0.22142505645751953, "rewards/margins": 0.09021635353565216, "rewards/rejected": 0.13120871782302856, "step": 899 }, { "epoch": 0.48537144398004584, "grad_norm": 8.524017333984375, "learning_rate": 3.060236380050519e-07, "logits/chosen": -0.2100522518157959, "logits/rejected": -0.12483110278844833, "logps/chosen": -300.4180908203125, "logps/rejected": -416.6201477050781, "loss": 0.6416, "rewards/accuracies": 0.625, "rewards/chosen": 0.15500126779079437, "rewards/margins": 0.11084622889757156, "rewards/rejected": 0.044155023992061615, "step": 900 }, { "epoch": 0.48537144398004584, "eval_logits/chosen": 1.3612416982650757, "eval_logits/rejected": 1.0876402854919434, "eval_logps/chosen": -249.66029357910156, "eval_logps/rejected": -234.5421905517578, "eval_loss": 0.6754201650619507, "eval_rewards/accuracies": 0.6037266850471497, "eval_rewards/chosen": 0.1791199892759323, "eval_rewards/margins": 0.047082461416721344, "eval_rewards/rejected": 0.13203752040863037, "eval_runtime": 836.6859, "eval_samples_per_second": 1.924, "eval_steps_per_second": 0.962, "step": 900 }, { "epoch": 0.4859107455844681, "grad_norm": 6.68065071105957, "learning_rate": 3.0556465194623266e-07, "logits/chosen": 0.11702238023281097, "logits/rejected": -1.156465768814087, "logps/chosen": -221.058349609375, "logps/rejected": -220.78695678710938, "loss": 0.6392, "rewards/accuracies": 0.75, "rewards/chosen": 0.22358685731887817, "rewards/margins": 0.1216920018196106, "rewards/rejected": 0.10189486294984818, "step": 901 }, { "epoch": 0.4864500471888904, "grad_norm": 7.737091541290283, "learning_rate": 3.05105468778564e-07, "logits/chosen": 1.106839656829834, "logits/rejected": 0.22868436574935913, "logps/chosen": -255.87130737304688, "logps/rejected": -222.88499450683594, "loss": 0.6948, "rewards/accuracies": 0.5, "rewards/chosen": 0.1683318167924881, "rewards/margins": 0.016241170465946198, "rewards/rejected": 0.1520906537771225, "step": 902 }, { "epoch": 0.4869893487933127, "grad_norm": 7.607453346252441, "learning_rate": 3.0464609013094237e-07, "logits/chosen": 0.5156528949737549, "logits/rejected": -0.39717456698417664, "logps/chosen": -292.2793273925781, "logps/rejected": -278.9642333984375, "loss": 0.7008, "rewards/accuracies": 0.625, "rewards/chosen": 0.1475902646780014, "rewards/margins": -0.0013977047055959702, "rewards/rejected": 0.14898796379566193, "step": 903 }, { "epoch": 0.4875286503977349, "grad_norm": 7.042440414428711, "learning_rate": 3.041865176329579e-07, "logits/chosen": -1.0175062417984009, "logits/rejected": 0.45479193329811096, "logps/chosen": -259.468505859375, "logps/rejected": -250.42886352539062, "loss": 0.7177, "rewards/accuracies": 0.375, "rewards/chosen": 0.1283067762851715, "rewards/margins": -0.04033508151769638, "rewards/rejected": 0.1686418652534485, "step": 904 }, { "epoch": 0.4880679520021572, "grad_norm": 8.450037002563477, "learning_rate": 3.0372675291488797e-07, "logits/chosen": 0.2502627372741699, "logits/rejected": -0.31089332699775696, "logps/chosen": -214.38070678710938, "logps/rejected": -167.19078063964844, "loss": 0.6654, "rewards/accuracies": 0.625, "rewards/chosen": 0.19357557594776154, "rewards/margins": 0.06227445602416992, "rewards/rejected": 0.1313011348247528, "step": 905 }, { "epoch": 0.4886072536065795, "grad_norm": 7.01882791519165, "learning_rate": 3.0326679760769226e-07, "logits/chosen": -0.8842638731002808, "logits/rejected": -1.8700240850448608, "logps/chosen": -297.49859619140625, "logps/rejected": -244.62283325195312, "loss": 0.6612, "rewards/accuracies": 0.75, "rewards/chosen": 0.09466691315174103, "rewards/margins": 0.07071718573570251, "rewards/rejected": 0.023949718102812767, "step": 906 }, { "epoch": 0.48914655521100175, "grad_norm": 9.3782377243042, "learning_rate": 3.02806653343006e-07, "logits/chosen": 0.15786248445510864, "logits/rejected": 0.5848703384399414, "logps/chosen": -205.05929565429688, "logps/rejected": -243.96771240234375, "loss": 0.7444, "rewards/accuracies": 0.375, "rewards/chosen": 0.08199214935302734, "rewards/margins": -0.09134139865636826, "rewards/rejected": 0.1733335554599762, "step": 907 }, { "epoch": 0.48968585681542404, "grad_norm": 7.141688346862793, "learning_rate": 3.023463217531353e-07, "logits/chosen": 0.21959832310676575, "logits/rejected": -0.22506992518901825, "logps/chosen": -241.32266235351562, "logps/rejected": -347.10723876953125, "loss": 0.6805, "rewards/accuracies": 0.75, "rewards/chosen": 0.13828010857105255, "rewards/margins": 0.03269615024328232, "rewards/rejected": 0.10558395832777023, "step": 908 }, { "epoch": 0.4902251584198463, "grad_norm": 7.8292646408081055, "learning_rate": 3.018858044710505e-07, "logits/chosen": 0.40552443265914917, "logits/rejected": 0.21433785557746887, "logps/chosen": -278.9302673339844, "logps/rejected": -234.16622924804688, "loss": 0.6975, "rewards/accuracies": 0.25, "rewards/chosen": 0.1613023728132248, "rewards/margins": 0.0031494610011577606, "rewards/rejected": 0.15815292298793793, "step": 909 }, { "epoch": 0.49076446002426855, "grad_norm": 8.030399322509766, "learning_rate": 3.0142510313038054e-07, "logits/chosen": 0.24745705723762512, "logits/rejected": -0.8345316052436829, "logps/chosen": -285.5075988769531, "logps/rejected": -247.8426513671875, "loss": 0.6586, "rewards/accuracies": 0.75, "rewards/chosen": 0.2144155651330948, "rewards/margins": 0.07893218845129013, "rewards/rejected": 0.13548336923122406, "step": 910 }, { "epoch": 0.4913037616286908, "grad_norm": 8.507048606872559, "learning_rate": 3.009642193654076e-07, "logits/chosen": 0.1550287902355194, "logits/rejected": 0.2675952613353729, "logps/chosen": -318.57318115234375, "logps/rejected": -285.2996520996094, "loss": 0.7759, "rewards/accuracies": 0.25, "rewards/chosen": 0.11578015983104706, "rewards/margins": -0.1405450850725174, "rewards/rejected": 0.25632524490356445, "step": 911 }, { "epoch": 0.4918430632331131, "grad_norm": 6.164507865905762, "learning_rate": 3.005031548110607e-07, "logits/chosen": 0.4780696630477905, "logits/rejected": 0.07449427247047424, "logps/chosen": -189.04522705078125, "logps/rejected": -167.83798217773438, "loss": 0.6921, "rewards/accuracies": 0.25, "rewards/chosen": 0.1384286880493164, "rewards/margins": 0.013880634680390358, "rewards/rejected": 0.1245480552315712, "step": 912 }, { "epoch": 0.4923823648375354, "grad_norm": 8.345555305480957, "learning_rate": 3.000419111029104e-07, "logits/chosen": -0.3154158294200897, "logits/rejected": -1.0847373008728027, "logps/chosen": -280.5724182128906, "logps/rejected": -200.42471313476562, "loss": 0.7143, "rewards/accuracies": 0.375, "rewards/chosen": 0.11941356211900711, "rewards/margins": -0.03019714169204235, "rewards/rejected": 0.1496107131242752, "step": 913 }, { "epoch": 0.4929216664419577, "grad_norm": 6.766451358795166, "learning_rate": 2.995804898771626e-07, "logits/chosen": 0.20228098332881927, "logits/rejected": 0.2845621705055237, "logps/chosen": -268.45025634765625, "logps/rejected": -282.7024230957031, "loss": 0.6945, "rewards/accuracies": 0.625, "rewards/chosen": 0.16124114394187927, "rewards/margins": 0.005947922356426716, "rewards/rejected": 0.15529322624206543, "step": 914 }, { "epoch": 0.49346096804637996, "grad_norm": 6.910723686218262, "learning_rate": 2.991188927706531e-07, "logits/chosen": 1.0538883209228516, "logits/rejected": 1.427545428276062, "logps/chosen": -186.052734375, "logps/rejected": -244.9948272705078, "loss": 0.7041, "rewards/accuracies": 0.5, "rewards/chosen": 0.1301673948764801, "rewards/margins": -0.015030095353722572, "rewards/rejected": 0.14519749581813812, "step": 915 }, { "epoch": 0.49400026965080224, "grad_norm": 7.166679382324219, "learning_rate": 2.986571214208414e-07, "logits/chosen": 0.6864697337150574, "logits/rejected": -0.10578548908233643, "logps/chosen": -236.64939880371094, "logps/rejected": -228.8186798095703, "loss": 0.661, "rewards/accuracies": 0.75, "rewards/chosen": 0.21772442758083344, "rewards/margins": 0.07210731506347656, "rewards/rejected": 0.14561709761619568, "step": 916 }, { "epoch": 0.49453957125522446, "grad_norm": 7.103065013885498, "learning_rate": 2.981951774658055e-07, "logits/chosen": 0.06381799280643463, "logits/rejected": -0.35661038756370544, "logps/chosen": -195.20510864257812, "logps/rejected": -159.92034912109375, "loss": 0.7242, "rewards/accuracies": 0.25, "rewards/chosen": 0.10269652307033539, "rewards/margins": -0.058028124272823334, "rewards/rejected": 0.16072463989257812, "step": 917 }, { "epoch": 0.49507887285964675, "grad_norm": 6.42849588394165, "learning_rate": 2.9773306254423513e-07, "logits/chosen": 1.426372766494751, "logits/rejected": -0.39682015776634216, "logps/chosen": -210.25961303710938, "logps/rejected": -151.77163696289062, "loss": 0.6551, "rewards/accuracies": 0.5, "rewards/chosen": 0.15764495730400085, "rewards/margins": 0.09113645553588867, "rewards/rejected": 0.06650848686695099, "step": 918 }, { "epoch": 0.49561817446406903, "grad_norm": 8.951469421386719, "learning_rate": 2.97270778295427e-07, "logits/chosen": -0.8544371128082275, "logits/rejected": -0.4096011221408844, "logps/chosen": -286.13922119140625, "logps/rejected": -318.6224670410156, "loss": 0.6918, "rewards/accuracies": 0.625, "rewards/chosen": 0.1975693702697754, "rewards/margins": 0.006072140298783779, "rewards/rejected": 0.19149723649024963, "step": 919 }, { "epoch": 0.4961574760684913, "grad_norm": 7.763193130493164, "learning_rate": 2.968083263592782e-07, "logits/chosen": -0.5117099285125732, "logits/rejected": -0.2294175922870636, "logps/chosen": -251.19784545898438, "logps/rejected": -263.6228332519531, "loss": 0.6704, "rewards/accuracies": 0.5, "rewards/chosen": 0.19372519850730896, "rewards/margins": 0.0629815086722374, "rewards/rejected": 0.13074369728565216, "step": 920 }, { "epoch": 0.4966967776729136, "grad_norm": 7.523340225219727, "learning_rate": 2.9634570837628086e-07, "logits/chosen": 0.12324616312980652, "logits/rejected": 0.5812612175941467, "logps/chosen": -239.15447998046875, "logps/rejected": -246.1467742919922, "loss": 0.6797, "rewards/accuracies": 0.5, "rewards/chosen": 0.140559583902359, "rewards/margins": 0.036095425486564636, "rewards/rejected": 0.10446415096521378, "step": 921 }, { "epoch": 0.4972360792773359, "grad_norm": 7.796231269836426, "learning_rate": 2.9588292598751595e-07, "logits/chosen": -0.44988352060317993, "logits/rejected": -0.692794919013977, "logps/chosen": -224.61672973632812, "logps/rejected": -294.8524475097656, "loss": 0.6357, "rewards/accuracies": 0.625, "rewards/chosen": 0.210673987865448, "rewards/margins": 0.12713061273097992, "rewards/rejected": 0.08354339003562927, "step": 922 }, { "epoch": 0.4977753808817581, "grad_norm": 6.7119140625, "learning_rate": 2.954199808346479e-07, "logits/chosen": 0.48281070590019226, "logits/rejected": -0.7807561159133911, "logps/chosen": -399.927490234375, "logps/rejected": -284.7840576171875, "loss": 0.6237, "rewards/accuracies": 0.75, "rewards/chosen": 0.22094173729419708, "rewards/margins": 0.1515255868434906, "rewards/rejected": 0.06941614300012589, "step": 923 }, { "epoch": 0.4983146824861804, "grad_norm": 8.5165376663208, "learning_rate": 2.9495687455991814e-07, "logits/chosen": -0.06697964668273926, "logits/rejected": -0.04793471097946167, "logps/chosen": -216.21490478515625, "logps/rejected": -194.0443878173828, "loss": 0.7157, "rewards/accuracies": 0.375, "rewards/chosen": 0.1715869903564453, "rewards/margins": -0.04130954295396805, "rewards/rejected": 0.21289654076099396, "step": 924 }, { "epoch": 0.49885398409060266, "grad_norm": 7.334231376647949, "learning_rate": 2.9449360880614005e-07, "logits/chosen": -0.9644678831100464, "logits/rejected": -0.6928349733352661, "logps/chosen": -185.19677734375, "logps/rejected": -245.6640625, "loss": 0.7346, "rewards/accuracies": 0.375, "rewards/chosen": 0.0549163818359375, "rewards/margins": -0.07053737342357635, "rewards/rejected": 0.12545377016067505, "step": 925 }, { "epoch": 0.49939328569502495, "grad_norm": 7.3229241371154785, "learning_rate": 2.9403018521669255e-07, "logits/chosen": -0.20159286260604858, "logits/rejected": -0.07460612058639526, "logps/chosen": -215.90542602539062, "logps/rejected": -230.82183837890625, "loss": 0.7195, "rewards/accuracies": 0.375, "rewards/chosen": 0.1764822006225586, "rewards/margins": -0.04539213329553604, "rewards/rejected": 0.22187434136867523, "step": 926 }, { "epoch": 0.49993258729944723, "grad_norm": 6.805231094360352, "learning_rate": 2.935666054355146e-07, "logits/chosen": -0.5085896849632263, "logits/rejected": 0.538925290107727, "logps/chosen": -188.88702392578125, "logps/rejected": -258.06640625, "loss": 0.7108, "rewards/accuracies": 0.375, "rewards/chosen": 0.09296007454395294, "rewards/margins": -0.02116222307085991, "rewards/rejected": 0.11412229388952255, "step": 927 }, { "epoch": 0.5004718889038695, "grad_norm": 7.923383712768555, "learning_rate": 2.9310287110709893e-07, "logits/chosen": 0.38499945402145386, "logits/rejected": -0.38578125834465027, "logps/chosen": -270.52020263671875, "logps/rejected": -215.2902069091797, "loss": 0.6856, "rewards/accuracies": 0.75, "rewards/chosen": 0.087680384516716, "rewards/margins": 0.02693219482898712, "rewards/rejected": 0.06074819713830948, "step": 928 }, { "epoch": 0.5010111905082918, "grad_norm": 7.069624900817871, "learning_rate": 2.926389838764868e-07, "logits/chosen": 0.881564199924469, "logits/rejected": 0.905655026435852, "logps/chosen": -302.6661071777344, "logps/rejected": -290.7531433105469, "loss": 0.7157, "rewards/accuracies": 0.5, "rewards/chosen": 0.11910992115736008, "rewards/margins": -0.03822841867804527, "rewards/rejected": 0.15733833611011505, "step": 929 }, { "epoch": 0.5015504921127141, "grad_norm": 8.704763412475586, "learning_rate": 2.921749453892618e-07, "logits/chosen": -0.25600582361221313, "logits/rejected": -1.2727164030075073, "logps/chosen": -377.31768798828125, "logps/rejected": -202.6580352783203, "loss": 0.6311, "rewards/accuracies": 0.5, "rewards/chosen": 0.22803935408592224, "rewards/margins": 0.1392708718776703, "rewards/rejected": 0.08876848220825195, "step": 930 }, { "epoch": 0.5020897937171364, "grad_norm": 7.046964645385742, "learning_rate": 2.917107572915441e-07, "logits/chosen": -1.0161399841308594, "logits/rejected": -0.5871947407722473, "logps/chosen": -324.77484130859375, "logps/rejected": -252.86129760742188, "loss": 0.6704, "rewards/accuracies": 0.625, "rewards/chosen": 0.1960139274597168, "rewards/margins": 0.05349549651145935, "rewards/rejected": 0.14251843094825745, "step": 931 }, { "epoch": 0.5026290953215585, "grad_norm": 8.098550796508789, "learning_rate": 2.912464212299845e-07, "logits/chosen": -1.5865418910980225, "logits/rejected": 0.41365501284599304, "logps/chosen": -241.79306030273438, "logps/rejected": -507.8553771972656, "loss": 0.6611, "rewards/accuracies": 0.5, "rewards/chosen": 0.18561868369579315, "rewards/margins": 0.07226095348596573, "rewards/rejected": 0.11335774511098862, "step": 932 }, { "epoch": 0.5031683969259808, "grad_norm": 7.622375965118408, "learning_rate": 2.907819388517587e-07, "logits/chosen": 0.3881247639656067, "logits/rejected": 0.6128380298614502, "logps/chosen": -196.57183837890625, "logps/rejected": -193.35955810546875, "loss": 0.6957, "rewards/accuracies": 0.375, "rewards/chosen": 0.15270358324050903, "rewards/margins": 0.000989439431577921, "rewards/rejected": 0.15171414613723755, "step": 933 }, { "epoch": 0.5037076985304031, "grad_norm": 7.411153316497803, "learning_rate": 2.9031731180456156e-07, "logits/chosen": 0.9898163080215454, "logits/rejected": -0.31655603647232056, "logps/chosen": -283.8743591308594, "logps/rejected": -213.58856201171875, "loss": 0.698, "rewards/accuracies": 0.375, "rewards/chosen": 0.1380753517150879, "rewards/margins": 0.019786827266216278, "rewards/rejected": 0.11828851699829102, "step": 934 }, { "epoch": 0.5042470001348254, "grad_norm": 7.582033157348633, "learning_rate": 2.89852541736601e-07, "logits/chosen": 1.098244547843933, "logits/rejected": 0.6628659963607788, "logps/chosen": -247.66688537597656, "logps/rejected": -295.93115234375, "loss": 0.616, "rewards/accuracies": 0.875, "rewards/chosen": 0.2539879083633423, "rewards/margins": 0.17531967163085938, "rewards/rejected": 0.07866821438074112, "step": 935 }, { "epoch": 0.5047863017392477, "grad_norm": 8.22445297241211, "learning_rate": 2.8938763029659246e-07, "logits/chosen": -0.26927340030670166, "logits/rejected": -0.08254766464233398, "logps/chosen": -313.6552734375, "logps/rejected": -278.7645263671875, "loss": 0.6612, "rewards/accuracies": 0.5, "rewards/chosen": 0.18137283623218536, "rewards/margins": 0.07502603530883789, "rewards/rejected": 0.10634680092334747, "step": 936 }, { "epoch": 0.5053256033436699, "grad_norm": 5.608268737792969, "learning_rate": 2.889225791337526e-07, "logits/chosen": 0.11171817779541016, "logits/rejected": -0.6164085865020752, "logps/chosen": -228.38526916503906, "logps/rejected": -196.4921112060547, "loss": 0.6973, "rewards/accuracies": 0.625, "rewards/chosen": 0.20130985975265503, "rewards/margins": -0.0003399886190891266, "rewards/rejected": 0.20164987444877625, "step": 937 }, { "epoch": 0.5058649049480922, "grad_norm": 8.00656509399414, "learning_rate": 2.884573898977941e-07, "logits/chosen": 0.8970887660980225, "logits/rejected": 0.6025723218917847, "logps/chosen": -339.7855529785156, "logps/rejected": -290.3210754394531, "loss": 0.6377, "rewards/accuracies": 0.875, "rewards/chosen": 0.18737412989139557, "rewards/margins": 0.11494751274585724, "rewards/rejected": 0.07242660224437714, "step": 938 }, { "epoch": 0.5064042065525145, "grad_norm": 7.151945114135742, "learning_rate": 2.8799206423891893e-07, "logits/chosen": 0.9800761938095093, "logits/rejected": 0.08051910996437073, "logps/chosen": -251.0663604736328, "logps/rejected": -230.90538024902344, "loss": 0.6621, "rewards/accuracies": 0.75, "rewards/chosen": 0.16176852583885193, "rewards/margins": 0.066503144800663, "rewards/rejected": 0.09526538848876953, "step": 939 }, { "epoch": 0.5069435081569368, "grad_norm": 7.553274631500244, "learning_rate": 2.875266038078136e-07, "logits/chosen": -0.31011754274368286, "logits/rejected": 1.1894547939300537, "logps/chosen": -274.15313720703125, "logps/rejected": -349.8634033203125, "loss": 0.7043, "rewards/accuracies": 0.625, "rewards/chosen": 0.22861328721046448, "rewards/margins": -0.0139299426227808, "rewards/rejected": 0.24254323542118073, "step": 940 }, { "epoch": 0.5074828097613591, "grad_norm": 6.673113822937012, "learning_rate": 2.870610102556423e-07, "logits/chosen": 0.2515423893928528, "logits/rejected": -0.2765371799468994, "logps/chosen": -255.1439666748047, "logps/rejected": -197.57528686523438, "loss": 0.6141, "rewards/accuracies": 0.75, "rewards/chosen": 0.25172194838523865, "rewards/margins": 0.17164887487888336, "rewards/rejected": 0.08007307350635529, "step": 941 }, { "epoch": 0.5080221113657813, "grad_norm": 7.114014625549316, "learning_rate": 2.8659528523404164e-07, "logits/chosen": 0.2709251344203949, "logits/rejected": 0.055838629603385925, "logps/chosen": -270.4700622558594, "logps/rejected": -262.4530944824219, "loss": 0.6864, "rewards/accuracies": 0.375, "rewards/chosen": 0.17911455035209656, "rewards/margins": 0.0229308120906353, "rewards/rejected": 0.15618371963500977, "step": 942 }, { "epoch": 0.5085614129702036, "grad_norm": 6.956024646759033, "learning_rate": 2.861294303951144e-07, "logits/chosen": -0.7974296808242798, "logits/rejected": -0.31015464663505554, "logps/chosen": -184.94776916503906, "logps/rejected": -217.84707641601562, "loss": 0.695, "rewards/accuracies": 0.5, "rewards/chosen": 0.19277648627758026, "rewards/margins": 0.005806825123727322, "rewards/rejected": 0.18696966767311096, "step": 943 }, { "epoch": 0.5091007145746259, "grad_norm": 6.60166597366333, "learning_rate": 2.856634473914242e-07, "logits/chosen": -0.14497584104537964, "logits/rejected": 0.15210258960723877, "logps/chosen": -238.3763885498047, "logps/rejected": -276.13629150390625, "loss": 0.6941, "rewards/accuracies": 0.5, "rewards/chosen": 0.22361239790916443, "rewards/margins": 0.007402423769235611, "rewards/rejected": 0.21620997786521912, "step": 944 }, { "epoch": 0.5096400161790481, "grad_norm": 8.623492240905762, "learning_rate": 2.8519733787598884e-07, "logits/chosen": 0.26228705048561096, "logits/rejected": -0.9396153688430786, "logps/chosen": -324.184814453125, "logps/rejected": -327.7364501953125, "loss": 0.6417, "rewards/accuracies": 0.625, "rewards/chosen": 0.23811884224414825, "rewards/margins": 0.11858692765235901, "rewards/rejected": 0.11953191459178925, "step": 945 }, { "epoch": 0.5101793177834704, "grad_norm": 7.113579273223877, "learning_rate": 2.847311035022753e-07, "logits/chosen": 0.666952908039093, "logits/rejected": -0.17009374499320984, "logps/chosen": -280.64849853515625, "logps/rejected": -267.05224609375, "loss": 0.6602, "rewards/accuracies": 0.5, "rewards/chosen": 0.20900079607963562, "rewards/margins": 0.08390836417675018, "rewards/rejected": 0.12509241700172424, "step": 946 }, { "epoch": 0.5107186193878926, "grad_norm": 8.11842155456543, "learning_rate": 2.842647459241934e-07, "logits/chosen": 0.2627861499786377, "logits/rejected": -1.3009049892425537, "logps/chosen": -264.8606262207031, "logps/rejected": -244.99671936035156, "loss": 0.7294, "rewards/accuracies": 0.5, "rewards/chosen": 0.17187471687793732, "rewards/margins": -0.05479670315980911, "rewards/rejected": 0.22667141258716583, "step": 947 }, { "epoch": 0.5112579209923149, "grad_norm": 6.73313045501709, "learning_rate": 2.8379826679609e-07, "logits/chosen": 0.7985554933547974, "logits/rejected": -0.18598201870918274, "logps/chosen": -281.1534423828125, "logps/rejected": -260.8255920410156, "loss": 0.6449, "rewards/accuracies": 0.625, "rewards/chosen": 0.1896222084760666, "rewards/margins": 0.11001225560903549, "rewards/rejected": 0.07960997521877289, "step": 948 }, { "epoch": 0.5117972225967372, "grad_norm": 8.496440887451172, "learning_rate": 2.8333166777274297e-07, "logits/chosen": -0.15961520373821259, "logits/rejected": -1.1655092239379883, "logps/chosen": -354.21588134765625, "logps/rejected": -185.61761474609375, "loss": 0.7314, "rewards/accuracies": 0.375, "rewards/chosen": 0.07606430351734161, "rewards/margins": -0.06831636279821396, "rewards/rejected": 0.14438065886497498, "step": 949 }, { "epoch": 0.5123365242011595, "grad_norm": 6.438569068908691, "learning_rate": 2.828649505093558e-07, "logits/chosen": -0.09408558905124664, "logits/rejected": -0.9915146827697754, "logps/chosen": -332.4750671386719, "logps/rejected": -279.8874206542969, "loss": 0.6828, "rewards/accuracies": 0.625, "rewards/chosen": 0.24077224731445312, "rewards/margins": 0.03529311716556549, "rewards/rejected": 0.20547914505004883, "step": 950 }, { "epoch": 0.5128758258055818, "grad_norm": 8.343050003051758, "learning_rate": 2.82398116661551e-07, "logits/chosen": 0.3343714475631714, "logits/rejected": 0.13581883907318115, "logps/chosen": -258.0262451171875, "logps/rejected": -214.34445190429688, "loss": 0.6869, "rewards/accuracies": 0.5, "rewards/chosen": 0.1591653972864151, "rewards/margins": 0.02188616245985031, "rewards/rejected": 0.1372792273759842, "step": 951 }, { "epoch": 0.5134151274100041, "grad_norm": 7.228579044342041, "learning_rate": 2.819311678853652e-07, "logits/chosen": 1.0359166860580444, "logits/rejected": -0.7901358604431152, "logps/chosen": -317.72479248046875, "logps/rejected": -259.3423156738281, "loss": 0.6796, "rewards/accuracies": 0.625, "rewards/chosen": 0.1528266966342926, "rewards/margins": 0.046307556331157684, "rewards/rejected": 0.10651912540197372, "step": 952 }, { "epoch": 0.5139544290144263, "grad_norm": 7.206027984619141, "learning_rate": 2.8146410583724225e-07, "logits/chosen": 0.6120848655700684, "logits/rejected": 0.6158080101013184, "logps/chosen": -270.776611328125, "logps/rejected": -237.6479949951172, "loss": 0.7213, "rewards/accuracies": 0.625, "rewards/chosen": 0.17424145340919495, "rewards/margins": -0.04271106794476509, "rewards/rejected": 0.21695251762866974, "step": 953 }, { "epoch": 0.5144937306188486, "grad_norm": 7.7878217697143555, "learning_rate": 2.8099693217402803e-07, "logits/chosen": 1.7828352451324463, "logits/rejected": -0.027004539966583252, "logps/chosen": -251.65234375, "logps/rejected": -214.08737182617188, "loss": 0.6938, "rewards/accuracies": 0.375, "rewards/chosen": 0.17936477065086365, "rewards/margins": 0.02440958470106125, "rewards/rejected": 0.1549552083015442, "step": 954 }, { "epoch": 0.5150330322232709, "grad_norm": 7.488417625427246, "learning_rate": 2.8052964855296437e-07, "logits/chosen": 0.37417054176330566, "logits/rejected": -0.6369287967681885, "logps/chosen": -330.43560791015625, "logps/rejected": -394.48779296875, "loss": 0.6775, "rewards/accuracies": 0.75, "rewards/chosen": 0.25557366013526917, "rewards/margins": 0.03784865885972977, "rewards/rejected": 0.2177249938249588, "step": 955 }, { "epoch": 0.5155723338276932, "grad_norm": 7.233161449432373, "learning_rate": 2.8006225663168304e-07, "logits/chosen": 0.1783939152956009, "logits/rejected": -1.132265567779541, "logps/chosen": -262.4233703613281, "logps/rejected": -187.9440155029297, "loss": 0.7242, "rewards/accuracies": 0.5, "rewards/chosen": 0.10599803924560547, "rewards/margins": -0.05780067294836044, "rewards/rejected": 0.1637987196445465, "step": 956 }, { "epoch": 0.5161116354321154, "grad_norm": 9.019381523132324, "learning_rate": 2.7959475806820027e-07, "logits/chosen": 0.5284242630004883, "logits/rejected": 1.0558334589004517, "logps/chosen": -422.11309814453125, "logps/rejected": -679.0325927734375, "loss": 0.6945, "rewards/accuracies": 0.5, "rewards/chosen": 0.1576610654592514, "rewards/margins": 0.005997657775878906, "rewards/rejected": 0.1516634076833725, "step": 957 }, { "epoch": 0.5166509370365376, "grad_norm": 7.077085494995117, "learning_rate": 2.791271545209101e-07, "logits/chosen": 0.27675074338912964, "logits/rejected": -0.989376425743103, "logps/chosen": -291.8957824707031, "logps/rejected": -232.5260772705078, "loss": 0.6614, "rewards/accuracies": 0.75, "rewards/chosen": 0.20493203401565552, "rewards/margins": 0.06841403245925903, "rewards/rejected": 0.13651800155639648, "step": 958 }, { "epoch": 0.5171902386409599, "grad_norm": 7.517457962036133, "learning_rate": 2.786594476485795e-07, "logits/chosen": 1.2647907733917236, "logits/rejected": 0.7585868835449219, "logps/chosen": -422.4537048339844, "logps/rejected": -382.8726501464844, "loss": 0.6484, "rewards/accuracies": 0.75, "rewards/chosen": 0.2590980529785156, "rewards/margins": 0.10498429089784622, "rewards/rejected": 0.15411376953125, "step": 959 }, { "epoch": 0.5177295402453822, "grad_norm": 7.101311683654785, "learning_rate": 2.781916391103417e-07, "logits/chosen": 0.45538198947906494, "logits/rejected": 1.1291203498840332, "logps/chosen": -251.55142211914062, "logps/rejected": -258.5719299316406, "loss": 0.7241, "rewards/accuracies": 0.375, "rewards/chosen": 0.1550544798374176, "rewards/margins": -0.05755767598748207, "rewards/rejected": 0.21261216700077057, "step": 960 }, { "epoch": 0.5182688418498045, "grad_norm": 6.608306407928467, "learning_rate": 2.777237305656906e-07, "logits/chosen": 0.4056742489337921, "logits/rejected": -0.4266083538532257, "logps/chosen": -247.8039093017578, "logps/rejected": -230.57496643066406, "loss": 0.6535, "rewards/accuracies": 0.5, "rewards/chosen": 0.22687970101833344, "rewards/margins": 0.0886894166469574, "rewards/rejected": 0.13819026947021484, "step": 961 }, { "epoch": 0.5188081434542268, "grad_norm": 6.767419338226318, "learning_rate": 2.7725572367447495e-07, "logits/chosen": 0.2864014208316803, "logits/rejected": -0.5652923583984375, "logps/chosen": -324.849853515625, "logps/rejected": -297.8727722167969, "loss": 0.6713, "rewards/accuracies": 0.625, "rewards/chosen": 0.12563486397266388, "rewards/margins": 0.04866380989551544, "rewards/rejected": 0.07697105407714844, "step": 962 }, { "epoch": 0.519347445058649, "grad_norm": 8.037771224975586, "learning_rate": 2.767876200968923e-07, "logits/chosen": 0.2148050218820572, "logits/rejected": -0.7730710506439209, "logps/chosen": -324.49102783203125, "logps/rejected": -225.68508911132812, "loss": 0.6372, "rewards/accuracies": 0.625, "rewards/chosen": 0.2109127938747406, "rewards/margins": 0.12644100189208984, "rewards/rejected": 0.08447179943323135, "step": 963 }, { "epoch": 0.5198867466630713, "grad_norm": 8.366497039794922, "learning_rate": 2.763194214934831e-07, "logits/chosen": 0.7875294089317322, "logits/rejected": -0.8335232734680176, "logps/chosen": -317.666015625, "logps/rejected": -251.77081298828125, "loss": 0.5748, "rewards/accuracies": 1.0, "rewards/chosen": 0.3710539937019348, "rewards/margins": 0.2618369162082672, "rewards/rejected": 0.1092170774936676, "step": 964 }, { "epoch": 0.5204260482674936, "grad_norm": 6.662132263183594, "learning_rate": 2.75851129525125e-07, "logits/chosen": -0.5412616729736328, "logits/rejected": -0.18412986397743225, "logps/chosen": -236.15042114257812, "logps/rejected": -234.38070678710938, "loss": 0.6455, "rewards/accuracies": 0.75, "rewards/chosen": 0.23632384836673737, "rewards/margins": 0.10303859412670135, "rewards/rejected": 0.13328523933887482, "step": 965 }, { "epoch": 0.5209653498719159, "grad_norm": 6.005458831787109, "learning_rate": 2.75382745853027e-07, "logits/chosen": 0.6863296031951904, "logits/rejected": -0.3110277056694031, "logps/chosen": -241.0189208984375, "logps/rejected": -224.9281005859375, "loss": 0.6116, "rewards/accuracies": 0.75, "rewards/chosen": 0.2787625193595886, "rewards/margins": 0.1766965389251709, "rewards/rejected": 0.10206599533557892, "step": 966 }, { "epoch": 0.5215046514763382, "grad_norm": 8.032205581665039, "learning_rate": 2.749142721387232e-07, "logits/chosen": -0.30617743730545044, "logits/rejected": -0.7133037447929382, "logps/chosen": -290.621826171875, "logps/rejected": -364.2094421386719, "loss": 0.6187, "rewards/accuracies": 0.75, "rewards/chosen": 0.18043634295463562, "rewards/margins": 0.16304510831832886, "rewards/rejected": 0.017391204833984375, "step": 967 }, { "epoch": 0.5220439530807605, "grad_norm": 7.464094638824463, "learning_rate": 2.7444571004406697e-07, "logits/chosen": -0.04428783059120178, "logits/rejected": -0.8359389305114746, "logps/chosen": -306.7667236328125, "logps/rejected": -255.1727752685547, "loss": 0.7321, "rewards/accuracies": 0.375, "rewards/chosen": 0.17485447227954865, "rewards/margins": -0.0661800354719162, "rewards/rejected": 0.24103452265262604, "step": 968 }, { "epoch": 0.5225832546851827, "grad_norm": 7.618457794189453, "learning_rate": 2.739770612312256e-07, "logits/chosen": -0.06510046124458313, "logits/rejected": -0.33590835332870483, "logps/chosen": -202.09756469726562, "logps/rejected": -179.6125946044922, "loss": 0.6933, "rewards/accuracies": 0.5, "rewards/chosen": 0.13248109817504883, "rewards/margins": 0.008173234760761261, "rewards/rejected": 0.12430787086486816, "step": 969 }, { "epoch": 0.5231225562896049, "grad_norm": 7.864625930786133, "learning_rate": 2.7350832736267376e-07, "logits/chosen": 0.14336737990379333, "logits/rejected": -0.7208901047706604, "logps/chosen": -323.3473205566406, "logps/rejected": -288.05267333984375, "loss": 0.607, "rewards/accuracies": 0.875, "rewards/chosen": 0.27010926604270935, "rewards/margins": 0.19488295912742615, "rewards/rejected": 0.0752263069152832, "step": 970 }, { "epoch": 0.5236618578940272, "grad_norm": 6.138752460479736, "learning_rate": 2.730395101011878e-07, "logits/chosen": 0.8557940721511841, "logits/rejected": -0.6484788656234741, "logps/chosen": -228.87063598632812, "logps/rejected": -197.21124267578125, "loss": 0.6439, "rewards/accuracies": 0.75, "rewards/chosen": 0.19431258738040924, "rewards/margins": 0.11250734329223633, "rewards/rejected": 0.08180522918701172, "step": 971 }, { "epoch": 0.5242011594984495, "grad_norm": 9.20334243774414, "learning_rate": 2.7257061110984e-07, "logits/chosen": 0.4946444034576416, "logits/rejected": 0.08856119960546494, "logps/chosen": -264.93682861328125, "logps/rejected": -249.10748291015625, "loss": 0.6984, "rewards/accuracies": 0.625, "rewards/chosen": 0.17304247617721558, "rewards/margins": -0.0003124270588159561, "rewards/rejected": 0.17335492372512817, "step": 972 }, { "epoch": 0.5247404611028718, "grad_norm": 7.392411708831787, "learning_rate": 2.721016320519927e-07, "logits/chosen": 0.33108067512512207, "logits/rejected": -0.6500067114830017, "logps/chosen": -248.3700714111328, "logps/rejected": -209.68756103515625, "loss": 0.636, "rewards/accuracies": 0.75, "rewards/chosen": 0.17370805144309998, "rewards/margins": 0.1315387785434723, "rewards/rejected": 0.04216928407549858, "step": 973 }, { "epoch": 0.525279762707294, "grad_norm": 7.586643218994141, "learning_rate": 2.716325745912918e-07, "logits/chosen": -0.5922907590866089, "logits/rejected": 0.2539729177951813, "logps/chosen": -235.19058227539062, "logps/rejected": -322.65521240234375, "loss": 0.7306, "rewards/accuracies": 0.25, "rewards/chosen": 0.13939037919044495, "rewards/margins": -0.0689508467912674, "rewards/rejected": 0.20834121108055115, "step": 974 }, { "epoch": 0.5258190643117163, "grad_norm": 8.363532066345215, "learning_rate": 2.711634403916619e-07, "logits/chosen": -0.33981984853744507, "logits/rejected": -1.487951636314392, "logps/chosen": -218.9769744873047, "logps/rejected": -172.3780517578125, "loss": 0.6528, "rewards/accuracies": 0.625, "rewards/chosen": 0.16681480407714844, "rewards/margins": 0.09645976126194, "rewards/rejected": 0.07035503536462784, "step": 975 }, { "epoch": 0.5263583659161386, "grad_norm": 7.111702919006348, "learning_rate": 2.7069423111729944e-07, "logits/chosen": 1.1149940490722656, "logits/rejected": 0.7477717399597168, "logps/chosen": -292.3304443359375, "logps/rejected": -322.28106689453125, "loss": 0.6705, "rewards/accuracies": 0.625, "rewards/chosen": 0.1430853009223938, "rewards/margins": 0.054793547838926315, "rewards/rejected": 0.08829174190759659, "step": 976 }, { "epoch": 0.5268976675205609, "grad_norm": 7.351909637451172, "learning_rate": 2.7022494843266723e-07, "logits/chosen": 0.150810107588768, "logits/rejected": -0.14126929640769958, "logps/chosen": -282.67083740234375, "logps/rejected": -206.04335021972656, "loss": 0.6905, "rewards/accuracies": 0.625, "rewards/chosen": 0.21686458587646484, "rewards/margins": 0.018909404054284096, "rewards/rejected": 0.1979551762342453, "step": 977 }, { "epoch": 0.5274369691249832, "grad_norm": 6.7640910148620605, "learning_rate": 2.697555940024887e-07, "logits/chosen": 0.2701672315597534, "logits/rejected": -0.43507111072540283, "logps/chosen": -332.06451416015625, "logps/rejected": -325.2165832519531, "loss": 0.6329, "rewards/accuracies": 0.625, "rewards/chosen": 0.26354390382766724, "rewards/margins": 0.13629856705665588, "rewards/rejected": 0.12724533677101135, "step": 978 }, { "epoch": 0.5279762707294055, "grad_norm": 7.050397872924805, "learning_rate": 2.6928616949174167e-07, "logits/chosen": -0.11255106329917908, "logits/rejected": 0.33591440320014954, "logps/chosen": -272.1903381347656, "logps/rejected": -310.3337097167969, "loss": 0.6289, "rewards/accuracies": 0.75, "rewards/chosen": 0.23008672893047333, "rewards/margins": 0.1378321647644043, "rewards/rejected": 0.09225454181432724, "step": 979 }, { "epoch": 0.5285155723338277, "grad_norm": 7.329227924346924, "learning_rate": 2.6881667656565226e-07, "logits/chosen": -0.05141481012105942, "logits/rejected": -0.7989603281021118, "logps/chosen": -284.52398681640625, "logps/rejected": -295.0751953125, "loss": 0.65, "rewards/accuracies": 0.875, "rewards/chosen": 0.1855921745300293, "rewards/margins": 0.09463587403297424, "rewards/rejected": 0.09095630794763565, "step": 980 }, { "epoch": 0.52905487393825, "grad_norm": 7.907735824584961, "learning_rate": 2.6834711688968986e-07, "logits/chosen": -0.7316145300865173, "logits/rejected": -1.2624707221984863, "logps/chosen": -274.5174865722656, "logps/rejected": -256.006103515625, "loss": 0.6814, "rewards/accuracies": 0.5, "rewards/chosen": 0.22083568572998047, "rewards/margins": 0.03227291256189346, "rewards/rejected": 0.1885627806186676, "step": 981 }, { "epoch": 0.5295941755426722, "grad_norm": 7.128477573394775, "learning_rate": 2.678774921295602e-07, "logits/chosen": 0.5521366596221924, "logits/rejected": -0.2341500222682953, "logps/chosen": -279.09521484375, "logps/rejected": -224.9398193359375, "loss": 0.6455, "rewards/accuracies": 0.75, "rewards/chosen": 0.2831032872200012, "rewards/margins": 0.11202040314674377, "rewards/rejected": 0.17108288407325745, "step": 982 }, { "epoch": 0.5301334771470945, "grad_norm": 8.60667610168457, "learning_rate": 2.6740780395120006e-07, "logits/chosen": -0.4340725541114807, "logits/rejected": -0.39410993456840515, "logps/chosen": -269.84228515625, "logps/rejected": -271.83441162109375, "loss": 0.7305, "rewards/accuracies": 0.5, "rewards/chosen": 0.21728239953517914, "rewards/margins": -0.05679083243012428, "rewards/rejected": 0.27407321333885193, "step": 983 }, { "epoch": 0.5306727787515167, "grad_norm": 6.749332427978516, "learning_rate": 2.6693805402077117e-07, "logits/chosen": 0.0598023384809494, "logits/rejected": -0.44136762619018555, "logps/chosen": -167.25466918945312, "logps/rejected": -163.46725463867188, "loss": 0.6608, "rewards/accuracies": 0.5, "rewards/chosen": 0.14204230904579163, "rewards/margins": 0.08399453014135361, "rewards/rejected": 0.05804777145385742, "step": 984 }, { "epoch": 0.531212080355939, "grad_norm": 8.474515914916992, "learning_rate": 2.6646824400465435e-07, "logits/chosen": 1.0245015621185303, "logits/rejected": -0.039338380098342896, "logps/chosen": -446.2529602050781, "logps/rejected": -404.5025634765625, "loss": 0.6622, "rewards/accuracies": 0.5, "rewards/chosen": 0.27831190824508667, "rewards/margins": 0.07770414650440216, "rewards/rejected": 0.2006077766418457, "step": 985 }, { "epoch": 0.5317513819603613, "grad_norm": 6.755720615386963, "learning_rate": 2.659983755694435e-07, "logits/chosen": 0.05814182758331299, "logits/rejected": 0.5872963666915894, "logps/chosen": -186.9912872314453, "logps/rejected": -237.06201171875, "loss": 0.7006, "rewards/accuracies": 0.5, "rewards/chosen": 0.23455210030078888, "rewards/margins": -0.01041821762919426, "rewards/rejected": 0.24497033655643463, "step": 986 }, { "epoch": 0.5322906835647836, "grad_norm": 7.771574020385742, "learning_rate": 2.655284503819397e-07, "logits/chosen": 1.3979812860488892, "logits/rejected": 0.4938295781612396, "logps/chosen": -425.3463439941406, "logps/rejected": -331.4800109863281, "loss": 0.6759, "rewards/accuracies": 0.75, "rewards/chosen": 0.28192996978759766, "rewards/margins": 0.03649330139160156, "rewards/rejected": 0.2454366832971573, "step": 987 }, { "epoch": 0.5328299851692059, "grad_norm": 7.308907508850098, "learning_rate": 2.6505847010914573e-07, "logits/chosen": 0.31313833594322205, "logits/rejected": -1.107191562652588, "logps/chosen": -268.37127685546875, "logps/rejected": -160.79248046875, "loss": 0.6096, "rewards/accuracies": 0.875, "rewards/chosen": 0.26978397369384766, "rewards/margins": 0.18398115038871765, "rewards/rejected": 0.0858028382062912, "step": 988 }, { "epoch": 0.5333692867736282, "grad_norm": 7.177689552307129, "learning_rate": 2.645884364182591e-07, "logits/chosen": 0.7051485180854797, "logits/rejected": -0.3130554258823395, "logps/chosen": -213.47824096679688, "logps/rejected": -168.4855194091797, "loss": 0.7066, "rewards/accuracies": 0.5, "rewards/chosen": 0.09060497581958771, "rewards/margins": 0.0029388442635536194, "rewards/rejected": 0.08766613155603409, "step": 989 }, { "epoch": 0.5339085883780504, "grad_norm": 7.696238994598389, "learning_rate": 2.641183509766675e-07, "logits/chosen": -0.1057223528623581, "logits/rejected": -1.5787856578826904, "logps/chosen": -282.66278076171875, "logps/rejected": -236.74632263183594, "loss": 0.6812, "rewards/accuracies": 0.625, "rewards/chosen": 0.2240627259016037, "rewards/margins": 0.041181568056344986, "rewards/rejected": 0.182881161570549, "step": 990 }, { "epoch": 0.5344478899824727, "grad_norm": 7.114490509033203, "learning_rate": 2.636482154519417e-07, "logits/chosen": 1.2306721210479736, "logits/rejected": -0.39103570580482483, "logps/chosen": -263.23553466796875, "logps/rejected": -219.74623107910156, "loss": 0.6228, "rewards/accuracies": 0.75, "rewards/chosen": 0.3181873559951782, "rewards/margins": 0.16040988266468048, "rewards/rejected": 0.15777745842933655, "step": 991 }, { "epoch": 0.534987191586895, "grad_norm": 8.425633430480957, "learning_rate": 2.6317803151183047e-07, "logits/chosen": 0.7593473196029663, "logits/rejected": 0.1965465545654297, "logps/chosen": -319.9184265136719, "logps/rejected": -261.853759765625, "loss": 0.7099, "rewards/accuracies": 0.5, "rewards/chosen": 0.13951054215431213, "rewards/margins": -0.02452382817864418, "rewards/rejected": 0.16403436660766602, "step": 992 }, { "epoch": 0.5355264931913173, "grad_norm": 7.719659805297852, "learning_rate": 2.627078008242541e-07, "logits/chosen": -0.48100972175598145, "logits/rejected": -0.2077818512916565, "logps/chosen": -236.22850036621094, "logps/rejected": -271.41064453125, "loss": 0.7282, "rewards/accuracies": 0.375, "rewards/chosen": 0.1037009209394455, "rewards/margins": -0.06126909703016281, "rewards/rejected": 0.1649700254201889, "step": 993 }, { "epoch": 0.5360657947957396, "grad_norm": 6.673591136932373, "learning_rate": 2.622375250572988e-07, "logits/chosen": 0.05215415731072426, "logits/rejected": -0.8153755068778992, "logps/chosen": -240.18362426757812, "logps/rejected": -163.35397338867188, "loss": 0.6223, "rewards/accuracies": 0.75, "rewards/chosen": 0.3068764805793762, "rewards/margins": 0.15510493516921997, "rewards/rejected": 0.15177154541015625, "step": 994 }, { "epoch": 0.5366050964001617, "grad_norm": 7.6394243240356445, "learning_rate": 2.6176720587921074e-07, "logits/chosen": 0.3411431610584259, "logits/rejected": -0.0001176595687866211, "logps/chosen": -190.41648864746094, "logps/rejected": -253.9099578857422, "loss": 0.7153, "rewards/accuracies": 0.375, "rewards/chosen": 0.198648601770401, "rewards/margins": -0.02737337350845337, "rewards/rejected": 0.22602197527885437, "step": 995 }, { "epoch": 0.537144398004584, "grad_norm": 6.689168453216553, "learning_rate": 2.612968449583901e-07, "logits/chosen": 0.5046786069869995, "logits/rejected": -0.5352974534034729, "logps/chosen": -231.09471130371094, "logps/rejected": -184.86572265625, "loss": 0.7257, "rewards/accuracies": 0.375, "rewards/chosen": 0.15924425423145294, "rewards/margins": -0.05469521880149841, "rewards/rejected": 0.21393948793411255, "step": 996 }, { "epoch": 0.5376836996090063, "grad_norm": 7.530440330505371, "learning_rate": 2.60826443963385e-07, "logits/chosen": 0.4852014183998108, "logits/rejected": -0.22153788805007935, "logps/chosen": -221.03204345703125, "logps/rejected": -209.60983276367188, "loss": 0.6198, "rewards/accuracies": 0.875, "rewards/chosen": 0.277273565530777, "rewards/margins": 0.15934991836547852, "rewards/rejected": 0.11792363971471786, "step": 997 }, { "epoch": 0.5382230012134286, "grad_norm": 8.190646171569824, "learning_rate": 2.603560045628857e-07, "logits/chosen": -0.11166591942310333, "logits/rejected": 0.12716570496559143, "logps/chosen": -290.7252197265625, "logps/rejected": -312.25634765625, "loss": 0.7211, "rewards/accuracies": 0.5, "rewards/chosen": 0.1398032158613205, "rewards/margins": -0.038367174565792084, "rewards/rejected": 0.17817039787769318, "step": 998 }, { "epoch": 0.5387623028178509, "grad_norm": 8.790964126586914, "learning_rate": 2.59885528425719e-07, "logits/chosen": -0.8406335711479187, "logits/rejected": -0.30131492018699646, "logps/chosen": -397.8966369628906, "logps/rejected": -415.610595703125, "loss": 0.6731, "rewards/accuracies": 0.625, "rewards/chosen": 0.2136787474155426, "rewards/margins": 0.04400653764605522, "rewards/rejected": 0.16967220604419708, "step": 999 }, { "epoch": 0.5393016044222732, "grad_norm": 7.805335521697998, "learning_rate": 2.594150172208416e-07, "logits/chosen": 0.2616594135761261, "logits/rejected": -0.7140202522277832, "logps/chosen": -384.89471435546875, "logps/rejected": -262.3598937988281, "loss": 0.6674, "rewards/accuracies": 0.625, "rewards/chosen": 0.16640710830688477, "rewards/margins": 0.06679697334766388, "rewards/rejected": 0.09961013495922089, "step": 1000 }, { "epoch": 0.5398409060266954, "grad_norm": 7.2480292320251465, "learning_rate": 2.589444726173351e-07, "logits/chosen": 0.5627864599227905, "logits/rejected": -0.2423264980316162, "logps/chosen": -239.08062744140625, "logps/rejected": -187.81695556640625, "loss": 0.6266, "rewards/accuracies": 0.75, "rewards/chosen": 0.3214559555053711, "rewards/margins": 0.1640338897705078, "rewards/rejected": 0.15742206573486328, "step": 1001 }, { "epoch": 0.5403802076311177, "grad_norm": 7.993189334869385, "learning_rate": 2.5847389628439905e-07, "logits/chosen": -1.100498914718628, "logits/rejected": -0.16764166951179504, "logps/chosen": -187.57315063476562, "logps/rejected": -215.1111297607422, "loss": 0.763, "rewards/accuracies": 0.25, "rewards/chosen": 0.19086198508739471, "rewards/margins": -0.13227909803390503, "rewards/rejected": 0.32314109802246094, "step": 1002 }, { "epoch": 0.54091950923554, "grad_norm": 6.416968822479248, "learning_rate": 2.580032898913458e-07, "logits/chosen": -0.06114792823791504, "logits/rejected": -1.5348234176635742, "logps/chosen": -255.16732788085938, "logps/rejected": -251.37936401367188, "loss": 0.6322, "rewards/accuracies": 0.75, "rewards/chosen": 0.18668824434280396, "rewards/margins": 0.13693419098854065, "rewards/rejected": 0.049754053354263306, "step": 1003 }, { "epoch": 0.5414588108399623, "grad_norm": 7.141849040985107, "learning_rate": 2.5753265510759447e-07, "logits/chosen": -0.18728597462177277, "logits/rejected": -0.39136290550231934, "logps/chosen": -231.8093719482422, "logps/rejected": -263.9236755371094, "loss": 0.6682, "rewards/accuracies": 0.75, "rewards/chosen": 0.16606350243091583, "rewards/margins": 0.058347512036561966, "rewards/rejected": 0.10771599411964417, "step": 1004 }, { "epoch": 0.5419981124443846, "grad_norm": 7.268177032470703, "learning_rate": 2.5706199360266466e-07, "logits/chosen": 0.38632673025131226, "logits/rejected": 1.347379207611084, "logps/chosen": -254.56785583496094, "logps/rejected": -305.4244384765625, "loss": 0.7231, "rewards/accuracies": 0.5, "rewards/chosen": 0.13833113014698029, "rewards/margins": -0.05196961760520935, "rewards/rejected": 0.19030074775218964, "step": 1005 }, { "epoch": 0.5425374140488068, "grad_norm": 7.326416015625, "learning_rate": 2.565913070461709e-07, "logits/chosen": 0.801520586013794, "logits/rejected": -0.5725429058074951, "logps/chosen": -283.4859619140625, "logps/rejected": -260.02789306640625, "loss": 0.66, "rewards/accuracies": 0.625, "rewards/chosen": 0.2308342009782791, "rewards/margins": 0.07470446079969406, "rewards/rejected": 0.15612974762916565, "step": 1006 }, { "epoch": 0.543076715653229, "grad_norm": 7.905848503112793, "learning_rate": 2.5612059710781643e-07, "logits/chosen": 0.37551724910736084, "logits/rejected": -0.18006737530231476, "logps/chosen": -215.09353637695312, "logps/rejected": -170.02938842773438, "loss": 0.6068, "rewards/accuracies": 0.875, "rewards/chosen": 0.3184448182582855, "rewards/margins": 0.1939212828874588, "rewards/rejected": 0.12452355027198792, "step": 1007 }, { "epoch": 0.5436160172576513, "grad_norm": 6.851794242858887, "learning_rate": 2.5564986545738766e-07, "logits/chosen": -0.33028578758239746, "logits/rejected": -0.3856905400753021, "logps/chosen": -296.3638916015625, "logps/rejected": -297.4784851074219, "loss": 0.6903, "rewards/accuracies": 0.5, "rewards/chosen": 0.1578565537929535, "rewards/margins": 0.018886948004364967, "rewards/rejected": 0.13896961510181427, "step": 1008 }, { "epoch": 0.5441553188620736, "grad_norm": 7.926867961883545, "learning_rate": 2.551791137647479e-07, "logits/chosen": -0.2558002471923828, "logits/rejected": -0.842138946056366, "logps/chosen": -190.36679077148438, "logps/rejected": -192.5867462158203, "loss": 0.7322, "rewards/accuracies": 0.625, "rewards/chosen": 0.1562190055847168, "rewards/margins": -0.058617398142814636, "rewards/rejected": 0.21483641862869263, "step": 1009 }, { "epoch": 0.5446946204664959, "grad_norm": 7.442685127258301, "learning_rate": 2.547083436998316e-07, "logits/chosen": 0.36958062648773193, "logits/rejected": -0.17642265558242798, "logps/chosen": -425.2739562988281, "logps/rejected": -349.5115661621094, "loss": 0.6478, "rewards/accuracies": 0.75, "rewards/chosen": 0.20799008011817932, "rewards/margins": 0.10144615173339844, "rewards/rejected": 0.10654392093420029, "step": 1010 }, { "epoch": 0.5452339220709181, "grad_norm": 8.389775276184082, "learning_rate": 2.5423755693263813e-07, "logits/chosen": 0.5686419606208801, "logits/rejected": -0.33294934034347534, "logps/chosen": -288.10137939453125, "logps/rejected": -277.14056396484375, "loss": 0.6692, "rewards/accuracies": 0.625, "rewards/chosen": 0.2812814712524414, "rewards/margins": 0.057456400245428085, "rewards/rejected": 0.22382506728172302, "step": 1011 }, { "epoch": 0.5457732236753404, "grad_norm": 9.450719833374023, "learning_rate": 2.537667551332266e-07, "logits/chosen": 0.3221825361251831, "logits/rejected": 1.1854209899902344, "logps/chosen": -207.65463256835938, "logps/rejected": -403.40509033203125, "loss": 0.6823, "rewards/accuracies": 0.625, "rewards/chosen": 0.17578811943531036, "rewards/margins": 0.032790184020996094, "rewards/rejected": 0.14299793541431427, "step": 1012 }, { "epoch": 0.5463125252797627, "grad_norm": 9.381396293640137, "learning_rate": 2.5329593997170896e-07, "logits/chosen": 0.6952894330024719, "logits/rejected": 0.5792037844657898, "logps/chosen": -327.9839172363281, "logps/rejected": -332.0537414550781, "loss": 0.7522, "rewards/accuracies": 0.5, "rewards/chosen": 0.17472630739212036, "rewards/margins": -0.10666065663099289, "rewards/rejected": 0.28138697147369385, "step": 1013 }, { "epoch": 0.546851826884185, "grad_norm": 7.811958312988281, "learning_rate": 2.5282511311824493e-07, "logits/chosen": 0.2970806062221527, "logits/rejected": 0.3635658621788025, "logps/chosen": -338.4962463378906, "logps/rejected": -347.73455810546875, "loss": 0.6949, "rewards/accuracies": 0.625, "rewards/chosen": 0.19420796632766724, "rewards/margins": 0.005716225132346153, "rewards/rejected": 0.18849173188209534, "step": 1014 }, { "epoch": 0.5473911284886073, "grad_norm": 7.5426859855651855, "learning_rate": 2.5235427624303546e-07, "logits/chosen": 0.7798069715499878, "logits/rejected": -0.3146505355834961, "logps/chosen": -270.272216796875, "logps/rejected": -182.05934143066406, "loss": 0.7298, "rewards/accuracies": 0.375, "rewards/chosen": 0.12673187255859375, "rewards/margins": -0.05445938557386398, "rewards/rejected": 0.18119126558303833, "step": 1015 }, { "epoch": 0.5479304300930296, "grad_norm": 6.881659030914307, "learning_rate": 2.5188343101631714e-07, "logits/chosen": 0.3285524249076843, "logits/rejected": -1.0220260620117188, "logps/chosen": -259.250732421875, "logps/rejected": -145.506103515625, "loss": 0.611, "rewards/accuracies": 0.875, "rewards/chosen": 0.2328791618347168, "rewards/margins": 0.18215981125831604, "rewards/rejected": 0.050719358026981354, "step": 1016 }, { "epoch": 0.5484697316974518, "grad_norm": 7.307126045227051, "learning_rate": 2.514125791083563e-07, "logits/chosen": -0.13408492505550385, "logits/rejected": -1.2595624923706055, "logps/chosen": -297.53082275390625, "logps/rejected": -256.41448974609375, "loss": 0.6472, "rewards/accuracies": 0.75, "rewards/chosen": 0.19541263580322266, "rewards/margins": 0.10080622881650925, "rewards/rejected": 0.09460639953613281, "step": 1017 }, { "epoch": 0.5490090333018741, "grad_norm": 8.359871864318848, "learning_rate": 2.509417221894427e-07, "logits/chosen": -0.7608452439308167, "logits/rejected": -0.5376445055007935, "logps/chosen": -272.9393310546875, "logps/rejected": -232.57290649414062, "loss": 0.7584, "rewards/accuracies": 0.125, "rewards/chosen": 0.12920036911964417, "rewards/margins": -0.12176857143640518, "rewards/rejected": 0.25096893310546875, "step": 1018 }, { "epoch": 0.5495483349062964, "grad_norm": 7.065249443054199, "learning_rate": 2.504708619298841e-07, "logits/chosen": -0.1978461742401123, "logits/rejected": -1.4300744533538818, "logps/chosen": -208.97694396972656, "logps/rejected": -179.66729736328125, "loss": 0.6672, "rewards/accuracies": 0.625, "rewards/chosen": 0.26829826831817627, "rewards/margins": 0.06620922684669495, "rewards/rejected": 0.20208902657032013, "step": 1019 }, { "epoch": 0.5500876365107186, "grad_norm": 8.186070442199707, "learning_rate": 2.5e-07, "logits/chosen": 0.06032499670982361, "logits/rejected": -1.2200968265533447, "logps/chosen": -252.891845703125, "logps/rejected": -202.54705810546875, "loss": 0.7142, "rewards/accuracies": 0.625, "rewards/chosen": 0.1395013928413391, "rewards/margins": -0.027167990803718567, "rewards/rejected": 0.16666936874389648, "step": 1020 }, { "epoch": 0.5506269381151409, "grad_norm": 7.384394645690918, "learning_rate": 2.4952913807011594e-07, "logits/chosen": -0.13565808534622192, "logits/rejected": 0.015671536326408386, "logps/chosen": -233.46791076660156, "logps/rejected": -239.81695556640625, "loss": 0.6707, "rewards/accuracies": 0.625, "rewards/chosen": 0.21266594529151917, "rewards/margins": 0.05396394804120064, "rewards/rejected": 0.15870200097560883, "step": 1021 }, { "epoch": 0.5511662397195631, "grad_norm": 7.244595527648926, "learning_rate": 2.490582778105573e-07, "logits/chosen": 0.8676422238349915, "logits/rejected": 0.6055478453636169, "logps/chosen": -189.1466064453125, "logps/rejected": -226.3778839111328, "loss": 0.6989, "rewards/accuracies": 0.375, "rewards/chosen": 0.17330867052078247, "rewards/margins": -0.002797124907374382, "rewards/rejected": 0.176105797290802, "step": 1022 }, { "epoch": 0.5517055413239854, "grad_norm": 5.521399974822998, "learning_rate": 2.4858742089164373e-07, "logits/chosen": 0.6265859007835388, "logits/rejected": 0.03403466194868088, "logps/chosen": -211.10687255859375, "logps/rejected": -216.7269744873047, "loss": 0.6835, "rewards/accuracies": 0.5, "rewards/chosen": 0.15433311462402344, "rewards/margins": 0.026257041841745377, "rewards/rejected": 0.12807607650756836, "step": 1023 }, { "epoch": 0.5522448429284077, "grad_norm": 7.075554847717285, "learning_rate": 2.4811656898368283e-07, "logits/chosen": -0.5585403442382812, "logits/rejected": -0.051473915576934814, "logps/chosen": -237.26605224609375, "logps/rejected": -274.7012939453125, "loss": 0.7131, "rewards/accuracies": 0.5, "rewards/chosen": 0.23703479766845703, "rewards/margins": -0.03435191512107849, "rewards/rejected": 0.2713867425918579, "step": 1024 }, { "epoch": 0.55278414453283, "grad_norm": 6.768413066864014, "learning_rate": 2.4764572375696457e-07, "logits/chosen": 0.12496265769004822, "logits/rejected": 0.06670695543289185, "logps/chosen": -286.11285400390625, "logps/rejected": -287.9031982421875, "loss": 0.7046, "rewards/accuracies": 0.5, "rewards/chosen": 0.12846851348876953, "rewards/margins": -0.021235084161162376, "rewards/rejected": 0.14970360696315765, "step": 1025 }, { "epoch": 0.5533234461372523, "grad_norm": 6.336050033569336, "learning_rate": 2.471748868817551e-07, "logits/chosen": -0.21340534090995789, "logits/rejected": -0.823559582233429, "logps/chosen": -242.35000610351562, "logps/rejected": -215.6680145263672, "loss": 0.701, "rewards/accuracies": 0.5, "rewards/chosen": 0.18451720476150513, "rewards/margins": 0.003668408840894699, "rewards/rejected": 0.18084879219532013, "step": 1026 }, { "epoch": 0.5538627477416745, "grad_norm": 8.850415229797363, "learning_rate": 2.467040600282911e-07, "logits/chosen": 0.360693097114563, "logits/rejected": -0.7677645683288574, "logps/chosen": -270.69085693359375, "logps/rejected": -256.7684020996094, "loss": 0.6166, "rewards/accuracies": 0.75, "rewards/chosen": 0.2579706311225891, "rewards/margins": 0.16807126998901367, "rewards/rejected": 0.08989934623241425, "step": 1027 }, { "epoch": 0.5544020493460968, "grad_norm": 6.9022417068481445, "learning_rate": 2.462332448667735e-07, "logits/chosen": 0.2268196940422058, "logits/rejected": -0.7405245900154114, "logps/chosen": -269.7364196777344, "logps/rejected": -198.40975952148438, "loss": 0.6438, "rewards/accuracies": 0.625, "rewards/chosen": 0.2819681167602539, "rewards/margins": 0.12462020665407181, "rewards/rejected": 0.1573479175567627, "step": 1028 }, { "epoch": 0.5549413509505191, "grad_norm": 7.782247066497803, "learning_rate": 2.457624430673619e-07, "logits/chosen": 0.3959428668022156, "logits/rejected": 0.6999610066413879, "logps/chosen": -246.8707275390625, "logps/rejected": -313.8858337402344, "loss": 0.7627, "rewards/accuracies": 0.375, "rewards/chosen": 0.14552746713161469, "rewards/margins": -0.1251930296421051, "rewards/rejected": 0.2707204818725586, "step": 1029 }, { "epoch": 0.5554806525549414, "grad_norm": 6.996789932250977, "learning_rate": 2.4529165630016854e-07, "logits/chosen": 0.8582363724708557, "logits/rejected": -1.278580904006958, "logps/chosen": -223.37330627441406, "logps/rejected": -176.83285522460938, "loss": 0.6329, "rewards/accuracies": 0.875, "rewards/chosen": 0.25319671630859375, "rewards/margins": 0.12917813658714294, "rewards/rejected": 0.12401857227087021, "step": 1030 }, { "epoch": 0.5560199541593637, "grad_norm": 7.648048400878906, "learning_rate": 2.4482088623525215e-07, "logits/chosen": 0.5815491676330566, "logits/rejected": -0.8148797154426575, "logps/chosen": -228.15084838867188, "logps/rejected": -186.35256958007812, "loss": 0.6425, "rewards/accuracies": 0.75, "rewards/chosen": 0.24753990769386292, "rewards/margins": 0.1182304322719574, "rewards/rejected": 0.12930947542190552, "step": 1031 }, { "epoch": 0.5565592557637858, "grad_norm": 7.40647029876709, "learning_rate": 2.4435013454261243e-07, "logits/chosen": 0.3734763562679291, "logits/rejected": -1.1340135335922241, "logps/chosen": -275.7060546875, "logps/rejected": -201.46533203125, "loss": 0.6271, "rewards/accuracies": 0.75, "rewards/chosen": 0.25679606199264526, "rewards/margins": 0.15929508209228516, "rewards/rejected": 0.0975009948015213, "step": 1032 }, { "epoch": 0.5570985573682081, "grad_norm": 8.461837768554688, "learning_rate": 2.4387940289218365e-07, "logits/chosen": 0.0055145323276519775, "logits/rejected": -0.5420266389846802, "logps/chosen": -198.94754028320312, "logps/rejected": -299.90350341796875, "loss": 0.6339, "rewards/accuracies": 0.75, "rewards/chosen": 0.2665039002895355, "rewards/margins": 0.13729915022850037, "rewards/rejected": 0.12920476496219635, "step": 1033 }, { "epoch": 0.5576378589726304, "grad_norm": 7.607484817504883, "learning_rate": 2.434086929538292e-07, "logits/chosen": -0.37277430295944214, "logits/rejected": -2.0515735149383545, "logps/chosen": -276.52032470703125, "logps/rejected": -201.2256622314453, "loss": 0.6528, "rewards/accuracies": 0.75, "rewards/chosen": 0.15175294876098633, "rewards/margins": 0.09500517696142197, "rewards/rejected": 0.05674777552485466, "step": 1034 }, { "epoch": 0.5581771605770527, "grad_norm": 7.212462425231934, "learning_rate": 2.429380063973353e-07, "logits/chosen": -1.2271052598953247, "logits/rejected": -0.2884814739227295, "logps/chosen": -175.04592895507812, "logps/rejected": -282.3916015625, "loss": 0.7008, "rewards/accuracies": 0.5, "rewards/chosen": 0.21415625512599945, "rewards/margins": -1.9550323486328125e-05, "rewards/rejected": 0.21417579054832458, "step": 1035 }, { "epoch": 0.558716462181475, "grad_norm": 6.954819202423096, "learning_rate": 2.424673448924055e-07, "logits/chosen": 0.36523622274398804, "logits/rejected": -0.22233593463897705, "logps/chosen": -258.094482421875, "logps/rejected": -229.14590454101562, "loss": 0.6921, "rewards/accuracies": 0.625, "rewards/chosen": 0.11971783638000488, "rewards/margins": 0.014757208526134491, "rewards/rejected": 0.10496063530445099, "step": 1036 }, { "epoch": 0.5592557637858973, "grad_norm": 6.550832271575928, "learning_rate": 2.4199671010865416e-07, "logits/chosen": 0.21199940145015717, "logits/rejected": -1.1697947978973389, "logps/chosen": -240.45309448242188, "logps/rejected": -163.8335723876953, "loss": 0.6602, "rewards/accuracies": 0.625, "rewards/chosen": 0.15804949402809143, "rewards/margins": 0.07063551247119904, "rewards/rejected": 0.0874139815568924, "step": 1037 }, { "epoch": 0.5597950653903195, "grad_norm": 8.120331764221191, "learning_rate": 2.4152610371560093e-07, "logits/chosen": -0.28869906067848206, "logits/rejected": -0.8243862390518188, "logps/chosen": -233.13539123535156, "logps/rejected": -222.67449951171875, "loss": 0.6279, "rewards/accuracies": 0.75, "rewards/chosen": 0.27606284618377686, "rewards/margins": 0.15251341462135315, "rewards/rejected": 0.1235494613647461, "step": 1038 }, { "epoch": 0.5603343669947418, "grad_norm": 7.046420574188232, "learning_rate": 2.410555273826649e-07, "logits/chosen": -0.20810502767562866, "logits/rejected": -0.08415642380714417, "logps/chosen": -323.30645751953125, "logps/rejected": -287.4557800292969, "loss": 0.6792, "rewards/accuracies": 0.5, "rewards/chosen": 0.15357179939746857, "rewards/margins": 0.035074714571237564, "rewards/rejected": 0.1184970811009407, "step": 1039 }, { "epoch": 0.5608736685991641, "grad_norm": 8.176450729370117, "learning_rate": 2.405849827791583e-07, "logits/chosen": -0.17849743366241455, "logits/rejected": -0.5371389389038086, "logps/chosen": -270.5467529296875, "logps/rejected": -209.47067260742188, "loss": 0.7228, "rewards/accuracies": 0.625, "rewards/chosen": 0.17937131226062775, "rewards/margins": -0.030224792659282684, "rewards/rejected": 0.20959609746932983, "step": 1040 }, { "epoch": 0.5614129702035864, "grad_norm": 8.028613090515137, "learning_rate": 2.40114471574281e-07, "logits/chosen": 0.6187763810157776, "logits/rejected": 0.5175942182540894, "logps/chosen": -377.2901306152344, "logps/rejected": -297.9815368652344, "loss": 0.7106, "rewards/accuracies": 0.625, "rewards/chosen": 0.11244802176952362, "rewards/margins": -0.013528350740671158, "rewards/rejected": 0.12597636878490448, "step": 1041 }, { "epoch": 0.5619522718080087, "grad_norm": 8.007352828979492, "learning_rate": 2.3964399543711424e-07, "logits/chosen": -0.6741634607315063, "logits/rejected": -0.34744858741760254, "logps/chosen": -219.53878784179688, "logps/rejected": -220.16448974609375, "loss": 0.7568, "rewards/accuracies": 0.25, "rewards/chosen": 0.17618677020072937, "rewards/margins": -0.11248359084129333, "rewards/rejected": 0.2886703610420227, "step": 1042 }, { "epoch": 0.562491573412431, "grad_norm": 7.107645511627197, "learning_rate": 2.3917355603661504e-07, "logits/chosen": 0.24713200330734253, "logits/rejected": -0.7361822724342346, "logps/chosen": -206.6236572265625, "logps/rejected": -183.18910217285156, "loss": 0.6256, "rewards/accuracies": 0.875, "rewards/chosen": 0.22225074470043182, "rewards/margins": 0.1453026831150055, "rewards/rejected": 0.07694806903600693, "step": 1043 }, { "epoch": 0.5630308750168532, "grad_norm": 6.247239589691162, "learning_rate": 2.387031550416099e-07, "logits/chosen": 0.3909822702407837, "logits/rejected": -1.1274869441986084, "logps/chosen": -227.0758819580078, "logps/rejected": -152.7442169189453, "loss": 0.6605, "rewards/accuracies": 0.75, "rewards/chosen": 0.1974886953830719, "rewards/margins": 0.07816142588853836, "rewards/rejected": 0.11932725459337234, "step": 1044 }, { "epoch": 0.5635701766212754, "grad_norm": 7.013668537139893, "learning_rate": 2.3823279412078923e-07, "logits/chosen": 1.1111377477645874, "logits/rejected": 0.1839517205953598, "logps/chosen": -310.3165283203125, "logps/rejected": -221.7006378173828, "loss": 0.6671, "rewards/accuracies": 0.5, "rewards/chosen": 0.19784964621067047, "rewards/margins": 0.06476841866970062, "rewards/rejected": 0.13308124244213104, "step": 1045 }, { "epoch": 0.5641094782256977, "grad_norm": 7.949416160583496, "learning_rate": 2.377624749427012e-07, "logits/chosen": 0.810839831829071, "logits/rejected": -1.2428231239318848, "logps/chosen": -365.9403381347656, "logps/rejected": -206.62353515625, "loss": 0.6611, "rewards/accuracies": 0.625, "rewards/chosen": 0.26056480407714844, "rewards/margins": 0.07477320730686188, "rewards/rejected": 0.18579159677028656, "step": 1046 }, { "epoch": 0.56464877983012, "grad_norm": 7.359198570251465, "learning_rate": 2.3729219917574593e-07, "logits/chosen": -0.312883198261261, "logits/rejected": 0.6688587665557861, "logps/chosen": -252.61212158203125, "logps/rejected": -270.8703918457031, "loss": 0.6731, "rewards/accuracies": 0.625, "rewards/chosen": 0.27524682879447937, "rewards/margins": 0.052526671439409256, "rewards/rejected": 0.22272014617919922, "step": 1047 }, { "epoch": 0.5651880814345422, "grad_norm": 8.209139823913574, "learning_rate": 2.3682196848816954e-07, "logits/chosen": 0.30915096402168274, "logits/rejected": -1.0130642652511597, "logps/chosen": -246.68304443359375, "logps/rejected": -223.9238739013672, "loss": 0.6941, "rewards/accuracies": 0.625, "rewards/chosen": 0.1907692849636078, "rewards/margins": 0.004566483199596405, "rewards/rejected": 0.18620282411575317, "step": 1048 }, { "epoch": 0.5657273830389645, "grad_norm": 6.854612350463867, "learning_rate": 2.3635178454805833e-07, "logits/chosen": -0.39856255054473877, "logits/rejected": -0.26482152938842773, "logps/chosen": -238.44509887695312, "logps/rejected": -203.98648071289062, "loss": 0.6531, "rewards/accuracies": 0.75, "rewards/chosen": 0.29657527804374695, "rewards/margins": 0.08616810292005539, "rewards/rejected": 0.21040716767311096, "step": 1049 }, { "epoch": 0.5662666846433868, "grad_norm": 7.386318683624268, "learning_rate": 2.3588164902333255e-07, "logits/chosen": 0.23379354178905487, "logits/rejected": 0.03994379937648773, "logps/chosen": -203.92385864257812, "logps/rejected": -244.3355712890625, "loss": 0.6962, "rewards/accuracies": 0.5, "rewards/chosen": 0.19633889198303223, "rewards/margins": 0.014272548258304596, "rewards/rejected": 0.18206635117530823, "step": 1050 }, { "epoch": 0.5668059862478091, "grad_norm": 8.748286247253418, "learning_rate": 2.354115635817409e-07, "logits/chosen": 0.009095638990402222, "logits/rejected": 0.15501898527145386, "logps/chosen": -283.2883605957031, "logps/rejected": -275.3495788574219, "loss": 0.7458, "rewards/accuracies": 0.375, "rewards/chosen": 0.11948767304420471, "rewards/margins": -0.09809627383947372, "rewards/rejected": 0.21758393943309784, "step": 1051 }, { "epoch": 0.5673452878522314, "grad_norm": 8.419224739074707, "learning_rate": 2.349415298908543e-07, "logits/chosen": 0.6376427412033081, "logits/rejected": -1.018251657485962, "logps/chosen": -313.86199951171875, "logps/rejected": -311.19549560546875, "loss": 0.7117, "rewards/accuracies": 0.5, "rewards/chosen": 0.10070686042308807, "rewards/margins": -0.010940738022327423, "rewards/rejected": 0.1116476058959961, "step": 1052 }, { "epoch": 0.5678845894566537, "grad_norm": 7.243666648864746, "learning_rate": 2.3447154961806026e-07, "logits/chosen": 0.22445547580718994, "logits/rejected": -1.3029483556747437, "logps/chosen": -311.94915771484375, "logps/rejected": -201.630126953125, "loss": 0.6445, "rewards/accuracies": 0.875, "rewards/chosen": 0.2286238670349121, "rewards/margins": 0.10480699688196182, "rewards/rejected": 0.12381687760353088, "step": 1053 }, { "epoch": 0.5684238910610759, "grad_norm": 6.942996025085449, "learning_rate": 2.3400162443055655e-07, "logits/chosen": 0.32529738545417786, "logits/rejected": 1.591228723526001, "logps/chosen": -166.89596557617188, "logps/rejected": -197.43679809570312, "loss": 0.7547, "rewards/accuracies": 0.25, "rewards/chosen": 0.06387586891651154, "rewards/margins": -0.11272935569286346, "rewards/rejected": 0.176605224609375, "step": 1054 }, { "epoch": 0.5689631926654982, "grad_norm": 7.951472759246826, "learning_rate": 2.335317559953457e-07, "logits/chosen": -0.23224791884422302, "logits/rejected": -0.2710723876953125, "logps/chosen": -218.40359497070312, "logps/rejected": -248.64752197265625, "loss": 0.6866, "rewards/accuracies": 0.5, "rewards/chosen": 0.2284325659275055, "rewards/margins": 0.03979721665382385, "rewards/rejected": 0.18863534927368164, "step": 1055 }, { "epoch": 0.5695024942699205, "grad_norm": 6.097642421722412, "learning_rate": 2.3306194597922884e-07, "logits/chosen": 0.054644227027893066, "logits/rejected": -0.6810814142227173, "logps/chosen": -246.68479919433594, "logps/rejected": -247.83447265625, "loss": 0.6496, "rewards/accuracies": 0.625, "rewards/chosen": 0.16733956336975098, "rewards/margins": 0.10114879906177521, "rewards/rejected": 0.06619076430797577, "step": 1056 }, { "epoch": 0.5700417958743427, "grad_norm": 7.728034019470215, "learning_rate": 2.325921960488e-07, "logits/chosen": -0.34903544187545776, "logits/rejected": -0.5606830716133118, "logps/chosen": -270.1268310546875, "logps/rejected": -242.2505340576172, "loss": 0.6601, "rewards/accuracies": 0.5, "rewards/chosen": 0.19184762239456177, "rewards/margins": 0.08002042770385742, "rewards/rejected": 0.11182717978954315, "step": 1057 }, { "epoch": 0.570581097478765, "grad_norm": 7.308550834655762, "learning_rate": 2.321225078704399e-07, "logits/chosen": 1.5210479497909546, "logits/rejected": 0.7447509765625, "logps/chosen": -301.8213195800781, "logps/rejected": -233.84048461914062, "loss": 0.6373, "rewards/accuracies": 0.75, "rewards/chosen": 0.25560837984085083, "rewards/margins": 0.12276582419872284, "rewards/rejected": 0.1328425407409668, "step": 1058 }, { "epoch": 0.5711203990831872, "grad_norm": 8.851030349731445, "learning_rate": 2.3165288311031023e-07, "logits/chosen": -0.11570185422897339, "logits/rejected": 0.5645214319229126, "logps/chosen": -241.71957397460938, "logps/rejected": -249.83642578125, "loss": 0.759, "rewards/accuracies": 0.25, "rewards/chosen": 0.018266677856445312, "rewards/margins": -0.1140674501657486, "rewards/rejected": 0.1323341429233551, "step": 1059 }, { "epoch": 0.5716597006876095, "grad_norm": 7.715731143951416, "learning_rate": 2.3118332343434777e-07, "logits/chosen": -0.755709707736969, "logits/rejected": -1.4184136390686035, "logps/chosen": -210.2996063232422, "logps/rejected": -231.8477325439453, "loss": 0.6083, "rewards/accuracies": 0.75, "rewards/chosen": 0.22448933124542236, "rewards/margins": 0.18971052765846252, "rewards/rejected": 0.03477878123521805, "step": 1060 }, { "epoch": 0.5721990022920318, "grad_norm": 7.458320617675781, "learning_rate": 2.3071383050825844e-07, "logits/chosen": -0.6113002300262451, "logits/rejected": -0.7283151745796204, "logps/chosen": -312.72808837890625, "logps/rejected": -229.7906494140625, "loss": 0.6902, "rewards/accuracies": 0.625, "rewards/chosen": 0.16841527819633484, "rewards/margins": 0.013286205008625984, "rewards/rejected": 0.1551290601491928, "step": 1061 }, { "epoch": 0.5727383038964541, "grad_norm": 6.5291924476623535, "learning_rate": 2.302444059975113e-07, "logits/chosen": 0.6417379379272461, "logits/rejected": -0.10734602808952332, "logps/chosen": -242.78915405273438, "logps/rejected": -212.17738342285156, "loss": 0.6665, "rewards/accuracies": 0.75, "rewards/chosen": 0.13555145263671875, "rewards/margins": 0.06051063910126686, "rewards/rejected": 0.07504081726074219, "step": 1062 }, { "epoch": 0.5732776055008764, "grad_norm": 8.326437950134277, "learning_rate": 2.297750515673328e-07, "logits/chosen": -0.373218297958374, "logits/rejected": -1.3788490295410156, "logps/chosen": -231.3839874267578, "logps/rejected": -176.6309814453125, "loss": 0.6713, "rewards/accuracies": 0.625, "rewards/chosen": 0.205590158700943, "rewards/margins": 0.056204892694950104, "rewards/rejected": 0.1493852734565735, "step": 1063 }, { "epoch": 0.5738169071052986, "grad_norm": 9.546561241149902, "learning_rate": 2.2930576888270064e-07, "logits/chosen": -0.7189356088638306, "logits/rejected": 0.035396382212638855, "logps/chosen": -190.4963836669922, "logps/rejected": -256.3672790527344, "loss": 0.8071, "rewards/accuracies": 0.25, "rewards/chosen": 0.07849683612585068, "rewards/margins": -0.1948838233947754, "rewards/rejected": 0.27338066697120667, "step": 1064 }, { "epoch": 0.5743562087097209, "grad_norm": 8.331809997558594, "learning_rate": 2.288365596083381e-07, "logits/chosen": 1.2116016149520874, "logits/rejected": 0.8164399862289429, "logps/chosen": -246.0426025390625, "logps/rejected": -332.95001220703125, "loss": 0.702, "rewards/accuracies": 0.5, "rewards/chosen": 0.24307212233543396, "rewards/margins": -0.0016390793025493622, "rewards/rejected": 0.24471122026443481, "step": 1065 }, { "epoch": 0.5748955103141432, "grad_norm": 9.769998550415039, "learning_rate": 2.2836742540870814e-07, "logits/chosen": -1.0370007753372192, "logits/rejected": -0.832961916923523, "logps/chosen": -198.20095825195312, "logps/rejected": -217.1308135986328, "loss": 0.7255, "rewards/accuracies": 0.375, "rewards/chosen": 0.08038687705993652, "rewards/margins": -0.03276258334517479, "rewards/rejected": 0.11314946413040161, "step": 1066 }, { "epoch": 0.5754348119185655, "grad_norm": 6.537318706512451, "learning_rate": 2.2789836794800732e-07, "logits/chosen": 0.6944802403450012, "logits/rejected": 0.3950127363204956, "logps/chosen": -261.06829833984375, "logps/rejected": -204.4121856689453, "loss": 0.6757, "rewards/accuracies": 0.5, "rewards/chosen": 0.16971206665039062, "rewards/margins": 0.04543857276439667, "rewards/rejected": 0.12427349388599396, "step": 1067 }, { "epoch": 0.5759741135229878, "grad_norm": 8.542394638061523, "learning_rate": 2.274293888901599e-07, "logits/chosen": 0.42882537841796875, "logits/rejected": -0.7237097024917603, "logps/chosen": -279.8091735839844, "logps/rejected": -260.8995361328125, "loss": 0.6124, "rewards/accuracies": 0.625, "rewards/chosen": 0.23195992410182953, "rewards/margins": 0.18321417272090912, "rewards/rejected": 0.04874572902917862, "step": 1068 }, { "epoch": 0.5765134151274101, "grad_norm": 7.879732608795166, "learning_rate": 2.2696048989881214e-07, "logits/chosen": 0.43975716829299927, "logits/rejected": -0.4831950068473816, "logps/chosen": -214.7781982421875, "logps/rejected": -156.1822509765625, "loss": 0.7, "rewards/accuracies": 0.375, "rewards/chosen": 0.18262425065040588, "rewards/margins": 0.0034110024571418762, "rewards/rejected": 0.1792132556438446, "step": 1069 }, { "epoch": 0.5770527167318322, "grad_norm": 7.699409008026123, "learning_rate": 2.2649167263732624e-07, "logits/chosen": 0.5120293498039246, "logits/rejected": -0.2343294322490692, "logps/chosen": -319.9969177246094, "logps/rejected": -288.64288330078125, "loss": 0.6515, "rewards/accuracies": 0.75, "rewards/chosen": 0.12543517351150513, "rewards/margins": 0.08736544102430344, "rewards/rejected": 0.038069725036621094, "step": 1070 }, { "epoch": 0.5775920183362545, "grad_norm": 8.81035041809082, "learning_rate": 2.2602293876877438e-07, "logits/chosen": -0.668907642364502, "logits/rejected": 0.07927174866199493, "logps/chosen": -331.3077087402344, "logps/rejected": -372.71533203125, "loss": 0.7622, "rewards/accuracies": 0.25, "rewards/chosen": 0.15826283395290375, "rewards/margins": -0.12619668245315552, "rewards/rejected": 0.28445950150489807, "step": 1071 }, { "epoch": 0.5781313199406768, "grad_norm": 7.523488521575928, "learning_rate": 2.25554289955933e-07, "logits/chosen": 0.37019118666648865, "logits/rejected": 0.3156392574310303, "logps/chosen": -239.37965393066406, "logps/rejected": -271.2366943359375, "loss": 0.729, "rewards/accuracies": 0.5, "rewards/chosen": 0.09731645882129669, "rewards/margins": -0.057853616774082184, "rewards/rejected": 0.15517006814479828, "step": 1072 }, { "epoch": 0.5786706215450991, "grad_norm": 7.599282264709473, "learning_rate": 2.2508572786127684e-07, "logits/chosen": 1.468280553817749, "logits/rejected": 0.5029834508895874, "logps/chosen": -326.154541015625, "logps/rejected": -262.8006591796875, "loss": 0.6918, "rewards/accuracies": 0.5, "rewards/chosen": 0.16644173860549927, "rewards/margins": 0.010291007347404957, "rewards/rejected": 0.1561507284641266, "step": 1073 }, { "epoch": 0.5792099231495214, "grad_norm": 7.174117088317871, "learning_rate": 2.24617254146973e-07, "logits/chosen": -0.10179433226585388, "logits/rejected": -0.9797888994216919, "logps/chosen": -239.01300048828125, "logps/rejected": -165.7371368408203, "loss": 0.6757, "rewards/accuracies": 0.375, "rewards/chosen": 0.19859857857227325, "rewards/margins": 0.05449523776769638, "rewards/rejected": 0.14410333335399628, "step": 1074 }, { "epoch": 0.5797492247539436, "grad_norm": 6.712502479553223, "learning_rate": 2.2414887047487498e-07, "logits/chosen": 0.012605957686901093, "logits/rejected": -0.7153850793838501, "logps/chosen": -244.40921020507812, "logps/rejected": -199.6781005859375, "loss": 0.6614, "rewards/accuracies": 0.75, "rewards/chosen": 0.2048398107290268, "rewards/margins": 0.07328367978334427, "rewards/rejected": 0.13155613839626312, "step": 1075 }, { "epoch": 0.5802885263583659, "grad_norm": 6.869988918304443, "learning_rate": 2.2368057850651695e-07, "logits/chosen": -0.7243853211402893, "logits/rejected": -1.2466745376586914, "logps/chosen": -228.0820770263672, "logps/rejected": -236.04574584960938, "loss": 0.6264, "rewards/accuracies": 0.625, "rewards/chosen": 0.2441617101430893, "rewards/margins": 0.1441785842180252, "rewards/rejected": 0.09998311847448349, "step": 1076 }, { "epoch": 0.5808278279627882, "grad_norm": 9.957380294799805, "learning_rate": 2.2321237990310777e-07, "logits/chosen": -0.08334098756313324, "logits/rejected": -0.7685688138008118, "logps/chosen": -296.392333984375, "logps/rejected": -309.3286437988281, "loss": 0.754, "rewards/accuracies": 0.25, "rewards/chosen": 0.1201997697353363, "rewards/margins": -0.09763164818286896, "rewards/rejected": 0.21783143281936646, "step": 1077 }, { "epoch": 0.5813671295672105, "grad_norm": 7.34542989730835, "learning_rate": 2.2274427632552503e-07, "logits/chosen": 0.7749833464622498, "logits/rejected": -0.5160343050956726, "logps/chosen": -192.7832489013672, "logps/rejected": -131.50088500976562, "loss": 0.6492, "rewards/accuracies": 0.625, "rewards/chosen": 0.1919151246547699, "rewards/margins": 0.109612375497818, "rewards/rejected": 0.0823027640581131, "step": 1078 }, { "epoch": 0.5819064311716328, "grad_norm": 7.5333428382873535, "learning_rate": 2.222762694343094e-07, "logits/chosen": 0.6580086350440979, "logits/rejected": -0.9754127264022827, "logps/chosen": -315.5328369140625, "logps/rejected": -248.54345703125, "loss": 0.5517, "rewards/accuracies": 1.0, "rewards/chosen": 0.3272378146648407, "rewards/margins": 0.31024160981178284, "rewards/rejected": 0.016996193677186966, "step": 1079 }, { "epoch": 0.582445732776055, "grad_norm": 8.444687843322754, "learning_rate": 2.218083608896583e-07, "logits/chosen": -0.7146320939064026, "logits/rejected": 0.4062756896018982, "logps/chosen": -207.8656768798828, "logps/rejected": -256.8157653808594, "loss": 0.7152, "rewards/accuracies": 0.25, "rewards/chosen": 0.14013996720314026, "rewards/margins": -0.03800173103809357, "rewards/rejected": 0.17814169824123383, "step": 1080 }, { "epoch": 0.5829850343804773, "grad_norm": 7.514667987823486, "learning_rate": 2.2134055235142046e-07, "logits/chosen": -0.9117540121078491, "logits/rejected": -0.10029272735118866, "logps/chosen": -208.5215606689453, "logps/rejected": -211.54055786132812, "loss": 0.669, "rewards/accuracies": 0.625, "rewards/chosen": 0.1438688337802887, "rewards/margins": 0.056571438908576965, "rewards/rejected": 0.08729739487171173, "step": 1081 }, { "epoch": 0.5835243359848995, "grad_norm": 6.599380016326904, "learning_rate": 2.2087284547908987e-07, "logits/chosen": -0.6465436220169067, "logits/rejected": 0.16063997149467468, "logps/chosen": -204.4278106689453, "logps/rejected": -207.8551483154297, "loss": 0.7051, "rewards/accuracies": 0.375, "rewards/chosen": 0.23153647780418396, "rewards/margins": -0.014637044630944729, "rewards/rejected": 0.24617351591587067, "step": 1082 }, { "epoch": 0.5840636375893218, "grad_norm": 8.10989761352539, "learning_rate": 2.2040524193179976e-07, "logits/chosen": -0.3498600125312805, "logits/rejected": 0.10933709144592285, "logps/chosen": -341.3009033203125, "logps/rejected": -409.53314208984375, "loss": 0.6795, "rewards/accuracies": 0.5, "rewards/chosen": 0.19380035996437073, "rewards/margins": 0.03076476976275444, "rewards/rejected": 0.1630355715751648, "step": 1083 }, { "epoch": 0.5846029391937441, "grad_norm": 6.001870155334473, "learning_rate": 2.1993774336831694e-07, "logits/chosen": 0.10282149910926819, "logits/rejected": 0.0006160745397210121, "logps/chosen": -215.415283203125, "logps/rejected": -216.20443725585938, "loss": 0.6583, "rewards/accuracies": 0.625, "rewards/chosen": 0.2246440052986145, "rewards/margins": 0.07446661591529846, "rewards/rejected": 0.15017737448215485, "step": 1084 }, { "epoch": 0.5851422407981663, "grad_norm": 6.499743938446045, "learning_rate": 2.1947035144703566e-07, "logits/chosen": 0.24229052662849426, "logits/rejected": -1.8594353199005127, "logps/chosen": -198.6099853515625, "logps/rejected": -140.6573486328125, "loss": 0.6379, "rewards/accuracies": 0.875, "rewards/chosen": 0.22601714730262756, "rewards/margins": 0.11843538284301758, "rewards/rejected": 0.10758176445960999, "step": 1085 }, { "epoch": 0.5856815424025886, "grad_norm": 7.21077823638916, "learning_rate": 2.19003067825972e-07, "logits/chosen": 0.7555131912231445, "logits/rejected": -0.6199585199356079, "logps/chosen": -319.9627990722656, "logps/rejected": -204.50730895996094, "loss": 0.6122, "rewards/accuracies": 0.75, "rewards/chosen": 0.300650030374527, "rewards/margins": 0.1827755868434906, "rewards/rejected": 0.11787443608045578, "step": 1086 }, { "epoch": 0.5862208440070109, "grad_norm": 7.999979019165039, "learning_rate": 2.185358941627578e-07, "logits/chosen": 2.1662845611572266, "logits/rejected": 0.2837226986885071, "logps/chosen": -410.2547607421875, "logps/rejected": -298.3476257324219, "loss": 0.6297, "rewards/accuracies": 0.75, "rewards/chosen": 0.2954162657260895, "rewards/margins": 0.14913922548294067, "rewards/rejected": 0.1462770402431488, "step": 1087 }, { "epoch": 0.5867601456114332, "grad_norm": 6.071428298950195, "learning_rate": 2.180688321146349e-07, "logits/chosen": 0.9836400747299194, "logits/rejected": -0.30248963832855225, "logps/chosen": -212.68316650390625, "logps/rejected": -175.4856719970703, "loss": 0.6174, "rewards/accuracies": 0.875, "rewards/chosen": 0.3040101230144501, "rewards/margins": 0.17003917694091797, "rewards/rejected": 0.1339709311723709, "step": 1088 }, { "epoch": 0.5872994472158555, "grad_norm": 6.959547519683838, "learning_rate": 2.1760188333844903e-07, "logits/chosen": 0.30810821056365967, "logits/rejected": 0.11498227715492249, "logps/chosen": -226.4201202392578, "logps/rejected": -290.12066650390625, "loss": 0.6653, "rewards/accuracies": 0.625, "rewards/chosen": 0.19378508627414703, "rewards/margins": 0.062267206609249115, "rewards/rejected": 0.13151788711547852, "step": 1089 }, { "epoch": 0.5878387488202778, "grad_norm": 7.317283630371094, "learning_rate": 2.1713504949064432e-07, "logits/chosen": 0.37895601987838745, "logits/rejected": -0.09369447827339172, "logps/chosen": -229.84185791015625, "logps/rejected": -244.267822265625, "loss": 0.7041, "rewards/accuracies": 0.375, "rewards/chosen": 0.23803481459617615, "rewards/margins": -0.01607971079647541, "rewards/rejected": 0.2541145384311676, "step": 1090 }, { "epoch": 0.5883780504247, "grad_norm": 7.672473907470703, "learning_rate": 2.1666833222725707e-07, "logits/chosen": -0.21723905205726624, "logits/rejected": -0.33987268805503845, "logps/chosen": -270.830322265625, "logps/rejected": -211.67474365234375, "loss": 0.6657, "rewards/accuracies": 0.375, "rewards/chosen": 0.19594678282737732, "rewards/margins": 0.07212018966674805, "rewards/rejected": 0.12382659316062927, "step": 1091 }, { "epoch": 0.5889173520291223, "grad_norm": 6.954753875732422, "learning_rate": 2.1620173320391006e-07, "logits/chosen": 0.7415423393249512, "logits/rejected": 1.2523876428604126, "logps/chosen": -221.32431030273438, "logps/rejected": -274.0748291015625, "loss": 0.6674, "rewards/accuracies": 0.625, "rewards/chosen": 0.278323769569397, "rewards/margins": 0.06962623447179794, "rewards/rejected": 0.20869751274585724, "step": 1092 }, { "epoch": 0.5894566536335446, "grad_norm": 6.382783889770508, "learning_rate": 2.1573525407580666e-07, "logits/chosen": 1.2294079065322876, "logits/rejected": -0.7507011294364929, "logps/chosen": -311.17364501953125, "logps/rejected": -185.62193298339844, "loss": 0.6487, "rewards/accuracies": 0.5, "rewards/chosen": 0.1475471556186676, "rewards/margins": 0.10571964830160141, "rewards/rejected": 0.041827492415905, "step": 1093 }, { "epoch": 0.5899959552379669, "grad_norm": 6.845088005065918, "learning_rate": 2.1526889649772475e-07, "logits/chosen": 0.9559447765350342, "logits/rejected": -0.3358859419822693, "logps/chosen": -272.20550537109375, "logps/rejected": -226.19969177246094, "loss": 0.7016, "rewards/accuracies": 0.5, "rewards/chosen": 0.1248815581202507, "rewards/margins": 0.0017569530755281448, "rewards/rejected": 0.12312459945678711, "step": 1094 }, { "epoch": 0.5905352568423891, "grad_norm": 9.37622356414795, "learning_rate": 2.1480266212401114e-07, "logits/chosen": 0.4117570221424103, "logits/rejected": -0.31726711988449097, "logps/chosen": -159.6101837158203, "logps/rejected": -173.26638793945312, "loss": 0.7051, "rewards/accuracies": 0.5, "rewards/chosen": 0.1336628943681717, "rewards/margins": -0.006693839095532894, "rewards/rejected": 0.14035673439502716, "step": 1095 }, { "epoch": 0.5910745584468113, "grad_norm": 7.76414155960083, "learning_rate": 2.1433655260857587e-07, "logits/chosen": -0.24299216270446777, "logits/rejected": -1.2427765130996704, "logps/chosen": -221.9036865234375, "logps/rejected": -205.25839233398438, "loss": 0.7264, "rewards/accuracies": 0.375, "rewards/chosen": 0.17667880654335022, "rewards/margins": -0.04736844077706337, "rewards/rejected": 0.2240472435951233, "step": 1096 }, { "epoch": 0.5916138600512336, "grad_norm": 5.491674900054932, "learning_rate": 2.1387056960488552e-07, "logits/chosen": 0.8178631067276001, "logits/rejected": 0.4102928936481476, "logps/chosen": -198.00808715820312, "logps/rejected": -182.6726531982422, "loss": 0.6539, "rewards/accuracies": 0.625, "rewards/chosen": 0.22005663812160492, "rewards/margins": 0.0844547301530838, "rewards/rejected": 0.13560189306735992, "step": 1097 }, { "epoch": 0.5921531616556559, "grad_norm": 7.166924476623535, "learning_rate": 2.134047147659583e-07, "logits/chosen": 0.6456588506698608, "logits/rejected": -0.21395863592624664, "logps/chosen": -213.0227813720703, "logps/rejected": -244.9644775390625, "loss": 0.6829, "rewards/accuracies": 0.75, "rewards/chosen": 0.20886152982711792, "rewards/margins": 0.02461213432252407, "rewards/rejected": 0.1842494010925293, "step": 1098 }, { "epoch": 0.5926924632600782, "grad_norm": 6.994472026824951, "learning_rate": 2.1293898974435758e-07, "logits/chosen": -1.1881340742111206, "logits/rejected": -0.8363721370697021, "logps/chosen": -285.56805419921875, "logps/rejected": -325.34576416015625, "loss": 0.6502, "rewards/accuracies": 0.75, "rewards/chosen": 0.22983330488204956, "rewards/margins": 0.09644804149866104, "rewards/rejected": 0.13338527083396912, "step": 1099 }, { "epoch": 0.5932317648645005, "grad_norm": 8.276287078857422, "learning_rate": 2.1247339619218638e-07, "logits/chosen": -0.7460021376609802, "logits/rejected": -0.06140223145484924, "logps/chosen": -254.14895629882812, "logps/rejected": -368.9841003417969, "loss": 0.6908, "rewards/accuracies": 0.625, "rewards/chosen": 0.2000637948513031, "rewards/margins": 0.01595631241798401, "rewards/rejected": 0.18410749733448029, "step": 1100 }, { "epoch": 0.5937710664689227, "grad_norm": 7.229936599731445, "learning_rate": 2.1200793576108102e-07, "logits/chosen": 0.7093383073806763, "logits/rejected": -0.05074017867445946, "logps/chosen": -300.6617431640625, "logps/rejected": -224.64663696289062, "loss": 0.658, "rewards/accuracies": 0.5, "rewards/chosen": 0.20487460494041443, "rewards/margins": 0.09035071730613708, "rewards/rejected": 0.11452388763427734, "step": 1101 }, { "epoch": 0.594310368073345, "grad_norm": 7.030712127685547, "learning_rate": 2.1154261010220598e-07, "logits/chosen": -1.0065137147903442, "logits/rejected": -1.8249390125274658, "logps/chosen": -333.8544921875, "logps/rejected": -186.25384521484375, "loss": 0.7172, "rewards/accuracies": 0.5, "rewards/chosen": 0.0775851309299469, "rewards/margins": -0.03349713608622551, "rewards/rejected": 0.11108227074146271, "step": 1102 }, { "epoch": 0.5948496696777673, "grad_norm": 8.718339920043945, "learning_rate": 2.1107742086624735e-07, "logits/chosen": 0.3096918761730194, "logits/rejected": -1.9635009765625, "logps/chosen": -564.6305541992188, "logps/rejected": -246.9453887939453, "loss": 0.6874, "rewards/accuracies": 0.5, "rewards/chosen": 0.22774848341941833, "rewards/margins": 0.02862415462732315, "rewards/rejected": 0.19912435114383698, "step": 1103 }, { "epoch": 0.5953889712821896, "grad_norm": 6.31668758392334, "learning_rate": 2.1061236970340755e-07, "logits/chosen": 0.6031879186630249, "logits/rejected": 0.37461960315704346, "logps/chosen": -215.20611572265625, "logps/rejected": -164.3341827392578, "loss": 0.7107, "rewards/accuracies": 0.375, "rewards/chosen": 0.10798425227403641, "rewards/margins": -0.02912149578332901, "rewards/rejected": 0.13710574805736542, "step": 1104 }, { "epoch": 0.5959282728866119, "grad_norm": 8.40976619720459, "learning_rate": 2.10147458263399e-07, "logits/chosen": -0.41118818521499634, "logits/rejected": -0.5494125485420227, "logps/chosen": -254.72457885742188, "logps/rejected": -206.8704376220703, "loss": 0.6567, "rewards/accuracies": 0.5, "rewards/chosen": 0.18879395723342896, "rewards/margins": 0.08583307266235352, "rewards/rejected": 0.10296087712049484, "step": 1105 }, { "epoch": 0.5964675744910342, "grad_norm": 8.623665809631348, "learning_rate": 2.0968268819543847e-07, "logits/chosen": -0.6670534610748291, "logits/rejected": -0.06309720873832703, "logps/chosen": -189.087646484375, "logps/rejected": -187.28280639648438, "loss": 0.7672, "rewards/accuracies": 0.25, "rewards/chosen": 0.08460903912782669, "rewards/margins": -0.1259150505065918, "rewards/rejected": 0.2105240821838379, "step": 1106 }, { "epoch": 0.5970068760954563, "grad_norm": 7.627223491668701, "learning_rate": 2.092180611482413e-07, "logits/chosen": 1.023220181465149, "logits/rejected": 1.0809712409973145, "logps/chosen": -283.0975036621094, "logps/rejected": -231.88734436035156, "loss": 0.6792, "rewards/accuracies": 0.625, "rewards/chosen": 0.24336948990821838, "rewards/margins": 0.032073404639959335, "rewards/rejected": 0.21129608154296875, "step": 1107 }, { "epoch": 0.5975461776998786, "grad_norm": 7.81983757019043, "learning_rate": 2.0875357877001555e-07, "logits/chosen": 0.31436488032341003, "logits/rejected": 0.40126433968544006, "logps/chosen": -235.1768341064453, "logps/rejected": -210.404052734375, "loss": 0.6843, "rewards/accuracies": 0.5, "rewards/chosen": 0.20273761451244354, "rewards/margins": 0.03355160355567932, "rewards/rejected": 0.16918602585792542, "step": 1108 }, { "epoch": 0.5980854793043009, "grad_norm": 7.349115371704102, "learning_rate": 2.082892427084559e-07, "logits/chosen": -0.2120429277420044, "logits/rejected": -0.5040079951286316, "logps/chosen": -187.48121643066406, "logps/rejected": -200.21466064453125, "loss": 0.7274, "rewards/accuracies": 0.25, "rewards/chosen": 0.12815523147583008, "rewards/margins": -0.05744219571352005, "rewards/rejected": 0.18559743463993073, "step": 1109 }, { "epoch": 0.5986247809087232, "grad_norm": 6.619981288909912, "learning_rate": 2.078250546107382e-07, "logits/chosen": 0.36974141001701355, "logits/rejected": -0.9879566431045532, "logps/chosen": -283.89080810546875, "logps/rejected": -186.82693481445312, "loss": 0.6243, "rewards/accuracies": 0.625, "rewards/chosen": 0.33286675810813904, "rewards/margins": 0.16345404088497162, "rewards/rejected": 0.16941270232200623, "step": 1110 }, { "epoch": 0.5991640825131455, "grad_norm": 6.8594865798950195, "learning_rate": 2.073610161235132e-07, "logits/chosen": 0.9410103559494019, "logits/rejected": 0.3300493359565735, "logps/chosen": -277.46783447265625, "logps/rejected": -286.6422119140625, "loss": 0.6337, "rewards/accuracies": 0.625, "rewards/chosen": 0.2667003571987152, "rewards/margins": 0.1532895863056183, "rewards/rejected": 0.11341076344251633, "step": 1111 }, { "epoch": 0.5997033841175677, "grad_norm": 6.4999308586120605, "learning_rate": 2.0689712889290112e-07, "logits/chosen": -0.47889241576194763, "logits/rejected": -0.4139832556247711, "logps/chosen": -163.10740661621094, "logps/rejected": -244.82958984375, "loss": 0.6813, "rewards/accuracies": 0.375, "rewards/chosen": 0.16394129395484924, "rewards/margins": 0.04015817865729332, "rewards/rejected": 0.12378311157226562, "step": 1112 }, { "epoch": 0.60024268572199, "grad_norm": 6.367954730987549, "learning_rate": 2.0643339456448545e-07, "logits/chosen": 0.4200483560562134, "logits/rejected": 0.3671035170555115, "logps/chosen": -199.5194091796875, "logps/rejected": -181.58290100097656, "loss": 0.6449, "rewards/accuracies": 0.5, "rewards/chosen": 0.23960790038108826, "rewards/margins": 0.1067531630396843, "rewards/rejected": 0.13285475969314575, "step": 1113 }, { "epoch": 0.6007819873264123, "grad_norm": 7.473036766052246, "learning_rate": 2.0596981478330745e-07, "logits/chosen": 1.404210090637207, "logits/rejected": 0.3649275600910187, "logps/chosen": -234.46170043945312, "logps/rejected": -228.74066162109375, "loss": 0.6511, "rewards/accuracies": 0.75, "rewards/chosen": 0.2819506824016571, "rewards/margins": 0.09205514192581177, "rewards/rejected": 0.18989554047584534, "step": 1114 }, { "epoch": 0.6013212889308346, "grad_norm": 7.361476898193359, "learning_rate": 2.0550639119385996e-07, "logits/chosen": -0.23572811484336853, "logits/rejected": -0.6644473075866699, "logps/chosen": -201.95553588867188, "logps/rejected": -251.1069793701172, "loss": 0.6368, "rewards/accuracies": 0.75, "rewards/chosen": 0.20029029250144958, "rewards/margins": 0.138045072555542, "rewards/rejected": 0.06224522739648819, "step": 1115 }, { "epoch": 0.6018605905352569, "grad_norm": 6.925312519073486, "learning_rate": 2.050431254400819e-07, "logits/chosen": -0.04230052977800369, "logits/rejected": -0.6916738748550415, "logps/chosen": -205.56710815429688, "logps/rejected": -159.73428344726562, "loss": 0.671, "rewards/accuracies": 0.625, "rewards/chosen": 0.1739896833896637, "rewards/margins": 0.05231046676635742, "rewards/rejected": 0.12167920917272568, "step": 1116 }, { "epoch": 0.6023998921396791, "grad_norm": 7.178651809692383, "learning_rate": 2.0458001916535217e-07, "logits/chosen": 0.1909475326538086, "logits/rejected": 0.1494908630847931, "logps/chosen": -204.3213653564453, "logps/rejected": -220.21310424804688, "loss": 0.6683, "rewards/accuracies": 0.625, "rewards/chosen": 0.2507708668708801, "rewards/margins": 0.05515871196985245, "rewards/rejected": 0.19561214745044708, "step": 1117 }, { "epoch": 0.6029391937441014, "grad_norm": 7.76811408996582, "learning_rate": 2.0411707401248403e-07, "logits/chosen": 0.7638496160507202, "logits/rejected": -0.10817325115203857, "logps/chosen": -225.0511474609375, "logps/rejected": -161.6140594482422, "loss": 0.5736, "rewards/accuracies": 0.875, "rewards/chosen": 0.2618732452392578, "rewards/margins": 0.2650078237056732, "rewards/rejected": -0.003134585916996002, "step": 1118 }, { "epoch": 0.6034784953485237, "grad_norm": 7.597289085388184, "learning_rate": 2.036542916237192e-07, "logits/chosen": 1.4679884910583496, "logits/rejected": 0.5777730941772461, "logps/chosen": -294.26708984375, "logps/rejected": -261.2345886230469, "loss": 0.6802, "rewards/accuracies": 0.75, "rewards/chosen": 0.1768062561750412, "rewards/margins": 0.031836412847042084, "rewards/rejected": 0.1449698507785797, "step": 1119 }, { "epoch": 0.6040177969529459, "grad_norm": 7.069028854370117, "learning_rate": 2.031916736407218e-07, "logits/chosen": 1.0322850942611694, "logits/rejected": 0.3779284954071045, "logps/chosen": -278.24652099609375, "logps/rejected": -308.5556945800781, "loss": 0.6976, "rewards/accuracies": 0.625, "rewards/chosen": 0.16491326689720154, "rewards/margins": 0.007042223587632179, "rewards/rejected": 0.1578710675239563, "step": 1120 }, { "epoch": 0.6045570985573682, "grad_norm": 7.6950907707214355, "learning_rate": 2.0272922170457304e-07, "logits/chosen": -0.744780421257019, "logits/rejected": -1.4793317317962646, "logps/chosen": -166.657470703125, "logps/rejected": -153.603271484375, "loss": 0.6498, "rewards/accuracies": 0.625, "rewards/chosen": 0.23319005966186523, "rewards/margins": 0.10169095546007156, "rewards/rejected": 0.13149911165237427, "step": 1121 }, { "epoch": 0.6050964001617904, "grad_norm": 8.440925598144531, "learning_rate": 2.022669374557649e-07, "logits/chosen": 0.18636402487754822, "logits/rejected": -0.9785841703414917, "logps/chosen": -387.5360107421875, "logps/rejected": -266.2207946777344, "loss": 0.6078, "rewards/accuracies": 1.0, "rewards/chosen": 0.32500436902046204, "rewards/margins": 0.1865902990102768, "rewards/rejected": 0.13841411471366882, "step": 1122 }, { "epoch": 0.6056357017662127, "grad_norm": 7.176341533660889, "learning_rate": 2.0180482253419463e-07, "logits/chosen": 0.41324734687805176, "logits/rejected": 0.3026791214942932, "logps/chosen": -248.85540771484375, "logps/rejected": -223.35098266601562, "loss": 0.6701, "rewards/accuracies": 0.875, "rewards/chosen": 0.19880180060863495, "rewards/margins": 0.05482501536607742, "rewards/rejected": 0.14397677779197693, "step": 1123 }, { "epoch": 0.606175003370635, "grad_norm": 7.957698345184326, "learning_rate": 2.013428785791586e-07, "logits/chosen": 0.02881874144077301, "logits/rejected": 0.08112601935863495, "logps/chosen": -190.27035522460938, "logps/rejected": -206.84417724609375, "loss": 0.6966, "rewards/accuracies": 0.625, "rewards/chosen": 0.21074500679969788, "rewards/margins": 0.014774609357118607, "rewards/rejected": 0.19597040116786957, "step": 1124 }, { "epoch": 0.6067143049750573, "grad_norm": 8.835807800292969, "learning_rate": 2.0088110722934688e-07, "logits/chosen": 0.44173067808151245, "logits/rejected": -1.0460717678070068, "logps/chosen": -234.57565307617188, "logps/rejected": -236.1998748779297, "loss": 0.6103, "rewards/accuracies": 0.875, "rewards/chosen": 0.22121019661426544, "rewards/margins": 0.18565011024475098, "rewards/rejected": 0.03556007891893387, "step": 1125 }, { "epoch": 0.6072536065794796, "grad_norm": 7.793407917022705, "learning_rate": 2.004195101228374e-07, "logits/chosen": -0.3238900899887085, "logits/rejected": -0.23866069316864014, "logps/chosen": -222.6190185546875, "logps/rejected": -328.4522705078125, "loss": 0.7288, "rewards/accuracies": 0.375, "rewards/chosen": 0.07698316872119904, "rewards/margins": -0.06449384987354279, "rewards/rejected": 0.14147701859474182, "step": 1126 }, { "epoch": 0.6077929081839019, "grad_norm": 7.458797931671143, "learning_rate": 1.999580888970896e-07, "logits/chosen": 0.6380518674850464, "logits/rejected": -0.2735372483730316, "logps/chosen": -224.29046630859375, "logps/rejected": -216.77874755859375, "loss": 0.6875, "rewards/accuracies": 0.75, "rewards/chosen": 0.13138408958911896, "rewards/margins": 0.015282725915312767, "rewards/rejected": 0.11610135436058044, "step": 1127 }, { "epoch": 0.6083322097883241, "grad_norm": 8.200947761535645, "learning_rate": 1.9949684518893925e-07, "logits/chosen": 0.3666197657585144, "logits/rejected": 1.1442521810531616, "logps/chosen": -251.32907104492188, "logps/rejected": -291.5677490234375, "loss": 0.6615, "rewards/accuracies": 0.5, "rewards/chosen": 0.31709253787994385, "rewards/margins": 0.06921596825122833, "rewards/rejected": 0.24787655472755432, "step": 1128 }, { "epoch": 0.6088715113927464, "grad_norm": 6.782431602478027, "learning_rate": 1.9903578063459238e-07, "logits/chosen": 0.6007586121559143, "logits/rejected": 0.10801849514245987, "logps/chosen": -246.14443969726562, "logps/rejected": -306.1820983886719, "loss": 0.6464, "rewards/accuracies": 0.75, "rewards/chosen": 0.23691701889038086, "rewards/margins": 0.12330065667629242, "rewards/rejected": 0.11361636966466904, "step": 1129 }, { "epoch": 0.6094108129971687, "grad_norm": 7.26017427444458, "learning_rate": 1.9857489686961936e-07, "logits/chosen": 0.0336880087852478, "logits/rejected": -1.0722569227218628, "logps/chosen": -209.68954467773438, "logps/rejected": -180.70912170410156, "loss": 0.6302, "rewards/accuracies": 0.75, "rewards/chosen": 0.23428648710250854, "rewards/margins": 0.14307206869125366, "rewards/rejected": 0.09121441841125488, "step": 1130 }, { "epoch": 0.609950114601591, "grad_norm": 7.308837413787842, "learning_rate": 1.981141955289495e-07, "logits/chosen": 0.8049414157867432, "logits/rejected": -0.6660414934158325, "logps/chosen": -290.7850036621094, "logps/rejected": -196.23410034179688, "loss": 0.5831, "rewards/accuracies": 0.875, "rewards/chosen": 0.3038857579231262, "rewards/margins": 0.24825319647789001, "rewards/rejected": 0.05563254654407501, "step": 1131 }, { "epoch": 0.6104894162060132, "grad_norm": 8.360848426818848, "learning_rate": 1.9765367824686466e-07, "logits/chosen": -0.017419010400772095, "logits/rejected": -0.45985352993011475, "logps/chosen": -313.52020263671875, "logps/rejected": -330.3147277832031, "loss": 0.6858, "rewards/accuracies": 0.5, "rewards/chosen": 0.2681441307067871, "rewards/margins": 0.03816384822130203, "rewards/rejected": 0.22998028993606567, "step": 1132 }, { "epoch": 0.6110287178104354, "grad_norm": 6.754363059997559, "learning_rate": 1.9719334665699395e-07, "logits/chosen": -0.1718330979347229, "logits/rejected": -1.396498203277588, "logps/chosen": -246.63845825195312, "logps/rejected": -207.9908447265625, "loss": 0.524, "rewards/accuracies": 1.0, "rewards/chosen": 0.36235544085502625, "rewards/margins": 0.38781529664993286, "rewards/rejected": -0.025459859520196915, "step": 1133 }, { "epoch": 0.6115680194148577, "grad_norm": 6.195006847381592, "learning_rate": 1.967332023923078e-07, "logits/chosen": 0.5014116168022156, "logits/rejected": 0.36591798067092896, "logps/chosen": -230.2371063232422, "logps/rejected": -196.98678588867188, "loss": 0.6538, "rewards/accuracies": 0.5, "rewards/chosen": 0.30521631240844727, "rewards/margins": 0.09539622813463211, "rewards/rejected": 0.20982009172439575, "step": 1134 }, { "epoch": 0.61210732101928, "grad_norm": 6.8439154624938965, "learning_rate": 1.96273247085112e-07, "logits/chosen": 0.5342034697532654, "logits/rejected": -0.4536167085170746, "logps/chosen": -275.3851623535156, "logps/rejected": -258.8686218261719, "loss": 0.6846, "rewards/accuracies": 0.625, "rewards/chosen": 0.2932305335998535, "rewards/margins": 0.029986288398504257, "rewards/rejected": 0.26324427127838135, "step": 1135 }, { "epoch": 0.6126466226237023, "grad_norm": 7.320465087890625, "learning_rate": 1.9581348236704212e-07, "logits/chosen": 0.9538546204566956, "logits/rejected": 0.26271817088127136, "logps/chosen": -208.914306640625, "logps/rejected": -172.64231872558594, "loss": 0.6414, "rewards/accuracies": 0.625, "rewards/chosen": 0.2679959237575531, "rewards/margins": 0.12013835459947586, "rewards/rejected": 0.14785757660865784, "step": 1136 }, { "epoch": 0.6131859242281246, "grad_norm": 7.533596515655518, "learning_rate": 1.953539098690576e-07, "logits/chosen": 1.0505880117416382, "logits/rejected": 0.6484179496765137, "logps/chosen": -261.496826171875, "logps/rejected": -297.26519775390625, "loss": 0.6843, "rewards/accuracies": 0.625, "rewards/chosen": 0.1557752639055252, "rewards/margins": 0.026452254503965378, "rewards/rejected": 0.12932300567626953, "step": 1137 }, { "epoch": 0.6137252258325469, "grad_norm": 7.13694953918457, "learning_rate": 1.9489453122143603e-07, "logits/chosen": 0.8116291761398315, "logits/rejected": -0.8284245133399963, "logps/chosen": -259.16741943359375, "logps/rejected": -172.55996704101562, "loss": 0.6146, "rewards/accuracies": 0.875, "rewards/chosen": 0.3060327470302582, "rewards/margins": 0.1691143959760666, "rewards/rejected": 0.13691836595535278, "step": 1138 }, { "epoch": 0.6142645274369691, "grad_norm": 8.955361366271973, "learning_rate": 1.9443534805376735e-07, "logits/chosen": -0.4227495491504669, "logits/rejected": -0.6501495838165283, "logps/chosen": -248.41326904296875, "logps/rejected": -241.51507568359375, "loss": 0.7055, "rewards/accuracies": 0.375, "rewards/chosen": 0.20741339027881622, "rewards/margins": -0.006138898432254791, "rewards/rejected": 0.21355228126049042, "step": 1139 }, { "epoch": 0.6148038290413914, "grad_norm": 7.412377834320068, "learning_rate": 1.9397636199494806e-07, "logits/chosen": -0.24463093280792236, "logits/rejected": 0.05907824635505676, "logps/chosen": -246.31655883789062, "logps/rejected": -312.3772888183594, "loss": 0.7036, "rewards/accuracies": 0.375, "rewards/chosen": 0.15195828676223755, "rewards/margins": -0.017293259501457214, "rewards/rejected": 0.16925154626369476, "step": 1140 }, { "epoch": 0.6153431306458137, "grad_norm": 7.617393493652344, "learning_rate": 1.9351757467317535e-07, "logits/chosen": 0.46443286538124084, "logits/rejected": -0.6913691759109497, "logps/chosen": -244.05966186523438, "logps/rejected": -174.49703979492188, "loss": 0.6418, "rewards/accuracies": 0.5, "rewards/chosen": 0.2793215811252594, "rewards/margins": 0.1158314198255539, "rewards/rejected": 0.1634901463985443, "step": 1141 }, { "epoch": 0.615882432250236, "grad_norm": 8.018550872802734, "learning_rate": 1.9305898771594148e-07, "logits/chosen": -0.22948512434959412, "logits/rejected": -0.6760820150375366, "logps/chosen": -262.951416015625, "logps/rejected": -233.80621337890625, "loss": 0.6784, "rewards/accuracies": 0.625, "rewards/chosen": 0.24436253309249878, "rewards/margins": 0.03698377311229706, "rewards/rejected": 0.20737877488136292, "step": 1142 }, { "epoch": 0.6164217338546583, "grad_norm": 6.107267379760742, "learning_rate": 1.9260060275002799e-07, "logits/chosen": 0.005804136395454407, "logits/rejected": -0.07425171136856079, "logps/chosen": -174.83615112304688, "logps/rejected": -221.99380493164062, "loss": 0.7005, "rewards/accuracies": 0.5, "rewards/chosen": 0.15905094146728516, "rewards/margins": -0.008934587240219116, "rewards/rejected": 0.16798552870750427, "step": 1143 }, { "epoch": 0.6169610354590805, "grad_norm": 7.729963779449463, "learning_rate": 1.9214242140149983e-07, "logits/chosen": 0.36681655049324036, "logits/rejected": -0.48535969853401184, "logps/chosen": -227.74472045898438, "logps/rejected": -200.97109985351562, "loss": 0.6502, "rewards/accuracies": 0.625, "rewards/chosen": 0.27964115142822266, "rewards/margins": 0.10017186403274536, "rewards/rejected": 0.1794693022966385, "step": 1144 }, { "epoch": 0.6175003370635027, "grad_norm": 8.031439781188965, "learning_rate": 1.9168444529569966e-07, "logits/chosen": 0.6299063563346863, "logits/rejected": 0.4383161664009094, "logps/chosen": -372.1654052734375, "logps/rejected": -311.6965637207031, "loss": 0.7146, "rewards/accuracies": 0.5, "rewards/chosen": 0.24004268646240234, "rewards/margins": -0.0296136736869812, "rewards/rejected": 0.26965638995170593, "step": 1145 }, { "epoch": 0.618039638667925, "grad_norm": 7.418875694274902, "learning_rate": 1.91226676057242e-07, "logits/chosen": -0.6997906565666199, "logits/rejected": -0.47037872672080994, "logps/chosen": -257.3505554199219, "logps/rejected": -222.84243774414062, "loss": 0.6978, "rewards/accuracies": 0.5, "rewards/chosen": 0.2624339163303375, "rewards/margins": 0.001905348151922226, "rewards/rejected": 0.260528564453125, "step": 1146 }, { "epoch": 0.6185789402723473, "grad_norm": 6.757796287536621, "learning_rate": 1.907691153100076e-07, "logits/chosen": 0.5603993535041809, "logits/rejected": -0.250858873128891, "logps/chosen": -239.5217742919922, "logps/rejected": -218.88694763183594, "loss": 0.6696, "rewards/accuracies": 0.5, "rewards/chosen": 0.2434822916984558, "rewards/margins": 0.053227417171001434, "rewards/rejected": 0.19025488197803497, "step": 1147 }, { "epoch": 0.6191182418767696, "grad_norm": 6.496389865875244, "learning_rate": 1.903117646771376e-07, "logits/chosen": -0.5308582186698914, "logits/rejected": -1.3100394010543823, "logps/chosen": -230.70440673828125, "logps/rejected": -222.41671752929688, "loss": 0.6607, "rewards/accuracies": 0.75, "rewards/chosen": 0.2539334297180176, "rewards/margins": 0.0752679854631424, "rewards/rejected": 0.17866545915603638, "step": 1148 }, { "epoch": 0.6196575434811918, "grad_norm": 7.58078670501709, "learning_rate": 1.8985462578102784e-07, "logits/chosen": 0.2620345652103424, "logits/rejected": 0.27799445390701294, "logps/chosen": -211.55264282226562, "logps/rejected": -255.18795776367188, "loss": 0.644, "rewards/accuracies": 0.75, "rewards/chosen": 0.25990965962409973, "rewards/margins": 0.12080613523721695, "rewards/rejected": 0.139103502035141, "step": 1149 }, { "epoch": 0.6201968450856141, "grad_norm": 7.166930675506592, "learning_rate": 1.893977002433229e-07, "logits/chosen": 0.10153110325336456, "logits/rejected": -0.4107010066509247, "logps/chosen": -236.03298950195312, "logps/rejected": -242.7576446533203, "loss": 0.6783, "rewards/accuracies": 0.75, "rewards/chosen": 0.2416427582502365, "rewards/margins": 0.05806141346693039, "rewards/rejected": 0.18358135223388672, "step": 1150 }, { "epoch": 0.6207361466900364, "grad_norm": 7.3650712966918945, "learning_rate": 1.8894098968491067e-07, "logits/chosen": 0.20379310846328735, "logits/rejected": -0.05229055881500244, "logps/chosen": -294.8105163574219, "logps/rejected": -246.7111053466797, "loss": 0.6741, "rewards/accuracies": 0.75, "rewards/chosen": 0.2781337797641754, "rewards/margins": 0.04709911346435547, "rewards/rejected": 0.23103466629981995, "step": 1151 }, { "epoch": 0.6212754482944587, "grad_norm": 7.425689220428467, "learning_rate": 1.8848449572591627e-07, "logits/chosen": -0.3860335350036621, "logits/rejected": -0.7086509466171265, "logps/chosen": -178.1561737060547, "logps/rejected": -147.7825927734375, "loss": 0.6562, "rewards/accuracies": 0.625, "rewards/chosen": 0.08062276989221573, "rewards/margins": 0.08571644127368927, "rewards/rejected": -0.005093667656183243, "step": 1152 }, { "epoch": 0.621814749898881, "grad_norm": 8.682735443115234, "learning_rate": 1.880282199856966e-07, "logits/chosen": 0.7472316026687622, "logits/rejected": -0.9560364484786987, "logps/chosen": -351.6298828125, "logps/rejected": -245.7984161376953, "loss": 0.6156, "rewards/accuracies": 0.75, "rewards/chosen": 0.28383857011795044, "rewards/margins": 0.17274218797683716, "rewards/rejected": 0.11109638214111328, "step": 1153 }, { "epoch": 0.6223540515033033, "grad_norm": 7.930273532867432, "learning_rate": 1.875721640828344e-07, "logits/chosen": 0.7308890223503113, "logits/rejected": -0.6159887313842773, "logps/chosen": -302.97802734375, "logps/rejected": -200.94131469726562, "loss": 0.6881, "rewards/accuracies": 0.5, "rewards/chosen": 0.24005794525146484, "rewards/margins": 0.030814550817012787, "rewards/rejected": 0.20924340188503265, "step": 1154 }, { "epoch": 0.6228933531077255, "grad_norm": 7.882028579711914, "learning_rate": 1.8711632963513235e-07, "logits/chosen": 0.3075273633003235, "logits/rejected": -0.5401034355163574, "logps/chosen": -290.6556701660156, "logps/rejected": -378.50189208984375, "loss": 0.6605, "rewards/accuracies": 0.375, "rewards/chosen": 0.2296573519706726, "rewards/margins": 0.08454389870166779, "rewards/rejected": 0.14511346817016602, "step": 1155 }, { "epoch": 0.6234326547121478, "grad_norm": 8.46735668182373, "learning_rate": 1.8666071825960807e-07, "logits/chosen": -0.5704637765884399, "logits/rejected": 0.4215833246707916, "logps/chosen": -213.59341430664062, "logps/rejected": -229.6951904296875, "loss": 0.7325, "rewards/accuracies": 0.375, "rewards/chosen": 0.1905716061592102, "rewards/margins": -0.06888771057128906, "rewards/rejected": 0.25945931673049927, "step": 1156 }, { "epoch": 0.62397195631657, "grad_norm": 6.432672500610352, "learning_rate": 1.8620533157248744e-07, "logits/chosen": -0.9418289065361023, "logits/rejected": -0.5990827679634094, "logps/chosen": -172.26495361328125, "logps/rejected": -193.11041259765625, "loss": 0.6936, "rewards/accuracies": 0.625, "rewards/chosen": 0.23719856142997742, "rewards/margins": 0.02376556023955345, "rewards/rejected": 0.21343299746513367, "step": 1157 }, { "epoch": 0.6245112579209923, "grad_norm": 8.37438678741455, "learning_rate": 1.8575017118919928e-07, "logits/chosen": -0.5601138472557068, "logits/rejected": -1.220430850982666, "logps/chosen": -296.25811767578125, "logps/rejected": -282.49432373046875, "loss": 0.7005, "rewards/accuracies": 0.625, "rewards/chosen": 0.2883869409561157, "rewards/margins": 0.008807465434074402, "rewards/rejected": 0.2795794606208801, "step": 1158 }, { "epoch": 0.6250505595254146, "grad_norm": 7.665247917175293, "learning_rate": 1.8529523872436977e-07, "logits/chosen": 0.4490460157394409, "logits/rejected": -0.07280510663986206, "logps/chosen": -244.39678955078125, "logps/rejected": -247.23428344726562, "loss": 0.6414, "rewards/accuracies": 0.625, "rewards/chosen": 0.2271472066640854, "rewards/margins": 0.11969384551048279, "rewards/rejected": 0.1074533462524414, "step": 1159 }, { "epoch": 0.6255898611298368, "grad_norm": 7.276493072509766, "learning_rate": 1.8484053579181658e-07, "logits/chosen": -1.0534909963607788, "logits/rejected": 0.02234492637217045, "logps/chosen": -206.37890625, "logps/rejected": -274.36602783203125, "loss": 0.6917, "rewards/accuracies": 0.5, "rewards/chosen": 0.20182161033153534, "rewards/margins": 0.009370137006044388, "rewards/rejected": 0.19245147705078125, "step": 1160 }, { "epoch": 0.6261291627342591, "grad_norm": 8.555850982666016, "learning_rate": 1.843860640045431e-07, "logits/chosen": 0.8707390427589417, "logits/rejected": 0.16777867078781128, "logps/chosen": -307.90008544921875, "logps/rejected": -249.39418029785156, "loss": 0.6521, "rewards/accuracies": 0.625, "rewards/chosen": 0.19859200716018677, "rewards/margins": 0.0938694030046463, "rewards/rejected": 0.10472259670495987, "step": 1161 }, { "epoch": 0.6266684643386814, "grad_norm": 7.20280122756958, "learning_rate": 1.839318249747327e-07, "logits/chosen": -0.16597804427146912, "logits/rejected": 0.38041508197784424, "logps/chosen": -160.17913818359375, "logps/rejected": -194.00889587402344, "loss": 0.6633, "rewards/accuracies": 0.625, "rewards/chosen": 0.24642905592918396, "rewards/margins": 0.07254457473754883, "rewards/rejected": 0.17388449609279633, "step": 1162 }, { "epoch": 0.6272077659431037, "grad_norm": 8.21740436553955, "learning_rate": 1.8347782031374315e-07, "logits/chosen": -0.514940083026886, "logits/rejected": -0.11791981756687164, "logps/chosen": -160.24490356445312, "logps/rejected": -184.51666259765625, "loss": 0.8213, "rewards/accuracies": 0.125, "rewards/chosen": 0.10043339431285858, "rewards/margins": -0.233601912856102, "rewards/rejected": 0.33403530716896057, "step": 1163 }, { "epoch": 0.627747067547526, "grad_norm": 7.577322483062744, "learning_rate": 1.8302405163210078e-07, "logits/chosen": -0.4853138327598572, "logits/rejected": -0.8670142889022827, "logps/chosen": -224.70584106445312, "logps/rejected": -210.80963134765625, "loss": 0.6611, "rewards/accuracies": 0.5, "rewards/chosen": 0.2121056616306305, "rewards/margins": 0.0734746903181076, "rewards/rejected": 0.1386309564113617, "step": 1164 }, { "epoch": 0.6282863691519482, "grad_norm": 6.774302005767822, "learning_rate": 1.825705205394949e-07, "logits/chosen": -0.9532215595245361, "logits/rejected": 0.35633620619773865, "logps/chosen": -236.22030639648438, "logps/rejected": -273.98944091796875, "loss": 0.6486, "rewards/accuracies": 0.5, "rewards/chosen": 0.23015080392360687, "rewards/margins": 0.10845452547073364, "rewards/rejected": 0.12169627845287323, "step": 1165 }, { "epoch": 0.6288256707563705, "grad_norm": 7.883232593536377, "learning_rate": 1.8211722864477193e-07, "logits/chosen": 0.9393333792686462, "logits/rejected": 0.47642627358436584, "logps/chosen": -196.26295471191406, "logps/rejected": -186.57958984375, "loss": 0.6817, "rewards/accuracies": 0.5, "rewards/chosen": 0.18258944153785706, "rewards/margins": 0.03596458584070206, "rewards/rejected": 0.1466248482465744, "step": 1166 }, { "epoch": 0.6293649723607928, "grad_norm": 6.092465400695801, "learning_rate": 1.8166417755592973e-07, "logits/chosen": 0.9033714532852173, "logits/rejected": 0.6139848232269287, "logps/chosen": -226.735595703125, "logps/rejected": -244.45274353027344, "loss": 0.6891, "rewards/accuracies": 0.375, "rewards/chosen": 0.3423532545566559, "rewards/margins": 0.01999472826719284, "rewards/rejected": 0.32235854864120483, "step": 1167 }, { "epoch": 0.6299042739652151, "grad_norm": 7.046959400177002, "learning_rate": 1.8121136888011197e-07, "logits/chosen": 1.4804067611694336, "logits/rejected": -0.5473349690437317, "logps/chosen": -263.405029296875, "logps/rejected": -186.02590942382812, "loss": 0.6435, "rewards/accuracies": 0.75, "rewards/chosen": 0.2706846296787262, "rewards/margins": 0.11179789900779724, "rewards/rejected": 0.15888671576976776, "step": 1168 }, { "epoch": 0.6304435755696374, "grad_norm": 11.733664512634277, "learning_rate": 1.8075880422360242e-07, "logits/chosen": -0.4362124800682068, "logits/rejected": 0.18849366903305054, "logps/chosen": -233.6059112548828, "logps/rejected": -238.49868774414062, "loss": 0.7082, "rewards/accuracies": 0.375, "rewards/chosen": 0.1678587943315506, "rewards/margins": -0.025295928120613098, "rewards/rejected": 0.1931547224521637, "step": 1169 }, { "epoch": 0.6309828771740595, "grad_norm": 8.00326919555664, "learning_rate": 1.8030648519181923e-07, "logits/chosen": -0.020949169993400574, "logits/rejected": -0.34498918056488037, "logps/chosen": -194.9423828125, "logps/rejected": -222.23915100097656, "loss": 0.676, "rewards/accuracies": 0.75, "rewards/chosen": 0.15572614967823029, "rewards/margins": 0.04772138595581055, "rewards/rejected": 0.10800476372241974, "step": 1170 }, { "epoch": 0.6315221787784818, "grad_norm": 6.210102081298828, "learning_rate": 1.7985441338930913e-07, "logits/chosen": 1.406675100326538, "logits/rejected": 0.07842929661273956, "logps/chosen": -184.90975952148438, "logps/rejected": -168.97320556640625, "loss": 0.6427, "rewards/accuracies": 0.75, "rewards/chosen": 0.17311066389083862, "rewards/margins": 0.11156788468360901, "rewards/rejected": 0.06154279783368111, "step": 1171 }, { "epoch": 0.6320614803829041, "grad_norm": 6.773646354675293, "learning_rate": 1.7940259041974186e-07, "logits/chosen": 0.5967315435409546, "logits/rejected": -0.9134011268615723, "logps/chosen": -172.16177368164062, "logps/rejected": -198.17715454101562, "loss": 0.6548, "rewards/accuracies": 0.625, "rewards/chosen": 0.2539263963699341, "rewards/margins": 0.09112949669361115, "rewards/rejected": 0.16279688477516174, "step": 1172 }, { "epoch": 0.6326007819873264, "grad_norm": 8.321001052856445, "learning_rate": 1.789510178859046e-07, "logits/chosen": 0.31582313776016235, "logits/rejected": -0.08202079683542252, "logps/chosen": -225.57684326171875, "logps/rejected": -240.81588745117188, "loss": 0.7501, "rewards/accuracies": 0.375, "rewards/chosen": 0.1693107634782791, "rewards/margins": -0.10048122704029083, "rewards/rejected": 0.26979202032089233, "step": 1173 }, { "epoch": 0.6331400835917487, "grad_norm": 8.003087997436523, "learning_rate": 1.7849969738969588e-07, "logits/chosen": 0.2907410264015198, "logits/rejected": 0.23548723757266998, "logps/chosen": -265.8203125, "logps/rejected": -255.93466186523438, "loss": 0.701, "rewards/accuracies": 0.5, "rewards/chosen": 0.18193894624710083, "rewards/margins": -0.012404154054820538, "rewards/rejected": 0.19434309005737305, "step": 1174 }, { "epoch": 0.633679385196171, "grad_norm": 6.992754936218262, "learning_rate": 1.7804863053212054e-07, "logits/chosen": 0.22345639765262604, "logits/rejected": 0.2990436553955078, "logps/chosen": -262.2685241699219, "logps/rejected": -285.3060302734375, "loss": 0.614, "rewards/accuracies": 0.75, "rewards/chosen": 0.26681604981422424, "rewards/margins": 0.17774219810962677, "rewards/rejected": 0.08907384425401688, "step": 1175 }, { "epoch": 0.6342186868005932, "grad_norm": 7.702865123748779, "learning_rate": 1.7759781891328318e-07, "logits/chosen": 0.7692366242408752, "logits/rejected": -0.016564682126045227, "logps/chosen": -226.77684020996094, "logps/rejected": -237.56690979003906, "loss": 0.6964, "rewards/accuracies": 0.375, "rewards/chosen": 0.32155361771583557, "rewards/margins": 0.002043437212705612, "rewards/rejected": 0.31951016187667847, "step": 1176 }, { "epoch": 0.6347579884050155, "grad_norm": 7.16938591003418, "learning_rate": 1.7714726413238336e-07, "logits/chosen": 0.3151836395263672, "logits/rejected": -1.0129977464675903, "logps/chosen": -185.43099975585938, "logps/rejected": -166.35498046875, "loss": 0.6364, "rewards/accuracies": 0.625, "rewards/chosen": 0.2167411893606186, "rewards/margins": 0.12410622090101242, "rewards/rejected": 0.09263496100902557, "step": 1177 }, { "epoch": 0.6352972900094378, "grad_norm": 6.628982067108154, "learning_rate": 1.7669696778770938e-07, "logits/chosen": 0.4819696545600891, "logits/rejected": 0.36127954721450806, "logps/chosen": -214.35057067871094, "logps/rejected": -267.2200927734375, "loss": 0.6679, "rewards/accuracies": 0.5, "rewards/chosen": 0.16437625885009766, "rewards/margins": 0.05542588606476784, "rewards/rejected": 0.10895037651062012, "step": 1178 }, { "epoch": 0.6358365916138601, "grad_norm": 7.591000556945801, "learning_rate": 1.762469314766328e-07, "logits/chosen": 0.009154997766017914, "logits/rejected": -0.5846494436264038, "logps/chosen": -255.24356079101562, "logps/rejected": -280.1429138183594, "loss": 0.6257, "rewards/accuracies": 0.625, "rewards/chosen": 0.321654349565506, "rewards/margins": 0.1568600833415985, "rewards/rejected": 0.16479425132274628, "step": 1179 }, { "epoch": 0.6363758932182824, "grad_norm": 6.910820484161377, "learning_rate": 1.757971567956027e-07, "logits/chosen": 0.5902112126350403, "logits/rejected": -0.40497887134552, "logps/chosen": -207.41030883789062, "logps/rejected": -225.05479431152344, "loss": 0.6335, "rewards/accuracies": 0.875, "rewards/chosen": 0.24714604020118713, "rewards/margins": 0.1300654411315918, "rewards/rejected": 0.11708059161901474, "step": 1180 }, { "epoch": 0.6369151948227046, "grad_norm": 9.018316268920898, "learning_rate": 1.7534764534014013e-07, "logits/chosen": -0.4677201509475708, "logits/rejected": 0.07178756594657898, "logps/chosen": -227.21205139160156, "logps/rejected": -329.8903503417969, "loss": 0.706, "rewards/accuracies": 0.5, "rewards/chosen": 0.285970002412796, "rewards/margins": -0.018691927194595337, "rewards/rejected": 0.30466192960739136, "step": 1181 }, { "epoch": 0.6374544964271268, "grad_norm": 8.276886940002441, "learning_rate": 1.7489839870483234e-07, "logits/chosen": 0.6645281314849854, "logits/rejected": -0.8268055319786072, "logps/chosen": -271.120361328125, "logps/rejected": -176.72552490234375, "loss": 0.6484, "rewards/accuracies": 0.625, "rewards/chosen": 0.2186364233493805, "rewards/margins": 0.09829229861497879, "rewards/rejected": 0.1203441247344017, "step": 1182 }, { "epoch": 0.6379937980315491, "grad_norm": 6.783848285675049, "learning_rate": 1.7444941848332715e-07, "logits/chosen": -0.08521327376365662, "logits/rejected": -0.3995366096496582, "logps/chosen": -257.1790771484375, "logps/rejected": -214.82557678222656, "loss": 0.6945, "rewards/accuracies": 0.625, "rewards/chosen": 0.143584206700325, "rewards/margins": 0.0012064464390277863, "rewards/rejected": 0.14237776398658752, "step": 1183 }, { "epoch": 0.6385330996359714, "grad_norm": 7.670401096343994, "learning_rate": 1.740007062683273e-07, "logits/chosen": 0.8127233982086182, "logits/rejected": -1.5412542819976807, "logps/chosen": -264.8668212890625, "logps/rejected": -158.280029296875, "loss": 0.5711, "rewards/accuracies": 0.75, "rewards/chosen": 0.273093044757843, "rewards/margins": 0.2739124298095703, "rewards/rejected": -0.0008193943649530411, "step": 1184 }, { "epoch": 0.6390724012403937, "grad_norm": 7.658565044403076, "learning_rate": 1.7355226365158488e-07, "logits/chosen": -0.3768589496612549, "logits/rejected": -0.9727158546447754, "logps/chosen": -286.08367919921875, "logps/rejected": -302.5654296875, "loss": 0.6275, "rewards/accuracies": 0.875, "rewards/chosen": 0.29694825410842896, "rewards/margins": 0.14490434527397156, "rewards/rejected": 0.1520439088344574, "step": 1185 }, { "epoch": 0.6396117028448159, "grad_norm": 8.019857406616211, "learning_rate": 1.731040922238956e-07, "logits/chosen": -0.2866225838661194, "logits/rejected": -0.731299877166748, "logps/chosen": -240.5928192138672, "logps/rejected": -296.44683837890625, "loss": 0.6657, "rewards/accuracies": 0.5, "rewards/chosen": 0.2539919912815094, "rewards/margins": 0.0700409784913063, "rewards/rejected": 0.1839510053396225, "step": 1186 }, { "epoch": 0.6401510044492382, "grad_norm": 7.739345550537109, "learning_rate": 1.7265619357509314e-07, "logits/chosen": 0.5243948698043823, "logits/rejected": 0.5972951054573059, "logps/chosen": -220.24380493164062, "logps/rejected": -229.5064239501953, "loss": 0.6768, "rewards/accuracies": 0.625, "rewards/chosen": 0.23661470413208008, "rewards/margins": 0.049577999860048294, "rewards/rejected": 0.1870366930961609, "step": 1187 }, { "epoch": 0.6406903060536605, "grad_norm": 8.105902671813965, "learning_rate": 1.722085692940434e-07, "logits/chosen": 0.7479959726333618, "logits/rejected": -0.6971269845962524, "logps/chosen": -267.0198059082031, "logps/rejected": -274.1193542480469, "loss": 0.6462, "rewards/accuracies": 0.75, "rewards/chosen": 0.28940415382385254, "rewards/margins": 0.12646737694740295, "rewards/rejected": 0.16293677687644958, "step": 1188 }, { "epoch": 0.6412296076580828, "grad_norm": 7.523765563964844, "learning_rate": 1.717612209686392e-07, "logits/chosen": 0.441253662109375, "logits/rejected": -0.9998236894607544, "logps/chosen": -333.5115661621094, "logps/rejected": -273.5065002441406, "loss": 0.6165, "rewards/accuracies": 0.75, "rewards/chosen": 0.32118168473243713, "rewards/margins": 0.17132148146629333, "rewards/rejected": 0.1498602032661438, "step": 1189 }, { "epoch": 0.6417689092625051, "grad_norm": 6.795889377593994, "learning_rate": 1.7131415018579428e-07, "logits/chosen": 0.9832770824432373, "logits/rejected": 0.03846028447151184, "logps/chosen": -249.246826171875, "logps/rejected": -204.27667236328125, "loss": 0.6749, "rewards/accuracies": 0.625, "rewards/chosen": 0.2498067021369934, "rewards/margins": 0.04557028412818909, "rewards/rejected": 0.20423641800880432, "step": 1190 }, { "epoch": 0.6423082108669274, "grad_norm": 8.451519012451172, "learning_rate": 1.7086735853143802e-07, "logits/chosen": 0.308464914560318, "logits/rejected": 0.02298460341989994, "logps/chosen": -199.439697265625, "logps/rejected": -220.95767211914062, "loss": 0.7762, "rewards/accuracies": 0.375, "rewards/chosen": 0.10286235809326172, "rewards/margins": -0.15326356887817383, "rewards/rejected": 0.25612592697143555, "step": 1191 }, { "epoch": 0.6428475124713496, "grad_norm": 7.963018894195557, "learning_rate": 1.7042084759050946e-07, "logits/chosen": -0.15688546001911163, "logits/rejected": 0.12044945359230042, "logps/chosen": -204.9170684814453, "logps/rejected": -207.4677276611328, "loss": 0.766, "rewards/accuracies": 0.125, "rewards/chosen": 0.1366005539894104, "rewards/margins": -0.1306135654449463, "rewards/rejected": 0.2672141194343567, "step": 1192 }, { "epoch": 0.6433868140757719, "grad_norm": 6.12981653213501, "learning_rate": 1.6997461894695181e-07, "logits/chosen": 0.3487786054611206, "logits/rejected": 0.22986161708831787, "logps/chosen": -199.44973754882812, "logps/rejected": -196.54774475097656, "loss": 0.6974, "rewards/accuracies": 0.625, "rewards/chosen": 0.18999643623828888, "rewards/margins": 0.0010580085217952728, "rewards/rejected": 0.1889384388923645, "step": 1193 }, { "epoch": 0.6439261156801942, "grad_norm": 7.222141742706299, "learning_rate": 1.6952867418370706e-07, "logits/chosen": 1.3104383945465088, "logits/rejected": 0.5658509135246277, "logps/chosen": -182.25657653808594, "logps/rejected": -160.0230712890625, "loss": 0.6944, "rewards/accuracies": 0.625, "rewards/chosen": 0.18514500558376312, "rewards/margins": 0.018619442358613014, "rewards/rejected": 0.16652554273605347, "step": 1194 }, { "epoch": 0.6444654172846164, "grad_norm": 8.846531867980957, "learning_rate": 1.6908301488270998e-07, "logits/chosen": 1.6503506898880005, "logits/rejected": 0.14829997718334198, "logps/chosen": -273.49371337890625, "logps/rejected": -247.4159393310547, "loss": 0.6512, "rewards/accuracies": 0.75, "rewards/chosen": 0.38472041487693787, "rewards/margins": 0.09989787638187408, "rewards/rejected": 0.28482258319854736, "step": 1195 }, { "epoch": 0.6450047188890387, "grad_norm": 8.990506172180176, "learning_rate": 1.686376426248829e-07, "logits/chosen": -0.3383890390396118, "logits/rejected": -1.5846750736236572, "logps/chosen": -258.1368103027344, "logps/rejected": -231.9687957763672, "loss": 0.6848, "rewards/accuracies": 0.5, "rewards/chosen": 0.21508283913135529, "rewards/margins": 0.04843369126319885, "rewards/rejected": 0.16664916276931763, "step": 1196 }, { "epoch": 0.6455440204934609, "grad_norm": 10.623568534851074, "learning_rate": 1.681925589901296e-07, "logits/chosen": 0.5443375110626221, "logits/rejected": -0.5604634284973145, "logps/chosen": -228.53054809570312, "logps/rejected": -188.6937255859375, "loss": 0.7093, "rewards/accuracies": 0.5, "rewards/chosen": 0.07884159684181213, "rewards/margins": -0.024642756208777428, "rewards/rejected": 0.10348434746265411, "step": 1197 }, { "epoch": 0.6460833220978832, "grad_norm": 6.591259002685547, "learning_rate": 1.6774776555733028e-07, "logits/chosen": 0.3771324157714844, "logits/rejected": -1.6438080072402954, "logps/chosen": -299.3226318359375, "logps/rejected": -192.78196716308594, "loss": 0.6612, "rewards/accuracies": 0.625, "rewards/chosen": 0.23297223448753357, "rewards/margins": 0.07911081612110138, "rewards/rejected": 0.1538614183664322, "step": 1198 }, { "epoch": 0.6466226237023055, "grad_norm": 7.754515171051025, "learning_rate": 1.6730326390433552e-07, "logits/chosen": 0.722075879573822, "logits/rejected": -0.05511082708835602, "logps/chosen": -226.2930908203125, "logps/rejected": -183.70994567871094, "loss": 0.6753, "rewards/accuracies": 0.5, "rewards/chosen": 0.2417423278093338, "rewards/margins": 0.05010538175702095, "rewards/rejected": 0.19163693487644196, "step": 1199 }, { "epoch": 0.6471619253067278, "grad_norm": 8.236790657043457, "learning_rate": 1.6685905560796098e-07, "logits/chosen": -0.4815419316291809, "logits/rejected": -0.48215168714523315, "logps/chosen": -238.57586669921875, "logps/rejected": -337.91192626953125, "loss": 0.6029, "rewards/accuracies": 0.875, "rewards/chosen": 0.3035772442817688, "rewards/margins": 0.20606079697608948, "rewards/rejected": 0.09751643985509872, "step": 1200 }, { "epoch": 0.6471619253067278, "eval_logits/chosen": 1.364848017692566, "eval_logits/rejected": 1.0919498205184937, "eval_logps/chosen": -249.18165588378906, "eval_logps/rejected": -234.18333435058594, "eval_loss": 0.671845555305481, "eval_rewards/accuracies": 0.6080745458602905, "eval_rewards/chosen": 0.22698259353637695, "eval_rewards/margins": 0.05906076356768608, "eval_rewards/rejected": 0.16792185604572296, "eval_runtime": 836.6377, "eval_samples_per_second": 1.924, "eval_steps_per_second": 0.962, "step": 1200 }, { "epoch": 0.6477012269111501, "grad_norm": 7.56683874130249, "learning_rate": 1.6641514224398163e-07, "logits/chosen": -0.49273788928985596, "logits/rejected": 0.5314897298812866, "logps/chosen": -217.83226013183594, "logps/rejected": -248.3484344482422, "loss": 0.6934, "rewards/accuracies": 0.375, "rewards/chosen": 0.24194346368312836, "rewards/margins": 0.01719846948981285, "rewards/rejected": 0.224744975566864, "step": 1201 }, { "epoch": 0.6482405285155723, "grad_norm": 6.483404159545898, "learning_rate": 1.6597152538712606e-07, "logits/chosen": 0.8200241923332214, "logits/rejected": -0.6519283056259155, "logps/chosen": -317.65252685546875, "logps/rejected": -242.96878051757812, "loss": 0.6205, "rewards/accuracies": 0.625, "rewards/chosen": 0.27121105790138245, "rewards/margins": 0.17102956771850586, "rewards/rejected": 0.10018149018287659, "step": 1202 }, { "epoch": 0.6487798301199946, "grad_norm": 7.679973602294922, "learning_rate": 1.6552820661107117e-07, "logits/chosen": 0.28616106510162354, "logits/rejected": -1.168559193611145, "logps/chosen": -251.21852111816406, "logps/rejected": -177.544189453125, "loss": 0.5987, "rewards/accuracies": 0.875, "rewards/chosen": 0.20727692544460297, "rewards/margins": 0.20977813005447388, "rewards/rejected": -0.002501200884580612, "step": 1203 }, { "epoch": 0.6493191317244169, "grad_norm": 6.833235740661621, "learning_rate": 1.650851874884365e-07, "logits/chosen": 0.044530127197504044, "logits/rejected": -0.9585328102111816, "logps/chosen": -214.8104705810547, "logps/rejected": -167.74993896484375, "loss": 0.6076, "rewards/accuracies": 0.875, "rewards/chosen": 0.2585294842720032, "rewards/margins": 0.1892833709716797, "rewards/rejected": 0.06924609839916229, "step": 1204 }, { "epoch": 0.6498584333288392, "grad_norm": 7.455731391906738, "learning_rate": 1.6464246959077851e-07, "logits/chosen": -0.5942604541778564, "logits/rejected": 0.14883831143379211, "logps/chosen": -150.35269165039062, "logps/rejected": -239.7926483154297, "loss": 0.7073, "rewards/accuracies": 0.5, "rewards/chosen": 0.13866671919822693, "rewards/margins": -0.0031235739588737488, "rewards/rejected": 0.14179030060768127, "step": 1205 }, { "epoch": 0.6503977349332615, "grad_norm": 7.877978801727295, "learning_rate": 1.642000544885852e-07, "logits/chosen": 1.2402695417404175, "logits/rejected": -0.6136485934257507, "logps/chosen": -253.53353881835938, "logps/rejected": -188.25906372070312, "loss": 0.6259, "rewards/accuracies": 0.75, "rewards/chosen": 0.3263116776943207, "rewards/margins": 0.15120716392993927, "rewards/rejected": 0.1751045286655426, "step": 1206 }, { "epoch": 0.6509370365376836, "grad_norm": 7.779075622558594, "learning_rate": 1.6375794375127027e-07, "logits/chosen": 0.8625923991203308, "logits/rejected": 0.40223509073257446, "logps/chosen": -304.33233642578125, "logps/rejected": -272.1868896484375, "loss": 0.644, "rewards/accuracies": 0.625, "rewards/chosen": 0.25446170568466187, "rewards/margins": 0.11524096131324768, "rewards/rejected": 0.1392207145690918, "step": 1207 }, { "epoch": 0.6514763381421059, "grad_norm": 7.392035484313965, "learning_rate": 1.6331613894716783e-07, "logits/chosen": 0.37228137254714966, "logits/rejected": -0.8682358860969543, "logps/chosen": -215.02557373046875, "logps/rejected": -155.62551879882812, "loss": 0.6458, "rewards/accuracies": 0.5, "rewards/chosen": 0.2623698115348816, "rewards/margins": 0.10893267393112183, "rewards/rejected": 0.15343713760375977, "step": 1208 }, { "epoch": 0.6520156397465282, "grad_norm": 8.144752502441406, "learning_rate": 1.6287464164352683e-07, "logits/chosen": 0.01469886302947998, "logits/rejected": 0.3308033347129822, "logps/chosen": -197.79202270507812, "logps/rejected": -218.6881103515625, "loss": 0.6949, "rewards/accuracies": 0.25, "rewards/chosen": 0.1799289733171463, "rewards/margins": 0.005606077611446381, "rewards/rejected": 0.17432290315628052, "step": 1209 }, { "epoch": 0.6525549413509505, "grad_norm": 8.472554206848145, "learning_rate": 1.624334534065052e-07, "logits/chosen": 0.534291684627533, "logits/rejected": -1.074465036392212, "logps/chosen": -254.57708740234375, "logps/rejected": -192.79722595214844, "loss": 0.6604, "rewards/accuracies": 0.75, "rewards/chosen": 0.25756964087486267, "rewards/margins": 0.07802262157201767, "rewards/rejected": 0.1795470267534256, "step": 1210 }, { "epoch": 0.6530942429553728, "grad_norm": 6.006984233856201, "learning_rate": 1.619925758011646e-07, "logits/chosen": 0.003641285002231598, "logits/rejected": -0.2943687438964844, "logps/chosen": -195.4427490234375, "logps/rejected": -248.04766845703125, "loss": 0.6434, "rewards/accuracies": 0.75, "rewards/chosen": 0.32778680324554443, "rewards/margins": 0.11303525418043137, "rewards/rejected": 0.21475154161453247, "step": 1211 }, { "epoch": 0.653633544559795, "grad_norm": 6.993709087371826, "learning_rate": 1.6155201039146477e-07, "logits/chosen": 0.8929477334022522, "logits/rejected": -0.3138788044452667, "logps/chosen": -280.4254455566406, "logps/rejected": -197.85855102539062, "loss": 0.676, "rewards/accuracies": 0.5, "rewards/chosen": 0.1869582235813141, "rewards/margins": 0.04846916347742081, "rewards/rejected": 0.13848905265331268, "step": 1212 }, { "epoch": 0.6541728461642173, "grad_norm": 7.409475326538086, "learning_rate": 1.6111175874025795e-07, "logits/chosen": -0.429921954870224, "logits/rejected": -0.5167303085327148, "logps/chosen": -217.16073608398438, "logps/rejected": -232.1488037109375, "loss": 0.7319, "rewards/accuracies": 0.375, "rewards/chosen": 0.14598684012889862, "rewards/margins": -0.05784616619348526, "rewards/rejected": 0.20383301377296448, "step": 1213 }, { "epoch": 0.6547121477686396, "grad_norm": 7.135781288146973, "learning_rate": 1.606718224092833e-07, "logits/chosen": 0.9341559410095215, "logits/rejected": -0.43944448232650757, "logps/chosen": -294.229248046875, "logps/rejected": -188.56402587890625, "loss": 0.6119, "rewards/accuracies": 1.0, "rewards/chosen": 0.3149968087673187, "rewards/margins": 0.17606544494628906, "rewards/rejected": 0.13893136382102966, "step": 1214 }, { "epoch": 0.6552514493730619, "grad_norm": 7.2331318855285645, "learning_rate": 1.602322029591616e-07, "logits/chosen": 0.4516412317752838, "logits/rejected": -0.6847960352897644, "logps/chosen": -301.3878479003906, "logps/rejected": -247.11163330078125, "loss": 0.6579, "rewards/accuracies": 0.75, "rewards/chosen": 0.21828442811965942, "rewards/margins": 0.08602513372898102, "rewards/rejected": 0.1322592794895172, "step": 1215 }, { "epoch": 0.6557907509774842, "grad_norm": 8.223609924316406, "learning_rate": 1.5979290194938935e-07, "logits/chosen": 0.6730554103851318, "logits/rejected": -0.8663997054100037, "logps/chosen": -320.7861022949219, "logps/rejected": -282.78155517578125, "loss": 0.641, "rewards/accuracies": 0.75, "rewards/chosen": 0.24606361985206604, "rewards/margins": 0.11974058300256729, "rewards/rejected": 0.12632302939891815, "step": 1216 }, { "epoch": 0.6563300525819065, "grad_norm": 6.891792297363281, "learning_rate": 1.5935392093833356e-07, "logits/chosen": 0.8005256652832031, "logits/rejected": 1.237368106842041, "logps/chosen": -255.4743194580078, "logps/rejected": -248.83103942871094, "loss": 0.6661, "rewards/accuracies": 0.375, "rewards/chosen": 0.2522619366645813, "rewards/margins": 0.0669003501534462, "rewards/rejected": 0.1853615790605545, "step": 1217 }, { "epoch": 0.6568693541863287, "grad_norm": 7.5281291007995605, "learning_rate": 1.5891526148322593e-07, "logits/chosen": 0.6743701100349426, "logits/rejected": -0.5812419652938843, "logps/chosen": -296.09991455078125, "logps/rejected": -206.58978271484375, "loss": 0.6933, "rewards/accuracies": 0.375, "rewards/chosen": 0.28607702255249023, "rewards/margins": 0.008136846125125885, "rewards/rejected": 0.27794018387794495, "step": 1218 }, { "epoch": 0.657408655790751, "grad_norm": 7.4411702156066895, "learning_rate": 1.5847692514015753e-07, "logits/chosen": 0.2206445038318634, "logits/rejected": -0.5326657295227051, "logps/chosen": -253.77716064453125, "logps/rejected": -244.21424865722656, "loss": 0.6555, "rewards/accuracies": 0.75, "rewards/chosen": 0.21016272902488708, "rewards/margins": 0.08229026943445206, "rewards/rejected": 0.12787246704101562, "step": 1219 }, { "epoch": 0.6579479573951732, "grad_norm": 7.924025535583496, "learning_rate": 1.580389134640734e-07, "logits/chosen": -0.2916180193424225, "logits/rejected": -1.1466096639633179, "logps/chosen": -176.33407592773438, "logps/rejected": -150.75852966308594, "loss": 0.6933, "rewards/accuracies": 0.5, "rewards/chosen": 0.1001138687133789, "rewards/margins": 0.009590629488229752, "rewards/rejected": 0.09052324295043945, "step": 1220 }, { "epoch": 0.6584872589995955, "grad_norm": 7.6184563636779785, "learning_rate": 1.5760122800876657e-07, "logits/chosen": 0.17421060800552368, "logits/rejected": -0.27868127822875977, "logps/chosen": -235.13800048828125, "logps/rejected": -212.99794006347656, "loss": 0.6985, "rewards/accuracies": 0.5, "rewards/chosen": 0.23239804804325104, "rewards/margins": -0.006538292393088341, "rewards/rejected": 0.23893633484840393, "step": 1221 }, { "epoch": 0.6590265606040178, "grad_norm": 7.755519390106201, "learning_rate": 1.571638703268731e-07, "logits/chosen": 0.8623460531234741, "logits/rejected": -1.0216418504714966, "logps/chosen": -278.5013732910156, "logps/rejected": -236.791748046875, "loss": 0.5415, "rewards/accuracies": 0.875, "rewards/chosen": 0.37181416153907776, "rewards/margins": 0.3475058674812317, "rewards/rejected": 0.024308297783136368, "step": 1222 }, { "epoch": 0.65956586220844, "grad_norm": 6.8399529457092285, "learning_rate": 1.5672684196986617e-07, "logits/chosen": -1.0495383739471436, "logits/rejected": -0.7380110025405884, "logps/chosen": -222.55128479003906, "logps/rejected": -235.45785522460938, "loss": 0.6956, "rewards/accuracies": 0.5, "rewards/chosen": 0.15731392800807953, "rewards/margins": -0.0014011375606060028, "rewards/rejected": 0.15871506929397583, "step": 1223 }, { "epoch": 0.6601051638128623, "grad_norm": 7.712538719177246, "learning_rate": 1.5629014448805077e-07, "logits/chosen": 0.6820339560508728, "logits/rejected": -0.6355302333831787, "logps/chosen": -286.2527160644531, "logps/rejected": -231.12896728515625, "loss": 0.6804, "rewards/accuracies": 0.625, "rewards/chosen": 0.2118942141532898, "rewards/margins": 0.0315919928252697, "rewards/rejected": 0.180302232503891, "step": 1224 }, { "epoch": 0.6606444654172846, "grad_norm": 6.54957389831543, "learning_rate": 1.558537794305581e-07, "logits/chosen": 1.6572496891021729, "logits/rejected": 0.054714545607566833, "logps/chosen": -247.88720703125, "logps/rejected": -235.2344207763672, "loss": 0.5661, "rewards/accuracies": 0.875, "rewards/chosen": 0.36334705352783203, "rewards/margins": 0.279940128326416, "rewards/rejected": 0.08340692520141602, "step": 1225 }, { "epoch": 0.6611837670217069, "grad_norm": 8.185343742370605, "learning_rate": 1.554177483453402e-07, "logits/chosen": -0.03621339797973633, "logits/rejected": 0.7751149535179138, "logps/chosen": -225.64068603515625, "logps/rejected": -228.68907165527344, "loss": 0.7266, "rewards/accuracies": 0.25, "rewards/chosen": 0.19245511293411255, "rewards/margins": -0.03959804028272629, "rewards/rejected": 0.23205313086509705, "step": 1226 }, { "epoch": 0.6617230686261292, "grad_norm": 9.192206382751465, "learning_rate": 1.5498205277916444e-07, "logits/chosen": 0.5535020232200623, "logits/rejected": 0.03805292397737503, "logps/chosen": -370.1622314453125, "logps/rejected": -271.77545166015625, "loss": 0.7506, "rewards/accuracies": 0.375, "rewards/chosen": 0.19929638504981995, "rewards/margins": -0.09400558471679688, "rewards/rejected": 0.2933019697666168, "step": 1227 }, { "epoch": 0.6622623702305515, "grad_norm": 7.284595012664795, "learning_rate": 1.5454669427760773e-07, "logits/chosen": 1.0399051904678345, "logits/rejected": -0.8224941492080688, "logps/chosen": -267.9743347167969, "logps/rejected": -244.63229370117188, "loss": 0.5639, "rewards/accuracies": 1.0, "rewards/chosen": 0.3376615643501282, "rewards/margins": 0.2850966453552246, "rewards/rejected": 0.05256490409374237, "step": 1228 }, { "epoch": 0.6628016718349737, "grad_norm": 7.88355016708374, "learning_rate": 1.541116743850515e-07, "logits/chosen": -0.6048586368560791, "logits/rejected": 0.7403733134269714, "logps/chosen": -225.37863159179688, "logps/rejected": -331.86328125, "loss": 0.6822, "rewards/accuracies": 0.5, "rewards/chosen": 0.15310965478420258, "rewards/margins": 0.026744559407234192, "rewards/rejected": 0.12636509537696838, "step": 1229 }, { "epoch": 0.663340973439396, "grad_norm": 7.516520977020264, "learning_rate": 1.5367699464467594e-07, "logits/chosen": 0.461116224527359, "logits/rejected": 0.5792497396469116, "logps/chosen": -243.39730834960938, "logps/rejected": -269.7962951660156, "loss": 0.7076, "rewards/accuracies": 0.375, "rewards/chosen": 0.27167215943336487, "rewards/margins": -0.022620104253292084, "rewards/rejected": 0.29429224133491516, "step": 1230 }, { "epoch": 0.6638802750438183, "grad_norm": 7.6668596267700195, "learning_rate": 1.5324265659845465e-07, "logits/chosen": -0.3139875829219818, "logits/rejected": 0.24164578318595886, "logps/chosen": -237.20614624023438, "logps/rejected": -253.70443725585938, "loss": 0.7223, "rewards/accuracies": 0.625, "rewards/chosen": 0.2508293390274048, "rewards/margins": -0.038202472031116486, "rewards/rejected": 0.2890318036079407, "step": 1231 }, { "epoch": 0.6644195766482405, "grad_norm": 7.219963073730469, "learning_rate": 1.5280866178714895e-07, "logits/chosen": -0.05753029137849808, "logits/rejected": -0.07947739213705063, "logps/chosen": -215.97665405273438, "logps/rejected": -274.6862487792969, "loss": 0.6836, "rewards/accuracies": 0.5, "rewards/chosen": 0.28485748171806335, "rewards/margins": 0.030265137553215027, "rewards/rejected": 0.25459232926368713, "step": 1232 }, { "epoch": 0.6649588782526628, "grad_norm": 7.4355573654174805, "learning_rate": 1.523750117503028e-07, "logits/chosen": -0.30770590901374817, "logits/rejected": -0.7242357730865479, "logps/chosen": -203.1287841796875, "logps/rejected": -199.56045532226562, "loss": 0.7471, "rewards/accuracies": 0.25, "rewards/chosen": 0.19474220275878906, "rewards/margins": -0.09863681346178055, "rewards/rejected": 0.293379008769989, "step": 1233 }, { "epoch": 0.665498179857085, "grad_norm": 6.309285640716553, "learning_rate": 1.519417080262369e-07, "logits/chosen": 0.9478244185447693, "logits/rejected": 0.08081550896167755, "logps/chosen": -253.6610107421875, "logps/rejected": -203.0294189453125, "loss": 0.5988, "rewards/accuracies": 0.75, "rewards/chosen": 0.2951698303222656, "rewards/margins": 0.21079635620117188, "rewards/rejected": 0.08437347412109375, "step": 1234 }, { "epoch": 0.6660374814615073, "grad_norm": 6.665345668792725, "learning_rate": 1.5150875215204362e-07, "logits/chosen": 0.6841993927955627, "logits/rejected": -1.122794270515442, "logps/chosen": -276.5787353515625, "logps/rejected": -168.30575561523438, "loss": 0.6854, "rewards/accuracies": 0.375, "rewards/chosen": 0.21600496768951416, "rewards/margins": 0.03561602532863617, "rewards/rejected": 0.1803889274597168, "step": 1235 }, { "epoch": 0.6665767830659296, "grad_norm": 7.827778339385986, "learning_rate": 1.5107614566358135e-07, "logits/chosen": 0.3912423849105835, "logits/rejected": -0.06708940863609314, "logps/chosen": -251.54254150390625, "logps/rejected": -286.6777038574219, "loss": 0.7176, "rewards/accuracies": 0.5, "rewards/chosen": 0.1619168370962143, "rewards/margins": -0.023778431117534637, "rewards/rejected": 0.18569527566432953, "step": 1236 }, { "epoch": 0.6671160846703519, "grad_norm": 8.598621368408203, "learning_rate": 1.5064389009546885e-07, "logits/chosen": -0.3362853229045868, "logits/rejected": -0.6339892148971558, "logps/chosen": -312.170166015625, "logps/rejected": -346.39691162109375, "loss": 0.6613, "rewards/accuracies": 0.625, "rewards/chosen": 0.2217707633972168, "rewards/margins": 0.08296146988868713, "rewards/rejected": 0.13880929350852966, "step": 1237 }, { "epoch": 0.6676553862747742, "grad_norm": 6.014604091644287, "learning_rate": 1.5021198698108036e-07, "logits/chosen": -0.09993019700050354, "logits/rejected": -0.7211665511131287, "logps/chosen": -180.126953125, "logps/rejected": -147.6605224609375, "loss": 0.6507, "rewards/accuracies": 0.625, "rewards/chosen": 0.27776041626930237, "rewards/margins": 0.09642467647790909, "rewards/rejected": 0.18133574724197388, "step": 1238 }, { "epoch": 0.6681946878791964, "grad_norm": 8.343913078308105, "learning_rate": 1.4978043785253962e-07, "logits/chosen": -0.5235894322395325, "logits/rejected": 0.6614292860031128, "logps/chosen": -203.69412231445312, "logps/rejected": -207.5852813720703, "loss": 0.7056, "rewards/accuracies": 0.375, "rewards/chosen": 0.18479128181934357, "rewards/margins": 0.007383629679679871, "rewards/rejected": 0.1774076521396637, "step": 1239 }, { "epoch": 0.6687339894836187, "grad_norm": 9.376834869384766, "learning_rate": 1.4934924424071475e-07, "logits/chosen": 0.5603383779525757, "logits/rejected": -1.092315435409546, "logps/chosen": -562.5904541015625, "logps/rejected": -234.02279663085938, "loss": 0.6605, "rewards/accuracies": 0.5, "rewards/chosen": 0.26521435379981995, "rewards/margins": 0.08625879883766174, "rewards/rejected": 0.1789555549621582, "step": 1240 }, { "epoch": 0.669273291088041, "grad_norm": 6.453458786010742, "learning_rate": 1.489184076752127e-07, "logits/chosen": -0.6999473571777344, "logits/rejected": -0.24409449100494385, "logps/chosen": -201.4540557861328, "logps/rejected": -211.1918487548828, "loss": 0.7332, "rewards/accuracies": 0.25, "rewards/chosen": 0.15055018663406372, "rewards/margins": -0.07406158745288849, "rewards/rejected": 0.22461175918579102, "step": 1241 }, { "epoch": 0.6698125926924633, "grad_norm": 8.040095329284668, "learning_rate": 1.4848792968437375e-07, "logits/chosen": 1.1052284240722656, "logits/rejected": 0.09111440181732178, "logps/chosen": -241.27342224121094, "logps/rejected": -209.677490234375, "loss": 0.6741, "rewards/accuracies": 0.375, "rewards/chosen": 0.18172425031661987, "rewards/margins": 0.0704987496137619, "rewards/rejected": 0.11122550815343857, "step": 1242 }, { "epoch": 0.6703518942968856, "grad_norm": 7.419899940490723, "learning_rate": 1.4805781179526625e-07, "logits/chosen": 0.09382365643978119, "logits/rejected": -0.41368621587753296, "logps/chosen": -203.7572021484375, "logps/rejected": -233.9864959716797, "loss": 0.588, "rewards/accuracies": 0.875, "rewards/chosen": 0.2711428701877594, "rewards/margins": 0.23480091989040375, "rewards/rejected": 0.03634195029735565, "step": 1243 }, { "epoch": 0.6708911959013079, "grad_norm": 8.153804779052734, "learning_rate": 1.4762805553368112e-07, "logits/chosen": -0.06361496448516846, "logits/rejected": -0.040611252188682556, "logps/chosen": -337.97698974609375, "logps/rejected": -385.38336181640625, "loss": 0.7045, "rewards/accuracies": 0.375, "rewards/chosen": 0.19773483276367188, "rewards/margins": -0.012871641665697098, "rewards/rejected": 0.21060648560523987, "step": 1244 }, { "epoch": 0.67143049750573, "grad_norm": 8.169336318969727, "learning_rate": 1.471986624241266e-07, "logits/chosen": 0.46942323446273804, "logits/rejected": -0.38239169120788574, "logps/chosen": -228.83840942382812, "logps/rejected": -191.9873046875, "loss": 0.6871, "rewards/accuracies": 0.625, "rewards/chosen": 0.18924680352210999, "rewards/margins": 0.02213425561785698, "rewards/rejected": 0.1671125590801239, "step": 1245 }, { "epoch": 0.6719697991101523, "grad_norm": 6.707968711853027, "learning_rate": 1.4676963398982245e-07, "logits/chosen": 0.598325252532959, "logits/rejected": -0.3855697512626648, "logps/chosen": -246.37107849121094, "logps/rejected": -246.9251708984375, "loss": 0.6505, "rewards/accuracies": 0.625, "rewards/chosen": 0.22461414337158203, "rewards/margins": 0.09558545053005219, "rewards/rejected": 0.12902870774269104, "step": 1246 }, { "epoch": 0.6725091007145746, "grad_norm": 7.668159484863281, "learning_rate": 1.4634097175269492e-07, "logits/chosen": -0.36062905192375183, "logits/rejected": -0.9783060550689697, "logps/chosen": -309.57470703125, "logps/rejected": -288.2925720214844, "loss": 0.6717, "rewards/accuracies": 0.75, "rewards/chosen": 0.27229538559913635, "rewards/margins": 0.0568205900490284, "rewards/rejected": 0.21547479927539825, "step": 1247 }, { "epoch": 0.6730484023189969, "grad_norm": 6.85590124130249, "learning_rate": 1.459126772333712e-07, "logits/chosen": 0.3958408236503601, "logits/rejected": -1.4027445316314697, "logps/chosen": -246.73794555664062, "logps/rejected": -162.05270385742188, "loss": 0.6038, "rewards/accuracies": 0.625, "rewards/chosen": 0.30878883600234985, "rewards/margins": 0.2152036726474762, "rewards/rejected": 0.09358515590429306, "step": 1248 }, { "epoch": 0.6735877039234192, "grad_norm": 6.238822937011719, "learning_rate": 1.45484751951174e-07, "logits/chosen": 1.3432199954986572, "logits/rejected": -0.20455381274223328, "logps/chosen": -320.34490966796875, "logps/rejected": -224.74191284179688, "loss": 0.6617, "rewards/accuracies": 0.625, "rewards/chosen": 0.23032961785793304, "rewards/margins": 0.07342158257961273, "rewards/rejected": 0.1569080352783203, "step": 1249 }, { "epoch": 0.6741270055278414, "grad_norm": 6.678515434265137, "learning_rate": 1.4505719742411644e-07, "logits/chosen": 0.08737978339195251, "logits/rejected": -0.13253295421600342, "logps/chosen": -246.92881774902344, "logps/rejected": -309.17706298828125, "loss": 0.6814, "rewards/accuracies": 0.75, "rewards/chosen": 0.2909116744995117, "rewards/margins": 0.03352584317326546, "rewards/rejected": 0.25738584995269775, "step": 1250 }, { "epoch": 0.6746663071322637, "grad_norm": 7.340824604034424, "learning_rate": 1.4463001516889595e-07, "logits/chosen": 0.3081947863101959, "logits/rejected": -0.5160962343215942, "logps/chosen": -254.4907989501953, "logps/rejected": -208.44442749023438, "loss": 0.6422, "rewards/accuracies": 0.5, "rewards/chosen": 0.2360832244157791, "rewards/margins": 0.12473249435424805, "rewards/rejected": 0.11135073006153107, "step": 1251 }, { "epoch": 0.675205608736686, "grad_norm": 7.705441951751709, "learning_rate": 1.4420320670088976e-07, "logits/chosen": 0.8012768030166626, "logits/rejected": 0.3744209408760071, "logps/chosen": -322.08489990234375, "logps/rejected": -292.1954650878906, "loss": 0.6568, "rewards/accuracies": 0.5, "rewards/chosen": 0.25569021701812744, "rewards/margins": 0.09752941876649857, "rewards/rejected": 0.15816079080104828, "step": 1252 }, { "epoch": 0.6757449103411083, "grad_norm": 8.16635799407959, "learning_rate": 1.4377677353414912e-07, "logits/chosen": 0.10735377669334412, "logits/rejected": 0.8247337937355042, "logps/chosen": -232.40817260742188, "logps/rejected": -234.82199096679688, "loss": 0.7249, "rewards/accuracies": 0.25, "rewards/chosen": 0.12269897758960724, "rewards/margins": -0.045392896980047226, "rewards/rejected": 0.16809187829494476, "step": 1253 }, { "epoch": 0.6762842119455306, "grad_norm": 6.788815498352051, "learning_rate": 1.4335071718139378e-07, "logits/chosen": -0.07919126749038696, "logits/rejected": 0.1273474395275116, "logps/chosen": -203.95782470703125, "logps/rejected": -258.2447509765625, "loss": 0.6943, "rewards/accuracies": 0.75, "rewards/chosen": 0.17648983001708984, "rewards/margins": 0.023029711097478867, "rewards/rejected": 0.15346013009548187, "step": 1254 }, { "epoch": 0.6768235135499528, "grad_norm": 6.69948148727417, "learning_rate": 1.429250391540069e-07, "logits/chosen": -1.0209406614303589, "logits/rejected": -0.8863370418548584, "logps/chosen": -196.3880615234375, "logps/rejected": -229.56959533691406, "loss": 0.6742, "rewards/accuracies": 0.625, "rewards/chosen": 0.2867262065410614, "rewards/margins": 0.059537697583436966, "rewards/rejected": 0.22718849778175354, "step": 1255 }, { "epoch": 0.6773628151543751, "grad_norm": 8.083917617797852, "learning_rate": 1.424997409620295e-07, "logits/chosen": 0.575040876865387, "logits/rejected": -0.7762348651885986, "logps/chosen": -299.766845703125, "logps/rejected": -166.85728454589844, "loss": 0.5971, "rewards/accuracies": 0.75, "rewards/chosen": 0.24429646134376526, "rewards/margins": 0.21329447627067566, "rewards/rejected": 0.031001998111605644, "step": 1256 }, { "epoch": 0.6779021167587974, "grad_norm": 6.450006008148193, "learning_rate": 1.420748241141553e-07, "logits/chosen": -0.5932601690292358, "logits/rejected": 0.018429899588227272, "logps/chosen": -141.77572631835938, "logps/rejected": -164.86666870117188, "loss": 0.6742, "rewards/accuracies": 0.75, "rewards/chosen": 0.12392835319042206, "rewards/margins": 0.0779639184474945, "rewards/rejected": 0.04596443474292755, "step": 1257 }, { "epoch": 0.6784414183632196, "grad_norm": 6.462782859802246, "learning_rate": 1.416502901177251e-07, "logits/chosen": -0.9382518529891968, "logits/rejected": -0.3536880910396576, "logps/chosen": -144.25222778320312, "logps/rejected": -207.0819854736328, "loss": 0.666, "rewards/accuracies": 0.5, "rewards/chosen": 0.208701029419899, "rewards/margins": 0.08510242402553558, "rewards/rejected": 0.1235986277461052, "step": 1258 }, { "epoch": 0.6789807199676419, "grad_norm": 7.177799701690674, "learning_rate": 1.4122614047872182e-07, "logits/chosen": 1.2651113271713257, "logits/rejected": 1.3278130292892456, "logps/chosen": -331.9840393066406, "logps/rejected": -365.3519287109375, "loss": 0.6362, "rewards/accuracies": 0.625, "rewards/chosen": 0.36230698227882385, "rewards/margins": 0.12639695405960083, "rewards/rejected": 0.23591002821922302, "step": 1259 }, { "epoch": 0.6795200215720641, "grad_norm": 6.884531021118164, "learning_rate": 1.4080237670176453e-07, "logits/chosen": 0.7435094714164734, "logits/rejected": 0.44869810342788696, "logps/chosen": -242.33462524414062, "logps/rejected": -293.9195556640625, "loss": 0.6548, "rewards/accuracies": 0.375, "rewards/chosen": 0.30589351058006287, "rewards/margins": 0.10074815899133682, "rewards/rejected": 0.20514535903930664, "step": 1260 }, { "epoch": 0.6800593231764864, "grad_norm": 6.474519729614258, "learning_rate": 1.403790002901038e-07, "logits/chosen": 0.6746587157249451, "logits/rejected": -0.3591618835926056, "logps/chosen": -197.8883514404297, "logps/rejected": -167.305419921875, "loss": 0.6069, "rewards/accuracies": 0.625, "rewards/chosen": 0.3073802888393402, "rewards/margins": 0.19599077105522156, "rewards/rejected": 0.11138953268527985, "step": 1261 }, { "epoch": 0.6805986247809087, "grad_norm": 6.526207447052002, "learning_rate": 1.3995601274561603e-07, "logits/chosen": 0.5120327472686768, "logits/rejected": 0.4911027252674103, "logps/chosen": -222.47109985351562, "logps/rejected": -234.91278076171875, "loss": 0.6481, "rewards/accuracies": 0.75, "rewards/chosen": 0.2659499943256378, "rewards/margins": 0.11036914587020874, "rewards/rejected": 0.15558084845542908, "step": 1262 }, { "epoch": 0.681137926385331, "grad_norm": 7.432262420654297, "learning_rate": 1.395334155687981e-07, "logits/chosen": 0.11954109370708466, "logits/rejected": 0.20297259092330933, "logps/chosen": -162.59078979492188, "logps/rejected": -219.73814392089844, "loss": 0.6524, "rewards/accuracies": 0.75, "rewards/chosen": 0.23965749144554138, "rewards/margins": 0.08953170478343964, "rewards/rejected": 0.15012578666210175, "step": 1263 }, { "epoch": 0.6816772279897533, "grad_norm": 7.248544692993164, "learning_rate": 1.391112102587621e-07, "logits/chosen": -0.8709344267845154, "logits/rejected": -0.1632504016160965, "logps/chosen": -225.4640655517578, "logps/rejected": -303.9102783203125, "loss": 0.7059, "rewards/accuracies": 0.375, "rewards/chosen": 0.13685759902000427, "rewards/margins": -0.01935557834804058, "rewards/rejected": 0.1562131941318512, "step": 1264 }, { "epoch": 0.6822165295941756, "grad_norm": 7.7291083335876465, "learning_rate": 1.3868939831323008e-07, "logits/chosen": 0.09854342043399811, "logits/rejected": -0.4267958700656891, "logps/chosen": -248.85726928710938, "logps/rejected": -295.3260498046875, "loss": 0.6889, "rewards/accuracies": 0.625, "rewards/chosen": 0.21973934769630432, "rewards/margins": 0.020201295614242554, "rewards/rejected": 0.19953805208206177, "step": 1265 }, { "epoch": 0.6827558311985978, "grad_norm": 8.416672706604004, "learning_rate": 1.3826798122852868e-07, "logits/chosen": 0.37349826097488403, "logits/rejected": -0.1967148631811142, "logps/chosen": -233.62322998046875, "logps/rejected": -219.53016662597656, "loss": 0.6906, "rewards/accuracies": 0.5, "rewards/chosen": 0.09755650162696838, "rewards/margins": 0.008212089538574219, "rewards/rejected": 0.08934441208839417, "step": 1266 }, { "epoch": 0.6832951328030201, "grad_norm": 7.3631134033203125, "learning_rate": 1.3784696049958376e-07, "logits/chosen": 0.0028783914167433977, "logits/rejected": 0.5404222011566162, "logps/chosen": -356.20562744140625, "logps/rejected": -327.873779296875, "loss": 0.7275, "rewards/accuracies": 0.25, "rewards/chosen": 0.22016219794750214, "rewards/margins": -0.06415939331054688, "rewards/rejected": 0.2843216061592102, "step": 1267 }, { "epoch": 0.6838344344074424, "grad_norm": 6.822511672973633, "learning_rate": 1.3742633761991518e-07, "logits/chosen": 0.7690489888191223, "logits/rejected": -0.11682248115539551, "logps/chosen": -298.0306091308594, "logps/rejected": -233.7655029296875, "loss": 0.6794, "rewards/accuracies": 0.625, "rewards/chosen": 0.27462226152420044, "rewards/margins": 0.04288434982299805, "rewards/rejected": 0.2317379117012024, "step": 1268 }, { "epoch": 0.6843737360118647, "grad_norm": 7.187079429626465, "learning_rate": 1.3700611408163158e-07, "logits/chosen": -0.18528559803962708, "logits/rejected": -0.8040751218795776, "logps/chosen": -180.1195068359375, "logps/rejected": -183.0467987060547, "loss": 0.6999, "rewards/accuracies": 0.625, "rewards/chosen": 0.1741238683462143, "rewards/margins": 0.01005583256483078, "rewards/rejected": 0.1640680432319641, "step": 1269 }, { "epoch": 0.6849130376162869, "grad_norm": 6.5881476402282715, "learning_rate": 1.365862913754247e-07, "logits/chosen": 0.7048249244689941, "logits/rejected": 0.029138892889022827, "logps/chosen": -289.9688720703125, "logps/rejected": -197.99301147460938, "loss": 0.5813, "rewards/accuracies": 1.0, "rewards/chosen": 0.3416808247566223, "rewards/margins": 0.24753093719482422, "rewards/rejected": 0.0941498726606369, "step": 1270 }, { "epoch": 0.6854523392207091, "grad_norm": 8.49373722076416, "learning_rate": 1.3616687099056465e-07, "logits/chosen": 1.0587506294250488, "logits/rejected": 0.2522916793823242, "logps/chosen": -320.4011535644531, "logps/rejected": -251.97218322753906, "loss": 0.7065, "rewards/accuracies": 0.375, "rewards/chosen": 0.1250602751970291, "rewards/margins": -0.020649628713726997, "rewards/rejected": 0.14570990204811096, "step": 1271 }, { "epoch": 0.6859916408251314, "grad_norm": 9.046189308166504, "learning_rate": 1.357478544148943e-07, "logits/chosen": 0.12609511613845825, "logits/rejected": -0.4346988797187805, "logps/chosen": -195.08468627929688, "logps/rejected": -255.539794921875, "loss": 0.7143, "rewards/accuracies": 0.5, "rewards/chosen": 0.20285721123218536, "rewards/margins": -0.024548381567001343, "rewards/rejected": 0.2274055927991867, "step": 1272 }, { "epoch": 0.6865309424295537, "grad_norm": 6.694724082946777, "learning_rate": 1.3532924313482397e-07, "logits/chosen": -0.11732780933380127, "logits/rejected": 0.07897733151912689, "logps/chosen": -218.62713623046875, "logps/rejected": -227.0671844482422, "loss": 0.6806, "rewards/accuracies": 0.5, "rewards/chosen": 0.25514090061187744, "rewards/margins": 0.03735694661736488, "rewards/rejected": 0.21778392791748047, "step": 1273 }, { "epoch": 0.687070244033976, "grad_norm": 7.322577953338623, "learning_rate": 1.3491103863532624e-07, "logits/chosen": -0.05890282988548279, "logits/rejected": 0.07353562861680984, "logps/chosen": -214.9175262451172, "logps/rejected": -260.85565185546875, "loss": 0.632, "rewards/accuracies": 0.75, "rewards/chosen": 0.35421693325042725, "rewards/margins": 0.13834771513938904, "rewards/rejected": 0.2158692330121994, "step": 1274 }, { "epoch": 0.6876095456383983, "grad_norm": 6.970195770263672, "learning_rate": 1.3449324239993094e-07, "logits/chosen": -0.231620654463768, "logits/rejected": -0.0059891194105148315, "logps/chosen": -204.23777770996094, "logps/rejected": -235.9385986328125, "loss": 0.6426, "rewards/accuracies": 0.75, "rewards/chosen": 0.2637186050415039, "rewards/margins": 0.11034941673278809, "rewards/rejected": 0.15336918830871582, "step": 1275 }, { "epoch": 0.6881488472428205, "grad_norm": 7.099493503570557, "learning_rate": 1.3407585591071944e-07, "logits/chosen": 0.7063131332397461, "logits/rejected": -0.23780547082424164, "logps/chosen": -194.33648681640625, "logps/rejected": -138.68284606933594, "loss": 0.6627, "rewards/accuracies": 0.625, "rewards/chosen": 0.276235967874527, "rewards/margins": 0.0733526349067688, "rewards/rejected": 0.20288334786891937, "step": 1276 }, { "epoch": 0.6886881488472428, "grad_norm": 7.027258396148682, "learning_rate": 1.3365888064831934e-07, "logits/chosen": -0.4421578049659729, "logits/rejected": -1.4583054780960083, "logps/chosen": -236.68348693847656, "logps/rejected": -201.78953552246094, "loss": 0.6473, "rewards/accuracies": 0.75, "rewards/chosen": 0.23952236771583557, "rewards/margins": 0.11025038361549377, "rewards/rejected": 0.1292719841003418, "step": 1277 }, { "epoch": 0.6892274504516651, "grad_norm": 8.455306053161621, "learning_rate": 1.3324231809189983e-07, "logits/chosen": 0.7702938914299011, "logits/rejected": -0.4914100468158722, "logps/chosen": -291.6300354003906, "logps/rejected": -188.8789520263672, "loss": 0.6221, "rewards/accuracies": 0.625, "rewards/chosen": 0.331326961517334, "rewards/margins": 0.18050850927829742, "rewards/rejected": 0.15081843733787537, "step": 1278 }, { "epoch": 0.6897667520560874, "grad_norm": 6.7834062576293945, "learning_rate": 1.32826169719166e-07, "logits/chosen": 0.55631422996521, "logits/rejected": -0.5516250133514404, "logps/chosen": -253.25808715820312, "logps/rejected": -223.51089477539062, "loss": 0.6563, "rewards/accuracies": 0.625, "rewards/chosen": 0.18734979629516602, "rewards/margins": 0.08548718690872192, "rewards/rejected": 0.10186261683702469, "step": 1279 }, { "epoch": 0.6903060536605097, "grad_norm": 8.660261154174805, "learning_rate": 1.3241043700635352e-07, "logits/chosen": -0.42684516310691833, "logits/rejected": -1.3795840740203857, "logps/chosen": -266.69580078125, "logps/rejected": -187.88729858398438, "loss": 0.6502, "rewards/accuracies": 0.75, "rewards/chosen": 0.27364444732666016, "rewards/margins": 0.09596872329711914, "rewards/rejected": 0.17767572402954102, "step": 1280 }, { "epoch": 0.690845355264932, "grad_norm": 6.890921592712402, "learning_rate": 1.3199512142822373e-07, "logits/chosen": 0.28414463996887207, "logits/rejected": -0.17640358209609985, "logps/chosen": -231.58029174804688, "logps/rejected": -256.28411865234375, "loss": 0.6515, "rewards/accuracies": 0.75, "rewards/chosen": 0.26159733533859253, "rewards/margins": 0.09600849449634552, "rewards/rejected": 0.1655888557434082, "step": 1281 }, { "epoch": 0.6913846568693542, "grad_norm": 6.242371559143066, "learning_rate": 1.3158022445805814e-07, "logits/chosen": -0.38047128915786743, "logits/rejected": -0.5838962197303772, "logps/chosen": -209.04429626464844, "logps/rejected": -215.353515625, "loss": 0.706, "rewards/accuracies": 0.375, "rewards/chosen": 0.1662854254245758, "rewards/margins": -0.015882108360528946, "rewards/rejected": 0.18216753005981445, "step": 1282 }, { "epoch": 0.6919239584737764, "grad_norm": 7.345821380615234, "learning_rate": 1.3116574756765335e-07, "logits/chosen": -0.1761796474456787, "logits/rejected": -0.3158765435218811, "logps/chosen": -298.8111572265625, "logps/rejected": -206.97560119628906, "loss": 0.6744, "rewards/accuracies": 0.625, "rewards/chosen": 0.18225345015525818, "rewards/margins": 0.07805797457695007, "rewards/rejected": 0.1041954904794693, "step": 1283 }, { "epoch": 0.6924632600781987, "grad_norm": 7.614097595214844, "learning_rate": 1.3075169222731572e-07, "logits/chosen": 0.0761682391166687, "logits/rejected": -0.4328846037387848, "logps/chosen": -221.75466918945312, "logps/rejected": -163.10032653808594, "loss": 0.6575, "rewards/accuracies": 0.5, "rewards/chosen": 0.2201080322265625, "rewards/margins": 0.09375958144664764, "rewards/rejected": 0.12634845077991486, "step": 1284 }, { "epoch": 0.693002561682621, "grad_norm": 7.404427528381348, "learning_rate": 1.3033805990585636e-07, "logits/chosen": 0.8932483792304993, "logits/rejected": -0.6299241185188293, "logps/chosen": -327.4954833984375, "logps/rejected": -241.5652618408203, "loss": 0.6936, "rewards/accuracies": 0.375, "rewards/chosen": 0.26200103759765625, "rewards/margins": 0.006514546927064657, "rewards/rejected": 0.25548648834228516, "step": 1285 }, { "epoch": 0.6935418632870433, "grad_norm": 6.690150737762451, "learning_rate": 1.2992485207058547e-07, "logits/chosen": -0.637227475643158, "logits/rejected": -0.29150107502937317, "logps/chosen": -184.45433044433594, "logps/rejected": -222.63259887695312, "loss": 0.7453, "rewards/accuracies": 0.375, "rewards/chosen": 0.16076917946338654, "rewards/margins": -0.09099331498146057, "rewards/rejected": 0.2517625093460083, "step": 1286 }, { "epoch": 0.6940811648914655, "grad_norm": 6.755788326263428, "learning_rate": 1.295120701873077e-07, "logits/chosen": -0.17186704277992249, "logits/rejected": -0.40162765979766846, "logps/chosen": -285.6639709472656, "logps/rejected": -237.36602783203125, "loss": 0.6447, "rewards/accuracies": 0.75, "rewards/chosen": 0.23502835631370544, "rewards/margins": 0.10602713376283646, "rewards/rejected": 0.12900124490261078, "step": 1287 }, { "epoch": 0.6946204664958878, "grad_norm": 8.93998908996582, "learning_rate": 1.2909971572031662e-07, "logits/chosen": 0.6031147837638855, "logits/rejected": 0.3039599359035492, "logps/chosen": -291.8426818847656, "logps/rejected": -296.44183349609375, "loss": 0.7118, "rewards/accuracies": 0.375, "rewards/chosen": 0.26953715085983276, "rewards/margins": 0.000812724232673645, "rewards/rejected": 0.2687244415283203, "step": 1288 }, { "epoch": 0.6951597681003101, "grad_norm": 6.890156269073486, "learning_rate": 1.2868779013238956e-07, "logits/chosen": -0.018807735294103622, "logits/rejected": 0.4886031448841095, "logps/chosen": -214.7432861328125, "logps/rejected": -221.91781616210938, "loss": 0.6963, "rewards/accuracies": 0.5, "rewards/chosen": 0.22882328927516937, "rewards/margins": 0.011970050632953644, "rewards/rejected": 0.21685324609279633, "step": 1289 }, { "epoch": 0.6956990697047324, "grad_norm": 6.795149326324463, "learning_rate": 1.2827629488478254e-07, "logits/chosen": 0.05728612840175629, "logits/rejected": -0.040174491703510284, "logps/chosen": -265.2661437988281, "logps/rejected": -274.3443603515625, "loss": 0.7008, "rewards/accuracies": 0.5, "rewards/chosen": 0.24480238556861877, "rewards/margins": 0.0026514027267694473, "rewards/rejected": 0.24215096235275269, "step": 1290 }, { "epoch": 0.6962383713091547, "grad_norm": 6.702830791473389, "learning_rate": 1.2786523143722489e-07, "logits/chosen": -0.08597220480442047, "logits/rejected": 0.03492039442062378, "logps/chosen": -238.29351806640625, "logps/rejected": -345.44281005859375, "loss": 0.6656, "rewards/accuracies": 0.625, "rewards/chosen": 0.3088080585002899, "rewards/margins": 0.06891412287950516, "rewards/rejected": 0.23989391326904297, "step": 1291 }, { "epoch": 0.696777672913577, "grad_norm": 7.842306137084961, "learning_rate": 1.2745460124791424e-07, "logits/chosen": -0.2912622094154358, "logits/rejected": -0.5910025238990784, "logps/chosen": -165.97840881347656, "logps/rejected": -171.20933532714844, "loss": 0.6895, "rewards/accuracies": 0.5, "rewards/chosen": 0.20715123414993286, "rewards/margins": 0.0351746566593647, "rewards/rejected": 0.17197656631469727, "step": 1292 }, { "epoch": 0.6973169745179992, "grad_norm": 8.409358978271484, "learning_rate": 1.2704440577351128e-07, "logits/chosen": -0.07610177993774414, "logits/rejected": -0.8753018379211426, "logps/chosen": -199.035888671875, "logps/rejected": -173.458984375, "loss": 0.7147, "rewards/accuracies": 0.5, "rewards/chosen": 0.1419481337070465, "rewards/margins": -0.028867626562714577, "rewards/rejected": 0.17081575095653534, "step": 1293 }, { "epoch": 0.6978562761224215, "grad_norm": 7.002345561981201, "learning_rate": 1.2663464646913458e-07, "logits/chosen": -0.15077616274356842, "logits/rejected": 0.1700269877910614, "logps/chosen": -161.2017059326172, "logps/rejected": -208.12973022460938, "loss": 0.6419, "rewards/accuracies": 0.625, "rewards/chosen": 0.2956591844558716, "rewards/margins": 0.12276734411716461, "rewards/rejected": 0.17289181053638458, "step": 1294 }, { "epoch": 0.6983955777268437, "grad_norm": 7.644437313079834, "learning_rate": 1.2622532478835558e-07, "logits/chosen": -0.1300152838230133, "logits/rejected": -0.5559167861938477, "logps/chosen": -219.6480712890625, "logps/rejected": -254.19879150390625, "loss": 0.6982, "rewards/accuracies": 0.5, "rewards/chosen": 0.22058621048927307, "rewards/margins": 0.005778169259428978, "rewards/rejected": 0.21480804681777954, "step": 1295 }, { "epoch": 0.698934879331266, "grad_norm": 6.5813422203063965, "learning_rate": 1.2581644218319299e-07, "logits/chosen": 0.9903100728988647, "logits/rejected": 0.7011757493019104, "logps/chosen": -216.21334838867188, "logps/rejected": -231.94281005859375, "loss": 0.669, "rewards/accuracies": 0.75, "rewards/chosen": 0.2186688482761383, "rewards/margins": 0.05774002522230148, "rewards/rejected": 0.16092883050441742, "step": 1296 }, { "epoch": 0.6994741809356883, "grad_norm": 6.713339805603027, "learning_rate": 1.2540800010410827e-07, "logits/chosen": -0.7832212448120117, "logits/rejected": 0.0003629028797149658, "logps/chosen": -215.3087921142578, "logps/rejected": -304.91705322265625, "loss": 0.6817, "rewards/accuracies": 0.75, "rewards/chosen": 0.22424641251564026, "rewards/margins": 0.030919605866074562, "rewards/rejected": 0.19332680106163025, "step": 1297 }, { "epoch": 0.7000134825401105, "grad_norm": 8.952996253967285, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -0.35691991448402405, "logits/rejected": -0.961686372756958, "logps/chosen": -252.28367614746094, "logps/rejected": -242.3607940673828, "loss": 0.7221, "rewards/accuracies": 0.625, "rewards/chosen": 0.19568881392478943, "rewards/margins": -0.0313662588596344, "rewards/rejected": 0.22705508768558502, "step": 1298 }, { "epoch": 0.7005527841445328, "grad_norm": 7.8447651863098145, "learning_rate": 1.245924433181991e-07, "logits/chosen": 0.7542281746864319, "logits/rejected": -0.40391677618026733, "logps/chosen": -365.743896484375, "logps/rejected": -223.60061645507812, "loss": 0.66, "rewards/accuracies": 0.5, "rewards/chosen": 0.28441351652145386, "rewards/margins": 0.0769307091832161, "rewards/rejected": 0.20748281478881836, "step": 1299 }, { "epoch": 0.7010920857489551, "grad_norm": 7.334555625915527, "learning_rate": 1.2418533150446324e-07, "logits/chosen": 0.5458379983901978, "logits/rejected": -0.4998508393764496, "logps/chosen": -285.6225891113281, "logps/rejected": -269.1523132324219, "loss": 0.6359, "rewards/accuracies": 0.625, "rewards/chosen": 0.18593770265579224, "rewards/margins": 0.12639951705932617, "rewards/rejected": 0.05953817814588547, "step": 1300 }, { "epoch": 0.7016313873533774, "grad_norm": 5.35244083404541, "learning_rate": 1.237786660029721e-07, "logits/chosen": 1.1585639715194702, "logits/rejected": 0.337704062461853, "logps/chosen": -198.56951904296875, "logps/rejected": -160.6029815673828, "loss": 0.6427, "rewards/accuracies": 0.625, "rewards/chosen": 0.3088703155517578, "rewards/margins": 0.11470318585634232, "rewards/rejected": 0.1941671371459961, "step": 1301 }, { "epoch": 0.7021706889577997, "grad_norm": 7.399692058563232, "learning_rate": 1.2337244825632216e-07, "logits/chosen": 1.0314297676086426, "logits/rejected": 0.19565951824188232, "logps/chosen": -265.7538146972656, "logps/rejected": -271.99127197265625, "loss": 0.6519, "rewards/accuracies": 0.75, "rewards/chosen": 0.3604789972305298, "rewards/margins": 0.0930454432964325, "rewards/rejected": 0.2674335539340973, "step": 1302 }, { "epoch": 0.7027099905622219, "grad_norm": 7.872140407562256, "learning_rate": 1.2296667970552147e-07, "logits/chosen": -0.3996660113334656, "logits/rejected": -0.5482621788978577, "logps/chosen": -315.7457275390625, "logps/rejected": -333.78289794921875, "loss": 0.6885, "rewards/accuracies": 0.375, "rewards/chosen": 0.2323123961687088, "rewards/margins": 0.033696651458740234, "rewards/rejected": 0.19861574470996857, "step": 1303 }, { "epoch": 0.7032492921666442, "grad_norm": 8.191426277160645, "learning_rate": 1.2256136178998466e-07, "logits/chosen": -0.4993674159049988, "logits/rejected": -0.9557494521141052, "logps/chosen": -321.95111083984375, "logps/rejected": -290.0621643066406, "loss": 0.6872, "rewards/accuracies": 0.5, "rewards/chosen": 0.24258680641651154, "rewards/margins": 0.025509260594844818, "rewards/rejected": 0.21707755327224731, "step": 1304 }, { "epoch": 0.7037885937710665, "grad_norm": 6.372839450836182, "learning_rate": 1.2215649594752782e-07, "logits/chosen": 0.12216556072235107, "logits/rejected": 0.6027907729148865, "logps/chosen": -261.35308837890625, "logps/rejected": -253.63890075683594, "loss": 0.6957, "rewards/accuracies": 0.5, "rewards/chosen": 0.19049720466136932, "rewards/margins": 0.0023574773222208023, "rewards/rejected": 0.18813973665237427, "step": 1305 }, { "epoch": 0.7043278953754888, "grad_norm": 7.482355117797852, "learning_rate": 1.2175208361436327e-07, "logits/chosen": -0.25235122442245483, "logits/rejected": 0.27826040983200073, "logps/chosen": -222.9798583984375, "logps/rejected": -285.339111328125, "loss": 0.7188, "rewards/accuracies": 0.375, "rewards/chosen": 0.16516925394535065, "rewards/margins": -0.03545704483985901, "rewards/rejected": 0.20062626898288727, "step": 1306 }, { "epoch": 0.7048671969799111, "grad_norm": 8.11060619354248, "learning_rate": 1.2134812622509457e-07, "logits/chosen": -0.5713010430335999, "logits/rejected": -0.031229734420776367, "logps/chosen": -311.3581237792969, "logps/rejected": -258.5328369140625, "loss": 0.7018, "rewards/accuracies": 0.625, "rewards/chosen": 0.23027287423610687, "rewards/margins": 0.006193060427904129, "rewards/rejected": 0.22407980263233185, "step": 1307 }, { "epoch": 0.7054064985843332, "grad_norm": 6.888844013214111, "learning_rate": 1.2094462521271154e-07, "logits/chosen": -0.2663514316082001, "logits/rejected": -0.7821130156517029, "logps/chosen": -351.6632995605469, "logps/rejected": -282.87286376953125, "loss": 0.6443, "rewards/accuracies": 0.625, "rewards/chosen": 0.2843637466430664, "rewards/margins": 0.12229748070240021, "rewards/rejected": 0.1620662659406662, "step": 1308 }, { "epoch": 0.7059458001887555, "grad_norm": 7.716382026672363, "learning_rate": 1.2054158200858493e-07, "logits/chosen": 0.47305500507354736, "logits/rejected": -0.6075085401535034, "logps/chosen": -403.75775146484375, "logps/rejected": -225.3384246826172, "loss": 0.6297, "rewards/accuracies": 0.75, "rewards/chosen": 0.21766397356987, "rewards/margins": 0.1442440152168274, "rewards/rejected": 0.073419950902462, "step": 1309 }, { "epoch": 0.7064851017931778, "grad_norm": 8.934891700744629, "learning_rate": 1.2013899804246158e-07, "logits/chosen": -0.6971591711044312, "logits/rejected": 0.15931686758995056, "logps/chosen": -277.4867858886719, "logps/rejected": -296.63323974609375, "loss": 0.781, "rewards/accuracies": 0.5, "rewards/chosen": 0.1702527105808258, "rewards/margins": -0.14303970336914062, "rewards/rejected": 0.31329241394996643, "step": 1310 }, { "epoch": 0.7070244033976001, "grad_norm": 7.870758056640625, "learning_rate": 1.197368747424592e-07, "logits/chosen": -0.3961830139160156, "logits/rejected": -1.4697608947753906, "logps/chosen": -246.74545288085938, "logps/rejected": -237.84120178222656, "loss": 0.6608, "rewards/accuracies": 0.625, "rewards/chosen": 0.1621171534061432, "rewards/margins": 0.08106017112731934, "rewards/rejected": 0.08105698227882385, "step": 1311 }, { "epoch": 0.7075637050020224, "grad_norm": 7.546592712402344, "learning_rate": 1.1933521353506117e-07, "logits/chosen": -0.4435584247112274, "logits/rejected": -1.1121609210968018, "logps/chosen": -282.9216613769531, "logps/rejected": -267.4111022949219, "loss": 0.7242, "rewards/accuracies": 0.625, "rewards/chosen": 0.20095835626125336, "rewards/margins": -0.056509215384721756, "rewards/rejected": 0.2574675679206848, "step": 1312 }, { "epoch": 0.7081030066064447, "grad_norm": 6.6944756507873535, "learning_rate": 1.1893401584511184e-07, "logits/chosen": 0.48260313272476196, "logits/rejected": -0.7711428999900818, "logps/chosen": -265.24847412109375, "logps/rejected": -211.40383911132812, "loss": 0.6279, "rewards/accuracies": 0.625, "rewards/chosen": 0.34160223603248596, "rewards/margins": 0.14301423728466034, "rewards/rejected": 0.19858799874782562, "step": 1313 }, { "epoch": 0.7086423082108669, "grad_norm": 7.603794097900391, "learning_rate": 1.1853328309581137e-07, "logits/chosen": -1.3934663534164429, "logits/rejected": 0.01741771399974823, "logps/chosen": -276.756103515625, "logps/rejected": -286.73455810546875, "loss": 0.7084, "rewards/accuracies": 0.375, "rewards/chosen": 0.1878371238708496, "rewards/margins": -0.019322199746966362, "rewards/rejected": 0.20715932548046112, "step": 1314 }, { "epoch": 0.7091816098152892, "grad_norm": 7.363443851470947, "learning_rate": 1.1813301670871037e-07, "logits/chosen": -0.12185829877853394, "logits/rejected": 0.3293604850769043, "logps/chosen": -228.42086791992188, "logps/rejected": -297.1985168457031, "loss": 0.7213, "rewards/accuracies": 0.5, "rewards/chosen": 0.2795454263687134, "rewards/margins": -0.04503849893808365, "rewards/rejected": 0.3245839476585388, "step": 1315 }, { "epoch": 0.7097209114197115, "grad_norm": 7.036599636077881, "learning_rate": 1.1773321810370526e-07, "logits/chosen": -0.030068978667259216, "logits/rejected": 0.14389830827713013, "logps/chosen": -282.70062255859375, "logps/rejected": -296.54266357421875, "loss": 0.6806, "rewards/accuracies": 0.5, "rewards/chosen": 0.2836105525493622, "rewards/margins": 0.029827781021595, "rewards/rejected": 0.2537827491760254, "step": 1316 }, { "epoch": 0.7102602130241338, "grad_norm": 7.266541957855225, "learning_rate": 1.1733388869903299e-07, "logits/chosen": 0.3199959695339203, "logits/rejected": 0.38477152585983276, "logps/chosen": -263.6343688964844, "logps/rejected": -270.32415771484375, "loss": 0.6723, "rewards/accuracies": 0.625, "rewards/chosen": 0.2565380334854126, "rewards/margins": 0.05923004075884819, "rewards/rejected": 0.19730797410011292, "step": 1317 }, { "epoch": 0.7107995146285561, "grad_norm": 7.902159690856934, "learning_rate": 1.1693502991126608e-07, "logits/chosen": -0.5090177059173584, "logits/rejected": 1.3485705852508545, "logps/chosen": -227.45199584960938, "logps/rejected": -408.4299621582031, "loss": 0.6631, "rewards/accuracies": 0.625, "rewards/chosen": 0.33052128553390503, "rewards/margins": 0.06521998345851898, "rewards/rejected": 0.26530134677886963, "step": 1318 }, { "epoch": 0.7113388162329783, "grad_norm": 9.18831729888916, "learning_rate": 1.1653664315530762e-07, "logits/chosen": -0.3379935920238495, "logits/rejected": -0.1844358742237091, "logps/chosen": -347.68084716796875, "logps/rejected": -239.5684051513672, "loss": 0.7602, "rewards/accuracies": 0.25, "rewards/chosen": 0.14616647362709045, "rewards/margins": -0.10885539650917053, "rewards/rejected": 0.255021870136261, "step": 1319 }, { "epoch": 0.7118781178374005, "grad_norm": 7.832462310791016, "learning_rate": 1.1613872984438628e-07, "logits/chosen": 0.6823294758796692, "logits/rejected": 0.2785966098308563, "logps/chosen": -310.7174987792969, "logps/rejected": -196.46200561523438, "loss": 0.6409, "rewards/accuracies": 0.75, "rewards/chosen": 0.2898569107055664, "rewards/margins": 0.11414223164319992, "rewards/rejected": 0.17571468651294708, "step": 1320 }, { "epoch": 0.7124174194418228, "grad_norm": 6.489288806915283, "learning_rate": 1.1574129139005096e-07, "logits/chosen": 0.8294523358345032, "logits/rejected": -0.3486727178096771, "logps/chosen": -258.1258544921875, "logps/rejected": -164.60232543945312, "loss": 0.6395, "rewards/accuracies": 0.625, "rewards/chosen": 0.3157920837402344, "rewards/margins": 0.11814269423484802, "rewards/rejected": 0.19764938950538635, "step": 1321 }, { "epoch": 0.7129567210462451, "grad_norm": 6.375120162963867, "learning_rate": 1.1534432920216642e-07, "logits/chosen": 0.5108115077018738, "logits/rejected": -1.2421977519989014, "logps/chosen": -225.51702880859375, "logps/rejected": -188.09164428710938, "loss": 0.6179, "rewards/accuracies": 0.75, "rewards/chosen": 0.33836764097213745, "rewards/margins": 0.17208726704120636, "rewards/rejected": 0.1662803739309311, "step": 1322 }, { "epoch": 0.7134960226506674, "grad_norm": 5.699094772338867, "learning_rate": 1.1494784468890769e-07, "logits/chosen": 0.6259499788284302, "logits/rejected": -0.6311880946159363, "logps/chosen": -225.04058837890625, "logps/rejected": -210.48814392089844, "loss": 0.6679, "rewards/accuracies": 0.625, "rewards/chosen": 0.261439710855484, "rewards/margins": 0.059733204543590546, "rewards/rejected": 0.20170651376247406, "step": 1323 }, { "epoch": 0.7140353242550896, "grad_norm": 6.828174591064453, "learning_rate": 1.1455183925675549e-07, "logits/chosen": 0.44013476371765137, "logits/rejected": -0.6134443879127502, "logps/chosen": -233.25631713867188, "logps/rejected": -205.5028839111328, "loss": 0.6784, "rewards/accuracies": 0.375, "rewards/chosen": 0.2753642201423645, "rewards/margins": 0.0462852381169796, "rewards/rejected": 0.2290789633989334, "step": 1324 }, { "epoch": 0.7145746258595119, "grad_norm": 6.899879455566406, "learning_rate": 1.1415631431049092e-07, "logits/chosen": 0.02734912931919098, "logits/rejected": -0.8192562460899353, "logps/chosen": -256.6775207519531, "logps/rejected": -261.5732116699219, "loss": 0.5709, "rewards/accuracies": 0.875, "rewards/chosen": 0.3527347445487976, "rewards/margins": 0.27714118361473083, "rewards/rejected": 0.07559357583522797, "step": 1325 }, { "epoch": 0.7151139274639342, "grad_norm": 7.298576831817627, "learning_rate": 1.1376127125319063e-07, "logits/chosen": -0.1879877746105194, "logits/rejected": -0.7829351425170898, "logps/chosen": -236.83242797851562, "logps/rejected": -238.951416015625, "loss": 0.6286, "rewards/accuracies": 0.625, "rewards/chosen": 0.23269644379615784, "rewards/margins": 0.1559978425502777, "rewards/rejected": 0.07669859379529953, "step": 1326 }, { "epoch": 0.7156532290683565, "grad_norm": 7.156325817108154, "learning_rate": 1.1336671148622184e-07, "logits/chosen": 0.9202764630317688, "logits/rejected": -0.14648792147636414, "logps/chosen": -264.1231994628906, "logps/rejected": -206.59988403320312, "loss": 0.7064, "rewards/accuracies": 0.375, "rewards/chosen": 0.2306446135044098, "rewards/margins": -0.008311750367283821, "rewards/rejected": 0.23895636200904846, "step": 1327 }, { "epoch": 0.7161925306727788, "grad_norm": 6.712767601013184, "learning_rate": 1.1297263640923744e-07, "logits/chosen": -0.012781769037246704, "logits/rejected": -0.3840906620025635, "logps/chosen": -278.3707580566406, "logps/rejected": -196.70094299316406, "loss": 0.7089, "rewards/accuracies": 0.5, "rewards/chosen": 0.2161644846200943, "rewards/margins": -0.00569094717502594, "rewards/rejected": 0.22185544669628143, "step": 1328 }, { "epoch": 0.716731832277201, "grad_norm": 7.357330322265625, "learning_rate": 1.125790474201708e-07, "logits/chosen": 0.06432229280471802, "logits/rejected": -0.11215465515851974, "logps/chosen": -238.46070861816406, "logps/rejected": -230.6365966796875, "loss": 0.6328, "rewards/accuracies": 0.5, "rewards/chosen": 0.34125232696533203, "rewards/margins": 0.14675913751125336, "rewards/rejected": 0.19449320435523987, "step": 1329 }, { "epoch": 0.7172711338816233, "grad_norm": 7.015687465667725, "learning_rate": 1.1218594591523118e-07, "logits/chosen": 0.6511495113372803, "logits/rejected": -0.6759612560272217, "logps/chosen": -292.6010437011719, "logps/rejected": -276.9075012207031, "loss": 0.5816, "rewards/accuracies": 0.875, "rewards/chosen": 0.3640022277832031, "rewards/margins": 0.2423742264509201, "rewards/rejected": 0.12162800133228302, "step": 1330 }, { "epoch": 0.7178104354860456, "grad_norm": 8.359238624572754, "learning_rate": 1.1179333328889811e-07, "logits/chosen": -1.0460834503173828, "logits/rejected": -1.4716030359268188, "logps/chosen": -222.0667266845703, "logps/rejected": -237.23207092285156, "loss": 0.7135, "rewards/accuracies": 0.5, "rewards/chosen": 0.29125043749809265, "rewards/margins": -0.013602830469608307, "rewards/rejected": 0.30485326051712036, "step": 1331 }, { "epoch": 0.7183497370904679, "grad_norm": 6.656169891357422, "learning_rate": 1.1140121093391735e-07, "logits/chosen": 0.631911039352417, "logits/rejected": -1.1169148683547974, "logps/chosen": -205.0841064453125, "logps/rejected": -144.84336853027344, "loss": 0.6141, "rewards/accuracies": 0.75, "rewards/chosen": 0.32371941208839417, "rewards/margins": 0.1791432946920395, "rewards/rejected": 0.14457611739635468, "step": 1332 }, { "epoch": 0.7188890386948901, "grad_norm": 8.161630630493164, "learning_rate": 1.1100958024129515e-07, "logits/chosen": 0.205488920211792, "logits/rejected": 0.12986335158348083, "logps/chosen": -414.39703369140625, "logps/rejected": -360.05914306640625, "loss": 0.7252, "rewards/accuracies": 0.375, "rewards/chosen": 0.2195219099521637, "rewards/margins": -0.055901914834976196, "rewards/rejected": 0.2754238247871399, "step": 1333 }, { "epoch": 0.7194283402993124, "grad_norm": 5.687441349029541, "learning_rate": 1.1061844260029399e-07, "logits/chosen": 0.6232950687408447, "logits/rejected": 0.735084056854248, "logps/chosen": -287.7105407714844, "logps/rejected": -225.91262817382812, "loss": 0.6882, "rewards/accuracies": 0.375, "rewards/chosen": 0.2058342844247818, "rewards/margins": 0.017732243984937668, "rewards/rejected": 0.18810206651687622, "step": 1334 }, { "epoch": 0.7199676419037346, "grad_norm": 6.776767730712891, "learning_rate": 1.1022779939842703e-07, "logits/chosen": 0.9853708744049072, "logits/rejected": 0.2604091465473175, "logps/chosen": -343.759521484375, "logps/rejected": -271.9275817871094, "loss": 0.6209, "rewards/accuracies": 0.75, "rewards/chosen": 0.307956337928772, "rewards/margins": 0.1753009855747223, "rewards/rejected": 0.1326553374528885, "step": 1335 }, { "epoch": 0.7205069435081569, "grad_norm": 7.099707126617432, "learning_rate": 1.0983765202145351e-07, "logits/chosen": 0.41472676396369934, "logits/rejected": 0.36126023530960083, "logps/chosen": -355.32269287109375, "logps/rejected": -294.723876953125, "loss": 0.6304, "rewards/accuracies": 0.875, "rewards/chosen": 0.2991134524345398, "rewards/margins": 0.1486043930053711, "rewards/rejected": 0.1505090594291687, "step": 1336 }, { "epoch": 0.7210462451125792, "grad_norm": 6.683874607086182, "learning_rate": 1.0944800185337396e-07, "logits/chosen": -0.021093226969242096, "logits/rejected": 0.25371843576431274, "logps/chosen": -211.87454223632812, "logps/rejected": -271.14300537109375, "loss": 0.6819, "rewards/accuracies": 0.25, "rewards/chosen": 0.3472291827201843, "rewards/margins": 0.03149089217185974, "rewards/rejected": 0.3157382905483246, "step": 1337 }, { "epoch": 0.7215855467170015, "grad_norm": 6.2249345779418945, "learning_rate": 1.0905885027642483e-07, "logits/chosen": 0.14591333270072937, "logits/rejected": -1.1174235343933105, "logps/chosen": -315.09429931640625, "logps/rejected": -244.73992919921875, "loss": 0.6367, "rewards/accuracies": 0.875, "rewards/chosen": 0.2578480839729309, "rewards/margins": 0.12600211799144745, "rewards/rejected": 0.13184595108032227, "step": 1338 }, { "epoch": 0.7221248483214238, "grad_norm": 7.33906888961792, "learning_rate": 1.0867019867107408e-07, "logits/chosen": 0.2425117790699005, "logits/rejected": -0.45565783977508545, "logps/chosen": -233.02061462402344, "logps/rejected": -234.82701110839844, "loss": 0.6807, "rewards/accuracies": 0.625, "rewards/chosen": 0.1702098846435547, "rewards/margins": 0.03552798926830292, "rewards/rejected": 0.13468191027641296, "step": 1339 }, { "epoch": 0.722664149925846, "grad_norm": 6.93991756439209, "learning_rate": 1.0828204841601607e-07, "logits/chosen": -0.7806396484375, "logits/rejected": -1.6724371910095215, "logps/chosen": -270.618408203125, "logps/rejected": -184.5111083984375, "loss": 0.676, "rewards/accuracies": 0.625, "rewards/chosen": 0.16873396933078766, "rewards/margins": 0.04323539882898331, "rewards/rejected": 0.12549859285354614, "step": 1340 }, { "epoch": 0.7232034515302683, "grad_norm": 8.187214851379395, "learning_rate": 1.0789440088816665e-07, "logits/chosen": -0.0035116076469421387, "logits/rejected": 0.6822943091392517, "logps/chosen": -212.34490966796875, "logps/rejected": -234.7574462890625, "loss": 0.6882, "rewards/accuracies": 0.625, "rewards/chosen": 0.1566387265920639, "rewards/margins": 0.02554769068956375, "rewards/rejected": 0.13109102845191956, "step": 1341 }, { "epoch": 0.7237427531346906, "grad_norm": 7.802231788635254, "learning_rate": 1.0750725746265832e-07, "logits/chosen": 0.5483349561691284, "logits/rejected": 0.7822273373603821, "logps/chosen": -243.4710693359375, "logps/rejected": -288.2770690917969, "loss": 0.726, "rewards/accuracies": 0.375, "rewards/chosen": 0.24895581603050232, "rewards/margins": -0.03311721235513687, "rewards/rejected": 0.2820730209350586, "step": 1342 }, { "epoch": 0.7242820547391129, "grad_norm": 8.606239318847656, "learning_rate": 1.071206195128353e-07, "logits/chosen": 0.5304661989212036, "logits/rejected": 0.3963369131088257, "logps/chosen": -214.0516357421875, "logps/rejected": -275.83807373046875, "loss": 0.6575, "rewards/accuracies": 0.625, "rewards/chosen": 0.266344279050827, "rewards/margins": 0.08506736159324646, "rewards/rejected": 0.18127688765525818, "step": 1343 }, { "epoch": 0.7248213563435352, "grad_norm": 7.732877731323242, "learning_rate": 1.0673448841024874e-07, "logits/chosen": -0.23677700757980347, "logits/rejected": -0.33425796031951904, "logps/chosen": -283.24871826171875, "logps/rejected": -262.9806823730469, "loss": 0.6215, "rewards/accuracies": 0.75, "rewards/chosen": 0.33177700638771057, "rewards/margins": 0.1655716896057129, "rewards/rejected": 0.16620531678199768, "step": 1344 }, { "epoch": 0.7253606579479573, "grad_norm": 6.068892955780029, "learning_rate": 1.063488655246518e-07, "logits/chosen": 0.16082829236984253, "logits/rejected": -0.0977558046579361, "logps/chosen": -217.8330535888672, "logps/rejected": -199.903076171875, "loss": 0.6592, "rewards/accuracies": 0.5, "rewards/chosen": 0.2986106872558594, "rewards/margins": 0.09268254041671753, "rewards/rejected": 0.20592813193798065, "step": 1345 }, { "epoch": 0.7258999595523796, "grad_norm": 8.43028450012207, "learning_rate": 1.059637522239949e-07, "logits/chosen": -0.020685642957687378, "logits/rejected": -0.2417331039905548, "logps/chosen": -270.5685729980469, "logps/rejected": -285.7800598144531, "loss": 0.6378, "rewards/accuracies": 0.75, "rewards/chosen": 0.3249289393424988, "rewards/margins": 0.12375202775001526, "rewards/rejected": 0.20117692649364471, "step": 1346 }, { "epoch": 0.7264392611568019, "grad_norm": 7.054600238800049, "learning_rate": 1.0557914987442045e-07, "logits/chosen": 0.19460251927375793, "logits/rejected": -0.7903667688369751, "logps/chosen": -213.15545654296875, "logps/rejected": -203.59738159179688, "loss": 0.6371, "rewards/accuracies": 0.75, "rewards/chosen": 0.2782992422580719, "rewards/margins": 0.1271061897277832, "rewards/rejected": 0.1511930525302887, "step": 1347 }, { "epoch": 0.7269785627612242, "grad_norm": 8.594265937805176, "learning_rate": 1.0519505984025864e-07, "logits/chosen": -0.7918789982795715, "logits/rejected": -0.12080623209476471, "logps/chosen": -214.57528686523438, "logps/rejected": -202.79110717773438, "loss": 0.7025, "rewards/accuracies": 0.625, "rewards/chosen": 0.19172364473342896, "rewards/margins": -0.00016260147094726562, "rewards/rejected": 0.19188624620437622, "step": 1348 }, { "epoch": 0.7275178643656465, "grad_norm": 8.053592681884766, "learning_rate": 1.0481148348402222e-07, "logits/chosen": 1.1605721712112427, "logits/rejected": -0.03933970630168915, "logps/chosen": -263.39398193359375, "logps/rejected": -246.0771484375, "loss": 0.6229, "rewards/accuracies": 0.75, "rewards/chosen": 0.3033917546272278, "rewards/margins": 0.15400390326976776, "rewards/rejected": 0.14938783645629883, "step": 1349 }, { "epoch": 0.7280571659700688, "grad_norm": 8.878920555114746, "learning_rate": 1.0442842216640166e-07, "logits/chosen": -1.0421099662780762, "logits/rejected": 0.13742661476135254, "logps/chosen": -254.75341796875, "logps/rejected": -324.3850402832031, "loss": 0.7312, "rewards/accuracies": 0.5, "rewards/chosen": 0.1654689759016037, "rewards/margins": -0.06553001701831818, "rewards/rejected": 0.23099899291992188, "step": 1350 }, { "epoch": 0.728596467574491, "grad_norm": 6.5738325119018555, "learning_rate": 1.0404587724626044e-07, "logits/chosen": 0.614402174949646, "logits/rejected": 0.32547125220298767, "logps/chosen": -250.16958618164062, "logps/rejected": -255.8306427001953, "loss": 0.6223, "rewards/accuracies": 0.75, "rewards/chosen": 0.3388586938381195, "rewards/margins": 0.1686018854379654, "rewards/rejected": 0.1702568084001541, "step": 1351 }, { "epoch": 0.7291357691789133, "grad_norm": 7.992438793182373, "learning_rate": 1.0366385008063014e-07, "logits/chosen": 0.2748405933380127, "logits/rejected": -1.6568009853363037, "logps/chosen": -345.33123779296875, "logps/rejected": -211.19908142089844, "loss": 0.5884, "rewards/accuracies": 0.875, "rewards/chosen": 0.3164100646972656, "rewards/margins": 0.2330358475446701, "rewards/rejected": 0.08337420970201492, "step": 1352 }, { "epoch": 0.7296750707833356, "grad_norm": 6.651468276977539, "learning_rate": 1.0328234202470574e-07, "logits/chosen": 0.4681311249732971, "logits/rejected": -0.2595019042491913, "logps/chosen": -230.1802215576172, "logps/rejected": -205.39389038085938, "loss": 0.6342, "rewards/accuracies": 0.5, "rewards/chosen": 0.21050338447093964, "rewards/margins": 0.13615551590919495, "rewards/rejected": 0.07434788346290588, "step": 1353 }, { "epoch": 0.7302143723877579, "grad_norm": 6.525243759155273, "learning_rate": 1.0290135443184067e-07, "logits/chosen": 0.5196101069450378, "logits/rejected": -0.9016942977905273, "logps/chosen": -181.7476043701172, "logps/rejected": -144.55703735351562, "loss": 0.673, "rewards/accuracies": 0.5, "rewards/chosen": 0.15419907867908478, "rewards/margins": 0.05385570973157883, "rewards/rejected": 0.10034336894750595, "step": 1354 }, { "epoch": 0.7307536739921802, "grad_norm": 7.345156669616699, "learning_rate": 1.0252088865354222e-07, "logits/chosen": -0.3604549765586853, "logits/rejected": -0.38349661231040955, "logps/chosen": -191.9664306640625, "logps/rejected": -166.34376525878906, "loss": 0.5925, "rewards/accuracies": 0.75, "rewards/chosen": 0.29479292035102844, "rewards/margins": 0.24406062066555023, "rewards/rejected": 0.0507323332130909, "step": 1355 }, { "epoch": 0.7312929755966024, "grad_norm": 7.623080730438232, "learning_rate": 1.021409460394663e-07, "logits/chosen": 1.2286925315856934, "logits/rejected": 0.48738789558410645, "logps/chosen": -279.7274169921875, "logps/rejected": -230.98297119140625, "loss": 0.6106, "rewards/accuracies": 0.75, "rewards/chosen": 0.29063376784324646, "rewards/margins": 0.18760518729686737, "rewards/rejected": 0.10302858054637909, "step": 1356 }, { "epoch": 0.7318322772010247, "grad_norm": 7.4196014404296875, "learning_rate": 1.0176152793741324e-07, "logits/chosen": 1.2370193004608154, "logits/rejected": 0.4533580541610718, "logps/chosen": -379.0828552246094, "logps/rejected": -261.73394775390625, "loss": 0.7292, "rewards/accuracies": 0.25, "rewards/chosen": 0.27712422609329224, "rewards/margins": -0.04743403568863869, "rewards/rejected": 0.3245582580566406, "step": 1357 }, { "epoch": 0.7323715788054469, "grad_norm": 7.965549468994141, "learning_rate": 1.0138263569332267e-07, "logits/chosen": 0.034255251288414, "logits/rejected": -0.47566333413124084, "logps/chosen": -281.9853515625, "logps/rejected": -239.76220703125, "loss": 0.634, "rewards/accuracies": 0.625, "rewards/chosen": 0.22173500061035156, "rewards/margins": 0.13534802198410034, "rewards/rejected": 0.08638697117567062, "step": 1358 }, { "epoch": 0.7329108804098692, "grad_norm": 6.554652690887451, "learning_rate": 1.0100427065126874e-07, "logits/chosen": 0.824408769607544, "logits/rejected": -0.36087796092033386, "logps/chosen": -220.3868865966797, "logps/rejected": -188.98382568359375, "loss": 0.6421, "rewards/accuracies": 0.5, "rewards/chosen": 0.2485463172197342, "rewards/margins": 0.1378704011440277, "rewards/rejected": 0.11067591607570648, "step": 1359 }, { "epoch": 0.7334501820142915, "grad_norm": 6.4781646728515625, "learning_rate": 1.0062643415345545e-07, "logits/chosen": 0.906370997428894, "logits/rejected": -0.24737659096717834, "logps/chosen": -204.678466796875, "logps/rejected": -166.45260620117188, "loss": 0.6586, "rewards/accuracies": 0.5, "rewards/chosen": 0.24765563011169434, "rewards/margins": 0.09181571006774902, "rewards/rejected": 0.1558399200439453, "step": 1360 }, { "epoch": 0.7339894836187137, "grad_norm": 7.610514163970947, "learning_rate": 1.002491275402119e-07, "logits/chosen": 0.7217711806297302, "logits/rejected": 0.8912860155105591, "logps/chosen": -242.14776611328125, "logps/rejected": -239.1456756591797, "loss": 0.6933, "rewards/accuracies": 0.625, "rewards/chosen": 0.26219987869262695, "rewards/margins": 0.015446186065673828, "rewards/rejected": 0.24675369262695312, "step": 1361 }, { "epoch": 0.734528785223136, "grad_norm": 6.387144088745117, "learning_rate": 9.987235214998741e-08, "logits/chosen": -0.447578102350235, "logits/rejected": -1.2165837287902832, "logps/chosen": -217.4945068359375, "logps/rejected": -178.41514587402344, "loss": 0.6436, "rewards/accuracies": 0.5, "rewards/chosen": 0.18784251809120178, "rewards/margins": 0.12238411605358124, "rewards/rejected": 0.06545839458703995, "step": 1362 }, { "epoch": 0.7350680868275583, "grad_norm": 7.788753509521484, "learning_rate": 9.949610931934684e-08, "logits/chosen": -0.06434629112482071, "logits/rejected": -1.1706981658935547, "logps/chosen": -279.1871337890625, "logps/rejected": -200.81991577148438, "loss": 0.7112, "rewards/accuracies": 0.5, "rewards/chosen": 0.21616040170192719, "rewards/margins": -0.02350768819451332, "rewards/rejected": 0.2396680861711502, "step": 1363 }, { "epoch": 0.7356073884319806, "grad_norm": 6.775146484375, "learning_rate": 9.912040038296599e-08, "logits/chosen": 1.3267977237701416, "logits/rejected": 1.3553818464279175, "logps/chosen": -240.43701171875, "logps/rejected": -209.1160888671875, "loss": 0.7388, "rewards/accuracies": 0.5, "rewards/chosen": 0.1689978688955307, "rewards/margins": -0.08484792709350586, "rewards/rejected": 0.25384578108787537, "step": 1364 }, { "epoch": 0.7361466900364029, "grad_norm": 6.455729007720947, "learning_rate": 9.874522667362659e-08, "logits/chosen": 0.7125237584114075, "logits/rejected": -0.5146552324295044, "logps/chosen": -254.67147827148438, "logps/rejected": -187.05760192871094, "loss": 0.6412, "rewards/accuracies": 0.75, "rewards/chosen": 0.2060547024011612, "rewards/margins": 0.11621876060962677, "rewards/rejected": 0.08983592689037323, "step": 1365 }, { "epoch": 0.7366859916408252, "grad_norm": 7.306731700897217, "learning_rate": 9.83705895222118e-08, "logits/chosen": -0.043071165680885315, "logits/rejected": -0.4577459692955017, "logps/chosen": -506.15423583984375, "logps/rejected": -262.582763671875, "loss": 0.6996, "rewards/accuracies": 0.375, "rewards/chosen": 0.1608007550239563, "rewards/margins": -0.010656258091330528, "rewards/rejected": 0.17145700752735138, "step": 1366 }, { "epoch": 0.7372252932452474, "grad_norm": 7.870561122894287, "learning_rate": 9.799649025770135e-08, "logits/chosen": 0.9552145600318909, "logits/rejected": -0.36317598819732666, "logps/chosen": -353.52520751953125, "logps/rejected": -241.82896423339844, "loss": 0.6724, "rewards/accuracies": 0.75, "rewards/chosen": 0.22285252809524536, "rewards/margins": 0.05596313625574112, "rewards/rejected": 0.16688938438892365, "step": 1367 }, { "epoch": 0.7377645948496697, "grad_norm": 7.238237380981445, "learning_rate": 9.762293020716695e-08, "logits/chosen": -0.4897254705429077, "logits/rejected": -1.1011886596679688, "logps/chosen": -252.82437133789062, "logps/rejected": -272.1824035644531, "loss": 0.6673, "rewards/accuracies": 0.625, "rewards/chosen": 0.21555271744728088, "rewards/margins": 0.05801697075366974, "rewards/rejected": 0.15753574669361115, "step": 1368 }, { "epoch": 0.738303896454092, "grad_norm": 7.498645782470703, "learning_rate": 9.724991069576744e-08, "logits/chosen": 0.25413861870765686, "logits/rejected": 0.9985327124595642, "logps/chosen": -244.02476501464844, "logps/rejected": -229.99285888671875, "loss": 0.7824, "rewards/accuracies": 0.375, "rewards/chosen": 0.17121200263500214, "rewards/margins": -0.15887461602687836, "rewards/rejected": 0.3300866186618805, "step": 1369 }, { "epoch": 0.7388431980585142, "grad_norm": 7.090149402618408, "learning_rate": 9.68774330467442e-08, "logits/chosen": 0.3981061577796936, "logits/rejected": -0.27455711364746094, "logps/chosen": -302.6665344238281, "logps/rejected": -274.79937744140625, "loss": 0.6581, "rewards/accuracies": 0.75, "rewards/chosen": 0.28012391924858093, "rewards/margins": 0.07637844979763031, "rewards/rejected": 0.20374546945095062, "step": 1370 }, { "epoch": 0.7393824996629365, "grad_norm": 5.919923305511475, "learning_rate": 9.650549858141646e-08, "logits/chosen": 0.3078945577144623, "logits/rejected": -0.7213093638420105, "logps/chosen": -175.2801971435547, "logps/rejected": -131.47605895996094, "loss": 0.6576, "rewards/accuracies": 0.625, "rewards/chosen": 0.23090972006320953, "rewards/margins": 0.0830373764038086, "rewards/rejected": 0.14787235856056213, "step": 1371 }, { "epoch": 0.7399218012673587, "grad_norm": 6.190971851348877, "learning_rate": 9.613410861917659e-08, "logits/chosen": -0.5219405293464661, "logits/rejected": -0.4237954616546631, "logps/chosen": -274.0438232421875, "logps/rejected": -273.06292724609375, "loss": 0.6765, "rewards/accuracies": 0.5, "rewards/chosen": 0.30806541442871094, "rewards/margins": 0.06075438857078552, "rewards/rejected": 0.24731102585792542, "step": 1372 }, { "epoch": 0.740461102871781, "grad_norm": 7.456714153289795, "learning_rate": 9.576326447748518e-08, "logits/chosen": 0.26319271326065063, "logits/rejected": -0.8220334053039551, "logps/chosen": -221.6479034423828, "logps/rejected": -147.8134307861328, "loss": 0.6399, "rewards/accuracies": 0.75, "rewards/chosen": 0.2718687951564789, "rewards/margins": 0.15731993317604065, "rewards/rejected": 0.11454888433218002, "step": 1373 }, { "epoch": 0.7410004044762033, "grad_norm": 7.888545036315918, "learning_rate": 9.539296747186678e-08, "logits/chosen": -1.4066497087478638, "logits/rejected": -0.19263961911201477, "logps/chosen": -147.88612365722656, "logps/rejected": -171.45909118652344, "loss": 0.7144, "rewards/accuracies": 0.625, "rewards/chosen": 0.126926988363266, "rewards/margins": -6.207823753356934e-05, "rewards/rejected": 0.12698908150196075, "step": 1374 }, { "epoch": 0.7415397060806256, "grad_norm": 7.284392356872559, "learning_rate": 9.502321891590512e-08, "logits/chosen": -0.43479257822036743, "logits/rejected": -0.9653462171554565, "logps/chosen": -262.4296569824219, "logps/rejected": -201.35113525390625, "loss": 0.6502, "rewards/accuracies": 0.625, "rewards/chosen": 0.33451902866363525, "rewards/margins": 0.10569381713867188, "rewards/rejected": 0.228825181722641, "step": 1375 }, { "epoch": 0.7420790076850479, "grad_norm": 8.148335456848145, "learning_rate": 9.465402012123816e-08, "logits/chosen": 0.13281643390655518, "logits/rejected": -0.3785475492477417, "logps/chosen": -201.5509490966797, "logps/rejected": -193.915771484375, "loss": 0.6307, "rewards/accuracies": 0.875, "rewards/chosen": 0.2863731384277344, "rewards/margins": 0.1372075080871582, "rewards/rejected": 0.14916563034057617, "step": 1376 }, { "epoch": 0.7426183092894701, "grad_norm": 7.647682189941406, "learning_rate": 9.42853723975538e-08, "logits/chosen": -0.6469489336013794, "logits/rejected": -1.7213470935821533, "logps/chosen": -202.96914672851562, "logps/rejected": -200.67721557617188, "loss": 0.6552, "rewards/accuracies": 0.75, "rewards/chosen": 0.22307053208351135, "rewards/margins": 0.09154300391674042, "rewards/rejected": 0.13152751326560974, "step": 1377 }, { "epoch": 0.7431576108938924, "grad_norm": 6.3903374671936035, "learning_rate": 9.391727705258502e-08, "logits/chosen": -0.650823175907135, "logits/rejected": -0.41075584292411804, "logps/chosen": -202.33970642089844, "logps/rejected": -224.35723876953125, "loss": 0.6934, "rewards/accuracies": 0.5, "rewards/chosen": 0.19508543610572815, "rewards/margins": 0.003872588276863098, "rewards/rejected": 0.19121284782886505, "step": 1378 }, { "epoch": 0.7436969124983147, "grad_norm": 7.088081359863281, "learning_rate": 9.354973539210531e-08, "logits/chosen": 0.588981032371521, "logits/rejected": 0.34445035457611084, "logps/chosen": -263.52545166015625, "logps/rejected": -268.9878234863281, "loss": 0.7395, "rewards/accuracies": 0.25, "rewards/chosen": 0.17399626970291138, "rewards/margins": -0.08452969044446945, "rewards/rejected": 0.2585259675979614, "step": 1379 }, { "epoch": 0.744236214102737, "grad_norm": 9.238449096679688, "learning_rate": 9.318274871992407e-08, "logits/chosen": 1.369771957397461, "logits/rejected": -0.7691029906272888, "logps/chosen": -270.6404113769531, "logps/rejected": -218.12680053710938, "loss": 0.7351, "rewards/accuracies": 0.625, "rewards/chosen": 0.13136214017868042, "rewards/margins": -0.05881224572658539, "rewards/rejected": 0.190174400806427, "step": 1380 }, { "epoch": 0.7447755157071593, "grad_norm": 7.624963283538818, "learning_rate": 9.281631833788195e-08, "logits/chosen": -0.14307284355163574, "logits/rejected": -0.46436822414398193, "logps/chosen": -320.2354736328125, "logps/rejected": -320.87481689453125, "loss": 0.6608, "rewards/accuracies": 0.5, "rewards/chosen": 0.257686048746109, "rewards/margins": 0.07562046498060226, "rewards/rejected": 0.18206559121608734, "step": 1381 }, { "epoch": 0.7453148173115816, "grad_norm": 7.487799167633057, "learning_rate": 9.245044554584608e-08, "logits/chosen": -0.5194365382194519, "logits/rejected": 0.2307802140712738, "logps/chosen": -221.33306884765625, "logps/rejected": -217.1483154296875, "loss": 0.7036, "rewards/accuracies": 0.5, "rewards/chosen": 0.19499072432518005, "rewards/margins": -0.009496975690126419, "rewards/rejected": 0.20448771119117737, "step": 1382 }, { "epoch": 0.7458541189160037, "grad_norm": 7.699361801147461, "learning_rate": 9.208513164170579e-08, "logits/chosen": 0.8075704574584961, "logits/rejected": -0.4389949440956116, "logps/chosen": -248.39987182617188, "logps/rejected": -212.53765869140625, "loss": 0.6239, "rewards/accuracies": 0.75, "rewards/chosen": 0.26230868697166443, "rewards/margins": 0.1562459021806717, "rewards/rejected": 0.10606279224157333, "step": 1383 }, { "epoch": 0.746393420520426, "grad_norm": 7.164297580718994, "learning_rate": 9.172037792136772e-08, "logits/chosen": 0.22875043749809265, "logits/rejected": -0.17523184418678284, "logps/chosen": -354.04107666015625, "logps/rejected": -305.5180358886719, "loss": 0.6156, "rewards/accuracies": 0.625, "rewards/chosen": 0.3072393536567688, "rewards/margins": 0.193263441324234, "rewards/rejected": 0.1139759048819542, "step": 1384 }, { "epoch": 0.7469327221248483, "grad_norm": 8.234734535217285, "learning_rate": 9.135618567875139e-08, "logits/chosen": 0.1009921059012413, "logits/rejected": -0.09950077533721924, "logps/chosen": -259.0480651855469, "logps/rejected": -232.0410919189453, "loss": 0.7908, "rewards/accuracies": 0.125, "rewards/chosen": 0.13988018035888672, "rewards/margins": -0.18109989166259766, "rewards/rejected": 0.3209800720214844, "step": 1385 }, { "epoch": 0.7474720237292706, "grad_norm": 6.302849769592285, "learning_rate": 9.09925562057845e-08, "logits/chosen": 0.0654134452342987, "logits/rejected": -0.7046445608139038, "logps/chosen": -270.4934997558594, "logps/rejected": -247.03549194335938, "loss": 0.6562, "rewards/accuracies": 0.625, "rewards/chosen": 0.36374664306640625, "rewards/margins": 0.09497575461864471, "rewards/rejected": 0.26877087354660034, "step": 1386 }, { "epoch": 0.7480113253336929, "grad_norm": 7.010738372802734, "learning_rate": 9.062949079239842e-08, "logits/chosen": -0.5017832517623901, "logits/rejected": -1.3834574222564697, "logps/chosen": -256.6371765136719, "logps/rejected": -297.2452697753906, "loss": 0.6467, "rewards/accuracies": 0.625, "rewards/chosen": 0.27163830399513245, "rewards/margins": 0.1087687611579895, "rewards/rejected": 0.16286954283714294, "step": 1387 }, { "epoch": 0.7485506269381151, "grad_norm": 8.812493324279785, "learning_rate": 9.026699072652361e-08, "logits/chosen": -0.5580148100852966, "logits/rejected": 0.8602297306060791, "logps/chosen": -163.98448181152344, "logps/rejected": -231.29562377929688, "loss": 0.7649, "rewards/accuracies": 0.375, "rewards/chosen": 0.13821421563625336, "rewards/margins": -0.12189294397830963, "rewards/rejected": 0.260107159614563, "step": 1388 }, { "epoch": 0.7490899285425374, "grad_norm": 7.36083459854126, "learning_rate": 8.990505729408493e-08, "logits/chosen": 0.176707461476326, "logits/rejected": 0.3785398006439209, "logps/chosen": -238.62832641601562, "logps/rejected": -297.7940368652344, "loss": 0.7284, "rewards/accuracies": 0.5, "rewards/chosen": 0.20243024826049805, "rewards/margins": -0.06113366782665253, "rewards/rejected": 0.26356393098831177, "step": 1389 }, { "epoch": 0.7496292301469597, "grad_norm": 6.753448009490967, "learning_rate": 8.954369177899726e-08, "logits/chosen": 0.27699413895606995, "logits/rejected": -0.8269992470741272, "logps/chosen": -240.7025146484375, "logps/rejected": -172.64315795898438, "loss": 0.6602, "rewards/accuracies": 0.75, "rewards/chosen": 0.17663250863552094, "rewards/margins": 0.09700041264295578, "rewards/rejected": 0.07963208854198456, "step": 1390 }, { "epoch": 0.750168531751382, "grad_norm": 6.424726963043213, "learning_rate": 8.918289546316097e-08, "logits/chosen": 0.8049682378768921, "logits/rejected": -0.8055755496025085, "logps/chosen": -243.52700805664062, "logps/rejected": -229.1033477783203, "loss": 0.6081, "rewards/accuracies": 0.75, "rewards/chosen": 0.3167659640312195, "rewards/margins": 0.19316531717777252, "rewards/rejected": 0.12360067665576935, "step": 1391 }, { "epoch": 0.7507078333558043, "grad_norm": 7.720163345336914, "learning_rate": 8.882266962645693e-08, "logits/chosen": -0.012236826121807098, "logits/rejected": -0.31428760290145874, "logps/chosen": -211.06881713867188, "logps/rejected": -241.81396484375, "loss": 0.6759, "rewards/accuracies": 0.625, "rewards/chosen": 0.3163747191429138, "rewards/margins": 0.042090609669685364, "rewards/rejected": 0.27428409457206726, "step": 1392 }, { "epoch": 0.7512471349602265, "grad_norm": 8.098331451416016, "learning_rate": 8.846301554674251e-08, "logits/chosen": 0.1881575584411621, "logits/rejected": -0.023630261421203613, "logps/chosen": -201.0306396484375, "logps/rejected": -190.0765380859375, "loss": 0.7848, "rewards/accuracies": 0.25, "rewards/chosen": 0.13467693328857422, "rewards/margins": -0.16214028000831604, "rewards/rejected": 0.29681721329689026, "step": 1393 }, { "epoch": 0.7517864365646488, "grad_norm": 6.93440580368042, "learning_rate": 8.810393449984704e-08, "logits/chosen": 0.7333905696868896, "logits/rejected": -0.4584701359272003, "logps/chosen": -214.11019897460938, "logps/rejected": -226.06471252441406, "loss": 0.6171, "rewards/accuracies": 0.625, "rewards/chosen": 0.2738809585571289, "rewards/margins": 0.17640724778175354, "rewards/rejected": 0.09747371822595596, "step": 1394 }, { "epoch": 0.752325738169071, "grad_norm": 6.6061882972717285, "learning_rate": 8.774542775956679e-08, "logits/chosen": -0.826754629611969, "logits/rejected": -0.3314387798309326, "logps/chosen": -191.9614715576172, "logps/rejected": -196.91964721679688, "loss": 0.5991, "rewards/accuracies": 0.75, "rewards/chosen": 0.2534659206867218, "rewards/margins": 0.218978613615036, "rewards/rejected": 0.03448734059929848, "step": 1395 }, { "epoch": 0.7528650397734933, "grad_norm": 7.418744087219238, "learning_rate": 8.738749659766085e-08, "logits/chosen": 0.5656816959381104, "logits/rejected": -0.03998098522424698, "logps/chosen": -254.67332458496094, "logps/rejected": -285.6153564453125, "loss": 0.6415, "rewards/accuracies": 0.75, "rewards/chosen": 0.3324945569038391, "rewards/margins": 0.12411557137966156, "rewards/rejected": 0.20837897062301636, "step": 1396 }, { "epoch": 0.7534043413779156, "grad_norm": 8.945199966430664, "learning_rate": 8.70301422838465e-08, "logits/chosen": -1.1599764823913574, "logits/rejected": -0.5134150981903076, "logps/chosen": -251.00790405273438, "logps/rejected": -266.7567138671875, "loss": 0.695, "rewards/accuracies": 0.5, "rewards/chosen": 0.145392045378685, "rewards/margins": 0.00656919926404953, "rewards/rejected": 0.13882285356521606, "step": 1397 }, { "epoch": 0.7539436429823378, "grad_norm": 7.2838239669799805, "learning_rate": 8.667336608579487e-08, "logits/chosen": -0.27174869179725647, "logits/rejected": -0.5072461366653442, "logps/chosen": -258.10394287109375, "logps/rejected": -253.23919677734375, "loss": 0.6904, "rewards/accuracies": 0.625, "rewards/chosen": 0.2196463644504547, "rewards/margins": 0.011559583246707916, "rewards/rejected": 0.2080867737531662, "step": 1398 }, { "epoch": 0.7544829445867601, "grad_norm": 6.5754547119140625, "learning_rate": 8.631716926912591e-08, "logits/chosen": 0.650461733341217, "logits/rejected": 0.9231566190719604, "logps/chosen": -163.17181396484375, "logps/rejected": -174.20289611816406, "loss": 0.6711, "rewards/accuracies": 0.375, "rewards/chosen": 0.22486992180347443, "rewards/margins": 0.06099052354693413, "rewards/rejected": 0.16387939453125, "step": 1399 }, { "epoch": 0.7550222461911824, "grad_norm": 6.437112808227539, "learning_rate": 8.596155309740469e-08, "logits/chosen": 0.06093952804803848, "logits/rejected": -0.5894863605499268, "logps/chosen": -256.7097473144531, "logps/rejected": -247.5192413330078, "loss": 0.6651, "rewards/accuracies": 0.375, "rewards/chosen": 0.306488037109375, "rewards/margins": 0.06481695920228958, "rewards/rejected": 0.24167108535766602, "step": 1400 }, { "epoch": 0.7555615477956047, "grad_norm": 7.6897149085998535, "learning_rate": 8.560651883213632e-08, "logits/chosen": -0.5203779935836792, "logits/rejected": -1.1673649549484253, "logps/chosen": -247.29888916015625, "logps/rejected": -243.68052673339844, "loss": 0.6799, "rewards/accuracies": 0.625, "rewards/chosen": 0.3211645185947418, "rewards/margins": 0.07850837707519531, "rewards/rejected": 0.2426561415195465, "step": 1401 }, { "epoch": 0.756100849400027, "grad_norm": 5.363736629486084, "learning_rate": 8.525206773276172e-08, "logits/chosen": 0.8216671347618103, "logits/rejected": -0.6621363162994385, "logps/chosen": -223.57530212402344, "logps/rejected": -149.89193725585938, "loss": 0.6427, "rewards/accuracies": 0.75, "rewards/chosen": 0.2525984048843384, "rewards/margins": 0.11178398132324219, "rewards/rejected": 0.1408143937587738, "step": 1402 }, { "epoch": 0.7566401510044493, "grad_norm": 6.977097511291504, "learning_rate": 8.489820105665307e-08, "logits/chosen": 0.5382859110832214, "logits/rejected": -0.33003753423690796, "logps/chosen": -215.10040283203125, "logps/rejected": -161.07656860351562, "loss": 0.6674, "rewards/accuracies": 0.625, "rewards/chosen": 0.2549822926521301, "rewards/margins": 0.06412382423877716, "rewards/rejected": 0.19085845351219177, "step": 1403 }, { "epoch": 0.7571794526088715, "grad_norm": 15.635050773620605, "learning_rate": 8.454492005910941e-08, "logits/chosen": -0.36054497957229614, "logits/rejected": -1.5838687419891357, "logps/chosen": -319.12054443359375, "logps/rejected": -245.77951049804688, "loss": 0.649, "rewards/accuracies": 0.625, "rewards/chosen": 0.2707436680793762, "rewards/margins": 0.10531435906887054, "rewards/rejected": 0.16542932391166687, "step": 1404 }, { "epoch": 0.7577187542132938, "grad_norm": 7.55141544342041, "learning_rate": 8.41922259933521e-08, "logits/chosen": 0.5272781848907471, "logits/rejected": 0.13479843735694885, "logps/chosen": -198.82101440429688, "logps/rejected": -233.26303100585938, "loss": 0.5937, "rewards/accuracies": 0.75, "rewards/chosen": 0.3474300503730774, "rewards/margins": 0.2275480329990387, "rewards/rejected": 0.1198820173740387, "step": 1405 }, { "epoch": 0.7582580558177161, "grad_norm": 7.873669147491455, "learning_rate": 8.384012011052053e-08, "logits/chosen": -0.09347753971815109, "logits/rejected": -1.5546801090240479, "logps/chosen": -244.65420532226562, "logps/rejected": -144.35572814941406, "loss": 0.6501, "rewards/accuracies": 0.75, "rewards/chosen": 0.2294166535139084, "rewards/margins": 0.09483051300048828, "rewards/rejected": 0.1345861405134201, "step": 1406 }, { "epoch": 0.7587973574221384, "grad_norm": 6.595964431762695, "learning_rate": 8.34886036596676e-08, "logits/chosen": 0.2712917625904083, "logits/rejected": -0.17333661019802094, "logps/chosen": -247.95452880859375, "logps/rejected": -305.45001220703125, "loss": 0.6603, "rewards/accuracies": 0.75, "rewards/chosen": 0.3232812285423279, "rewards/margins": 0.08142424374818802, "rewards/rejected": 0.24185696244239807, "step": 1407 }, { "epoch": 0.7593366590265606, "grad_norm": 6.568453311920166, "learning_rate": 8.313767788775498e-08, "logits/chosen": -0.29226359724998474, "logits/rejected": 0.15315616130828857, "logps/chosen": -220.67391967773438, "logps/rejected": -279.4277038574219, "loss": 0.6481, "rewards/accuracies": 0.75, "rewards/chosen": 0.32994937896728516, "rewards/margins": 0.09874744713306427, "rewards/rejected": 0.23120194673538208, "step": 1408 }, { "epoch": 0.7598759606309828, "grad_norm": 7.845480442047119, "learning_rate": 8.27873440396493e-08, "logits/chosen": 1.2851179838180542, "logits/rejected": 0.3845193088054657, "logps/chosen": -200.94293212890625, "logps/rejected": -165.29812622070312, "loss": 0.6681, "rewards/accuracies": 0.5, "rewards/chosen": 0.2103586196899414, "rewards/margins": 0.06525488197803497, "rewards/rejected": 0.14510375261306763, "step": 1409 }, { "epoch": 0.7604152622354051, "grad_norm": 7.060963153839111, "learning_rate": 8.243760335811734e-08, "logits/chosen": 1.6224415302276611, "logits/rejected": 0.4861796498298645, "logps/chosen": -224.0515594482422, "logps/rejected": -184.25613403320312, "loss": 0.6133, "rewards/accuracies": 0.75, "rewards/chosen": 0.26217538118362427, "rewards/margins": 0.1875239908695221, "rewards/rejected": 0.07465139031410217, "step": 1410 }, { "epoch": 0.7609545638398274, "grad_norm": 6.742592811584473, "learning_rate": 8.208845708382162e-08, "logits/chosen": -0.33583447337150574, "logits/rejected": -0.9555658102035522, "logps/chosen": -235.91091918945312, "logps/rejected": -221.1014404296875, "loss": 0.6602, "rewards/accuracies": 0.625, "rewards/chosen": 0.2191978543996811, "rewards/margins": 0.08270206302404404, "rewards/rejected": 0.13649578392505646, "step": 1411 }, { "epoch": 0.7614938654442497, "grad_norm": 7.542926788330078, "learning_rate": 8.173990645531612e-08, "logits/chosen": -0.45314040780067444, "logits/rejected": 0.6713016033172607, "logps/chosen": -180.01248168945312, "logps/rejected": -263.86553955078125, "loss": 0.7421, "rewards/accuracies": 0.125, "rewards/chosen": 0.17482557892799377, "rewards/margins": -0.0703967958688736, "rewards/rejected": 0.24522238969802856, "step": 1412 }, { "epoch": 0.762033167048672, "grad_norm": 7.889143466949463, "learning_rate": 8.139195270904181e-08, "logits/chosen": 0.1813853532075882, "logits/rejected": 0.34331950545310974, "logps/chosen": -261.51483154296875, "logps/rejected": -341.5196228027344, "loss": 0.701, "rewards/accuracies": 0.375, "rewards/chosen": 0.201878160238266, "rewards/margins": 0.004114435985684395, "rewards/rejected": 0.19776374101638794, "step": 1413 }, { "epoch": 0.7625724686530942, "grad_norm": 8.507914543151855, "learning_rate": 8.104459707932238e-08, "logits/chosen": -1.5884288549423218, "logits/rejected": -0.33574286103248596, "logps/chosen": -195.80557250976562, "logps/rejected": -230.83895874023438, "loss": 0.7587, "rewards/accuracies": 0.5, "rewards/chosen": 0.08599080890417099, "rewards/margins": -0.1055041253566742, "rewards/rejected": 0.19149494171142578, "step": 1414 }, { "epoch": 0.7631117702575165, "grad_norm": 8.0756254196167, "learning_rate": 8.069784079835964e-08, "logits/chosen": 1.2844409942626953, "logits/rejected": 0.5785068869590759, "logps/chosen": -331.93853759765625, "logps/rejected": -278.45513916015625, "loss": 0.7269, "rewards/accuracies": 0.625, "rewards/chosen": 0.30523234605789185, "rewards/margins": -0.05907468870282173, "rewards/rejected": 0.3643070459365845, "step": 1415 }, { "epoch": 0.7636510718619388, "grad_norm": 8.265884399414062, "learning_rate": 8.035168509622947e-08, "logits/chosen": 0.4877611994743347, "logits/rejected": -0.7051863670349121, "logps/chosen": -222.9419403076172, "logps/rejected": -178.67596435546875, "loss": 0.6331, "rewards/accuracies": 0.75, "rewards/chosen": 0.28072795271873474, "rewards/margins": 0.14806443452835083, "rewards/rejected": 0.1326635330915451, "step": 1416 }, { "epoch": 0.7641903734663611, "grad_norm": 7.908290386199951, "learning_rate": 8.000613120087698e-08, "logits/chosen": -0.0682934820652008, "logits/rejected": -0.5835909247398376, "logps/chosen": -172.6353759765625, "logps/rejected": -153.94566345214844, "loss": 0.6867, "rewards/accuracies": 0.75, "rewards/chosen": 0.18115893006324768, "rewards/margins": 0.0315668061375618, "rewards/rejected": 0.14959211647510529, "step": 1417 }, { "epoch": 0.7647296750707834, "grad_norm": 7.6476593017578125, "learning_rate": 7.96611803381127e-08, "logits/chosen": -0.20751681923866272, "logits/rejected": -0.9908617734909058, "logps/chosen": -159.70835876464844, "logps/rejected": -173.8741912841797, "loss": 0.626, "rewards/accuracies": 0.625, "rewards/chosen": 0.2447761595249176, "rewards/margins": 0.16534072160720825, "rewards/rejected": 0.07943543791770935, "step": 1418 }, { "epoch": 0.7652689766752057, "grad_norm": 7.983841419219971, "learning_rate": 7.931683373160788e-08, "logits/chosen": -0.2537631094455719, "logits/rejected": -0.6095055937767029, "logps/chosen": -290.51751708984375, "logps/rejected": -260.91851806640625, "loss": 0.6691, "rewards/accuracies": 0.625, "rewards/chosen": 0.24907809495925903, "rewards/margins": 0.06089086830615997, "rewards/rejected": 0.18818722665309906, "step": 1419 }, { "epoch": 0.7658082782796278, "grad_norm": 7.055339813232422, "learning_rate": 7.897309260289026e-08, "logits/chosen": 1.2252767086029053, "logits/rejected": 0.6565641164779663, "logps/chosen": -217.9817657470703, "logps/rejected": -193.72866821289062, "loss": 0.6687, "rewards/accuracies": 0.625, "rewards/chosen": 0.198161318898201, "rewards/margins": 0.07464122027158737, "rewards/rejected": 0.12352009117603302, "step": 1420 }, { "epoch": 0.7663475798840501, "grad_norm": 7.139714241027832, "learning_rate": 7.862995817133972e-08, "logits/chosen": 1.5395557880401611, "logits/rejected": -0.3614404797554016, "logps/chosen": -329.19921875, "logps/rejected": -182.62539672851562, "loss": 0.6463, "rewards/accuracies": 0.75, "rewards/chosen": 0.22906751930713654, "rewards/margins": 0.10010361671447754, "rewards/rejected": 0.128963902592659, "step": 1421 }, { "epoch": 0.7668868814884724, "grad_norm": 7.723531723022461, "learning_rate": 7.828743165418392e-08, "logits/chosen": 0.49756014347076416, "logits/rejected": -0.14719338715076447, "logps/chosen": -230.30726623535156, "logps/rejected": -198.77484130859375, "loss": 0.6858, "rewards/accuracies": 0.5, "rewards/chosen": 0.20742206275463104, "rewards/margins": 0.02544344961643219, "rewards/rejected": 0.18197861313819885, "step": 1422 }, { "epoch": 0.7674261830928947, "grad_norm": 7.85009765625, "learning_rate": 7.794551426649401e-08, "logits/chosen": -0.9381887316703796, "logits/rejected": -0.32102659344673157, "logps/chosen": -250.78106689453125, "logps/rejected": -261.6895751953125, "loss": 0.6993, "rewards/accuracies": 0.375, "rewards/chosen": 0.2638409435749054, "rewards/margins": 0.0057004764676094055, "rewards/rejected": 0.2581404745578766, "step": 1423 }, { "epoch": 0.767965484697317, "grad_norm": 6.73705530166626, "learning_rate": 7.760420722118057e-08, "logits/chosen": 1.497875690460205, "logits/rejected": -0.13364458084106445, "logps/chosen": -266.361083984375, "logps/rejected": -197.47581481933594, "loss": 0.6696, "rewards/accuracies": 0.625, "rewards/chosen": 0.2208835631608963, "rewards/margins": 0.06842097640037537, "rewards/rejected": 0.15246258676052094, "step": 1424 }, { "epoch": 0.7685047863017392, "grad_norm": 7.487432479858398, "learning_rate": 7.726351172898868e-08, "logits/chosen": -0.2628781795501709, "logits/rejected": -0.16591669619083405, "logps/chosen": -223.40045166015625, "logps/rejected": -239.82760620117188, "loss": 0.7283, "rewards/accuracies": 0.5, "rewards/chosen": 0.2436559796333313, "rewards/margins": -0.0569426529109478, "rewards/rejected": 0.3005986213684082, "step": 1425 }, { "epoch": 0.7690440879061615, "grad_norm": 7.098363399505615, "learning_rate": 7.692342899849419e-08, "logits/chosen": 0.11226321756839752, "logits/rejected": -0.4422336220741272, "logps/chosen": -191.4561767578125, "logps/rejected": -223.4169921875, "loss": 0.6477, "rewards/accuracies": 0.75, "rewards/chosen": 0.23550301790237427, "rewards/margins": 0.11379576474428177, "rewards/rejected": 0.1217072531580925, "step": 1426 }, { "epoch": 0.7695833895105838, "grad_norm": 6.997533798217773, "learning_rate": 7.658396023609931e-08, "logits/chosen": 0.413054883480072, "logits/rejected": -0.5517896413803101, "logps/chosen": -206.50582885742188, "logps/rejected": -152.651123046875, "loss": 0.64, "rewards/accuracies": 1.0, "rewards/chosen": 0.265434205532074, "rewards/margins": 0.11069728434085846, "rewards/rejected": 0.15473690629005432, "step": 1427 }, { "epoch": 0.7701226911150061, "grad_norm": 8.706216812133789, "learning_rate": 7.624510664602818e-08, "logits/chosen": 0.35216203331947327, "logits/rejected": 0.21802952885627747, "logps/chosen": -263.96905517578125, "logps/rejected": -264.9402160644531, "loss": 0.6742, "rewards/accuracies": 0.5, "rewards/chosen": 0.33639031648635864, "rewards/margins": 0.05766792595386505, "rewards/rejected": 0.2787223756313324, "step": 1428 }, { "epoch": 0.7706619927194284, "grad_norm": 8.145230293273926, "learning_rate": 7.590686943032273e-08, "logits/chosen": 0.7171145677566528, "logits/rejected": 0.7856228351593018, "logps/chosen": -194.6008758544922, "logps/rejected": -233.3496856689453, "loss": 0.7007, "rewards/accuracies": 0.375, "rewards/chosen": 0.16290999948978424, "rewards/margins": -0.00857972726225853, "rewards/rejected": 0.17148971557617188, "step": 1429 }, { "epoch": 0.7712012943238507, "grad_norm": 7.283442497253418, "learning_rate": 7.556924978883842e-08, "logits/chosen": 0.1494470238685608, "logits/rejected": -0.545466423034668, "logps/chosen": -295.2892761230469, "logps/rejected": -223.060546875, "loss": 0.6692, "rewards/accuracies": 0.625, "rewards/chosen": 0.2460257112979889, "rewards/margins": 0.0617525577545166, "rewards/rejected": 0.1842731535434723, "step": 1430 }, { "epoch": 0.7717405959282729, "grad_norm": 7.176407814025879, "learning_rate": 7.523224891923983e-08, "logits/chosen": -0.17572075128555298, "logits/rejected": -0.6066306233406067, "logps/chosen": -266.84918212890625, "logps/rejected": -228.87481689453125, "loss": 0.6682, "rewards/accuracies": 0.75, "rewards/chosen": 0.15324993431568146, "rewards/margins": 0.0826636403799057, "rewards/rejected": 0.07058630138635635, "step": 1431 }, { "epoch": 0.7722798975326952, "grad_norm": 8.208709716796875, "learning_rate": 7.48958680169966e-08, "logits/chosen": -0.9896048307418823, "logits/rejected": 0.010866612195968628, "logps/chosen": -348.8328552246094, "logps/rejected": -369.9046936035156, "loss": 0.7068, "rewards/accuracies": 0.5, "rewards/chosen": 0.2767467498779297, "rewards/margins": 0.0043814517557621, "rewards/rejected": 0.2723653018474579, "step": 1432 }, { "epoch": 0.7728191991371174, "grad_norm": 7.149876117706299, "learning_rate": 7.456010827537921e-08, "logits/chosen": 0.13861575722694397, "logits/rejected": -0.7600127458572388, "logps/chosen": -169.12904357910156, "logps/rejected": -154.69265747070312, "loss": 0.5897, "rewards/accuracies": 0.875, "rewards/chosen": 0.24771100282669067, "rewards/margins": 0.22643804550170898, "rewards/rejected": 0.02127295359969139, "step": 1433 }, { "epoch": 0.7733585007415397, "grad_norm": 6.682015895843506, "learning_rate": 7.422497088545435e-08, "logits/chosen": -0.24329957365989685, "logits/rejected": -0.7062938213348389, "logps/chosen": -260.015380859375, "logps/rejected": -309.20880126953125, "loss": 0.6772, "rewards/accuracies": 0.625, "rewards/chosen": 0.2335960566997528, "rewards/margins": 0.06309824436903, "rewards/rejected": 0.1704978048801422, "step": 1434 }, { "epoch": 0.773897802345962, "grad_norm": 6.642261505126953, "learning_rate": 7.389045703608126e-08, "logits/chosen": 0.21773508191108704, "logits/rejected": -0.24540966749191284, "logps/chosen": -227.904541015625, "logps/rejected": -262.2426452636719, "loss": 0.6073, "rewards/accuracies": 0.625, "rewards/chosen": 0.28521737456321716, "rewards/margins": 0.1989714503288269, "rewards/rejected": 0.08624592423439026, "step": 1435 }, { "epoch": 0.7744371039503842, "grad_norm": 7.706037521362305, "learning_rate": 7.355656791390716e-08, "logits/chosen": -0.22024352848529816, "logits/rejected": 0.2271576225757599, "logps/chosen": -208.94888305664062, "logps/rejected": -254.05157470703125, "loss": 0.6714, "rewards/accuracies": 0.625, "rewards/chosen": 0.11922255158424377, "rewards/margins": 0.06062335893511772, "rewards/rejected": 0.058599188923835754, "step": 1436 }, { "epoch": 0.7749764055548065, "grad_norm": 7.801687240600586, "learning_rate": 7.322330470336313e-08, "logits/chosen": -0.4930424392223358, "logits/rejected": -0.9692047238349915, "logps/chosen": -164.733154296875, "logps/rejected": -194.02943420410156, "loss": 0.7148, "rewards/accuracies": 0.5, "rewards/chosen": 0.14667072892189026, "rewards/margins": -0.03310614079236984, "rewards/rejected": 0.1797768771648407, "step": 1437 }, { "epoch": 0.7755157071592288, "grad_norm": 7.054015159606934, "learning_rate": 7.28906685866599e-08, "logits/chosen": 0.9838512539863586, "logits/rejected": -0.17552316188812256, "logps/chosen": -234.28236389160156, "logps/rejected": -189.8896484375, "loss": 0.6253, "rewards/accuracies": 0.5, "rewards/chosen": 0.271780788898468, "rewards/margins": 0.1573042869567871, "rewards/rejected": 0.11447648704051971, "step": 1438 }, { "epoch": 0.7760550087636511, "grad_norm": 8.015451431274414, "learning_rate": 7.25586607437837e-08, "logits/chosen": 0.5544331073760986, "logits/rejected": -0.1882704496383667, "logps/chosen": -250.51766967773438, "logps/rejected": -220.6772918701172, "loss": 0.6717, "rewards/accuracies": 0.375, "rewards/chosen": 0.3014402389526367, "rewards/margins": 0.06594448536634445, "rewards/rejected": 0.23549576103687286, "step": 1439 }, { "epoch": 0.7765943103680734, "grad_norm": 7.820230484008789, "learning_rate": 7.222728235249195e-08, "logits/chosen": 0.15975403785705566, "logits/rejected": 0.42706063389778137, "logps/chosen": -199.6822967529297, "logps/rejected": -290.0102233886719, "loss": 0.739, "rewards/accuracies": 0.5, "rewards/chosen": 0.18177220225334167, "rewards/margins": -0.06935258209705353, "rewards/rejected": 0.251124769449234, "step": 1440 }, { "epoch": 0.7771336119724956, "grad_norm": 7.8336100578308105, "learning_rate": 7.189653458830924e-08, "logits/chosen": -0.7224916815757751, "logits/rejected": -1.3861020803451538, "logps/chosen": -187.13558959960938, "logps/rejected": -201.63401794433594, "loss": 0.6891, "rewards/accuracies": 0.5, "rewards/chosen": 0.16979855298995972, "rewards/margins": 0.04015979915857315, "rewards/rejected": 0.12963876128196716, "step": 1441 }, { "epoch": 0.7776729135769179, "grad_norm": 7.854916095733643, "learning_rate": 7.156641862452315e-08, "logits/chosen": -0.32137006521224976, "logits/rejected": -0.6639515161514282, "logps/chosen": -307.03619384765625, "logps/rejected": -257.4342041015625, "loss": 0.664, "rewards/accuracies": 0.5, "rewards/chosen": 0.24678468704223633, "rewards/margins": 0.0684208869934082, "rewards/rejected": 0.17836380004882812, "step": 1442 }, { "epoch": 0.7782122151813402, "grad_norm": 7.285402774810791, "learning_rate": 7.123693563217978e-08, "logits/chosen": -0.33075442910194397, "logits/rejected": 0.15344861149787903, "logps/chosen": -206.30667114257812, "logps/rejected": -256.36505126953125, "loss": 0.7097, "rewards/accuracies": 0.25, "rewards/chosen": 0.2218114733695984, "rewards/margins": -0.022547252476215363, "rewards/rejected": 0.24435874819755554, "step": 1443 }, { "epoch": 0.7787515167857625, "grad_norm": 8.141386985778809, "learning_rate": 7.090808678008003e-08, "logits/chosen": 0.40441784262657166, "logits/rejected": -0.28385215997695923, "logps/chosen": -326.989013671875, "logps/rejected": -302.2472229003906, "loss": 0.6557, "rewards/accuracies": 0.75, "rewards/chosen": 0.2409299910068512, "rewards/margins": 0.10445456206798553, "rewards/rejected": 0.13647542893886566, "step": 1444 }, { "epoch": 0.7792908183901847, "grad_norm": 8.538065910339355, "learning_rate": 7.057987323477533e-08, "logits/chosen": 0.13334716856479645, "logits/rejected": -0.0372653603553772, "logps/chosen": -452.19720458984375, "logps/rejected": -252.94744873046875, "loss": 0.7717, "rewards/accuracies": 0.5, "rewards/chosen": 0.20859098434448242, "rewards/margins": -0.13117437064647675, "rewards/rejected": 0.33976536989212036, "step": 1445 }, { "epoch": 0.7798301199946069, "grad_norm": 6.859841346740723, "learning_rate": 7.025229616056325e-08, "logits/chosen": 0.9776374101638794, "logits/rejected": -0.05445854365825653, "logps/chosen": -250.8380889892578, "logps/rejected": -269.46661376953125, "loss": 0.6091, "rewards/accuracies": 1.0, "rewards/chosen": 0.2705671191215515, "rewards/margins": 0.18062978982925415, "rewards/rejected": 0.08993735909461975, "step": 1446 }, { "epoch": 0.7803694215990292, "grad_norm": 5.922843933105469, "learning_rate": 6.992535671948369e-08, "logits/chosen": 1.1269402503967285, "logits/rejected": 0.3075302541255951, "logps/chosen": -195.26046752929688, "logps/rejected": -189.00624084472656, "loss": 0.6354, "rewards/accuracies": 0.5, "rewards/chosen": 0.2079952359199524, "rewards/margins": 0.13365650177001953, "rewards/rejected": 0.07433871924877167, "step": 1447 }, { "epoch": 0.7809087232034515, "grad_norm": 7.388822555541992, "learning_rate": 6.959905607131455e-08, "logits/chosen": 1.6067208051681519, "logits/rejected": 0.6314756274223328, "logps/chosen": -245.87655639648438, "logps/rejected": -262.77447509765625, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": 0.21924267709255219, "rewards/margins": 0.010635752230882645, "rewards/rejected": 0.20860691368579865, "step": 1448 }, { "epoch": 0.7814480248078738, "grad_norm": 7.525094032287598, "learning_rate": 6.927339537356777e-08, "logits/chosen": 0.4276154935359955, "logits/rejected": -0.07019281387329102, "logps/chosen": -292.3406066894531, "logps/rejected": -247.23324584960938, "loss": 0.6347, "rewards/accuracies": 0.625, "rewards/chosen": 0.30999070405960083, "rewards/margins": 0.1394285261631012, "rewards/rejected": 0.17056219279766083, "step": 1449 }, { "epoch": 0.7819873264122961, "grad_norm": 7.737863540649414, "learning_rate": 6.894837578148505e-08, "logits/chosen": -0.35280317068099976, "logits/rejected": -1.2857270240783691, "logps/chosen": -249.60972595214844, "logps/rejected": -185.20054626464844, "loss": 0.6419, "rewards/accuracies": 0.75, "rewards/chosen": 0.21171551942825317, "rewards/margins": 0.10947523266077042, "rewards/rejected": 0.10224027931690216, "step": 1450 }, { "epoch": 0.7825266280167184, "grad_norm": 8.549269676208496, "learning_rate": 6.862399844803399e-08, "logits/chosen": -0.20041507482528687, "logits/rejected": -1.479318618774414, "logps/chosen": -244.22262573242188, "logps/rejected": -171.12045288085938, "loss": 0.702, "rewards/accuracies": 0.375, "rewards/chosen": 0.2562801241874695, "rewards/margins": 0.01617007702589035, "rewards/rejected": 0.24011006951332092, "step": 1451 }, { "epoch": 0.7830659296211406, "grad_norm": 6.384467601776123, "learning_rate": 6.830026452390352e-08, "logits/chosen": 0.5860675573348999, "logits/rejected": 0.6623389720916748, "logps/chosen": -195.2073516845703, "logps/rejected": -197.3836212158203, "loss": 0.7424, "rewards/accuracies": 0.375, "rewards/chosen": 0.21971769630908966, "rewards/margins": -0.08625898510217667, "rewards/rejected": 0.3059766888618469, "step": 1452 }, { "epoch": 0.7836052312255629, "grad_norm": 8.202244758605957, "learning_rate": 6.797717515750059e-08, "logits/chosen": -0.009088411927223206, "logits/rejected": -0.2528126537799835, "logps/chosen": -204.90692138671875, "logps/rejected": -196.6551971435547, "loss": 0.69, "rewards/accuracies": 0.75, "rewards/chosen": 0.18189215660095215, "rewards/margins": 0.024048279970884323, "rewards/rejected": 0.15784388780593872, "step": 1453 }, { "epoch": 0.7841445328299852, "grad_norm": 7.448551177978516, "learning_rate": 6.765473149494543e-08, "logits/chosen": 0.5237464904785156, "logits/rejected": -0.5044668912887573, "logps/chosen": -268.8206481933594, "logps/rejected": -225.84548950195312, "loss": 0.6493, "rewards/accuracies": 0.5, "rewards/chosen": 0.25292283296585083, "rewards/margins": 0.09608583897352219, "rewards/rejected": 0.15683698654174805, "step": 1454 }, { "epoch": 0.7846838344344075, "grad_norm": 6.418224811553955, "learning_rate": 6.733293468006773e-08, "logits/chosen": 0.396345317363739, "logits/rejected": -0.8478541970252991, "logps/chosen": -255.41213989257812, "logps/rejected": -153.31063842773438, "loss": 0.6496, "rewards/accuracies": 0.625, "rewards/chosen": 0.2263786494731903, "rewards/margins": 0.09904412925243378, "rewards/rejected": 0.12733450531959534, "step": 1455 }, { "epoch": 0.7852231360388298, "grad_norm": 6.556121349334717, "learning_rate": 6.701178585440256e-08, "logits/chosen": 0.6646672487258911, "logits/rejected": -1.2659058570861816, "logps/chosen": -175.9944305419922, "logps/rejected": -131.21847534179688, "loss": 0.6664, "rewards/accuracies": 0.375, "rewards/chosen": 0.2449638992547989, "rewards/margins": 0.07518865168094635, "rewards/rejected": 0.16977524757385254, "step": 1456 }, { "epoch": 0.785762437643252, "grad_norm": 6.118531227111816, "learning_rate": 6.669128615718633e-08, "logits/chosen": 1.2890297174453735, "logits/rejected": -0.25423896312713623, "logps/chosen": -251.40728759765625, "logps/rejected": -177.22430419921875, "loss": 0.6033, "rewards/accuracies": 0.75, "rewards/chosen": 0.28043681383132935, "rewards/margins": 0.2024584859609604, "rewards/rejected": 0.07797833532094955, "step": 1457 }, { "epoch": 0.7863017392476742, "grad_norm": 8.855640411376953, "learning_rate": 6.637143672535281e-08, "logits/chosen": -0.12075701355934143, "logits/rejected": -0.05087415874004364, "logps/chosen": -258.4335632324219, "logps/rejected": -275.59814453125, "loss": 0.7344, "rewards/accuracies": 0.25, "rewards/chosen": 0.2130487561225891, "rewards/margins": -0.056669048964977264, "rewards/rejected": 0.2697177827358246, "step": 1458 }, { "epoch": 0.7868410408520965, "grad_norm": 7.599788188934326, "learning_rate": 6.605223869352902e-08, "logits/chosen": 0.41489535570144653, "logits/rejected": 0.818354606628418, "logps/chosen": -210.96926879882812, "logps/rejected": -259.73388671875, "loss": 0.6541, "rewards/accuracies": 0.625, "rewards/chosen": 0.2636564373970032, "rewards/margins": 0.09641657769680023, "rewards/rejected": 0.16723985970020294, "step": 1459 }, { "epoch": 0.7873803424565188, "grad_norm": 8.870309829711914, "learning_rate": 6.573369319403108e-08, "logits/chosen": 0.5376191139221191, "logits/rejected": 0.2935018539428711, "logps/chosen": -281.5422058105469, "logps/rejected": -280.4180908203125, "loss": 0.6685, "rewards/accuracies": 0.375, "rewards/chosen": 0.2558395564556122, "rewards/margins": 0.06569405645132065, "rewards/rejected": 0.19014549255371094, "step": 1460 }, { "epoch": 0.7879196440609411, "grad_norm": 6.905012607574463, "learning_rate": 6.541580135686045e-08, "logits/chosen": 0.6927992701530457, "logits/rejected": -0.11877420544624329, "logps/chosen": -192.58189392089844, "logps/rejected": -186.9796905517578, "loss": 0.7027, "rewards/accuracies": 0.625, "rewards/chosen": 0.25088921189308167, "rewards/margins": -0.009848885238170624, "rewards/rejected": 0.2607380747795105, "step": 1461 }, { "epoch": 0.7884589456653633, "grad_norm": 7.746665000915527, "learning_rate": 6.509856430969982e-08, "logits/chosen": 0.4197014570236206, "logits/rejected": -0.15237730741500854, "logps/chosen": -270.75201416015625, "logps/rejected": -229.3333282470703, "loss": 0.7164, "rewards/accuracies": 0.5, "rewards/chosen": 0.3011336326599121, "rewards/margins": -0.03568820655345917, "rewards/rejected": 0.33682185411453247, "step": 1462 }, { "epoch": 0.7889982472697856, "grad_norm": 7.154517650604248, "learning_rate": 6.478198317790904e-08, "logits/chosen": -0.338360458612442, "logits/rejected": -1.6600345373153687, "logps/chosen": -263.4886474609375, "logps/rejected": -187.89830017089844, "loss": 0.6458, "rewards/accuracies": 0.625, "rewards/chosen": 0.1991174817085266, "rewards/margins": 0.10323606431484222, "rewards/rejected": 0.09588141739368439, "step": 1463 }, { "epoch": 0.7895375488742079, "grad_norm": 7.53279447555542, "learning_rate": 6.446605908452121e-08, "logits/chosen": -0.4830808639526367, "logits/rejected": 0.0937836617231369, "logps/chosen": -251.97396850585938, "logps/rejected": -437.283447265625, "loss": 0.6557, "rewards/accuracies": 0.625, "rewards/chosen": 0.20213738083839417, "rewards/margins": 0.08437047898769379, "rewards/rejected": 0.11776690185070038, "step": 1464 }, { "epoch": 0.7900768504786302, "grad_norm": 7.251241683959961, "learning_rate": 6.415079315023864e-08, "logits/chosen": 0.5959858894348145, "logits/rejected": -0.45296162366867065, "logps/chosen": -185.00149536132812, "logps/rejected": -215.93528747558594, "loss": 0.6745, "rewards/accuracies": 0.75, "rewards/chosen": 0.27737924456596375, "rewards/margins": 0.05470933020114899, "rewards/rejected": 0.22266989946365356, "step": 1465 }, { "epoch": 0.7906161520830525, "grad_norm": 8.237611770629883, "learning_rate": 6.383618649342892e-08, "logits/chosen": 0.2974909543991089, "logits/rejected": 0.3914446234703064, "logps/chosen": -238.87615966796875, "logps/rejected": -220.64996337890625, "loss": 0.7218, "rewards/accuracies": 0.625, "rewards/chosen": 0.28392642736434937, "rewards/margins": -0.022434763610363007, "rewards/rejected": 0.30636119842529297, "step": 1466 }, { "epoch": 0.7911554536874748, "grad_norm": 9.157388687133789, "learning_rate": 6.352224023012096e-08, "logits/chosen": 0.9698801040649414, "logits/rejected": 0.20161345601081848, "logps/chosen": -200.4534149169922, "logps/rejected": -204.5070037841797, "loss": 0.7133, "rewards/accuracies": 0.625, "rewards/chosen": 0.12299013882875443, "rewards/margins": -0.001926332712173462, "rewards/rejected": 0.12491646409034729, "step": 1467 }, { "epoch": 0.791694755291897, "grad_norm": 6.585491180419922, "learning_rate": 6.320895547400099e-08, "logits/chosen": 0.14798188209533691, "logits/rejected": -1.2951736450195312, "logps/chosen": -262.311767578125, "logps/rejected": -174.74754333496094, "loss": 0.6233, "rewards/accuracies": 1.0, "rewards/chosen": 0.2867639660835266, "rewards/margins": 0.14735566079616547, "rewards/rejected": 0.13940830528736115, "step": 1468 }, { "epoch": 0.7922340568963193, "grad_norm": 7.102452278137207, "learning_rate": 6.289633333640848e-08, "logits/chosen": 1.1886839866638184, "logits/rejected": -0.030452042818069458, "logps/chosen": -264.204833984375, "logps/rejected": -219.05397033691406, "loss": 0.6712, "rewards/accuracies": 0.75, "rewards/chosen": 0.2016405165195465, "rewards/margins": 0.06302347779273987, "rewards/rejected": 0.13861703872680664, "step": 1469 }, { "epoch": 0.7927733585007415, "grad_norm": 6.067802906036377, "learning_rate": 6.258437492633254e-08, "logits/chosen": 0.12506207823753357, "logits/rejected": 0.050690650939941406, "logps/chosen": -180.9417266845703, "logps/rejected": -223.70773315429688, "loss": 0.6426, "rewards/accuracies": 0.75, "rewards/chosen": 0.25350314378738403, "rewards/margins": 0.1162189468741417, "rewards/rejected": 0.13728418946266174, "step": 1470 }, { "epoch": 0.7933126601051638, "grad_norm": 7.302490711212158, "learning_rate": 6.227308135040773e-08, "logits/chosen": -0.15522176027297974, "logits/rejected": -0.16931690275669098, "logps/chosen": -235.03659057617188, "logps/rejected": -195.038330078125, "loss": 0.6365, "rewards/accuracies": 0.625, "rewards/chosen": 0.3089289665222168, "rewards/margins": 0.14474916458129883, "rewards/rejected": 0.16417980194091797, "step": 1471 }, { "epoch": 0.793851961709586, "grad_norm": 7.672072887420654, "learning_rate": 6.196245371291015e-08, "logits/chosen": 0.18702872097492218, "logits/rejected": -0.3933962285518646, "logps/chosen": -233.07325744628906, "logps/rejected": -135.80010986328125, "loss": 0.5801, "rewards/accuracies": 0.875, "rewards/chosen": 0.32516270875930786, "rewards/margins": 0.2609066963195801, "rewards/rejected": 0.06425599753856659, "step": 1472 }, { "epoch": 0.7943912633140083, "grad_norm": 9.86580753326416, "learning_rate": 6.165249311575361e-08, "logits/chosen": -0.7089015245437622, "logits/rejected": -0.18815171718597412, "logps/chosen": -189.3513946533203, "logps/rejected": -237.21868896484375, "loss": 0.7647, "rewards/accuracies": 0.375, "rewards/chosen": 0.17218467593193054, "rewards/margins": -0.10867471992969513, "rewards/rejected": 0.2808593511581421, "step": 1473 }, { "epoch": 0.7949305649184306, "grad_norm": 7.654724597930908, "learning_rate": 6.134320065848564e-08, "logits/chosen": -0.6068696975708008, "logits/rejected": -1.3504595756530762, "logps/chosen": -260.65496826171875, "logps/rejected": -216.5692901611328, "loss": 0.7361, "rewards/accuracies": 0.375, "rewards/chosen": 0.2131868451833725, "rewards/margins": -0.05636681988835335, "rewards/rejected": 0.26955366134643555, "step": 1474 }, { "epoch": 0.7954698665228529, "grad_norm": 7.083737373352051, "learning_rate": 6.103457743828366e-08, "logits/chosen": 0.527350127696991, "logits/rejected": -0.09593933820724487, "logps/chosen": -275.7688293457031, "logps/rejected": -226.47752380371094, "loss": 0.637, "rewards/accuracies": 0.75, "rewards/chosen": 0.28563234210014343, "rewards/margins": 0.13230818510055542, "rewards/rejected": 0.15332412719726562, "step": 1475 }, { "epoch": 0.7960091681272752, "grad_norm": 8.112215995788574, "learning_rate": 6.0726624549951e-08, "logits/chosen": -0.040202319622039795, "logits/rejected": -0.5789554119110107, "logps/chosen": -318.7373962402344, "logps/rejected": -322.9298400878906, "loss": 0.6599, "rewards/accuracies": 0.75, "rewards/chosen": 0.2795621156692505, "rewards/margins": 0.08331498503684998, "rewards/rejected": 0.19624710083007812, "step": 1476 }, { "epoch": 0.7965484697316975, "grad_norm": 8.758994102478027, "learning_rate": 6.041934308591324e-08, "logits/chosen": 0.3723578155040741, "logits/rejected": 0.34323593974113464, "logps/chosen": -260.2291259765625, "logps/rejected": -286.6726989746094, "loss": 0.7013, "rewards/accuracies": 0.25, "rewards/chosen": 0.2222227156162262, "rewards/margins": -0.011701392009854317, "rewards/rejected": 0.23392410576343536, "step": 1477 }, { "epoch": 0.7970877713361197, "grad_norm": 6.959707260131836, "learning_rate": 6.01127341362138e-08, "logits/chosen": 0.833451509475708, "logits/rejected": -0.8452929258346558, "logps/chosen": -397.90521240234375, "logps/rejected": -242.45449829101562, "loss": 0.6457, "rewards/accuracies": 0.625, "rewards/chosen": 0.2996227443218231, "rewards/margins": 0.11291142553091049, "rewards/rejected": 0.18671131134033203, "step": 1478 }, { "epoch": 0.797627072940542, "grad_norm": 6.278786659240723, "learning_rate": 5.980679878851075e-08, "logits/chosen": -0.22487768530845642, "logits/rejected": -0.05418160557746887, "logps/chosen": -173.23385620117188, "logps/rejected": -192.53363037109375, "loss": 0.6627, "rewards/accuracies": 0.625, "rewards/chosen": 0.2169780731201172, "rewards/margins": 0.07447223365306854, "rewards/rejected": 0.14250582456588745, "step": 1479 }, { "epoch": 0.7981663745449643, "grad_norm": 6.5141072273254395, "learning_rate": 5.9501538128072597e-08, "logits/chosen": 0.5405345559120178, "logits/rejected": -0.3997494578361511, "logps/chosen": -206.27548217773438, "logps/rejected": -213.97372436523438, "loss": 0.6958, "rewards/accuracies": 0.375, "rewards/chosen": 0.27698472142219543, "rewards/margins": 0.00748424232006073, "rewards/rejected": 0.2695004343986511, "step": 1480 }, { "epoch": 0.7987056761493866, "grad_norm": 8.48796272277832, "learning_rate": 5.9196953237774436e-08, "logits/chosen": 0.14068828523159027, "logits/rejected": 0.2916799485683441, "logps/chosen": -187.6827850341797, "logps/rejected": -253.22418212890625, "loss": 0.7004, "rewards/accuracies": 0.5, "rewards/chosen": 0.15086641907691956, "rewards/margins": -0.004661083221435547, "rewards/rejected": 0.1555275022983551, "step": 1481 }, { "epoch": 0.7992449777538089, "grad_norm": 7.468092441558838, "learning_rate": 5.8893045198094015e-08, "logits/chosen": 0.057533591985702515, "logits/rejected": -0.05048714578151703, "logps/chosen": -310.3269348144531, "logps/rejected": -254.0272216796875, "loss": 0.6955, "rewards/accuracies": 0.375, "rewards/chosen": 0.24230575561523438, "rewards/margins": 0.009255975484848022, "rewards/rejected": 0.23304978013038635, "step": 1482 }, { "epoch": 0.799784279358231, "grad_norm": 7.374403476715088, "learning_rate": 5.8589815087108333e-08, "logits/chosen": 0.851414680480957, "logits/rejected": -0.48211270570755005, "logps/chosen": -297.86956787109375, "logps/rejected": -204.8619384765625, "loss": 0.6497, "rewards/accuracies": 0.5, "rewards/chosen": 0.2932373285293579, "rewards/margins": 0.11079750955104828, "rewards/rejected": 0.18243980407714844, "step": 1483 }, { "epoch": 0.8003235809626533, "grad_norm": 7.395756721496582, "learning_rate": 5.8287263980489386e-08, "logits/chosen": 0.16941490769386292, "logits/rejected": 0.24294134974479675, "logps/chosen": -233.9009552001953, "logps/rejected": -207.06515502929688, "loss": 0.6962, "rewards/accuracies": 0.625, "rewards/chosen": 0.2352502942085266, "rewards/margins": 0.004476653411984444, "rewards/rejected": 0.23077362775802612, "step": 1484 }, { "epoch": 0.8008628825670756, "grad_norm": 8.086187362670898, "learning_rate": 5.798539295150026e-08, "logits/chosen": 0.06766346096992493, "logits/rejected": -2.248955488204956, "logps/chosen": -254.50836181640625, "logps/rejected": -196.62574768066406, "loss": 0.5747, "rewards/accuracies": 0.875, "rewards/chosen": 0.3263603448867798, "rewards/margins": 0.25847500562667847, "rewards/rejected": 0.06788530945777893, "step": 1485 }, { "epoch": 0.8014021841714979, "grad_norm": 6.661515235900879, "learning_rate": 5.768420307099187e-08, "logits/chosen": 0.8165938854217529, "logits/rejected": -0.3788672387599945, "logps/chosen": -307.20062255859375, "logps/rejected": -256.48980712890625, "loss": 0.6184, "rewards/accuracies": 0.875, "rewards/chosen": 0.23257417976856232, "rewards/margins": 0.16345366835594177, "rewards/rejected": 0.06912050396203995, "step": 1486 }, { "epoch": 0.8019414857759202, "grad_norm": 7.346641540527344, "learning_rate": 5.738369540739865e-08, "logits/chosen": 1.0327621698379517, "logits/rejected": 1.0962777137756348, "logps/chosen": -318.11907958984375, "logps/rejected": -303.84783935546875, "loss": 0.6925, "rewards/accuracies": 0.625, "rewards/chosen": 0.22855108976364136, "rewards/margins": 0.019564010202884674, "rewards/rejected": 0.20898710191249847, "step": 1487 }, { "epoch": 0.8024807873803425, "grad_norm": 6.628454685211182, "learning_rate": 5.7083871026735065e-08, "logits/chosen": 0.7556992173194885, "logits/rejected": 0.12994921207427979, "logps/chosen": -248.8976593017578, "logps/rejected": -201.3057403564453, "loss": 0.6457, "rewards/accuracies": 0.625, "rewards/chosen": 0.2779771685600281, "rewards/margins": 0.1163863092660904, "rewards/rejected": 0.16159087419509888, "step": 1488 }, { "epoch": 0.8030200889847647, "grad_norm": 6.351442813873291, "learning_rate": 5.6784730992591624e-08, "logits/chosen": -0.517838716506958, "logits/rejected": -1.5752564668655396, "logps/chosen": -201.23736572265625, "logps/rejected": -148.58529663085938, "loss": 0.6271, "rewards/accuracies": 0.75, "rewards/chosen": 0.21771831810474396, "rewards/margins": 0.1483418494462967, "rewards/rejected": 0.06937646865844727, "step": 1489 }, { "epoch": 0.803559390589187, "grad_norm": 7.036858081817627, "learning_rate": 5.6486276366131264e-08, "logits/chosen": 0.23951226472854614, "logits/rejected": 0.21370121836662292, "logps/chosen": -239.26585388183594, "logps/rejected": -242.53089904785156, "loss": 0.6768, "rewards/accuracies": 0.75, "rewards/chosen": 0.3577348589897156, "rewards/margins": 0.058123305439949036, "rewards/rejected": 0.29961156845092773, "step": 1490 }, { "epoch": 0.8040986921936093, "grad_norm": 6.051412105560303, "learning_rate": 5.6188508206085475e-08, "logits/chosen": -0.921517014503479, "logits/rejected": -0.8056782484054565, "logps/chosen": -240.15322875976562, "logps/rejected": -212.95379638671875, "loss": 0.6671, "rewards/accuracies": 0.625, "rewards/chosen": 0.26670151948928833, "rewards/margins": 0.05828629434108734, "rewards/rejected": 0.20841524004936218, "step": 1491 }, { "epoch": 0.8046379937980316, "grad_norm": 7.523355484008789, "learning_rate": 5.589142756875065e-08, "logits/chosen": 0.782548725605011, "logits/rejected": 0.18682409822940826, "logps/chosen": -332.846435546875, "logps/rejected": -236.76588439941406, "loss": 0.6548, "rewards/accuracies": 0.75, "rewards/chosen": 0.25431346893310547, "rewards/margins": 0.08738036453723907, "rewards/rejected": 0.1669331192970276, "step": 1492 }, { "epoch": 0.8051772954024539, "grad_norm": 8.806517601013184, "learning_rate": 5.55950355079842e-08, "logits/chosen": 0.15090854465961456, "logits/rejected": -1.161678433418274, "logps/chosen": -287.838623046875, "logps/rejected": -295.51123046875, "loss": 0.6969, "rewards/accuracies": 0.625, "rewards/chosen": 0.23887042701244354, "rewards/margins": 0.0028279293328523636, "rewards/rejected": 0.23604251444339752, "step": 1493 }, { "epoch": 0.8057165970068761, "grad_norm": 7.049385070800781, "learning_rate": 5.5299333075201016e-08, "logits/chosen": -0.6665620803833008, "logits/rejected": -0.05932298302650452, "logps/chosen": -225.55726623535156, "logps/rejected": -248.33111572265625, "loss": 0.7231, "rewards/accuracies": 0.25, "rewards/chosen": 0.17033754289150238, "rewards/margins": -0.054001860320568085, "rewards/rejected": 0.22433939576148987, "step": 1494 }, { "epoch": 0.8062558986112983, "grad_norm": 7.364766597747803, "learning_rate": 5.500432131936944e-08, "logits/chosen": 0.2582981586456299, "logits/rejected": -0.699051558971405, "logps/chosen": -239.72166442871094, "logps/rejected": -185.88783264160156, "loss": 0.6875, "rewards/accuracies": 0.375, "rewards/chosen": 0.18558406829833984, "rewards/margins": 0.02388487011194229, "rewards/rejected": 0.16169919073581696, "step": 1495 }, { "epoch": 0.8067952002157206, "grad_norm": 8.15445327758789, "learning_rate": 5.471000128700784e-08, "logits/chosen": -0.1381845772266388, "logits/rejected": 1.50877046585083, "logps/chosen": -241.77554321289062, "logps/rejected": -263.58026123046875, "loss": 0.7447, "rewards/accuracies": 0.5, "rewards/chosen": 0.14317761361598969, "rewards/margins": -0.08557567000389099, "rewards/rejected": 0.22875328361988068, "step": 1496 }, { "epoch": 0.8073345018201429, "grad_norm": 7.921571254730225, "learning_rate": 5.441637402218077e-08, "logits/chosen": -0.1351366937160492, "logits/rejected": -0.7748820185661316, "logps/chosen": -237.97865295410156, "logps/rejected": -192.0287628173828, "loss": 0.6827, "rewards/accuracies": 0.625, "rewards/chosen": 0.08462628722190857, "rewards/margins": 0.042085547000169754, "rewards/rejected": 0.042540740221738815, "step": 1497 }, { "epoch": 0.8078738034245652, "grad_norm": 6.641026973724365, "learning_rate": 5.412344056649526e-08, "logits/chosen": 0.3664908707141876, "logits/rejected": -0.031974583864212036, "logps/chosen": -422.6947021484375, "logps/rejected": -318.3591003417969, "loss": 0.6503, "rewards/accuracies": 0.5, "rewards/chosen": 0.30654555559158325, "rewards/margins": 0.09901820123195648, "rewards/rejected": 0.20752735435962677, "step": 1498 }, { "epoch": 0.8084131050289874, "grad_norm": 7.3334760665893555, "learning_rate": 5.383120195909718e-08, "logits/chosen": -1.0684643983840942, "logits/rejected": -0.3663855195045471, "logps/chosen": -203.88095092773438, "logps/rejected": -247.63768005371094, "loss": 0.7207, "rewards/accuracies": 0.5, "rewards/chosen": 0.2499866485595703, "rewards/margins": -0.02145405113697052, "rewards/rejected": 0.27144068479537964, "step": 1499 }, { "epoch": 0.8089524066334097, "grad_norm": 7.985299587249756, "learning_rate": 5.353965923666742e-08, "logits/chosen": -0.7877472043037415, "logits/rejected": -0.2795852720737457, "logps/chosen": -313.05560302734375, "logps/rejected": -350.213134765625, "loss": 0.7023, "rewards/accuracies": 0.375, "rewards/chosen": 0.22274190187454224, "rewards/margins": 0.017535602673888206, "rewards/rejected": 0.20520630478858948, "step": 1500 }, { "epoch": 0.8089524066334097, "eval_logits/chosen": 1.3644288778305054, "eval_logits/rejected": 1.0909603834152222, "eval_logps/chosen": -248.90052795410156, "eval_logps/rejected": -233.94927978515625, "eval_loss": 0.6706193685531616, "eval_rewards/accuracies": 0.595652163028717, "eval_rewards/chosen": 0.2550937831401825, "eval_rewards/margins": 0.0637654960155487, "eval_rewards/rejected": 0.1913282871246338, "eval_runtime": 836.6126, "eval_samples_per_second": 1.924, "eval_steps_per_second": 0.962, "step": 1500 }, { "epoch": 0.809491708237832, "grad_norm": 6.16612434387207, "learning_rate": 5.324881343341839e-08, "logits/chosen": 1.0902577638626099, "logits/rejected": 0.5503213405609131, "logps/chosen": -241.94781494140625, "logps/rejected": -250.4031982421875, "loss": 0.6099, "rewards/accuracies": 0.75, "rewards/chosen": 0.3599034249782562, "rewards/margins": 0.20127803087234497, "rewards/rejected": 0.15862542390823364, "step": 1501 }, { "epoch": 0.8100310098422543, "grad_norm": 8.806445121765137, "learning_rate": 5.295866558109022e-08, "logits/chosen": -0.56241375207901, "logits/rejected": -0.9480810165405273, "logps/chosen": -275.30438232421875, "logps/rejected": -254.8780975341797, "loss": 0.6784, "rewards/accuracies": 0.5, "rewards/chosen": 0.2944112718105316, "rewards/margins": 0.07578430324792862, "rewards/rejected": 0.2186269760131836, "step": 1502 }, { "epoch": 0.8105703114466766, "grad_norm": 6.272574424743652, "learning_rate": 5.26692167089472e-08, "logits/chosen": 0.16793179512023926, "logits/rejected": -0.6036605834960938, "logps/chosen": -292.10223388671875, "logps/rejected": -260.27496337890625, "loss": 0.6106, "rewards/accuracies": 0.75, "rewards/chosen": 0.29798728227615356, "rewards/margins": 0.17995098233222961, "rewards/rejected": 0.11803627014160156, "step": 1503 }, { "epoch": 0.8111096130510989, "grad_norm": 7.40969705581665, "learning_rate": 5.2380467843773865e-08, "logits/chosen": 0.01262897253036499, "logits/rejected": -0.41619545221328735, "logps/chosen": -208.77822875976562, "logps/rejected": -182.93482971191406, "loss": 0.6035, "rewards/accuracies": 0.75, "rewards/chosen": 0.18902501463890076, "rewards/margins": 0.20403628051280975, "rewards/rejected": -0.015011262148618698, "step": 1504 }, { "epoch": 0.8116489146555211, "grad_norm": 7.782901287078857, "learning_rate": 5.2092420009871854e-08, "logits/chosen": 1.4273788928985596, "logits/rejected": -0.925783634185791, "logps/chosen": -267.1810302734375, "logps/rejected": -180.0823516845703, "loss": 0.6852, "rewards/accuracies": 0.75, "rewards/chosen": 0.1858808696269989, "rewards/margins": 0.05905924737453461, "rewards/rejected": 0.1268216222524643, "step": 1505 }, { "epoch": 0.8121882162599434, "grad_norm": 7.161984920501709, "learning_rate": 5.180507422905583e-08, "logits/chosen": -0.6532650589942932, "logits/rejected": -0.5577034950256348, "logps/chosen": -211.00680541992188, "logps/rejected": -197.302978515625, "loss": 0.6516, "rewards/accuracies": 0.75, "rewards/chosen": 0.2479148805141449, "rewards/margins": 0.09731779992580414, "rewards/rejected": 0.15059709548950195, "step": 1506 }, { "epoch": 0.8127275178643657, "grad_norm": 6.851113796234131, "learning_rate": 5.151843152064997e-08, "logits/chosen": 0.04912838339805603, "logits/rejected": -0.6879080533981323, "logps/chosen": -239.73492431640625, "logps/rejected": -281.8937072753906, "loss": 0.7145, "rewards/accuracies": 0.5, "rewards/chosen": 0.19157105684280396, "rewards/margins": -0.010395810008049011, "rewards/rejected": 0.20196686685085297, "step": 1507 }, { "epoch": 0.8132668194687879, "grad_norm": 8.208599090576172, "learning_rate": 5.1232492901484516e-08, "logits/chosen": 0.112160325050354, "logits/rejected": -1.1757173538208008, "logps/chosen": -304.5999450683594, "logps/rejected": -242.610595703125, "loss": 0.6447, "rewards/accuracies": 0.5, "rewards/chosen": 0.3613402545452118, "rewards/margins": 0.11618128418922424, "rewards/rejected": 0.24515897035598755, "step": 1508 }, { "epoch": 0.8138061210732102, "grad_norm": 7.326775550842285, "learning_rate": 5.0947259385891925e-08, "logits/chosen": 0.28489595651626587, "logits/rejected": 0.3681371808052063, "logps/chosen": -241.0958251953125, "logps/rejected": -318.0062561035156, "loss": 0.6418, "rewards/accuracies": 0.875, "rewards/chosen": 0.2598203718662262, "rewards/margins": 0.12311368435621262, "rewards/rejected": 0.13670669496059418, "step": 1509 }, { "epoch": 0.8143454226776324, "grad_norm": 8.32992935180664, "learning_rate": 5.066273198570342e-08, "logits/chosen": -1.5433549880981445, "logits/rejected": -0.13147777318954468, "logps/chosen": -298.4884948730469, "logps/rejected": -533.9010009765625, "loss": 0.7404, "rewards/accuracies": 0.375, "rewards/chosen": 0.24847659468650818, "rewards/margins": -0.07945843040943146, "rewards/rejected": 0.32793503999710083, "step": 1510 }, { "epoch": 0.8148847242820547, "grad_norm": 7.016333103179932, "learning_rate": 5.037891171024542e-08, "logits/chosen": 0.36626672744750977, "logits/rejected": -0.2922482490539551, "logps/chosen": -209.74778747558594, "logps/rejected": -182.396240234375, "loss": 0.638, "rewards/accuracies": 0.625, "rewards/chosen": 0.3185363709926605, "rewards/margins": 0.1267329603433609, "rewards/rejected": 0.19180341064929962, "step": 1511 }, { "epoch": 0.815424025886477, "grad_norm": 6.355964660644531, "learning_rate": 5.009579956633578e-08, "logits/chosen": -0.09849553555250168, "logits/rejected": -0.6075491309165955, "logps/chosen": -200.34158325195312, "logps/rejected": -250.5266571044922, "loss": 0.6794, "rewards/accuracies": 0.75, "rewards/chosen": 0.2183246612548828, "rewards/margins": 0.02994728460907936, "rewards/rejected": 0.18837738037109375, "step": 1512 }, { "epoch": 0.8159633274908993, "grad_norm": 8.24234676361084, "learning_rate": 4.9813396558280486e-08, "logits/chosen": 0.44060254096984863, "logits/rejected": -0.5617737174034119, "logps/chosen": -263.46563720703125, "logps/rejected": -225.0086212158203, "loss": 0.6757, "rewards/accuracies": 0.625, "rewards/chosen": 0.2882663607597351, "rewards/margins": 0.0706634521484375, "rewards/rejected": 0.2176029235124588, "step": 1513 }, { "epoch": 0.8165026290953216, "grad_norm": 6.095864772796631, "learning_rate": 4.953170368786985e-08, "logits/chosen": 0.658421516418457, "logits/rejected": -0.6155095100402832, "logps/chosen": -166.83340454101562, "logps/rejected": -143.41036987304688, "loss": 0.6502, "rewards/accuracies": 0.625, "rewards/chosen": 0.2423071265220642, "rewards/margins": 0.11073131859302521, "rewards/rejected": 0.1315758228302002, "step": 1514 }, { "epoch": 0.8170419306997438, "grad_norm": 6.614131450653076, "learning_rate": 4.92507219543751e-08, "logits/chosen": 0.8454495072364807, "logits/rejected": -0.5330227017402649, "logps/chosen": -260.45379638671875, "logps/rejected": -194.5661163330078, "loss": 0.6109, "rewards/accuracies": 0.75, "rewards/chosen": 0.5049072504043579, "rewards/margins": 0.18822717666625977, "rewards/rejected": 0.31668004393577576, "step": 1515 }, { "epoch": 0.8175812323041661, "grad_norm": 8.954354286193848, "learning_rate": 4.8970452354544804e-08, "logits/chosen": 0.2077130526304245, "logits/rejected": -0.5341324806213379, "logps/chosen": -187.4531707763672, "logps/rejected": -191.74957275390625, "loss": 0.603, "rewards/accuracies": 0.75, "rewards/chosen": 0.21332377195358276, "rewards/margins": 0.20034456253051758, "rewards/rejected": 0.01297922432422638, "step": 1516 }, { "epoch": 0.8181205339085884, "grad_norm": 8.308403015136719, "learning_rate": 4.869089588260128e-08, "logits/chosen": 1.13260817527771, "logits/rejected": -0.00880444049835205, "logps/chosen": -281.69390869140625, "logps/rejected": -227.61036682128906, "loss": 0.6915, "rewards/accuracies": 0.5, "rewards/chosen": 0.23011940717697144, "rewards/margins": 0.04008418321609497, "rewards/rejected": 0.19003523886203766, "step": 1517 }, { "epoch": 0.8186598355130107, "grad_norm": 6.958199501037598, "learning_rate": 4.841205353023714e-08, "logits/chosen": 0.08441516757011414, "logits/rejected": -0.978095293045044, "logps/chosen": -217.54598999023438, "logps/rejected": -188.137939453125, "loss": 0.6137, "rewards/accuracies": 0.875, "rewards/chosen": 0.24934786558151245, "rewards/margins": 0.1718616485595703, "rewards/rejected": 0.07748623192310333, "step": 1518 }, { "epoch": 0.819199137117433, "grad_norm": 8.025626182556152, "learning_rate": 4.813392628661175e-08, "logits/chosen": 0.4417731761932373, "logits/rejected": 0.14353448152542114, "logps/chosen": -260.78729248046875, "logps/rejected": -222.66526794433594, "loss": 0.7151, "rewards/accuracies": 0.5, "rewards/chosen": 0.25517216324806213, "rewards/margins": -0.015723424032330513, "rewards/rejected": 0.2708956003189087, "step": 1519 }, { "epoch": 0.8197384387218551, "grad_norm": 8.046648025512695, "learning_rate": 4.7856515138347735e-08, "logits/chosen": -0.5610284805297852, "logits/rejected": -0.3621827960014343, "logps/chosen": -247.22955322265625, "logps/rejected": -219.96356201171875, "loss": 0.6921, "rewards/accuracies": 0.5, "rewards/chosen": 0.260010689496994, "rewards/margins": 0.016847949475049973, "rewards/rejected": 0.24316272139549255, "step": 1520 }, { "epoch": 0.8202777403262774, "grad_norm": 7.183707237243652, "learning_rate": 4.757982106952735e-08, "logits/chosen": 0.13348177075386047, "logits/rejected": 0.36425548791885376, "logps/chosen": -218.95408630371094, "logps/rejected": -198.431884765625, "loss": 0.7348, "rewards/accuracies": 0.25, "rewards/chosen": 0.09122113883495331, "rewards/margins": -0.07182732224464417, "rewards/rejected": 0.16304844617843628, "step": 1521 }, { "epoch": 0.8208170419306997, "grad_norm": 7.318119525909424, "learning_rate": 4.730384506168919e-08, "logits/chosen": -0.6569498777389526, "logits/rejected": -0.31048426032066345, "logps/chosen": -288.2287902832031, "logps/rejected": -280.94573974609375, "loss": 0.6901, "rewards/accuracies": 0.5, "rewards/chosen": 0.2696557939052582, "rewards/margins": 0.007683757692575455, "rewards/rejected": 0.2619720697402954, "step": 1522 }, { "epoch": 0.821356343535122, "grad_norm": 6.659022808074951, "learning_rate": 4.702858809382462e-08, "logits/chosen": -0.3904348611831665, "logits/rejected": -0.52785325050354, "logps/chosen": -277.5850524902344, "logps/rejected": -303.72705078125, "loss": 0.6179, "rewards/accuracies": 0.875, "rewards/chosen": 0.3309403657913208, "rewards/margins": 0.16070489585399628, "rewards/rejected": 0.17023545503616333, "step": 1523 }, { "epoch": 0.8218956451395443, "grad_norm": 6.8238091468811035, "learning_rate": 4.675405114237427e-08, "logits/chosen": 0.4530567526817322, "logits/rejected": 0.5241587162017822, "logps/chosen": -254.58799743652344, "logps/rejected": -251.01931762695312, "loss": 0.685, "rewards/accuracies": 0.625, "rewards/chosen": 0.2617396116256714, "rewards/margins": 0.03881064057350159, "rewards/rejected": 0.2229290008544922, "step": 1524 }, { "epoch": 0.8224349467439666, "grad_norm": 6.484152793884277, "learning_rate": 4.648023518122463e-08, "logits/chosen": 0.11652401089668274, "logits/rejected": -0.6295149326324463, "logps/chosen": -270.31439208984375, "logps/rejected": -240.67349243164062, "loss": 0.6791, "rewards/accuracies": 0.375, "rewards/chosen": 0.2061534821987152, "rewards/margins": 0.04727354645729065, "rewards/rejected": 0.15887995064258575, "step": 1525 }, { "epoch": 0.8229742483483888, "grad_norm": 8.469347953796387, "learning_rate": 4.6207141181704515e-08, "logits/chosen": -0.4381110668182373, "logits/rejected": 0.7263089418411255, "logps/chosen": -125.52745819091797, "logps/rejected": -175.3401641845703, "loss": 0.7659, "rewards/accuracies": 0.5, "rewards/chosen": 0.0973232239484787, "rewards/margins": -0.12431801855564117, "rewards/rejected": 0.22164124250411987, "step": 1526 }, { "epoch": 0.8235135499528111, "grad_norm": 6.825808048248291, "learning_rate": 4.5934770112581704e-08, "logits/chosen": 0.4113348126411438, "logits/rejected": -0.47059962153434753, "logps/chosen": -231.22158813476562, "logps/rejected": -198.5060272216797, "loss": 0.6334, "rewards/accuracies": 0.75, "rewards/chosen": 0.3436829447746277, "rewards/margins": 0.13743534684181213, "rewards/rejected": 0.20624762773513794, "step": 1527 }, { "epoch": 0.8240528515572334, "grad_norm": 7.573840141296387, "learning_rate": 4.5663122940059475e-08, "logits/chosen": 0.30036646127700806, "logits/rejected": -0.9397674202919006, "logps/chosen": -349.7419738769531, "logps/rejected": -326.76885986328125, "loss": 0.6744, "rewards/accuracies": 0.5, "rewards/chosen": 0.1856524497270584, "rewards/margins": 0.05788449943065643, "rewards/rejected": 0.12776795029640198, "step": 1528 }, { "epoch": 0.8245921531616557, "grad_norm": 7.038821697235107, "learning_rate": 4.5392200627773186e-08, "logits/chosen": 0.7834839820861816, "logits/rejected": 0.2704179286956787, "logps/chosen": -264.326904296875, "logps/rejected": -231.71630859375, "loss": 0.6706, "rewards/accuracies": 0.75, "rewards/chosen": 0.20977783203125, "rewards/margins": 0.06547661125659943, "rewards/rejected": 0.14430123567581177, "step": 1529 }, { "epoch": 0.825131454766078, "grad_norm": 6.7296223640441895, "learning_rate": 4.5122004136786715e-08, "logits/chosen": 0.2532747685909271, "logits/rejected": -0.2254493534564972, "logps/chosen": -284.35870361328125, "logps/rejected": -261.2187194824219, "loss": 0.674, "rewards/accuracies": 0.375, "rewards/chosen": 0.2769010663032532, "rewards/margins": 0.07008494436740875, "rewards/rejected": 0.20681610703468323, "step": 1530 }, { "epoch": 0.8256707563705002, "grad_norm": 7.8619184494018555, "learning_rate": 4.485253442558934e-08, "logits/chosen": 0.5782453417778015, "logits/rejected": -1.058072805404663, "logps/chosen": -232.81748962402344, "logps/rejected": -167.57998657226562, "loss": 0.6867, "rewards/accuracies": 0.5, "rewards/chosen": 0.2341223657131195, "rewards/margins": 0.04378795623779297, "rewards/rejected": 0.19033440947532654, "step": 1531 }, { "epoch": 0.8262100579749225, "grad_norm": 5.439651966094971, "learning_rate": 4.4583792450092083e-08, "logits/chosen": 0.9400855302810669, "logits/rejected": 0.6340286731719971, "logps/chosen": -194.89309692382812, "logps/rejected": -235.24354553222656, "loss": 0.6214, "rewards/accuracies": 0.75, "rewards/chosen": 0.28396493196487427, "rewards/margins": 0.1598789393901825, "rewards/rejected": 0.12408600747585297, "step": 1532 }, { "epoch": 0.8267493595793447, "grad_norm": 8.309202194213867, "learning_rate": 4.4315779163624475e-08, "logits/chosen": -0.4574386179447174, "logits/rejected": -0.678265392780304, "logps/chosen": -206.63943481445312, "logps/rejected": -245.84982299804688, "loss": 0.6473, "rewards/accuracies": 0.75, "rewards/chosen": 0.33723515272140503, "rewards/margins": 0.11906462907791138, "rewards/rejected": 0.21817055344581604, "step": 1533 }, { "epoch": 0.827288661183767, "grad_norm": 6.627256393432617, "learning_rate": 4.404849551693102e-08, "logits/chosen": 0.18793153762817383, "logits/rejected": -1.0673259496688843, "logps/chosen": -221.40419006347656, "logps/rejected": -200.83120727539062, "loss": 0.6282, "rewards/accuracies": 0.625, "rewards/chosen": 0.3267834186553955, "rewards/margins": 0.17426753044128418, "rewards/rejected": 0.15251588821411133, "step": 1534 }, { "epoch": 0.8278279627881893, "grad_norm": 6.374931335449219, "learning_rate": 4.378194245816802e-08, "logits/chosen": -0.0935201495885849, "logits/rejected": -0.44543570280075073, "logps/chosen": -230.50367736816406, "logps/rejected": -212.4730224609375, "loss": 0.7091, "rewards/accuracies": 0.375, "rewards/chosen": 0.2247861921787262, "rewards/margins": -0.02813454158604145, "rewards/rejected": 0.2529207468032837, "step": 1535 }, { "epoch": 0.8283672643926115, "grad_norm": 8.387415885925293, "learning_rate": 4.3516120932900055e-08, "logits/chosen": -0.16141976416110992, "logits/rejected": -0.6505376100540161, "logps/chosen": -327.08868408203125, "logps/rejected": -248.42454528808594, "loss": 0.7235, "rewards/accuracies": 0.375, "rewards/chosen": 0.17883911728858948, "rewards/margins": -0.03438877686858177, "rewards/rejected": 0.21322789788246155, "step": 1536 }, { "epoch": 0.8289065659970338, "grad_norm": 6.529022693634033, "learning_rate": 4.325103188409665e-08, "logits/chosen": 0.07110065221786499, "logits/rejected": -0.5301898121833801, "logps/chosen": -304.9180908203125, "logps/rejected": -317.0077209472656, "loss": 0.6493, "rewards/accuracies": 0.625, "rewards/chosen": 0.3526701033115387, "rewards/margins": 0.0984681099653244, "rewards/rejected": 0.2542020082473755, "step": 1537 }, { "epoch": 0.8294458676014561, "grad_norm": 7.00029182434082, "learning_rate": 4.298667625212904e-08, "logits/chosen": -0.05659948289394379, "logits/rejected": -0.6401269435882568, "logps/chosen": -236.87557983398438, "logps/rejected": -188.65467834472656, "loss": 0.7283, "rewards/accuracies": 0.25, "rewards/chosen": 0.25758305191993713, "rewards/margins": -0.04404252767562866, "rewards/rejected": 0.3016255795955658, "step": 1538 }, { "epoch": 0.8299851692058784, "grad_norm": 7.066158294677734, "learning_rate": 4.272305497476658e-08, "logits/chosen": 0.04439398646354675, "logits/rejected": 0.7298094034194946, "logps/chosen": -223.31561279296875, "logps/rejected": -266.5920715332031, "loss": 0.6995, "rewards/accuracies": 0.5, "rewards/chosen": 0.21659022569656372, "rewards/margins": 0.0017612501978874207, "rewards/rejected": 0.2148289680480957, "step": 1539 }, { "epoch": 0.8305244708103007, "grad_norm": 7.898696422576904, "learning_rate": 4.2460168987173806e-08, "logits/chosen": 0.16367889940738678, "logits/rejected": 0.6667635440826416, "logps/chosen": -202.9391326904297, "logps/rejected": -190.25949096679688, "loss": 0.7559, "rewards/accuracies": 0.125, "rewards/chosen": 0.16274681687355042, "rewards/margins": -0.1055203378200531, "rewards/rejected": 0.2682671546936035, "step": 1540 }, { "epoch": 0.831063772414723, "grad_norm": 7.433077812194824, "learning_rate": 4.21980192219068e-08, "logits/chosen": 0.010634660720825195, "logits/rejected": 0.19654937088489532, "logps/chosen": -302.99560546875, "logps/rejected": -365.6701354980469, "loss": 0.6376, "rewards/accuracies": 0.5, "rewards/chosen": 0.3125530481338501, "rewards/margins": 0.13067880272865295, "rewards/rejected": 0.18187421560287476, "step": 1541 }, { "epoch": 0.8316030740191452, "grad_norm": 6.962863445281982, "learning_rate": 4.193660660890988e-08, "logits/chosen": 0.16392749547958374, "logits/rejected": -1.4221265316009521, "logps/chosen": -263.6744384765625, "logps/rejected": -151.90093994140625, "loss": 0.6325, "rewards/accuracies": 0.75, "rewards/chosen": 0.2032245695590973, "rewards/margins": 0.15018586814403534, "rewards/rejected": 0.053038693964481354, "step": 1542 }, { "epoch": 0.8321423756235675, "grad_norm": 6.886363983154297, "learning_rate": 4.16759320755127e-08, "logits/chosen": -0.6750422716140747, "logits/rejected": -1.1388343572616577, "logps/chosen": -179.93960571289062, "logps/rejected": -214.1401824951172, "loss": 0.6294, "rewards/accuracies": 0.625, "rewards/chosen": 0.25351160764694214, "rewards/margins": 0.15311679244041443, "rewards/rejected": 0.1003948226571083, "step": 1543 }, { "epoch": 0.8326816772279898, "grad_norm": 7.321399688720703, "learning_rate": 4.141599654642641e-08, "logits/chosen": 1.0929063558578491, "logits/rejected": -0.1345224231481552, "logps/chosen": -265.77630615234375, "logps/rejected": -180.98956298828125, "loss": 0.643, "rewards/accuracies": 0.5, "rewards/chosen": 0.242436945438385, "rewards/margins": 0.1467624306678772, "rewards/rejected": 0.09567450731992722, "step": 1544 }, { "epoch": 0.833220978832412, "grad_norm": 7.820748805999756, "learning_rate": 4.115680094374074e-08, "logits/chosen": -0.34461352229118347, "logits/rejected": 0.09632280468940735, "logps/chosen": -248.77847290039062, "logps/rejected": -309.5677490234375, "loss": 0.7096, "rewards/accuracies": 0.375, "rewards/chosen": 0.2578086853027344, "rewards/margins": -0.029925543814897537, "rewards/rejected": 0.2877342104911804, "step": 1545 }, { "epoch": 0.8337602804368343, "grad_norm": 7.745112895965576, "learning_rate": 4.089834618692048e-08, "logits/chosen": -0.0727340579032898, "logits/rejected": 1.0516598224639893, "logps/chosen": -270.565185546875, "logps/rejected": -328.80859375, "loss": 0.768, "rewards/accuracies": 0.375, "rewards/chosen": 0.20516642928123474, "rewards/margins": -0.13813582062721252, "rewards/rejected": 0.34330224990844727, "step": 1546 }, { "epoch": 0.8342995820412565, "grad_norm": 9.002788543701172, "learning_rate": 4.064063319280253e-08, "logits/chosen": 0.36846664547920227, "logits/rejected": -0.3075871765613556, "logps/chosen": -314.9609375, "logps/rejected": -366.2476806640625, "loss": 0.6595, "rewards/accuracies": 0.75, "rewards/chosen": 0.2337503433227539, "rewards/margins": 0.07467660307884216, "rewards/rejected": 0.15907374024391174, "step": 1547 }, { "epoch": 0.8348388836456788, "grad_norm": 8.532106399536133, "learning_rate": 4.038366287559245e-08, "logits/chosen": 1.3695873022079468, "logits/rejected": 0.4778316617012024, "logps/chosen": -295.4178466796875, "logps/rejected": -246.23654174804688, "loss": 0.6843, "rewards/accuracies": 0.375, "rewards/chosen": 0.23040199279785156, "rewards/margins": 0.025692373514175415, "rewards/rejected": 0.20470963418483734, "step": 1548 }, { "epoch": 0.8353781852501011, "grad_norm": 8.323247909545898, "learning_rate": 4.012743614686112e-08, "logits/chosen": 0.13278323411941528, "logits/rejected": 0.25211265683174133, "logps/chosen": -223.73834228515625, "logps/rejected": -291.1574401855469, "loss": 0.7164, "rewards/accuracies": 0.625, "rewards/chosen": 0.19488868117332458, "rewards/margins": -0.019174952059984207, "rewards/rejected": 0.2140636444091797, "step": 1549 }, { "epoch": 0.8359174868545234, "grad_norm": 8.549795150756836, "learning_rate": 3.9871953915541795e-08, "logits/chosen": -0.7968780398368835, "logits/rejected": 0.6852686405181885, "logps/chosen": -150.533447265625, "logps/rejected": -219.53558349609375, "loss": 0.7842, "rewards/accuracies": 0.125, "rewards/chosen": 0.1858028769493103, "rewards/margins": -0.16712164878845215, "rewards/rejected": 0.35292455554008484, "step": 1550 }, { "epoch": 0.8364567884589457, "grad_norm": 7.3432135581970215, "learning_rate": 3.961721708792662e-08, "logits/chosen": 1.083378791809082, "logits/rejected": -0.09672385454177856, "logps/chosen": -238.81076049804688, "logps/rejected": -180.4684600830078, "loss": 0.6307, "rewards/accuracies": 0.75, "rewards/chosen": 0.35181131958961487, "rewards/margins": 0.1344941109418869, "rewards/rejected": 0.21731719374656677, "step": 1551 }, { "epoch": 0.836996090063368, "grad_norm": 7.491903781890869, "learning_rate": 3.93632265676635e-08, "logits/chosen": 0.09359708428382874, "logits/rejected": -1.1700609922409058, "logps/chosen": -317.49322509765625, "logps/rejected": -237.38027954101562, "loss": 0.6606, "rewards/accuracies": 0.625, "rewards/chosen": 0.30489102005958557, "rewards/margins": 0.08435249328613281, "rewards/rejected": 0.22053852677345276, "step": 1552 }, { "epoch": 0.8375353916677902, "grad_norm": 6.174464702606201, "learning_rate": 3.9109983255752985e-08, "logits/chosen": 1.122245192527771, "logits/rejected": -0.010372698307037354, "logps/chosen": -285.18316650390625, "logps/rejected": -207.488037109375, "loss": 0.6628, "rewards/accuracies": 0.625, "rewards/chosen": 0.3274610638618469, "rewards/margins": 0.07153140008449554, "rewards/rejected": 0.2559296786785126, "step": 1553 }, { "epoch": 0.8380746932722125, "grad_norm": 7.349976062774658, "learning_rate": 3.8857488050544896e-08, "logits/chosen": 1.0300531387329102, "logits/rejected": -0.25105416774749756, "logps/chosen": -296.4185791015625, "logps/rejected": -220.75131225585938, "loss": 0.6701, "rewards/accuracies": 0.75, "rewards/chosen": 0.24247227609157562, "rewards/margins": 0.05869865417480469, "rewards/rejected": 0.18377360701560974, "step": 1554 }, { "epoch": 0.8386139948766348, "grad_norm": 7.736039638519287, "learning_rate": 3.860574184773538e-08, "logits/chosen": -0.5988274216651917, "logits/rejected": -1.2986226081848145, "logps/chosen": -264.3452453613281, "logps/rejected": -270.1436767578125, "loss": 0.6476, "rewards/accuracies": 0.75, "rewards/chosen": 0.2726137340068817, "rewards/margins": 0.10501141846179962, "rewards/rejected": 0.1676023155450821, "step": 1555 }, { "epoch": 0.8391532964810571, "grad_norm": 5.840487003326416, "learning_rate": 3.835474554036336e-08, "logits/chosen": -0.15339437127113342, "logits/rejected": -0.05806181579828262, "logps/chosen": -198.60105895996094, "logps/rejected": -214.3849334716797, "loss": 0.6528, "rewards/accuracies": 0.625, "rewards/chosen": 0.2481909692287445, "rewards/margins": 0.09775114804506302, "rewards/rejected": 0.15043982863426208, "step": 1556 }, { "epoch": 0.8396925980854794, "grad_norm": 7.163537502288818, "learning_rate": 3.81045000188078e-08, "logits/chosen": 0.26818907260894775, "logits/rejected": 0.5105207562446594, "logps/chosen": -188.8400115966797, "logps/rejected": -214.2478485107422, "loss": 0.714, "rewards/accuracies": 0.25, "rewards/chosen": 0.19754953682422638, "rewards/margins": -0.03544378653168678, "rewards/rejected": 0.23299333453178406, "step": 1557 }, { "epoch": 0.8402318996899015, "grad_norm": 6.538573741912842, "learning_rate": 3.785500617078424e-08, "logits/chosen": 0.24845777451992035, "logits/rejected": -0.40216395258903503, "logps/chosen": -223.09140014648438, "logps/rejected": -191.33750915527344, "loss": 0.6354, "rewards/accuracies": 0.625, "rewards/chosen": 0.2403464913368225, "rewards/margins": 0.13526567816734314, "rewards/rejected": 0.10508079826831818, "step": 1558 }, { "epoch": 0.8407712012943238, "grad_norm": 8.976716995239258, "learning_rate": 3.76062648813418e-08, "logits/chosen": -0.3085877299308777, "logits/rejected": -1.1839009523391724, "logps/chosen": -254.43930053710938, "logps/rejected": -227.018310546875, "loss": 0.7358, "rewards/accuracies": 0.625, "rewards/chosen": 0.16949531435966492, "rewards/margins": -0.06653708219528198, "rewards/rejected": 0.2360323965549469, "step": 1559 }, { "epoch": 0.8413105028987461, "grad_norm": 7.327898025512695, "learning_rate": 3.7358277032860016e-08, "logits/chosen": 0.8374137878417969, "logits/rejected": -0.8470154404640198, "logps/chosen": -364.853759765625, "logps/rejected": -248.35948181152344, "loss": 0.6684, "rewards/accuracies": 0.5, "rewards/chosen": 0.23716622591018677, "rewards/margins": 0.056723594665527344, "rewards/rejected": 0.18044263124465942, "step": 1560 }, { "epoch": 0.8418498045031684, "grad_norm": 6.9659576416015625, "learning_rate": 3.711104350504557e-08, "logits/chosen": 0.9646937251091003, "logits/rejected": 0.851036548614502, "logps/chosen": -260.8197937011719, "logps/rejected": -304.53411865234375, "loss": 0.6756, "rewards/accuracies": 0.625, "rewards/chosen": 0.2155403047800064, "rewards/margins": 0.04663124307990074, "rewards/rejected": 0.16890907287597656, "step": 1561 }, { "epoch": 0.8423891061075907, "grad_norm": 6.276204586029053, "learning_rate": 3.686456517492939e-08, "logits/chosen": 0.5981194972991943, "logits/rejected": 0.5547270774841309, "logps/chosen": -197.15696716308594, "logps/rejected": -231.12171936035156, "loss": 0.6736, "rewards/accuracies": 0.375, "rewards/chosen": 0.23140759766101837, "rewards/margins": 0.06698660552501678, "rewards/rejected": 0.1644209921360016, "step": 1562 }, { "epoch": 0.8429284077120129, "grad_norm": 6.835249423980713, "learning_rate": 3.661884291686338e-08, "logits/chosen": 0.09006568789482117, "logits/rejected": -1.6418002843856812, "logps/chosen": -271.123779296875, "logps/rejected": -200.31463623046875, "loss": 0.7126, "rewards/accuracies": 0.25, "rewards/chosen": 0.21873359382152557, "rewards/margins": -0.02113962173461914, "rewards/rejected": 0.2398732304573059, "step": 1563 }, { "epoch": 0.8434677093164352, "grad_norm": 7.717635631561279, "learning_rate": 3.637387760251745e-08, "logits/chosen": 0.7738257646560669, "logits/rejected": 0.7441388964653015, "logps/chosen": -267.8015441894531, "logps/rejected": -292.7160949707031, "loss": 0.7057, "rewards/accuracies": 0.375, "rewards/chosen": 0.2737741768360138, "rewards/margins": -0.013341903686523438, "rewards/rejected": 0.28711605072021484, "step": 1564 }, { "epoch": 0.8440070109208575, "grad_norm": 8.361268997192383, "learning_rate": 3.612967010087617e-08, "logits/chosen": 0.2811349630355835, "logits/rejected": -0.3353443741798401, "logps/chosen": -209.34732055664062, "logps/rejected": -213.5775146484375, "loss": 0.7576, "rewards/accuracies": 0.375, "rewards/chosen": 0.22113484144210815, "rewards/margins": -0.1072581335902214, "rewards/rejected": 0.32839298248291016, "step": 1565 }, { "epoch": 0.8445463125252798, "grad_norm": 7.888171672821045, "learning_rate": 3.588622127823604e-08, "logits/chosen": -0.022125646471977234, "logits/rejected": -1.0767699480056763, "logps/chosen": -321.876953125, "logps/rejected": -193.62094116210938, "loss": 0.628, "rewards/accuracies": 0.5, "rewards/chosen": 0.2752220034599304, "rewards/margins": 0.1532144546508789, "rewards/rejected": 0.12200756371021271, "step": 1566 }, { "epoch": 0.8450856141297021, "grad_norm": 6.733707904815674, "learning_rate": 3.5643531998202134e-08, "logits/chosen": 0.010833993554115295, "logits/rejected": -0.3777913451194763, "logps/chosen": -212.3902130126953, "logps/rejected": -194.50946044921875, "loss": 0.6727, "rewards/accuracies": 0.625, "rewards/chosen": 0.19806616008281708, "rewards/margins": 0.04588795080780983, "rewards/rejected": 0.15217819809913635, "step": 1567 }, { "epoch": 0.8456249157341243, "grad_norm": 9.640110969543457, "learning_rate": 3.5401603121685194e-08, "logits/chosen": -0.03644596040248871, "logits/rejected": -0.7404294013977051, "logps/chosen": -301.7503356933594, "logps/rejected": -218.9658203125, "loss": 0.7175, "rewards/accuracies": 0.625, "rewards/chosen": 0.19367781281471252, "rewards/margins": -0.02138805016875267, "rewards/rejected": 0.2150658518075943, "step": 1568 }, { "epoch": 0.8461642173385466, "grad_norm": 7.502255916595459, "learning_rate": 3.516043550689851e-08, "logits/chosen": -0.36370694637298584, "logits/rejected": -2.1346845626831055, "logps/chosen": -301.68017578125, "logps/rejected": -168.9670867919922, "loss": 0.6377, "rewards/accuracies": 0.625, "rewards/chosen": 0.26945629715919495, "rewards/margins": 0.13396921753883362, "rewards/rejected": 0.13548707962036133, "step": 1569 }, { "epoch": 0.8467035189429688, "grad_norm": 6.481011867523193, "learning_rate": 3.4920030009354864e-08, "logits/chosen": 0.34968069195747375, "logits/rejected": -0.3307953476905823, "logps/chosen": -188.87188720703125, "logps/rejected": -176.07150268554688, "loss": 0.6935, "rewards/accuracies": 0.375, "rewards/chosen": 0.22394362092018127, "rewards/margins": 0.01609477587044239, "rewards/rejected": 0.20784883201122284, "step": 1570 }, { "epoch": 0.8472428205473911, "grad_norm": 6.097122669219971, "learning_rate": 3.4680387481863555e-08, "logits/chosen": -0.35032832622528076, "logits/rejected": -0.026812881231307983, "logps/chosen": -185.1141357421875, "logps/rejected": -221.28887939453125, "loss": 0.7012, "rewards/accuracies": 0.5, "rewards/chosen": 0.22473669052124023, "rewards/margins": -0.005911154672503471, "rewards/rejected": 0.23064784705638885, "step": 1571 }, { "epoch": 0.8477821221518134, "grad_norm": 7.622925281524658, "learning_rate": 3.444150877452734e-08, "logits/chosen": 0.4143519401550293, "logits/rejected": -0.4958032965660095, "logps/chosen": -377.86187744140625, "logps/rejected": -279.4939880371094, "loss": 0.6624, "rewards/accuracies": 0.625, "rewards/chosen": 0.2706700563430786, "rewards/margins": 0.07253807783126831, "rewards/rejected": 0.19813194870948792, "step": 1572 }, { "epoch": 0.8483214237562356, "grad_norm": 9.392786979675293, "learning_rate": 3.4203394734739346e-08, "logits/chosen": -0.21256712079048157, "logits/rejected": -0.07079240679740906, "logps/chosen": -256.80279541015625, "logps/rejected": -240.97769165039062, "loss": 0.7674, "rewards/accuracies": 0.125, "rewards/chosen": 0.2704053819179535, "rewards/margins": -0.1255132108926773, "rewards/rejected": 0.395918607711792, "step": 1573 }, { "epoch": 0.8488607253606579, "grad_norm": 6.418622970581055, "learning_rate": 3.3966046207180245e-08, "logits/chosen": 0.5036191940307617, "logits/rejected": 0.19832172989845276, "logps/chosen": -203.32546997070312, "logps/rejected": -194.8690185546875, "loss": 0.6659, "rewards/accuracies": 0.75, "rewards/chosen": 0.25124064087867737, "rewards/margins": 0.06601109355688095, "rewards/rejected": 0.18522953987121582, "step": 1574 }, { "epoch": 0.8494000269650802, "grad_norm": 6.897183418273926, "learning_rate": 3.372946403381508e-08, "logits/chosen": 0.9012309908866882, "logits/rejected": 0.07424348592758179, "logps/chosen": -183.98033142089844, "logps/rejected": -128.1854705810547, "loss": 0.6577, "rewards/accuracies": 0.625, "rewards/chosen": 0.20650748908519745, "rewards/margins": 0.10695524513721466, "rewards/rejected": 0.09955225139856339, "step": 1575 }, { "epoch": 0.8499393285695025, "grad_norm": 8.141575813293457, "learning_rate": 3.349364905389032e-08, "logits/chosen": -0.2045000195503235, "logits/rejected": 0.26617521047592163, "logps/chosen": -161.33694458007812, "logps/rejected": -190.56365966796875, "loss": 0.684, "rewards/accuracies": 0.75, "rewards/chosen": 0.3031044006347656, "rewards/margins": 0.036046408116817474, "rewards/rejected": 0.26705801486968994, "step": 1576 }, { "epoch": 0.8504786301739248, "grad_norm": 7.171656608581543, "learning_rate": 3.3258602103931e-08, "logits/chosen": -0.6077585220336914, "logits/rejected": -0.29163858294487, "logps/chosen": -240.42276000976562, "logps/rejected": -236.15237426757812, "loss": 0.6277, "rewards/accuracies": 0.875, "rewards/chosen": 0.2600136995315552, "rewards/margins": 0.15104132890701294, "rewards/rejected": 0.10897236317396164, "step": 1577 }, { "epoch": 0.8510179317783471, "grad_norm": 7.447567939758301, "learning_rate": 3.3024324017737554e-08, "logits/chosen": 0.06719404458999634, "logits/rejected": 0.19534572958946228, "logps/chosen": -228.7930908203125, "logps/rejected": -287.7898864746094, "loss": 0.7059, "rewards/accuracies": 0.5, "rewards/chosen": 0.28234025835990906, "rewards/margins": -0.002384176477789879, "rewards/rejected": 0.2847244143486023, "step": 1578 }, { "epoch": 0.8515572333827693, "grad_norm": 6.713082790374756, "learning_rate": 3.279081562638306e-08, "logits/chosen": 0.029655814170837402, "logits/rejected": -1.0665861368179321, "logps/chosen": -222.5806427001953, "logps/rejected": -183.12030029296875, "loss": 0.7123, "rewards/accuracies": 0.5, "rewards/chosen": 0.2300117462873459, "rewards/margins": -0.032271858304739, "rewards/rejected": 0.2622836232185364, "step": 1579 }, { "epoch": 0.8520965349871916, "grad_norm": 7.410712242126465, "learning_rate": 3.255807775821015e-08, "logits/chosen": 0.5799064636230469, "logits/rejected": -0.8826082944869995, "logps/chosen": -363.65435791015625, "logps/rejected": -212.70587158203125, "loss": 0.6238, "rewards/accuracies": 0.625, "rewards/chosen": 0.30719003081321716, "rewards/margins": 0.16989582777023315, "rewards/rejected": 0.137294203042984, "step": 1580 }, { "epoch": 0.8526358365916139, "grad_norm": 6.736196041107178, "learning_rate": 3.2326111238828086e-08, "logits/chosen": 1.4674490690231323, "logits/rejected": 0.6193886399269104, "logps/chosen": -286.7793273925781, "logps/rejected": -272.2789611816406, "loss": 0.6225, "rewards/accuracies": 0.75, "rewards/chosen": 0.3280349671840668, "rewards/margins": 0.16156768798828125, "rewards/rejected": 0.16646729409694672, "step": 1581 }, { "epoch": 0.8531751381960362, "grad_norm": 8.401603698730469, "learning_rate": 3.209491689110994e-08, "logits/chosen": -0.5046306252479553, "logits/rejected": -0.8903576135635376, "logps/chosen": -244.5380859375, "logps/rejected": -236.1622314453125, "loss": 0.7091, "rewards/accuracies": 0.25, "rewards/chosen": 0.1709057092666626, "rewards/margins": -0.023224541917443275, "rewards/rejected": 0.19413022696971893, "step": 1582 }, { "epoch": 0.8537144398004584, "grad_norm": 7.308312892913818, "learning_rate": 3.1864495535189525e-08, "logits/chosen": 0.6776459217071533, "logits/rejected": 0.7240502238273621, "logps/chosen": -224.27096557617188, "logps/rejected": -248.77024841308594, "loss": 0.6875, "rewards/accuracies": 0.5, "rewards/chosen": 0.2591365873813629, "rewards/margins": 0.019218822941184044, "rewards/rejected": 0.23991775512695312, "step": 1583 }, { "epoch": 0.8542537414048806, "grad_norm": 7.466114521026611, "learning_rate": 3.163484798845861e-08, "logits/chosen": -0.09742903709411621, "logits/rejected": -0.3346102833747864, "logps/chosen": -196.61380004882812, "logps/rejected": -248.37989807128906, "loss": 0.6517, "rewards/accuracies": 0.75, "rewards/chosen": 0.2667257785797119, "rewards/margins": 0.10445752739906311, "rewards/rejected": 0.1622682511806488, "step": 1584 }, { "epoch": 0.8547930430093029, "grad_norm": 7.66231107711792, "learning_rate": 3.1405975065563975e-08, "logits/chosen": -0.11766301095485687, "logits/rejected": -0.563469409942627, "logps/chosen": -186.313232421875, "logps/rejected": -218.1179962158203, "loss": 0.6305, "rewards/accuracies": 0.375, "rewards/chosen": 0.24362564086914062, "rewards/margins": 0.14691276848316193, "rewards/rejected": 0.0967128798365593, "step": 1585 }, { "epoch": 0.8553323446137252, "grad_norm": 6.824010372161865, "learning_rate": 3.117787757840448e-08, "logits/chosen": 0.2033562809228897, "logits/rejected": 0.0643535852432251, "logps/chosen": -213.07205200195312, "logps/rejected": -192.02029418945312, "loss": 0.6115, "rewards/accuracies": 0.75, "rewards/chosen": 0.25093191862106323, "rewards/margins": 0.18118000030517578, "rewards/rejected": 0.06975193321704865, "step": 1586 }, { "epoch": 0.8558716462181475, "grad_norm": 7.241494655609131, "learning_rate": 3.095055633612825e-08, "logits/chosen": 1.2597877979278564, "logits/rejected": 0.32481104135513306, "logps/chosen": -317.64434814453125, "logps/rejected": -304.3688049316406, "loss": 0.6737, "rewards/accuracies": 0.5, "rewards/chosen": 0.2820562422275543, "rewards/margins": 0.045191094279289246, "rewards/rejected": 0.23686513304710388, "step": 1587 }, { "epoch": 0.8564109478225698, "grad_norm": 6.291884899139404, "learning_rate": 3.0724012145129733e-08, "logits/chosen": 0.40768536925315857, "logits/rejected": 0.1234678328037262, "logps/chosen": -214.63433837890625, "logps/rejected": -250.09771728515625, "loss": 0.6409, "rewards/accuracies": 0.625, "rewards/chosen": 0.25575026869773865, "rewards/margins": 0.12538012862205505, "rewards/rejected": 0.1303701400756836, "step": 1588 }, { "epoch": 0.856950249426992, "grad_norm": 7.301852226257324, "learning_rate": 3.049824580904695e-08, "logits/chosen": 0.054133713245391846, "logits/rejected": -1.5349106788635254, "logps/chosen": -221.9153289794922, "logps/rejected": -150.22311401367188, "loss": 0.66, "rewards/accuracies": 0.625, "rewards/chosen": 0.2517850995063782, "rewards/margins": 0.09227582067251205, "rewards/rejected": 0.15950927138328552, "step": 1589 }, { "epoch": 0.8574895510314143, "grad_norm": 7.962097644805908, "learning_rate": 3.027325812875858e-08, "logits/chosen": 0.25351032614707947, "logits/rejected": -0.08162425458431244, "logps/chosen": -229.8039093017578, "logps/rejected": -219.1312255859375, "loss": 0.6953, "rewards/accuracies": 0.375, "rewards/chosen": 0.16337966918945312, "rewards/margins": 0.002064228057861328, "rewards/rejected": 0.1613154411315918, "step": 1590 }, { "epoch": 0.8580288526358366, "grad_norm": 6.749392986297607, "learning_rate": 3.004904990238094e-08, "logits/chosen": 0.4736559987068176, "logits/rejected": -0.07569704949855804, "logps/chosen": -218.9084014892578, "logps/rejected": -212.61859130859375, "loss": 0.6138, "rewards/accuracies": 0.75, "rewards/chosen": 0.357858270406723, "rewards/margins": 0.17969931662082672, "rewards/rejected": 0.1781589537858963, "step": 1591 }, { "epoch": 0.8585681542402589, "grad_norm": 7.326657772064209, "learning_rate": 2.982562192526555e-08, "logits/chosen": -0.10729513317346573, "logits/rejected": -0.45852434635162354, "logps/chosen": -229.1967010498047, "logps/rejected": -229.23519897460938, "loss": 0.6523, "rewards/accuracies": 0.75, "rewards/chosen": 0.20799237489700317, "rewards/margins": 0.09478645026683807, "rewards/rejected": 0.1132059097290039, "step": 1592 }, { "epoch": 0.8591074558446812, "grad_norm": 8.226881980895996, "learning_rate": 2.9602974989996004e-08, "logits/chosen": -0.14096881449222565, "logits/rejected": -0.5024330615997314, "logps/chosen": -283.447021484375, "logps/rejected": -237.6868438720703, "loss": 0.5939, "rewards/accuracies": 1.0, "rewards/chosen": 0.36568450927734375, "rewards/margins": 0.21857833862304688, "rewards/rejected": 0.14710617065429688, "step": 1593 }, { "epoch": 0.8596467574491035, "grad_norm": 6.77661657333374, "learning_rate": 2.9381109886385203e-08, "logits/chosen": 0.13719511032104492, "logits/rejected": 0.6437448859214783, "logps/chosen": -226.41152954101562, "logps/rejected": -321.82330322265625, "loss": 0.6272, "rewards/accuracies": 0.75, "rewards/chosen": 0.3165569305419922, "rewards/margins": 0.15008297562599182, "rewards/rejected": 0.16647396981716156, "step": 1594 }, { "epoch": 0.8601860590535256, "grad_norm": 6.037734031677246, "learning_rate": 2.9160027401472692e-08, "logits/chosen": 0.758987307548523, "logits/rejected": 0.7057100534439087, "logps/chosen": -214.8432159423828, "logps/rejected": -241.42999267578125, "loss": 0.6506, "rewards/accuracies": 0.75, "rewards/chosen": 0.2832489013671875, "rewards/margins": 0.1189032644033432, "rewards/rejected": 0.1643456518650055, "step": 1595 }, { "epoch": 0.8607253606579479, "grad_norm": 7.152924537658691, "learning_rate": 2.8939728319521655e-08, "logits/chosen": 0.9773836731910706, "logits/rejected": -1.4473496675491333, "logps/chosen": -339.57098388671875, "logps/rejected": -208.48953247070312, "loss": 0.6145, "rewards/accuracies": 0.875, "rewards/chosen": 0.3495796322822571, "rewards/margins": 0.19534578919410706, "rewards/rejected": 0.15423384308815002, "step": 1596 }, { "epoch": 0.8612646622623702, "grad_norm": 7.00665807723999, "learning_rate": 2.872021342201636e-08, "logits/chosen": 0.8782666921615601, "logits/rejected": -0.6422200202941895, "logps/chosen": -175.5957794189453, "logps/rejected": -133.8526611328125, "loss": 0.5632, "rewards/accuracies": 0.875, "rewards/chosen": 0.3158903121948242, "rewards/margins": 0.3025516867637634, "rewards/rejected": 0.013338612392544746, "step": 1597 }, { "epoch": 0.8618039638667925, "grad_norm": 7.495398044586182, "learning_rate": 2.850148348765921e-08, "logits/chosen": 0.5802465677261353, "logits/rejected": -0.3401373624801636, "logps/chosen": -297.8285827636719, "logps/rejected": -245.44906616210938, "loss": 0.6689, "rewards/accuracies": 0.375, "rewards/chosen": 0.2971050441265106, "rewards/margins": 0.06903428584337234, "rewards/rejected": 0.22807073593139648, "step": 1598 }, { "epoch": 0.8623432654712148, "grad_norm": 7.141726016998291, "learning_rate": 2.8283539292368098e-08, "logits/chosen": 0.11600557714700699, "logits/rejected": 0.17835023999214172, "logps/chosen": -172.0159454345703, "logps/rejected": -228.1461944580078, "loss": 0.7125, "rewards/accuracies": 0.75, "rewards/chosen": 0.1717667579650879, "rewards/margins": -0.006334681063890457, "rewards/rejected": 0.17810145020484924, "step": 1599 }, { "epoch": 0.862882567075637, "grad_norm": 6.665175914764404, "learning_rate": 2.8066381609273493e-08, "logits/chosen": 0.9455461502075195, "logits/rejected": -0.889117956161499, "logps/chosen": -248.007568359375, "logps/rejected": -241.1127166748047, "loss": 0.5871, "rewards/accuracies": 0.5, "rewards/chosen": 0.4051628112792969, "rewards/margins": 0.2719717025756836, "rewards/rejected": 0.13319110870361328, "step": 1600 }, { "epoch": 0.8634218686800593, "grad_norm": 6.988154411315918, "learning_rate": 2.7850011208715858e-08, "logits/chosen": -1.0519623756408691, "logits/rejected": -0.13943159580230713, "logps/chosen": -273.52490234375, "logps/rejected": -316.80975341796875, "loss": 0.6425, "rewards/accuracies": 0.625, "rewards/chosen": 0.264255166053772, "rewards/margins": 0.1342584639787674, "rewards/rejected": 0.12999668717384338, "step": 1601 }, { "epoch": 0.8639611702844816, "grad_norm": 8.304727554321289, "learning_rate": 2.7634428858242993e-08, "logits/chosen": -0.5551722049713135, "logits/rejected": -0.1682998687028885, "logps/chosen": -193.34548950195312, "logps/rejected": -242.62863159179688, "loss": 0.7277, "rewards/accuracies": 0.375, "rewards/chosen": 0.22767162322998047, "rewards/margins": -0.05039206147193909, "rewards/rejected": 0.27806368470191956, "step": 1602 }, { "epoch": 0.8645004718889039, "grad_norm": 7.982570648193359, "learning_rate": 2.741963532260705e-08, "logits/chosen": -0.09140445291996002, "logits/rejected": -0.9240220189094543, "logps/chosen": -236.47447204589844, "logps/rejected": -195.14486694335938, "loss": 0.6575, "rewards/accuracies": 0.75, "rewards/chosen": 0.21004372835159302, "rewards/margins": 0.0838659331202507, "rewards/rejected": 0.12617778778076172, "step": 1603 }, { "epoch": 0.8650397734933262, "grad_norm": 8.096760749816895, "learning_rate": 2.7205631363761972e-08, "logits/chosen": -0.01048995926976204, "logits/rejected": -0.37613973021507263, "logps/chosen": -232.79690551757812, "logps/rejected": -245.20138549804688, "loss": 0.6619, "rewards/accuracies": 0.75, "rewards/chosen": 0.24909105896949768, "rewards/margins": 0.10181894898414612, "rewards/rejected": 0.14727210998535156, "step": 1604 }, { "epoch": 0.8655790750977485, "grad_norm": 7.260157585144043, "learning_rate": 2.6992417740860807e-08, "logits/chosen": 0.6578934192657471, "logits/rejected": -0.6209515333175659, "logps/chosen": -270.0917053222656, "logps/rejected": -202.6220703125, "loss": 0.6087, "rewards/accuracies": 0.625, "rewards/chosen": 0.3588293194770813, "rewards/margins": 0.1954685002565384, "rewards/rejected": 0.16336078941822052, "step": 1605 }, { "epoch": 0.8661183767021707, "grad_norm": 6.805848598480225, "learning_rate": 2.677999521025301e-08, "logits/chosen": 0.8723041415214539, "logits/rejected": -0.8204317092895508, "logps/chosen": -225.11795043945312, "logps/rejected": -171.96682739257812, "loss": 0.6177, "rewards/accuracies": 0.875, "rewards/chosen": 0.27036744356155396, "rewards/margins": 0.165422722697258, "rewards/rejected": 0.10494471341371536, "step": 1606 }, { "epoch": 0.866657678306593, "grad_norm": 10.013739585876465, "learning_rate": 2.6568364525481612e-08, "logits/chosen": 0.37887704372406006, "logits/rejected": 0.6204221248626709, "logps/chosen": -309.5977783203125, "logps/rejected": -357.17852783203125, "loss": 0.8347, "rewards/accuracies": 0.375, "rewards/chosen": 0.14134922623634338, "rewards/margins": -0.24982531368732452, "rewards/rejected": 0.3911745250225067, "step": 1607 }, { "epoch": 0.8671969799110152, "grad_norm": 7.156760215759277, "learning_rate": 2.6357526437280757e-08, "logits/chosen": 1.3380461931228638, "logits/rejected": 0.2068013697862625, "logps/chosen": -372.70074462890625, "logps/rejected": -232.39308166503906, "loss": 0.6332, "rewards/accuracies": 0.625, "rewards/chosen": 0.31356582045555115, "rewards/margins": 0.1371661126613617, "rewards/rejected": 0.17639970779418945, "step": 1608 }, { "epoch": 0.8677362815154375, "grad_norm": 8.734373092651367, "learning_rate": 2.6147481693572974e-08, "logits/chosen": 0.940714955329895, "logits/rejected": 0.9116564393043518, "logps/chosen": -383.1006774902344, "logps/rejected": -349.9525146484375, "loss": 0.7819, "rewards/accuracies": 0.25, "rewards/chosen": 0.14371949434280396, "rewards/margins": -0.151676744222641, "rewards/rejected": 0.29539623856544495, "step": 1609 }, { "epoch": 0.8682755831198598, "grad_norm": 6.164621353149414, "learning_rate": 2.5938231039466436e-08, "logits/chosen": -0.7551496028900146, "logits/rejected": 0.45775049924850464, "logps/chosen": -214.45155334472656, "logps/rejected": -274.27642822265625, "loss": 0.6145, "rewards/accuracies": 0.75, "rewards/chosen": 0.3569208085536957, "rewards/margins": 0.18460962176322937, "rewards/rejected": 0.1723112165927887, "step": 1610 }, { "epoch": 0.868814884724282, "grad_norm": 6.968345642089844, "learning_rate": 2.572977521725242e-08, "logits/chosen": 0.3525431752204895, "logits/rejected": 0.487393319606781, "logps/chosen": -251.0069580078125, "logps/rejected": -226.0507049560547, "loss": 0.731, "rewards/accuracies": 0.5, "rewards/chosen": 0.188482865691185, "rewards/margins": -0.06780730187892914, "rewards/rejected": 0.25629016757011414, "step": 1611 }, { "epoch": 0.8693541863287043, "grad_norm": 6.771980285644531, "learning_rate": 2.552211496640261e-08, "logits/chosen": 0.5525929927825928, "logits/rejected": 0.9968878626823425, "logps/chosen": -183.65731811523438, "logps/rejected": -256.859375, "loss": 0.6952, "rewards/accuracies": 0.5, "rewards/chosen": 0.21771679818630219, "rewards/margins": 0.011010181158781052, "rewards/rejected": 0.20670661330223083, "step": 1612 }, { "epoch": 0.8698934879331266, "grad_norm": 6.927292823791504, "learning_rate": 2.531525102356649e-08, "logits/chosen": -0.013709111139178276, "logits/rejected": -0.12198221683502197, "logps/chosen": -251.76126098632812, "logps/rejected": -206.790283203125, "loss": 0.6473, "rewards/accuracies": 0.625, "rewards/chosen": 0.2702619433403015, "rewards/margins": 0.10498585551977158, "rewards/rejected": 0.16527611017227173, "step": 1613 }, { "epoch": 0.8704327895375489, "grad_norm": 6.840063571929932, "learning_rate": 2.5109184122568793e-08, "logits/chosen": -0.7108164429664612, "logits/rejected": -0.428788959980011, "logps/chosen": -185.56480407714844, "logps/rejected": -190.939453125, "loss": 0.7239, "rewards/accuracies": 0.5, "rewards/chosen": 0.171251580119133, "rewards/margins": -0.05244303494691849, "rewards/rejected": 0.22369462251663208, "step": 1614 }, { "epoch": 0.8709720911419712, "grad_norm": 6.147156238555908, "learning_rate": 2.4903914994406805e-08, "logits/chosen": 1.0073046684265137, "logits/rejected": -0.6331090927124023, "logps/chosen": -310.70343017578125, "logps/rejected": -179.25518798828125, "loss": 0.6255, "rewards/accuracies": 0.875, "rewards/chosen": 0.3613359332084656, "rewards/margins": 0.14590173959732056, "rewards/rejected": 0.2154342085123062, "step": 1615 }, { "epoch": 0.8715113927463934, "grad_norm": 7.45594596862793, "learning_rate": 2.4699444367247828e-08, "logits/chosen": -0.9124006032943726, "logits/rejected": 0.14431118965148926, "logps/chosen": -195.12789916992188, "logps/rejected": -258.41400146484375, "loss": 0.7359, "rewards/accuracies": 0.5, "rewards/chosen": 0.20492029190063477, "rewards/margins": -0.06865892559289932, "rewards/rejected": 0.2735792100429535, "step": 1616 }, { "epoch": 0.8720506943508157, "grad_norm": 9.512672424316406, "learning_rate": 2.4495772966426463e-08, "logits/chosen": -0.03163626044988632, "logits/rejected": -0.8259420394897461, "logps/chosen": -221.9586181640625, "logps/rejected": -238.166748046875, "loss": 0.6973, "rewards/accuracies": 0.375, "rewards/chosen": 0.27945852279663086, "rewards/margins": 0.006973356008529663, "rewards/rejected": 0.2724851965904236, "step": 1617 }, { "epoch": 0.872589995955238, "grad_norm": 6.987581253051758, "learning_rate": 2.4292901514442327e-08, "logits/chosen": 0.5331758260726929, "logits/rejected": 0.4846418499946594, "logps/chosen": -255.74815368652344, "logps/rejected": -258.4261779785156, "loss": 0.7216, "rewards/accuracies": 0.375, "rewards/chosen": 0.20801860094070435, "rewards/margins": -0.045854851603507996, "rewards/rejected": 0.25387346744537354, "step": 1618 }, { "epoch": 0.8731292975596603, "grad_norm": 8.437186241149902, "learning_rate": 2.409083073095719e-08, "logits/chosen": -0.27341675758361816, "logits/rejected": -0.43843966722488403, "logps/chosen": -180.16177368164062, "logps/rejected": -207.1761932373047, "loss": 0.7303, "rewards/accuracies": 0.625, "rewards/chosen": 0.12694720923900604, "rewards/margins": -0.0601532906293869, "rewards/rejected": 0.18710049986839294, "step": 1619 }, { "epoch": 0.8736685991640825, "grad_norm": 7.483500003814697, "learning_rate": 2.3889561332792657e-08, "logits/chosen": 0.21130573749542236, "logits/rejected": -0.2281467467546463, "logps/chosen": -260.9106140136719, "logps/rejected": -223.88247680664062, "loss": 0.7046, "rewards/accuracies": 0.5, "rewards/chosen": 0.3092266321182251, "rewards/margins": -0.0076551418751478195, "rewards/rejected": 0.31688177585601807, "step": 1620 }, { "epoch": 0.8742079007685047, "grad_norm": 8.134014129638672, "learning_rate": 2.368909403392741e-08, "logits/chosen": -0.254633367061615, "logits/rejected": 0.08057373762130737, "logps/chosen": -223.22789001464844, "logps/rejected": -325.834716796875, "loss": 0.6079, "rewards/accuracies": 0.75, "rewards/chosen": 0.2918393909931183, "rewards/margins": 0.204655259847641, "rewards/rejected": 0.08718414604663849, "step": 1621 }, { "epoch": 0.874747202372927, "grad_norm": 8.332819938659668, "learning_rate": 2.348942954549485e-08, "logits/chosen": 0.5670070052146912, "logits/rejected": -0.26283109188079834, "logps/chosen": -225.5873260498047, "logps/rejected": -233.56600952148438, "loss": 0.5813, "rewards/accuracies": 0.625, "rewards/chosen": 0.34183406829833984, "rewards/margins": 0.26938843727111816, "rewards/rejected": 0.07244562357664108, "step": 1622 }, { "epoch": 0.8752865039773493, "grad_norm": 7.623291015625, "learning_rate": 2.329056857578049e-08, "logits/chosen": -0.3930942416191101, "logits/rejected": -0.06643450260162354, "logps/chosen": -213.8507537841797, "logps/rejected": -322.59234619140625, "loss": 0.7307, "rewards/accuracies": 0.25, "rewards/chosen": 0.23207010328769684, "rewards/margins": -0.06714987754821777, "rewards/rejected": 0.2992199957370758, "step": 1623 }, { "epoch": 0.8758258055817716, "grad_norm": 7.000836372375488, "learning_rate": 2.3092511830219403e-08, "logits/chosen": 0.19512373208999634, "logits/rejected": -0.2576231062412262, "logps/chosen": -249.07537841796875, "logps/rejected": -270.4708557128906, "loss": 0.6446, "rewards/accuracies": 0.75, "rewards/chosen": 0.31286728382110596, "rewards/margins": 0.10747271031141281, "rewards/rejected": 0.20539455115795135, "step": 1624 }, { "epoch": 0.8763651071861939, "grad_norm": 8.91315746307373, "learning_rate": 2.289526001139394e-08, "logits/chosen": 0.8935835361480713, "logits/rejected": 0.6171073913574219, "logps/chosen": -285.65814208984375, "logps/rejected": -274.146484375, "loss": 0.7619, "rewards/accuracies": 0.25, "rewards/chosen": 0.14260168373584747, "rewards/margins": -0.11890001595020294, "rewards/rejected": 0.2615016996860504, "step": 1625 }, { "epoch": 0.8769044087906162, "grad_norm": 6.956132411956787, "learning_rate": 2.26988138190308e-08, "logits/chosen": 1.859009027481079, "logits/rejected": 0.9051089286804199, "logps/chosen": -304.2218017578125, "logps/rejected": -259.10498046875, "loss": 0.6109, "rewards/accuracies": 0.75, "rewards/chosen": 0.32264405488967896, "rewards/margins": 0.19791856408119202, "rewards/rejected": 0.12472549080848694, "step": 1626 }, { "epoch": 0.8774437103950384, "grad_norm": 6.390243053436279, "learning_rate": 2.250317394999904e-08, "logits/chosen": 0.2910573482513428, "logits/rejected": -0.09433993697166443, "logps/chosen": -212.85592651367188, "logps/rejected": -220.34182739257812, "loss": 0.6571, "rewards/accuracies": 0.75, "rewards/chosen": 0.26526832580566406, "rewards/margins": 0.08241082727909088, "rewards/rejected": 0.18285751342773438, "step": 1627 }, { "epoch": 0.8779830119994607, "grad_norm": 7.353758811950684, "learning_rate": 2.2308341098307315e-08, "logits/chosen": 0.7232913374900818, "logits/rejected": 0.9679633975028992, "logps/chosen": -259.44635009765625, "logps/rejected": -265.54742431640625, "loss": 0.7616, "rewards/accuracies": 0.125, "rewards/chosen": 0.27689629793167114, "rewards/margins": -0.12677878141403198, "rewards/rejected": 0.4036750793457031, "step": 1628 }, { "epoch": 0.878522313603883, "grad_norm": 6.50864315032959, "learning_rate": 2.2114315955101493e-08, "logits/chosen": 0.3496047556400299, "logits/rejected": 0.3703112006187439, "logps/chosen": -230.3781280517578, "logps/rejected": -240.84596252441406, "loss": 0.6376, "rewards/accuracies": 0.625, "rewards/chosen": 0.2937309741973877, "rewards/margins": 0.1226232647895813, "rewards/rejected": 0.171107679605484, "step": 1629 }, { "epoch": 0.8790616152083053, "grad_norm": 7.091867923736572, "learning_rate": 2.192109920866217e-08, "logits/chosen": 0.4416024088859558, "logits/rejected": -0.0012230873107910156, "logps/chosen": -253.4104766845703, "logps/rejected": -204.34854125976562, "loss": 0.6775, "rewards/accuracies": 0.625, "rewards/chosen": 0.2434835582971573, "rewards/margins": 0.05501842871308327, "rewards/rejected": 0.18846511840820312, "step": 1630 }, { "epoch": 0.8796009168127276, "grad_norm": 6.968080520629883, "learning_rate": 2.1728691544402266e-08, "logits/chosen": 1.0527273416519165, "logits/rejected": -0.38357943296432495, "logps/chosen": -259.85723876953125, "logps/rejected": -196.43179321289062, "loss": 0.5965, "rewards/accuracies": 0.875, "rewards/chosen": 0.32308536767959595, "rewards/margins": 0.2107049971818924, "rewards/rejected": 0.11238041520118713, "step": 1631 }, { "epoch": 0.8801402184171498, "grad_norm": 7.366759300231934, "learning_rate": 2.1537093644864668e-08, "logits/chosen": 0.13251277804374695, "logits/rejected": -0.026298947632312775, "logps/chosen": -302.6383056640625, "logps/rejected": -312.30816650390625, "loss": 0.7089, "rewards/accuracies": 0.625, "rewards/chosen": 0.20708903670310974, "rewards/margins": -0.020881274715065956, "rewards/rejected": 0.22797033190727234, "step": 1632 }, { "epoch": 0.880679520021572, "grad_norm": 7.124780178070068, "learning_rate": 2.1346306189719554e-08, "logits/chosen": 0.11103558540344238, "logits/rejected": -0.04975214600563049, "logps/chosen": -211.98171997070312, "logps/rejected": -260.748291015625, "loss": 0.6748, "rewards/accuracies": 0.5, "rewards/chosen": 0.2993168830871582, "rewards/margins": 0.04297065734863281, "rewards/rejected": 0.2563462257385254, "step": 1633 }, { "epoch": 0.8812188216259943, "grad_norm": 7.219688415527344, "learning_rate": 2.115632985576224e-08, "logits/chosen": -0.3542371988296509, "logits/rejected": 0.4713117182254791, "logps/chosen": -241.05364990234375, "logps/rejected": -329.61749267578125, "loss": 0.7178, "rewards/accuracies": 0.75, "rewards/chosen": 0.19956570863723755, "rewards/margins": -0.03729839250445366, "rewards/rejected": 0.2368640899658203, "step": 1634 }, { "epoch": 0.8817581232304166, "grad_norm": 7.581539154052734, "learning_rate": 2.0967165316910672e-08, "logits/chosen": -0.4707740843296051, "logits/rejected": 0.25281035900115967, "logps/chosen": -285.511962890625, "logps/rejected": -324.5328063964844, "loss": 0.6596, "rewards/accuracies": 0.625, "rewards/chosen": 0.32349205017089844, "rewards/margins": 0.08753861486911774, "rewards/rejected": 0.2359534353017807, "step": 1635 }, { "epoch": 0.8822974248348389, "grad_norm": 8.675339698791504, "learning_rate": 2.077881324420311e-08, "logits/chosen": -0.5175249576568604, "logits/rejected": -1.2111495733261108, "logps/chosen": -219.2646484375, "logps/rejected": -203.4996337890625, "loss": 0.7685, "rewards/accuracies": 0.375, "rewards/chosen": 0.12629690766334534, "rewards/margins": -0.10851754993200302, "rewards/rejected": 0.23481445014476776, "step": 1636 }, { "epoch": 0.8828367264392611, "grad_norm": 7.997865676879883, "learning_rate": 2.0591274305795577e-08, "logits/chosen": 0.5754443407058716, "logits/rejected": -0.6447864174842834, "logps/chosen": -250.74960327148438, "logps/rejected": -203.65927124023438, "loss": 0.7115, "rewards/accuracies": 0.375, "rewards/chosen": 0.2423701286315918, "rewards/margins": -0.020321551710367203, "rewards/rejected": 0.2626916766166687, "step": 1637 }, { "epoch": 0.8833760280436834, "grad_norm": 8.157695770263672, "learning_rate": 2.0404549166959718e-08, "logits/chosen": 0.22607208788394928, "logits/rejected": -0.859481155872345, "logps/chosen": -265.0055847167969, "logps/rejected": -282.8481140136719, "loss": 0.6654, "rewards/accuracies": 0.625, "rewards/chosen": 0.29339855909347534, "rewards/margins": 0.0689535140991211, "rewards/rejected": 0.22444505989551544, "step": 1638 }, { "epoch": 0.8839153296481057, "grad_norm": 7.4526495933532715, "learning_rate": 2.0218638490080238e-08, "logits/chosen": 0.4218762218952179, "logits/rejected": 0.1817498356103897, "logps/chosen": -280.2963562011719, "logps/rejected": -245.20721435546875, "loss": 0.6779, "rewards/accuracies": 0.5, "rewards/chosen": 0.33788052201271057, "rewards/margins": 0.053662970662117004, "rewards/rejected": 0.2842175364494324, "step": 1639 }, { "epoch": 0.884454631252528, "grad_norm": 8.114139556884766, "learning_rate": 2.0033542934652675e-08, "logits/chosen": 0.8867239952087402, "logits/rejected": -0.14461348950862885, "logps/chosen": -274.5006408691406, "logps/rejected": -219.1727294921875, "loss": 0.6391, "rewards/accuracies": 0.75, "rewards/chosen": 0.3085615634918213, "rewards/margins": 0.1285381317138672, "rewards/rejected": 0.18002337217330933, "step": 1640 }, { "epoch": 0.8849939328569503, "grad_norm": 8.122736930847168, "learning_rate": 1.9849263157281054e-08, "logits/chosen": -1.7947874069213867, "logits/rejected": -1.5474052429199219, "logps/chosen": -288.5297546386719, "logps/rejected": -298.92767333984375, "loss": 0.7, "rewards/accuracies": 0.5, "rewards/chosen": 0.31191474199295044, "rewards/margins": 0.003275100141763687, "rewards/rejected": 0.30863964557647705, "step": 1641 }, { "epoch": 0.8855332344613726, "grad_norm": 7.074897766113281, "learning_rate": 1.9665799811675404e-08, "logits/chosen": 0.5868040919303894, "logits/rejected": 0.40494269132614136, "logps/chosen": -164.26380920410156, "logps/rejected": -181.92276000976562, "loss": 0.7199, "rewards/accuracies": 0.5, "rewards/chosen": 0.24214068055152893, "rewards/margins": -0.040933042764663696, "rewards/rejected": 0.2830737233161926, "step": 1642 }, { "epoch": 0.8860725360657948, "grad_norm": 8.312529563903809, "learning_rate": 1.9483153548649712e-08, "logits/chosen": -0.38155117630958557, "logits/rejected": -0.6312244534492493, "logps/chosen": -320.626708984375, "logps/rejected": -424.8695373535156, "loss": 0.6856, "rewards/accuracies": 0.625, "rewards/chosen": 0.20448656380176544, "rewards/margins": 0.0397271104156971, "rewards/rejected": 0.16475944221019745, "step": 1643 }, { "epoch": 0.8866118376702171, "grad_norm": 7.339071273803711, "learning_rate": 1.9301325016119334e-08, "logits/chosen": 0.25325071811676025, "logits/rejected": -1.169724941253662, "logps/chosen": -241.88031005859375, "logps/rejected": -192.86781311035156, "loss": 0.6467, "rewards/accuracies": 0.875, "rewards/chosen": 0.37763750553131104, "rewards/margins": 0.10175161808729172, "rewards/rejected": 0.2758858799934387, "step": 1644 }, { "epoch": 0.8871511392746393, "grad_norm": 6.413866996765137, "learning_rate": 1.9120314859098957e-08, "logits/chosen": 0.674241840839386, "logits/rejected": 0.3979926109313965, "logps/chosen": -232.45916748046875, "logps/rejected": -224.69668579101562, "loss": 0.6452, "rewards/accuracies": 0.625, "rewards/chosen": 0.3631477355957031, "rewards/margins": 0.1110575795173645, "rewards/rejected": 0.252090185880661, "step": 1645 }, { "epoch": 0.8876904408790616, "grad_norm": 7.044876575469971, "learning_rate": 1.8940123719700078e-08, "logits/chosen": 0.6140847206115723, "logits/rejected": 0.23939450085163116, "logps/chosen": -217.8841094970703, "logps/rejected": -232.4615936279297, "loss": 0.6908, "rewards/accuracies": 0.75, "rewards/chosen": 0.2875026762485504, "rewards/margins": 0.015390023589134216, "rewards/rejected": 0.2721126675605774, "step": 1646 }, { "epoch": 0.8882297424834839, "grad_norm": 6.9399261474609375, "learning_rate": 1.8760752237128864e-08, "logits/chosen": 0.6775317788124084, "logits/rejected": 0.18409299850463867, "logps/chosen": -196.3712921142578, "logps/rejected": -188.77027893066406, "loss": 0.6792, "rewards/accuracies": 0.625, "rewards/chosen": 0.19864006340503693, "rewards/margins": 0.04329700767993927, "rewards/rejected": 0.15534304082393646, "step": 1647 }, { "epoch": 0.8887690440879061, "grad_norm": 7.745382308959961, "learning_rate": 1.8582201047683847e-08, "logits/chosen": -0.8022178411483765, "logits/rejected": -0.34628602862358093, "logps/chosen": -195.46795654296875, "logps/rejected": -262.03753662109375, "loss": 0.6481, "rewards/accuracies": 0.5, "rewards/chosen": 0.24937696754932404, "rewards/margins": 0.12205763161182404, "rewards/rejected": 0.1273193359375, "step": 1648 }, { "epoch": 0.8893083456923284, "grad_norm": 7.115649223327637, "learning_rate": 1.8404470784753713e-08, "logits/chosen": 1.0180079936981201, "logits/rejected": -0.14946621656417847, "logps/chosen": -249.47845458984375, "logps/rejected": -186.5915985107422, "loss": 0.6226, "rewards/accuracies": 0.75, "rewards/chosen": 0.2532207667827606, "rewards/margins": 0.15348616242408752, "rewards/rejected": 0.0997345969080925, "step": 1649 }, { "epoch": 0.8898476472967507, "grad_norm": 7.574265003204346, "learning_rate": 1.8227562078814903e-08, "logits/chosen": 0.4497467279434204, "logits/rejected": 0.15866227447986603, "logps/chosen": -267.5225830078125, "logps/rejected": -265.99285888671875, "loss": 0.7379, "rewards/accuracies": 0.375, "rewards/chosen": 0.17064592242240906, "rewards/margins": -0.07917499542236328, "rewards/rejected": 0.24982090294361115, "step": 1650 }, { "epoch": 0.890386948901173, "grad_norm": 7.162780284881592, "learning_rate": 1.8051475557429613e-08, "logits/chosen": 0.8463422656059265, "logits/rejected": -0.22270111739635468, "logps/chosen": -254.0099334716797, "logps/rejected": -250.72903442382812, "loss": 0.6007, "rewards/accuracies": 0.75, "rewards/chosen": 0.3162956237792969, "rewards/margins": 0.22548776865005493, "rewards/rejected": 0.09080787003040314, "step": 1651 }, { "epoch": 0.8909262505055953, "grad_norm": 7.611424922943115, "learning_rate": 1.787621184524332e-08, "logits/chosen": -0.007685758173465729, "logits/rejected": 0.43369877338409424, "logps/chosen": -265.44384765625, "logps/rejected": -268.0870361328125, "loss": 0.7496, "rewards/accuracies": 0.375, "rewards/chosen": 0.2303321361541748, "rewards/margins": -0.09761244058609009, "rewards/rejected": 0.3279445767402649, "step": 1652 }, { "epoch": 0.8914655521100175, "grad_norm": 6.544556140899658, "learning_rate": 1.7701771563982755e-08, "logits/chosen": 0.2274748533964157, "logits/rejected": 0.7989020943641663, "logps/chosen": -188.4969482421875, "logps/rejected": -231.357177734375, "loss": 0.7174, "rewards/accuracies": 0.25, "rewards/chosen": 0.23688077926635742, "rewards/margins": -0.041979413479566574, "rewards/rejected": 0.2788602113723755, "step": 1653 }, { "epoch": 0.8920048537144398, "grad_norm": 6.892899036407471, "learning_rate": 1.7528155332453635e-08, "logits/chosen": 0.39875149726867676, "logits/rejected": 0.6802717447280884, "logps/chosen": -275.2698059082031, "logps/rejected": -355.36273193359375, "loss": 0.6524, "rewards/accuracies": 0.625, "rewards/chosen": 0.27626943588256836, "rewards/margins": 0.08845014870166779, "rewards/rejected": 0.18781930208206177, "step": 1654 }, { "epoch": 0.8925441553188621, "grad_norm": 7.389105319976807, "learning_rate": 1.73553637665384e-08, "logits/chosen": -0.015084236860275269, "logits/rejected": -0.32747459411621094, "logps/chosen": -274.14111328125, "logps/rejected": -207.2670135498047, "loss": 0.7358, "rewards/accuracies": 0.5, "rewards/chosen": 0.053168393671512604, "rewards/margins": -0.0673866793513298, "rewards/rejected": 0.12055506557226181, "step": 1655 }, { "epoch": 0.8930834569232844, "grad_norm": 7.774374961853027, "learning_rate": 1.7183397479194173e-08, "logits/chosen": 0.4008762836456299, "logits/rejected": -1.2109637260437012, "logps/chosen": -206.35110473632812, "logps/rejected": -152.44618225097656, "loss": 0.6631, "rewards/accuracies": 0.625, "rewards/chosen": 0.27653947472572327, "rewards/margins": 0.0896887257695198, "rewards/rejected": 0.18685075640678406, "step": 1656 }, { "epoch": 0.8936227585277067, "grad_norm": 7.918209075927734, "learning_rate": 1.7012257080450454e-08, "logits/chosen": -0.49674054980278015, "logits/rejected": -0.04616335779428482, "logps/chosen": -273.26318359375, "logps/rejected": -253.5360107421875, "loss": 0.7026, "rewards/accuracies": 0.375, "rewards/chosen": 0.3079378008842468, "rewards/margins": -0.010011684149503708, "rewards/rejected": 0.31794947385787964, "step": 1657 }, { "epoch": 0.8941620601321288, "grad_norm": 9.556459426879883, "learning_rate": 1.6841943177406976e-08, "logits/chosen": 0.8259072303771973, "logits/rejected": -0.25312870740890503, "logps/chosen": -280.69329833984375, "logps/rejected": -198.08079528808594, "loss": 0.6964, "rewards/accuracies": 0.25, "rewards/chosen": 0.2700607180595398, "rewards/margins": 0.0033235549926757812, "rewards/rejected": 0.2667371928691864, "step": 1658 }, { "epoch": 0.8947013617365511, "grad_norm": 7.400942325592041, "learning_rate": 1.667245637423162e-08, "logits/chosen": 0.3032877743244171, "logits/rejected": -0.020560532808303833, "logps/chosen": -217.05088806152344, "logps/rejected": -251.8738555908203, "loss": 0.7237, "rewards/accuracies": 0.375, "rewards/chosen": 0.1748451292514801, "rewards/margins": -0.050959400832653046, "rewards/rejected": 0.22580452263355255, "step": 1659 }, { "epoch": 0.8952406633409734, "grad_norm": 6.033153057098389, "learning_rate": 1.6503797272158282e-08, "logits/chosen": 0.9346306324005127, "logits/rejected": 0.561007022857666, "logps/chosen": -208.02467346191406, "logps/rejected": -201.76446533203125, "loss": 0.6713, "rewards/accuracies": 0.75, "rewards/chosen": 0.37112313508987427, "rewards/margins": 0.05609874427318573, "rewards/rejected": 0.31502437591552734, "step": 1660 }, { "epoch": 0.8957799649453957, "grad_norm": 7.541969299316406, "learning_rate": 1.6335966469484486e-08, "logits/chosen": 0.8333567380905151, "logits/rejected": -0.4657711088657379, "logps/chosen": -255.92755126953125, "logps/rejected": -277.0511169433594, "loss": 0.6947, "rewards/accuracies": 0.5, "rewards/chosen": 0.23766937851905823, "rewards/margins": -0.00022544898092746735, "rewards/rejected": 0.23789483308792114, "step": 1661 }, { "epoch": 0.896319266549818, "grad_norm": 8.596410751342773, "learning_rate": 1.6168964561569714e-08, "logits/chosen": -0.03151007741689682, "logits/rejected": -0.4644593596458435, "logps/chosen": -161.93524169921875, "logps/rejected": -207.25454711914062, "loss": 0.6873, "rewards/accuracies": 0.5, "rewards/chosen": 0.1944970190525055, "rewards/margins": 0.03438558802008629, "rewards/rejected": 0.1601114273071289, "step": 1662 }, { "epoch": 0.8968585681542403, "grad_norm": 7.226507663726807, "learning_rate": 1.6002792140832943e-08, "logits/chosen": -0.4380776882171631, "logits/rejected": -1.2933909893035889, "logps/chosen": -240.30673217773438, "logps/rejected": -182.753662109375, "loss": 0.6494, "rewards/accuracies": 0.625, "rewards/chosen": 0.22752398252487183, "rewards/margins": 0.10487060993909836, "rewards/rejected": 0.12265338748693466, "step": 1663 }, { "epoch": 0.8973978697586625, "grad_norm": 6.548976898193359, "learning_rate": 1.5837449796750586e-08, "logits/chosen": 0.3847573697566986, "logits/rejected": -0.315012663602829, "logps/chosen": -219.61148071289062, "logps/rejected": -179.8965301513672, "loss": 0.6612, "rewards/accuracies": 0.75, "rewards/chosen": 0.2559608519077301, "rewards/margins": 0.07280654460191727, "rewards/rejected": 0.18315429985523224, "step": 1664 }, { "epoch": 0.8979371713630848, "grad_norm": 9.33441162109375, "learning_rate": 1.5672938115854544e-08, "logits/chosen": 0.4187481999397278, "logits/rejected": -0.01477968692779541, "logps/chosen": -259.35552978515625, "logps/rejected": -258.7119445800781, "loss": 0.6823, "rewards/accuracies": 0.5, "rewards/chosen": 0.2249288558959961, "rewards/margins": 0.03597278147935867, "rewards/rejected": 0.18895606696605682, "step": 1665 }, { "epoch": 0.8984764729675071, "grad_norm": 6.655388355255127, "learning_rate": 1.550925768173003e-08, "logits/chosen": 0.9366920590400696, "logits/rejected": 1.0546314716339111, "logps/chosen": -229.87762451171875, "logps/rejected": -283.80462646484375, "loss": 0.6781, "rewards/accuracies": 0.625, "rewards/chosen": 0.29735812544822693, "rewards/margins": 0.04976825416088104, "rewards/rejected": 0.2475898712873459, "step": 1666 }, { "epoch": 0.8990157745719294, "grad_norm": 7.601930141448975, "learning_rate": 1.534640907501347e-08, "logits/chosen": -0.36324256658554077, "logits/rejected": 0.4735521376132965, "logps/chosen": -173.45816040039062, "logps/rejected": -246.74954223632812, "loss": 0.7266, "rewards/accuracies": 0.375, "rewards/chosen": 0.24385342001914978, "rewards/margins": -0.046910807490348816, "rewards/rejected": 0.2907642424106598, "step": 1667 }, { "epoch": 0.8995550761763517, "grad_norm": 6.728512287139893, "learning_rate": 1.518439287339046e-08, "logits/chosen": 0.4187816083431244, "logits/rejected": 0.7370437383651733, "logps/chosen": -231.80723571777344, "logps/rejected": -302.82208251953125, "loss": 0.6288, "rewards/accuracies": 0.75, "rewards/chosen": 0.3638744354248047, "rewards/margins": 0.14327087998390198, "rewards/rejected": 0.2206035852432251, "step": 1668 }, { "epoch": 0.900094377780774, "grad_norm": 7.148212909698486, "learning_rate": 1.5023209651593844e-08, "logits/chosen": -1.0399863719940186, "logits/rejected": 0.19358977675437927, "logps/chosen": -135.19119262695312, "logps/rejected": -187.4426727294922, "loss": 0.7138, "rewards/accuracies": 0.25, "rewards/chosen": 0.22637969255447388, "rewards/margins": -0.01986059732735157, "rewards/rejected": 0.2462402880191803, "step": 1669 }, { "epoch": 0.9006336793851961, "grad_norm": 7.26365852355957, "learning_rate": 1.4862859981401465e-08, "logits/chosen": -0.8004997372627258, "logits/rejected": -0.5200403332710266, "logps/chosen": -222.19142150878906, "logps/rejected": -298.8789367675781, "loss": 0.6728, "rewards/accuracies": 0.625, "rewards/chosen": 0.21567153930664062, "rewards/margins": 0.061758991330862045, "rewards/rejected": 0.15391254425048828, "step": 1670 }, { "epoch": 0.9011729809896184, "grad_norm": 7.312172889709473, "learning_rate": 1.4703344431634318e-08, "logits/chosen": 0.3076868951320648, "logits/rejected": 0.32324427366256714, "logps/chosen": -244.26626586914062, "logps/rejected": -272.1744384765625, "loss": 0.7159, "rewards/accuracies": 0.5, "rewards/chosen": 0.2804323434829712, "rewards/margins": -0.0037515610456466675, "rewards/rejected": 0.28418388962745667, "step": 1671 }, { "epoch": 0.9017122825940407, "grad_norm": 7.07871150970459, "learning_rate": 1.4544663568154426e-08, "logits/chosen": 0.9535889625549316, "logits/rejected": 0.6223064064979553, "logps/chosen": -323.0877685546875, "logps/rejected": -247.43646240234375, "loss": 0.6974, "rewards/accuracies": 0.5, "rewards/chosen": 0.2746501863002777, "rewards/margins": 0.005873285233974457, "rewards/rejected": 0.26877689361572266, "step": 1672 }, { "epoch": 0.902251584198463, "grad_norm": 7.8316874504089355, "learning_rate": 1.4386817953862845e-08, "logits/chosen": 0.45475736260414124, "logits/rejected": 0.8952723741531372, "logps/chosen": -277.4761047363281, "logps/rejected": -348.3894958496094, "loss": 0.6583, "rewards/accuracies": 0.5, "rewards/chosen": 0.2749297022819519, "rewards/margins": 0.08360090851783752, "rewards/rejected": 0.19132882356643677, "step": 1673 }, { "epoch": 0.9027908858028852, "grad_norm": 8.056811332702637, "learning_rate": 1.422980814869773e-08, "logits/chosen": 1.2364213466644287, "logits/rejected": -0.058146923780441284, "logps/chosen": -267.21136474609375, "logps/rejected": -210.52655029296875, "loss": 0.6492, "rewards/accuracies": 0.75, "rewards/chosen": 0.20810289680957794, "rewards/margins": 0.11105643212795258, "rewards/rejected": 0.09704647958278656, "step": 1674 }, { "epoch": 0.9033301874073075, "grad_norm": 6.989512920379639, "learning_rate": 1.407363470963227e-08, "logits/chosen": 0.2997710108757019, "logits/rejected": -0.358466774225235, "logps/chosen": -233.57589721679688, "logps/rejected": -229.28533935546875, "loss": 0.6262, "rewards/accuracies": 0.625, "rewards/chosen": 0.3478964865207672, "rewards/margins": 0.16574038565158844, "rewards/rejected": 0.18215610086917877, "step": 1675 }, { "epoch": 0.9038694890117298, "grad_norm": 6.0396037101745605, "learning_rate": 1.3918298190672806e-08, "logits/chosen": 0.17309585213661194, "logits/rejected": 0.7517809867858887, "logps/chosen": -278.2210693359375, "logps/rejected": -263.9965515136719, "loss": 0.6961, "rewards/accuracies": 0.375, "rewards/chosen": 0.2881452441215515, "rewards/margins": 0.0033377204090356827, "rewards/rejected": 0.28480756282806396, "step": 1676 }, { "epoch": 0.9044087906161521, "grad_norm": 7.806862831115723, "learning_rate": 1.3763799142856691e-08, "logits/chosen": 0.5490881204605103, "logits/rejected": -0.010938301682472229, "logps/chosen": -321.0225524902344, "logps/rejected": -256.1401062011719, "loss": 0.6228, "rewards/accuracies": 0.875, "rewards/chosen": 0.3921663463115692, "rewards/margins": 0.15212516486644745, "rewards/rejected": 0.24004116654396057, "step": 1677 }, { "epoch": 0.9049480922205744, "grad_norm": 6.652557849884033, "learning_rate": 1.3610138114250519e-08, "logits/chosen": 0.20441976189613342, "logits/rejected": -0.01617315411567688, "logps/chosen": -189.910888671875, "logps/rejected": -214.62432861328125, "loss": 0.6265, "rewards/accuracies": 0.625, "rewards/chosen": 0.3009962737560272, "rewards/margins": 0.15509414672851562, "rewards/rejected": 0.1459021121263504, "step": 1678 }, { "epoch": 0.9054873938249967, "grad_norm": 8.227704048156738, "learning_rate": 1.3457315649948147e-08, "logits/chosen": 0.4354962408542633, "logits/rejected": -0.4568574130535126, "logps/chosen": -295.0381164550781, "logps/rejected": -217.90182495117188, "loss": 0.6347, "rewards/accuracies": 0.75, "rewards/chosen": 0.30458909273147583, "rewards/margins": 0.12420153617858887, "rewards/rejected": 0.18038754165172577, "step": 1679 }, { "epoch": 0.9060266954294189, "grad_norm": 8.824984550476074, "learning_rate": 1.3305332292068705e-08, "logits/chosen": -1.5967636108398438, "logits/rejected": -0.7811694145202637, "logps/chosen": -190.8035125732422, "logps/rejected": -210.84625244140625, "loss": 0.7818, "rewards/accuracies": 0.125, "rewards/chosen": 0.12105798721313477, "rewards/margins": -0.1487535536289215, "rewards/rejected": 0.2698115408420563, "step": 1680 }, { "epoch": 0.9065659970338412, "grad_norm": 6.820858955383301, "learning_rate": 1.3154188579754616e-08, "logits/chosen": -0.7625322937965393, "logits/rejected": -1.230787992477417, "logps/chosen": -275.04010009765625, "logps/rejected": -237.26678466796875, "loss": 0.6432, "rewards/accuracies": 0.625, "rewards/chosen": 0.32272589206695557, "rewards/margins": 0.10887356847524643, "rewards/rejected": 0.21385231614112854, "step": 1681 }, { "epoch": 0.9071052986382635, "grad_norm": 8.167036056518555, "learning_rate": 1.300388504916991e-08, "logits/chosen": 1.0061705112457275, "logits/rejected": 1.4134619235992432, "logps/chosen": -280.45574951171875, "logps/rejected": -300.720458984375, "loss": 0.7122, "rewards/accuracies": 0.75, "rewards/chosen": 0.24851742386817932, "rewards/margins": -0.008048254996538162, "rewards/rejected": 0.2565656900405884, "step": 1682 }, { "epoch": 0.9076446002426857, "grad_norm": 7.24718713760376, "learning_rate": 1.2854422233498058e-08, "logits/chosen": -1.2435314655303955, "logits/rejected": 0.09624656289815903, "logps/chosen": -214.01431274414062, "logps/rejected": -268.31011962890625, "loss": 0.6551, "rewards/accuracies": 0.75, "rewards/chosen": 0.30542460083961487, "rewards/margins": 0.08491606265306473, "rewards/rejected": 0.22050853073596954, "step": 1683 }, { "epoch": 0.908183901847108, "grad_norm": 8.51020336151123, "learning_rate": 1.270580066294022e-08, "logits/chosen": 0.5257039070129395, "logits/rejected": -0.9340826869010925, "logps/chosen": -279.9271240234375, "logps/rejected": -243.28492736816406, "loss": 0.6328, "rewards/accuracies": 0.625, "rewards/chosen": 0.2945888638496399, "rewards/margins": 0.15038223564624786, "rewards/rejected": 0.14420661330223083, "step": 1684 }, { "epoch": 0.9087232034515302, "grad_norm": 19.226547241210938, "learning_rate": 1.255802086471336e-08, "logits/chosen": 0.42046624422073364, "logits/rejected": 1.2294731140136719, "logps/chosen": -219.69007873535156, "logps/rejected": -223.86776733398438, "loss": 0.7204, "rewards/accuracies": 0.375, "rewards/chosen": 0.2797779142856598, "rewards/margins": -0.049114421010017395, "rewards/rejected": 0.3288923501968384, "step": 1685 }, { "epoch": 0.9092625050559525, "grad_norm": 7.7528839111328125, "learning_rate": 1.2411083363048386e-08, "logits/chosen": -0.51710045337677, "logits/rejected": -0.43775585293769836, "logps/chosen": -214.32302856445312, "logps/rejected": -232.8851776123047, "loss": 0.7174, "rewards/accuracies": 0.375, "rewards/chosen": 0.20971402525901794, "rewards/margins": -0.01894693821668625, "rewards/rejected": 0.2286609709262848, "step": 1686 }, { "epoch": 0.9098018066603748, "grad_norm": 7.858645439147949, "learning_rate": 1.2264988679188181e-08, "logits/chosen": 0.16503584384918213, "logits/rejected": 0.191541850566864, "logps/chosen": -204.74139404296875, "logps/rejected": -213.55029296875, "loss": 0.6463, "rewards/accuracies": 0.75, "rewards/chosen": 0.28348296880722046, "rewards/margins": 0.11813975125551224, "rewards/rejected": 0.16534319519996643, "step": 1687 }, { "epoch": 0.9103411082647971, "grad_norm": 5.999725818634033, "learning_rate": 1.2119737331385882e-08, "logits/chosen": 0.033794939517974854, "logits/rejected": -0.553809404373169, "logps/chosen": -200.84429931640625, "logps/rejected": -214.52371215820312, "loss": 0.6225, "rewards/accuracies": 0.5, "rewards/chosen": 0.2795163094997406, "rewards/margins": 0.16574811935424805, "rewards/rejected": 0.11376819759607315, "step": 1688 }, { "epoch": 0.9108804098692194, "grad_norm": 6.579035758972168, "learning_rate": 1.1975329834903014e-08, "logits/chosen": 0.1308399885892868, "logits/rejected": -0.5148248672485352, "logps/chosen": -214.76727294921875, "logps/rejected": -192.6377410888672, "loss": 0.6636, "rewards/accuracies": 0.75, "rewards/chosen": 0.35003727674484253, "rewards/margins": 0.06164809316396713, "rewards/rejected": 0.2883892059326172, "step": 1689 }, { "epoch": 0.9114197114736416, "grad_norm": 7.73162841796875, "learning_rate": 1.1831766702007612e-08, "logits/chosen": 0.6378284096717834, "logits/rejected": 0.5136896967887878, "logps/chosen": -249.83157348632812, "logps/rejected": -258.9708251953125, "loss": 0.5941, "rewards/accuracies": 0.875, "rewards/chosen": 0.3015413284301758, "rewards/margins": 0.2178056836128235, "rewards/rejected": 0.08373565226793289, "step": 1690 }, { "epoch": 0.9119590130780639, "grad_norm": 6.919564247131348, "learning_rate": 1.168904844197241e-08, "logits/chosen": 0.28972798585891724, "logits/rejected": 0.6803395748138428, "logps/chosen": -320.634765625, "logps/rejected": -382.572509765625, "loss": 0.6533, "rewards/accuracies": 0.75, "rewards/chosen": 0.30290302634239197, "rewards/margins": 0.08975791186094284, "rewards/rejected": 0.21314513683319092, "step": 1691 }, { "epoch": 0.9124983146824862, "grad_norm": 7.293718338012695, "learning_rate": 1.1547175561073152e-08, "logits/chosen": 0.5000752210617065, "logits/rejected": -0.363353431224823, "logps/chosen": -181.34738159179688, "logps/rejected": -180.50035095214844, "loss": 0.6564, "rewards/accuracies": 0.375, "rewards/chosen": 0.23435088992118835, "rewards/margins": 0.09436417371034622, "rewards/rejected": 0.13998670876026154, "step": 1692 }, { "epoch": 0.9130376162869085, "grad_norm": 6.835783958435059, "learning_rate": 1.140614856258662e-08, "logits/chosen": 0.041316092014312744, "logits/rejected": 0.388016939163208, "logps/chosen": -247.3732452392578, "logps/rejected": -415.782470703125, "loss": 0.6312, "rewards/accuracies": 0.625, "rewards/chosen": 0.29027968645095825, "rewards/margins": 0.14293891191482544, "rewards/rejected": 0.1473407745361328, "step": 1693 }, { "epoch": 0.9135769178913308, "grad_norm": 7.657758712768555, "learning_rate": 1.1265967946788912e-08, "logits/chosen": -0.4498627781867981, "logits/rejected": -1.0647801160812378, "logps/chosen": -248.78810119628906, "logps/rejected": -260.72222900390625, "loss": 0.7074, "rewards/accuracies": 0.5, "rewards/chosen": 0.2189485728740692, "rewards/margins": 0.0014228839427232742, "rewards/rejected": 0.2175256758928299, "step": 1694 }, { "epoch": 0.914116219495753, "grad_norm": 6.187150001525879, "learning_rate": 1.1126634210953751e-08, "logits/chosen": 1.3947062492370605, "logits/rejected": 0.44897791743278503, "logps/chosen": -241.98068237304688, "logps/rejected": -176.5337677001953, "loss": 0.6236, "rewards/accuracies": 0.625, "rewards/chosen": 0.3070608377456665, "rewards/margins": 0.1565670222043991, "rewards/rejected": 0.1504938155412674, "step": 1695 }, { "epoch": 0.9146555211001752, "grad_norm": 7.868348598480225, "learning_rate": 1.098814784935062e-08, "logits/chosen": -0.007250398397445679, "logits/rejected": -0.6863348484039307, "logps/chosen": -254.1373748779297, "logps/rejected": -198.73690795898438, "loss": 0.7203, "rewards/accuracies": 0.25, "rewards/chosen": 0.17806874215602875, "rewards/margins": -0.027891837060451508, "rewards/rejected": 0.20596055686473846, "step": 1696 }, { "epoch": 0.9151948227045975, "grad_norm": 7.342476844787598, "learning_rate": 1.0850509353243081e-08, "logits/chosen": 0.5744562149047852, "logits/rejected": 0.2354414463043213, "logps/chosen": -223.5961456298828, "logps/rejected": -218.0300750732422, "loss": 0.682, "rewards/accuracies": 0.5, "rewards/chosen": 0.2609451413154602, "rewards/margins": 0.03392324224114418, "rewards/rejected": 0.22702190279960632, "step": 1697 }, { "epoch": 0.9157341243090198, "grad_norm": 6.689701557159424, "learning_rate": 1.0713719210886928e-08, "logits/chosen": 0.5644553899765015, "logits/rejected": -1.1619868278503418, "logps/chosen": -239.55999755859375, "logps/rejected": -202.4461212158203, "loss": 0.6259, "rewards/accuracies": 0.75, "rewards/chosen": 0.3180192708969116, "rewards/margins": 0.15016257762908936, "rewards/rejected": 0.16785672307014465, "step": 1698 }, { "epoch": 0.9162734259134421, "grad_norm": 7.754012107849121, "learning_rate": 1.0577777907528618e-08, "logits/chosen": 0.5175084471702576, "logits/rejected": -0.21918544173240662, "logps/chosen": -259.4305114746094, "logps/rejected": -234.4466552734375, "loss": 0.6603, "rewards/accuracies": 0.75, "rewards/chosen": 0.2590949237346649, "rewards/margins": 0.07889127731323242, "rewards/rejected": 0.1802036464214325, "step": 1699 }, { "epoch": 0.9168127275178644, "grad_norm": 7.551769733428955, "learning_rate": 1.0442685925403344e-08, "logits/chosen": 0.1411183625459671, "logits/rejected": -0.48808950185775757, "logps/chosen": -238.60296630859375, "logps/rejected": -215.5308837890625, "loss": 0.7121, "rewards/accuracies": 0.375, "rewards/chosen": 0.28499871492385864, "rewards/margins": -0.015840522944927216, "rewards/rejected": 0.30083924531936646, "step": 1700 }, { "epoch": 0.9173520291222866, "grad_norm": 6.478913307189941, "learning_rate": 1.0308443743733546e-08, "logits/chosen": -0.15127283334732056, "logits/rejected": -0.6159822344779968, "logps/chosen": -168.4664306640625, "logps/rejected": -146.44679260253906, "loss": 0.6697, "rewards/accuracies": 0.625, "rewards/chosen": 0.2201709747314453, "rewards/margins": 0.07273055613040924, "rewards/rejected": 0.14744043350219727, "step": 1701 }, { "epoch": 0.9178913307267089, "grad_norm": 8.549936294555664, "learning_rate": 1.0175051838727023e-08, "logits/chosen": -0.299165278673172, "logits/rejected": 0.17544227838516235, "logps/chosen": -203.94976806640625, "logps/rejected": -223.49813842773438, "loss": 0.7274, "rewards/accuracies": 0.25, "rewards/chosen": 0.2133008986711502, "rewards/margins": -0.05766868591308594, "rewards/rejected": 0.27096956968307495, "step": 1702 }, { "epoch": 0.9184306323311312, "grad_norm": 7.547126770019531, "learning_rate": 1.0042510683575339e-08, "logits/chosen": -0.020045161247253418, "logits/rejected": 0.2098616361618042, "logps/chosen": -271.1405944824219, "logps/rejected": -292.40283203125, "loss": 0.6443, "rewards/accuracies": 0.75, "rewards/chosen": 0.2783249020576477, "rewards/margins": 0.12448691576719284, "rewards/rejected": 0.15383797883987427, "step": 1703 }, { "epoch": 0.9189699339355535, "grad_norm": 9.148500442504883, "learning_rate": 9.910820748452148e-09, "logits/chosen": -0.4546220004558563, "logits/rejected": -1.2249137163162231, "logps/chosen": -246.15028381347656, "logps/rejected": -218.86175537109375, "loss": 0.7296, "rewards/accuracies": 0.5, "rewards/chosen": 0.15377721190452576, "rewards/margins": -0.040895141661167145, "rewards/rejected": 0.1946723461151123, "step": 1704 }, { "epoch": 0.9195092355399758, "grad_norm": 6.858067989349365, "learning_rate": 9.779982500511457e-09, "logits/chosen": 0.33602210879325867, "logits/rejected": -1.1635669469833374, "logps/chosen": -268.7437744140625, "logps/rejected": -213.19467163085938, "loss": 0.6537, "rewards/accuracies": 0.5, "rewards/chosen": 0.3026731014251709, "rewards/margins": 0.0976533442735672, "rewards/rejected": 0.2050197720527649, "step": 1705 }, { "epoch": 0.920048537144398, "grad_norm": 7.099669933319092, "learning_rate": 9.649996403886085e-09, "logits/chosen": 0.12330657243728638, "logits/rejected": -0.4318162202835083, "logps/chosen": -330.720703125, "logps/rejected": -350.5711669921875, "loss": 0.5904, "rewards/accuracies": 0.75, "rewards/chosen": 0.2896374762058258, "rewards/margins": 0.2477341592311859, "rewards/rejected": 0.0419033020734787, "step": 1706 }, { "epoch": 0.9205878387488203, "grad_norm": 8.135424613952637, "learning_rate": 9.520862919685902e-09, "logits/chosen": 0.17962437868118286, "logits/rejected": -0.5014395713806152, "logps/chosen": -229.351806640625, "logps/rejected": -185.13906860351562, "loss": 0.6593, "rewards/accuracies": 0.625, "rewards/chosen": 0.17363396286964417, "rewards/margins": 0.09927073121070862, "rewards/rejected": 0.07436323910951614, "step": 1707 }, { "epoch": 0.9211271403532425, "grad_norm": 6.639937877655029, "learning_rate": 9.392582505996255e-09, "logits/chosen": 0.298675537109375, "logits/rejected": -1.875511884689331, "logps/chosen": -207.25070190429688, "logps/rejected": -142.20431518554688, "loss": 0.6106, "rewards/accuracies": 0.75, "rewards/chosen": 0.2611025869846344, "rewards/margins": 0.208812415599823, "rewards/rejected": 0.05229015275835991, "step": 1708 }, { "epoch": 0.9216664419576648, "grad_norm": 8.265810012817383, "learning_rate": 9.265155617876275e-09, "logits/chosen": 0.6425062417984009, "logits/rejected": 0.38317781686782837, "logps/chosen": -255.6146240234375, "logps/rejected": -308.3804931640625, "loss": 0.6485, "rewards/accuracies": 0.625, "rewards/chosen": 0.2996278703212738, "rewards/margins": 0.10300950706005096, "rewards/rejected": 0.19661837816238403, "step": 1709 }, { "epoch": 0.9222057435620871, "grad_norm": 6.348053932189941, "learning_rate": 9.138582707357428e-09, "logits/chosen": -0.2719285488128662, "logits/rejected": -1.2998332977294922, "logps/chosen": -154.51673889160156, "logps/rejected": -142.32931518554688, "loss": 0.6534, "rewards/accuracies": 0.625, "rewards/chosen": 0.21567068994045258, "rewards/margins": 0.10889711230993271, "rewards/rejected": 0.10677356272935867, "step": 1710 }, { "epoch": 0.9227450451665093, "grad_norm": 8.069062232971191, "learning_rate": 9.012864223441662e-09, "logits/chosen": -0.06594124436378479, "logits/rejected": -1.2206426858901978, "logps/chosen": -275.3605041503906, "logps/rejected": -186.937255859375, "loss": 0.6516, "rewards/accuracies": 0.75, "rewards/chosen": 0.21595898270606995, "rewards/margins": 0.12500372529029846, "rewards/rejected": 0.09095526486635208, "step": 1711 }, { "epoch": 0.9232843467709316, "grad_norm": 8.57202434539795, "learning_rate": 8.888000612100127e-09, "logits/chosen": -0.7226338386535645, "logits/rejected": -0.24714180827140808, "logps/chosen": -221.4221649169922, "logps/rejected": -263.614501953125, "loss": 0.7015, "rewards/accuracies": 0.375, "rewards/chosen": 0.253500759601593, "rewards/margins": -0.010214235633611679, "rewards/rejected": 0.263714998960495, "step": 1712 }, { "epoch": 0.9238236483753539, "grad_norm": 6.572484970092773, "learning_rate": 8.763992316271174e-09, "logits/chosen": 0.13287408649921417, "logits/rejected": 0.2053951770067215, "logps/chosen": -211.7611083984375, "logps/rejected": -246.77574157714844, "loss": 0.6688, "rewards/accuracies": 0.5, "rewards/chosen": 0.2836008071899414, "rewards/margins": 0.05624770373106003, "rewards/rejected": 0.22735312581062317, "step": 1713 }, { "epoch": 0.9243629499797762, "grad_norm": 8.269004821777344, "learning_rate": 8.640839775859221e-09, "logits/chosen": 0.7561063170433044, "logits/rejected": 0.5063572525978088, "logps/chosen": -303.4508972167969, "logps/rejected": -252.78240966796875, "loss": 0.6806, "rewards/accuracies": 0.5, "rewards/chosen": 0.26809805631637573, "rewards/margins": 0.03783855959773064, "rewards/rejected": 0.23025952279567719, "step": 1714 }, { "epoch": 0.9249022515841985, "grad_norm": 7.697258472442627, "learning_rate": 8.518543427732949e-09, "logits/chosen": 0.9204349517822266, "logits/rejected": 0.5237825512886047, "logps/chosen": -286.11968994140625, "logps/rejected": -283.71966552734375, "loss": 0.6963, "rewards/accuracies": 0.625, "rewards/chosen": 0.2261168360710144, "rewards/margins": 0.007292071357369423, "rewards/rejected": 0.21882477402687073, "step": 1715 }, { "epoch": 0.9254415531886208, "grad_norm": 6.904294967651367, "learning_rate": 8.397103705723775e-09, "logits/chosen": 0.9637314081192017, "logits/rejected": 0.45414847135543823, "logps/chosen": -263.62481689453125, "logps/rejected": -184.152099609375, "loss": 0.6921, "rewards/accuracies": 0.625, "rewards/chosen": 0.2650860548019409, "rewards/margins": 0.03198728337883949, "rewards/rejected": 0.2330988049507141, "step": 1716 }, { "epoch": 0.925980854793043, "grad_norm": 6.983127593994141, "learning_rate": 8.276521040624345e-09, "logits/chosen": 0.17944476008415222, "logits/rejected": -0.8591233491897583, "logps/chosen": -231.5665283203125, "logps/rejected": -206.7794647216797, "loss": 0.6491, "rewards/accuracies": 0.625, "rewards/chosen": 0.2354438751935959, "rewards/margins": 0.10172271728515625, "rewards/rejected": 0.13372115790843964, "step": 1717 }, { "epoch": 0.9265201563974653, "grad_norm": 7.2200541496276855, "learning_rate": 8.156795860187027e-09, "logits/chosen": -0.25339746475219727, "logits/rejected": 0.06410843133926392, "logps/chosen": -283.722412109375, "logps/rejected": -238.4850311279297, "loss": 0.7435, "rewards/accuracies": 0.25, "rewards/chosen": 0.24272461235523224, "rewards/margins": -0.09186458587646484, "rewards/rejected": 0.33458924293518066, "step": 1718 }, { "epoch": 0.9270594580018876, "grad_norm": 6.4942307472229, "learning_rate": 8.037928589122306e-09, "logits/chosen": -0.9899293184280396, "logits/rejected": -1.5849840641021729, "logps/chosen": -255.3314666748047, "logps/rejected": -197.62991333007812, "loss": 0.6156, "rewards/accuracies": 0.75, "rewards/chosen": 0.3858183026313782, "rewards/margins": 0.17138901352882385, "rewards/rejected": 0.21442928910255432, "step": 1719 }, { "epoch": 0.9275987596063098, "grad_norm": 6.924929618835449, "learning_rate": 7.91991964909744e-09, "logits/chosen": 0.06388568878173828, "logits/rejected": -0.639651894569397, "logps/chosen": -392.716552734375, "logps/rejected": -259.60076904296875, "loss": 0.6818, "rewards/accuracies": 0.75, "rewards/chosen": 0.24309921264648438, "rewards/margins": 0.027201464399695396, "rewards/rejected": 0.21589776873588562, "step": 1720 }, { "epoch": 0.9281380612107321, "grad_norm": 7.5741095542907715, "learning_rate": 7.80276945873476e-09, "logits/chosen": 1.9278792142868042, "logits/rejected": 0.6471579670906067, "logps/chosen": -204.0267333984375, "logps/rejected": -218.04925537109375, "loss": 0.6875, "rewards/accuracies": 0.75, "rewards/chosen": 0.1878330260515213, "rewards/margins": 0.030009757727384567, "rewards/rejected": 0.15782327950000763, "step": 1721 }, { "epoch": 0.9286773628151543, "grad_norm": 8.07251262664795, "learning_rate": 7.686478433610339e-09, "logits/chosen": 0.36122459173202515, "logits/rejected": -0.7083293199539185, "logps/chosen": -309.14276123046875, "logps/rejected": -224.9419708251953, "loss": 0.6993, "rewards/accuracies": 0.5, "rewards/chosen": 0.3171825408935547, "rewards/margins": -0.005498314276337624, "rewards/rejected": 0.32268086075782776, "step": 1722 }, { "epoch": 0.9292166644195766, "grad_norm": 8.048133850097656, "learning_rate": 7.571046986252466e-09, "logits/chosen": 0.499074250459671, "logits/rejected": -0.7916439771652222, "logps/chosen": -195.4637451171875, "logps/rejected": -174.50146484375, "loss": 0.6514, "rewards/accuracies": 0.625, "rewards/chosen": 0.2336178719997406, "rewards/margins": 0.09071120619773865, "rewards/rejected": 0.14290666580200195, "step": 1723 }, { "epoch": 0.9297559660239989, "grad_norm": 8.723793983459473, "learning_rate": 7.45647552614015e-09, "logits/chosen": 0.036466971039772034, "logits/rejected": -0.0343351848423481, "logps/chosen": -244.00796508789062, "logps/rejected": -330.3148193359375, "loss": 0.7185, "rewards/accuracies": 0.5, "rewards/chosen": 0.2225264608860016, "rewards/margins": -0.023937124758958817, "rewards/rejected": 0.2464635819196701, "step": 1724 }, { "epoch": 0.9302952676284212, "grad_norm": 7.038191318511963, "learning_rate": 7.342764459701723e-09, "logits/chosen": 0.9625868201255798, "logits/rejected": -0.27279531955718994, "logps/chosen": -232.95230102539062, "logps/rejected": -152.72250366210938, "loss": 0.6481, "rewards/accuracies": 0.625, "rewards/chosen": 0.24788761138916016, "rewards/margins": 0.10087443143129349, "rewards/rejected": 0.14701318740844727, "step": 1725 }, { "epoch": 0.9308345692328435, "grad_norm": 9.469698905944824, "learning_rate": 7.22991419031338e-09, "logits/chosen": 0.736747682094574, "logits/rejected": 0.1434900164604187, "logps/chosen": -301.3958435058594, "logps/rejected": -250.92984008789062, "loss": 0.6935, "rewards/accuracies": 0.375, "rewards/chosen": 0.22192879021167755, "rewards/margins": 0.03888358920812607, "rewards/rejected": 0.18304520845413208, "step": 1726 }, { "epoch": 0.9313738708372657, "grad_norm": 7.535282135009766, "learning_rate": 7.1179251182977295e-09, "logits/chosen": -0.10755214095115662, "logits/rejected": 0.488264799118042, "logps/chosen": -216.42013549804688, "logps/rejected": -315.0540771484375, "loss": 0.6839, "rewards/accuracies": 0.625, "rewards/chosen": 0.21837255358695984, "rewards/margins": 0.02684326283633709, "rewards/rejected": 0.1915292739868164, "step": 1727 }, { "epoch": 0.931913172441688, "grad_norm": 7.276549339294434, "learning_rate": 7.006797640922435e-09, "logits/chosen": 0.7720540165901184, "logits/rejected": 0.7613895535469055, "logps/chosen": -206.24362182617188, "logps/rejected": -212.56553649902344, "loss": 0.7038, "rewards/accuracies": 0.375, "rewards/chosen": 0.25629597902297974, "rewards/margins": -0.012007806450128555, "rewards/rejected": 0.268303781747818, "step": 1728 }, { "epoch": 0.9324524740461103, "grad_norm": 8.525206565856934, "learning_rate": 6.896532152398632e-09, "logits/chosen": -0.17945259809494019, "logits/rejected": -0.877418577671051, "logps/chosen": -347.39593505859375, "logps/rejected": -383.07891845703125, "loss": 0.754, "rewards/accuracies": 0.375, "rewards/chosen": 0.19062185287475586, "rewards/margins": -0.10020313411951065, "rewards/rejected": 0.2908249795436859, "step": 1729 }, { "epoch": 0.9329917756505326, "grad_norm": 7.02874755859375, "learning_rate": 6.787129043879819e-09, "logits/chosen": 0.45522111654281616, "logits/rejected": -0.09960611164569855, "logps/chosen": -222.52056884765625, "logps/rejected": -237.81857299804688, "loss": 0.659, "rewards/accuracies": 0.375, "rewards/chosen": 0.3046405613422394, "rewards/margins": 0.09197330474853516, "rewards/rejected": 0.2126672863960266, "step": 1730 }, { "epoch": 0.9335310772549549, "grad_norm": 9.130603790283203, "learning_rate": 6.678588703460164e-09, "logits/chosen": -0.2327600121498108, "logits/rejected": -0.9455168843269348, "logps/chosen": -200.97589111328125, "logps/rejected": -178.60421752929688, "loss": 0.6329, "rewards/accuracies": 0.5, "rewards/chosen": 0.2804550230503082, "rewards/margins": 0.14032210409641266, "rewards/rejected": 0.14013290405273438, "step": 1731 }, { "epoch": 0.9340703788593772, "grad_norm": 6.71237325668335, "learning_rate": 6.570911516173367e-09, "logits/chosen": -1.093918800354004, "logits/rejected": -0.4141175448894501, "logps/chosen": -214.3740234375, "logps/rejected": -274.1138916015625, "loss": 0.7079, "rewards/accuracies": 0.375, "rewards/chosen": 0.20710572600364685, "rewards/margins": -0.012104131281375885, "rewards/rejected": 0.21920986473560333, "step": 1732 }, { "epoch": 0.9346096804637993, "grad_norm": 7.305292129516602, "learning_rate": 6.4640978639911585e-09, "logits/chosen": -0.07182708382606506, "logits/rejected": 0.8333231210708618, "logps/chosen": -244.9391632080078, "logps/rejected": -324.58306884765625, "loss": 0.6558, "rewards/accuracies": 0.75, "rewards/chosen": 0.22960461676120758, "rewards/margins": 0.09130911529064178, "rewards/rejected": 0.138295516371727, "step": 1733 }, { "epoch": 0.9351489820682216, "grad_norm": 8.767008781433105, "learning_rate": 6.358148125821999e-09, "logits/chosen": 0.30345994234085083, "logits/rejected": 0.3600462079048157, "logps/chosen": -239.5279541015625, "logps/rejected": -240.32220458984375, "loss": 0.7756, "rewards/accuracies": 0.25, "rewards/chosen": 0.12509441375732422, "rewards/margins": -0.13567373156547546, "rewards/rejected": 0.2607681155204773, "step": 1734 }, { "epoch": 0.9356882836726439, "grad_norm": 6.262093544006348, "learning_rate": 6.253062677509663e-09, "logits/chosen": 0.18317070603370667, "logits/rejected": 0.48650306463241577, "logps/chosen": -203.33135986328125, "logps/rejected": -247.9308624267578, "loss": 0.6456, "rewards/accuracies": 0.75, "rewards/chosen": 0.29008981585502625, "rewards/margins": 0.11799756437540054, "rewards/rejected": 0.1720922291278839, "step": 1735 }, { "epoch": 0.9362275852770662, "grad_norm": 9.546297073364258, "learning_rate": 6.148841891832068e-09, "logits/chosen": -0.630653977394104, "logits/rejected": -0.9545276761054993, "logps/chosen": -187.10540771484375, "logps/rejected": -192.8797607421875, "loss": 0.7415, "rewards/accuracies": 0.375, "rewards/chosen": 0.22075195610523224, "rewards/margins": -0.06860199570655823, "rewards/rejected": 0.28935396671295166, "step": 1736 }, { "epoch": 0.9367668868814885, "grad_norm": 6.9398908615112305, "learning_rate": 6.045486138499756e-09, "logits/chosen": -0.30611151456832886, "logits/rejected": -0.2511117160320282, "logps/chosen": -215.56533813476562, "logps/rejected": -305.99224853515625, "loss": 0.68, "rewards/accuracies": 0.625, "rewards/chosen": 0.19118528068065643, "rewards/margins": 0.04832926392555237, "rewards/rejected": 0.14285601675510406, "step": 1737 }, { "epoch": 0.9373061884859107, "grad_norm": 7.650277614593506, "learning_rate": 5.942995784154692e-09, "logits/chosen": 1.0034596920013428, "logits/rejected": 0.018773719668388367, "logps/chosen": -318.8880615234375, "logps/rejected": -295.31561279296875, "loss": 0.6339, "rewards/accuracies": 0.75, "rewards/chosen": 0.38995587825775146, "rewards/margins": 0.12932395935058594, "rewards/rejected": 0.2606319487094879, "step": 1738 }, { "epoch": 0.937845490090333, "grad_norm": 8.215425491333008, "learning_rate": 5.841371192368938e-09, "logits/chosen": 0.8381428718566895, "logits/rejected": 0.04656469076871872, "logps/chosen": -301.69219970703125, "logps/rejected": -258.7289123535156, "loss": 0.673, "rewards/accuracies": 0.625, "rewards/chosen": 0.3305376172065735, "rewards/margins": 0.05592699348926544, "rewards/rejected": 0.27461060881614685, "step": 1739 }, { "epoch": 0.9383847916947553, "grad_norm": 6.611172676086426, "learning_rate": 5.740612723643401e-09, "logits/chosen": -0.0780872106552124, "logits/rejected": -1.040395975112915, "logps/chosen": -250.27313232421875, "logps/rejected": -168.63685607910156, "loss": 0.593, "rewards/accuracies": 0.75, "rewards/chosen": 0.2687101364135742, "rewards/margins": 0.2231667935848236, "rewards/rejected": 0.04554334282875061, "step": 1740 }, { "epoch": 0.9389240932991776, "grad_norm": 8.640463829040527, "learning_rate": 5.640720735406529e-09, "logits/chosen": -0.455407977104187, "logits/rejected": -0.15394721925258636, "logps/chosen": -187.78582763671875, "logps/rejected": -223.72491455078125, "loss": 0.749, "rewards/accuracies": 0.25, "rewards/chosen": 0.1676056981086731, "rewards/margins": -0.10104256123304367, "rewards/rejected": 0.268648236989975, "step": 1741 }, { "epoch": 0.9394633949035999, "grad_norm": 8.063871383666992, "learning_rate": 5.541695582012951e-09, "logits/chosen": 0.6376574635505676, "logits/rejected": -0.07003557682037354, "logps/chosen": -376.78662109375, "logps/rejected": -410.6734619140625, "loss": 0.6999, "rewards/accuracies": 0.5, "rewards/chosen": 0.2586832046508789, "rewards/margins": 0.017244352027773857, "rewards/rejected": 0.2414388656616211, "step": 1742 }, { "epoch": 0.9400026965080222, "grad_norm": 7.982825756072998, "learning_rate": 5.443537614742394e-09, "logits/chosen": -0.23101994395256042, "logits/rejected": -0.07198624312877655, "logps/chosen": -299.39501953125, "logps/rejected": -347.9587707519531, "loss": 0.6455, "rewards/accuracies": 0.75, "rewards/chosen": 0.28330403566360474, "rewards/margins": 0.1525995433330536, "rewards/rejected": 0.13070449233055115, "step": 1743 }, { "epoch": 0.9405419981124444, "grad_norm": 6.715437412261963, "learning_rate": 5.346247181798325e-09, "logits/chosen": 1.2059669494628906, "logits/rejected": -0.14871051907539368, "logps/chosen": -323.2347106933594, "logps/rejected": -217.48687744140625, "loss": 0.6709, "rewards/accuracies": 0.625, "rewards/chosen": 0.21511630713939667, "rewards/margins": 0.05460090562701225, "rewards/rejected": 0.16051539778709412, "step": 1744 }, { "epoch": 0.9410812997168666, "grad_norm": 7.871525287628174, "learning_rate": 5.249824628306726e-09, "logits/chosen": 0.7634117007255554, "logits/rejected": 0.1939275860786438, "logps/chosen": -272.1278381347656, "logps/rejected": -241.49310302734375, "loss": 0.7083, "rewards/accuracies": 0.5, "rewards/chosen": 0.24712544679641724, "rewards/margins": -0.011749550700187683, "rewards/rejected": 0.2588750123977661, "step": 1745 }, { "epoch": 0.9416206013212889, "grad_norm": 7.268445014953613, "learning_rate": 5.154270296314878e-09, "logits/chosen": 0.20140905678272247, "logits/rejected": -0.5154944658279419, "logps/chosen": -255.5149688720703, "logps/rejected": -248.75540161132812, "loss": 0.6671, "rewards/accuracies": 0.625, "rewards/chosen": 0.23190078139305115, "rewards/margins": 0.061409760266542435, "rewards/rejected": 0.170491024851799, "step": 1746 }, { "epoch": 0.9421599029257112, "grad_norm": 9.111614227294922, "learning_rate": 5.059584524790189e-09, "logits/chosen": -0.0332578644156456, "logits/rejected": 0.16348928213119507, "logps/chosen": -259.1986389160156, "logps/rejected": -260.4068603515625, "loss": 0.7316, "rewards/accuracies": 0.5, "rewards/chosen": 0.18534916639328003, "rewards/margins": -0.061551764607429504, "rewards/rejected": 0.24690094590187073, "step": 1747 }, { "epoch": 0.9426992045301334, "grad_norm": 6.981016159057617, "learning_rate": 4.965767649618868e-09, "logits/chosen": 0.05183400213718414, "logits/rejected": 0.030537903308868408, "logps/chosen": -235.14242553710938, "logps/rejected": -222.7081298828125, "loss": 0.715, "rewards/accuracies": 0.25, "rewards/chosen": 0.22699718177318573, "rewards/margins": -0.02674589678645134, "rewards/rejected": 0.25374308228492737, "step": 1748 }, { "epoch": 0.9432385061345557, "grad_norm": 7.814472198486328, "learning_rate": 4.8728200036049215e-09, "logits/chosen": -0.13388735055923462, "logits/rejected": -0.9723266959190369, "logps/chosen": -293.1058654785156, "logps/rejected": -194.72486877441406, "loss": 0.6299, "rewards/accuracies": 0.875, "rewards/chosen": 0.23662109673023224, "rewards/margins": 0.14483709633350372, "rewards/rejected": 0.09178400784730911, "step": 1749 }, { "epoch": 0.943777807738978, "grad_norm": 7.091527462005615, "learning_rate": 4.780741916468767e-09, "logits/chosen": 1.20114266872406, "logits/rejected": 0.3292785882949829, "logps/chosen": -220.29159545898438, "logps/rejected": -181.00595092773438, "loss": 0.6931, "rewards/accuracies": 0.625, "rewards/chosen": 0.2762397825717926, "rewards/margins": 0.009940151125192642, "rewards/rejected": 0.26629963517189026, "step": 1750 }, { "epoch": 0.9443171093434003, "grad_norm": 7.589290142059326, "learning_rate": 4.689533714846317e-09, "logits/chosen": 0.6290695071220398, "logits/rejected": -0.6494110822677612, "logps/chosen": -339.990966796875, "logps/rejected": -266.5962219238281, "loss": 0.6655, "rewards/accuracies": 0.625, "rewards/chosen": 0.3838985562324524, "rewards/margins": 0.0710349977016449, "rewards/rejected": 0.3128635585308075, "step": 1751 }, { "epoch": 0.9448564109478226, "grad_norm": 6.852378845214844, "learning_rate": 4.599195722287535e-09, "logits/chosen": 0.04840414226055145, "logits/rejected": -0.48588982224464417, "logps/chosen": -218.22802734375, "logps/rejected": -195.2518768310547, "loss": 0.6742, "rewards/accuracies": 0.5, "rewards/chosen": 0.21831902861595154, "rewards/margins": 0.053443145006895065, "rewards/rejected": 0.16487589478492737, "step": 1752 }, { "epoch": 0.9453957125522449, "grad_norm": 7.370027542114258, "learning_rate": 4.509728259255468e-09, "logits/chosen": -0.3762838840484619, "logits/rejected": -0.9957235455513, "logps/chosen": -261.3039245605469, "logps/rejected": -262.7789306640625, "loss": 0.694, "rewards/accuracies": 0.5, "rewards/chosen": 0.1978204846382141, "rewards/margins": 0.04709884151816368, "rewards/rejected": 0.15072163939476013, "step": 1753 }, { "epoch": 0.9459350141566671, "grad_norm": 7.030991554260254, "learning_rate": 4.4211316431251035e-09, "logits/chosen": -0.027045726776123047, "logits/rejected": -0.42767030000686646, "logps/chosen": -197.30191040039062, "logps/rejected": -237.52700805664062, "loss": 0.6812, "rewards/accuracies": 0.5, "rewards/chosen": 0.23586368560791016, "rewards/margins": 0.03573455289006233, "rewards/rejected": 0.20012912154197693, "step": 1754 }, { "epoch": 0.9464743157610894, "grad_norm": 7.788833141326904, "learning_rate": 4.333406188182092e-09, "logits/chosen": 0.4175671935081482, "logits/rejected": 0.5001330375671387, "logps/chosen": -247.43276977539062, "logps/rejected": -210.16566467285156, "loss": 0.7066, "rewards/accuracies": 0.375, "rewards/chosen": 0.1506633758544922, "rewards/margins": -0.003418169915676117, "rewards/rejected": 0.1540815383195877, "step": 1755 }, { "epoch": 0.9470136173655117, "grad_norm": 7.247875690460205, "learning_rate": 4.246552205621895e-09, "logits/chosen": -0.14471906423568726, "logits/rejected": -1.1348931789398193, "logps/chosen": -271.12652587890625, "logps/rejected": -180.36407470703125, "loss": 0.6692, "rewards/accuracies": 0.375, "rewards/chosen": 0.28447428345680237, "rewards/margins": 0.06419448554515839, "rewards/rejected": 0.22027979791164398, "step": 1756 }, { "epoch": 0.947552918969934, "grad_norm": 5.664723873138428, "learning_rate": 4.160570003548414e-09, "logits/chosen": -0.46512049436569214, "logits/rejected": -0.4593580961227417, "logps/chosen": -153.348876953125, "logps/rejected": -140.7579803466797, "loss": 0.6633, "rewards/accuracies": 0.75, "rewards/chosen": 0.1631523221731186, "rewards/margins": 0.06863299012184143, "rewards/rejected": 0.09451933205127716, "step": 1757 }, { "epoch": 0.9480922205743562, "grad_norm": 8.19200611114502, "learning_rate": 4.075459886973082e-09, "logits/chosen": 0.9130032062530518, "logits/rejected": 0.6385451555252075, "logps/chosen": -246.22882080078125, "logps/rejected": -226.85440063476562, "loss": 0.6084, "rewards/accuracies": 0.75, "rewards/chosen": 0.2738878130912781, "rewards/margins": 0.20090247690677643, "rewards/rejected": 0.07298536598682404, "step": 1758 }, { "epoch": 0.9486315221787784, "grad_norm": 6.439020156860352, "learning_rate": 3.991222157813695e-09, "logits/chosen": -1.275365948677063, "logits/rejected": -1.2152273654937744, "logps/chosen": -229.17919921875, "logps/rejected": -221.793212890625, "loss": 0.6137, "rewards/accuracies": 0.75, "rewards/chosen": 0.3369029760360718, "rewards/margins": 0.18096381425857544, "rewards/rejected": 0.15593919157981873, "step": 1759 }, { "epoch": 0.9491708237832007, "grad_norm": 7.425931930541992, "learning_rate": 3.907857114893359e-09, "logits/chosen": -0.361501008272171, "logits/rejected": -0.8650451898574829, "logps/chosen": -239.24571228027344, "logps/rejected": -203.55917358398438, "loss": 0.6923, "rewards/accuracies": 0.5, "rewards/chosen": 0.16912221908569336, "rewards/margins": 0.032847896218299866, "rewards/rejected": 0.1362743377685547, "step": 1760 }, { "epoch": 0.949710125387623, "grad_norm": 6.089888095855713, "learning_rate": 3.8253650539394056e-09, "logits/chosen": 0.24856188893318176, "logits/rejected": 0.0253201425075531, "logps/chosen": -215.12149047851562, "logps/rejected": -185.20211791992188, "loss": 0.6588, "rewards/accuracies": 0.5, "rewards/chosen": 0.2157384753227234, "rewards/margins": 0.09229755401611328, "rewards/rejected": 0.12344092130661011, "step": 1761 }, { "epoch": 0.9502494269920453, "grad_norm": 6.73253059387207, "learning_rate": 3.743746267582421e-09, "logits/chosen": 0.031631916761398315, "logits/rejected": -0.44788819551467896, "logps/chosen": -201.9138641357422, "logps/rejected": -186.16744995117188, "loss": 0.7219, "rewards/accuracies": 0.375, "rewards/chosen": 0.19268321990966797, "rewards/margins": -0.02694663777947426, "rewards/rejected": 0.21962985396385193, "step": 1762 }, { "epoch": 0.9507887285964676, "grad_norm": 8.529867172241211, "learning_rate": 3.663001045355163e-09, "logits/chosen": -0.03374003618955612, "logits/rejected": 0.3676217794418335, "logps/chosen": -169.12210083007812, "logps/rejected": -214.31576538085938, "loss": 0.7463, "rewards/accuracies": 0.375, "rewards/chosen": 0.17974625527858734, "rewards/margins": -0.0749274268746376, "rewards/rejected": 0.25467365980148315, "step": 1763 }, { "epoch": 0.9513280302008899, "grad_norm": 8.911370277404785, "learning_rate": 3.5831296736914264e-09, "logits/chosen": 0.5950840711593628, "logits/rejected": 0.10471081733703613, "logps/chosen": -273.4225158691406, "logps/rejected": -286.8302917480469, "loss": 0.7392, "rewards/accuracies": 0.5, "rewards/chosen": 0.1847085952758789, "rewards/margins": -0.07074384391307831, "rewards/rejected": 0.2554524540901184, "step": 1764 }, { "epoch": 0.9518673318053121, "grad_norm": 6.766948223114014, "learning_rate": 3.5041324359252054e-09, "logits/chosen": 0.38468584418296814, "logits/rejected": -0.44892141222953796, "logps/chosen": -183.2099151611328, "logps/rejected": -174.566162109375, "loss": 0.6173, "rewards/accuracies": 0.75, "rewards/chosen": 0.27176475524902344, "rewards/margins": 0.16718779504299164, "rewards/rejected": 0.104576975107193, "step": 1765 }, { "epoch": 0.9524066334097344, "grad_norm": 8.72609806060791, "learning_rate": 3.4260096122896433e-09, "logits/chosen": -0.8897371292114258, "logits/rejected": 0.5730926990509033, "logps/chosen": -224.28472900390625, "logps/rejected": -310.09735107421875, "loss": 0.7409, "rewards/accuracies": 0.375, "rewards/chosen": 0.16596576571464539, "rewards/margins": -0.08348693698644638, "rewards/rejected": 0.24945268034934998, "step": 1766 }, { "epoch": 0.9529459350141567, "grad_norm": 6.9774489402771, "learning_rate": 3.3487614799159182e-09, "logits/chosen": 0.1964578628540039, "logits/rejected": 0.11841493099927902, "logps/chosen": -370.81884765625, "logps/rejected": -275.2999572753906, "loss": 0.6595, "rewards/accuracies": 0.5, "rewards/chosen": 0.340842068195343, "rewards/margins": 0.10142727196216583, "rewards/rejected": 0.23941479623317719, "step": 1767 }, { "epoch": 0.953485236618579, "grad_norm": 8.09975814819336, "learning_rate": 3.2723883128324135e-09, "logits/chosen": 0.9586758017539978, "logits/rejected": 0.3163352906703949, "logps/chosen": -265.6737060546875, "logps/rejected": -219.65228271484375, "loss": 0.6651, "rewards/accuracies": 0.625, "rewards/chosen": 0.319662481546402, "rewards/margins": 0.07178305834531784, "rewards/rejected": 0.24787941575050354, "step": 1768 }, { "epoch": 0.9540245382230013, "grad_norm": 6.677431106567383, "learning_rate": 3.196890381963635e-09, "logits/chosen": -0.35107898712158203, "logits/rejected": 0.36302122473716736, "logps/chosen": -148.91702270507812, "logps/rejected": -209.09103393554688, "loss": 0.743, "rewards/accuracies": 0.25, "rewards/chosen": 0.12345342338085175, "rewards/margins": -0.0887250006198883, "rewards/rejected": 0.21217842400074005, "step": 1769 }, { "epoch": 0.9545638398274234, "grad_norm": 8.10130786895752, "learning_rate": 3.1222679551293486e-09, "logits/chosen": 1.5479977130889893, "logits/rejected": 0.6267954707145691, "logps/chosen": -402.5186767578125, "logps/rejected": -268.73565673828125, "loss": 0.6872, "rewards/accuracies": 0.5, "rewards/chosen": 0.2173820585012436, "rewards/margins": 0.019651416689157486, "rewards/rejected": 0.1977306455373764, "step": 1770 }, { "epoch": 0.9551031414318457, "grad_norm": 6.743312358856201, "learning_rate": 3.048521297043527e-09, "logits/chosen": 0.08832705020904541, "logits/rejected": -0.732772707939148, "logps/chosen": -334.3332824707031, "logps/rejected": -210.12493896484375, "loss": 0.6838, "rewards/accuracies": 0.5, "rewards/chosen": 0.2879959046840668, "rewards/margins": 0.02872323803603649, "rewards/rejected": 0.2592726945877075, "step": 1771 }, { "epoch": 0.955642443036268, "grad_norm": 7.391458988189697, "learning_rate": 2.9756506693134898e-09, "logits/chosen": 0.5112782120704651, "logits/rejected": -0.9764875769615173, "logps/chosen": -251.59608459472656, "logps/rejected": -234.72218322753906, "loss": 0.6739, "rewards/accuracies": 0.625, "rewards/chosen": 0.24132147431373596, "rewards/margins": 0.06315651535987854, "rewards/rejected": 0.17816495895385742, "step": 1772 }, { "epoch": 0.9561817446406903, "grad_norm": 7.026484966278076, "learning_rate": 2.9036563304389027e-09, "logits/chosen": 0.3033958673477173, "logits/rejected": -1.196770429611206, "logps/chosen": -250.34312438964844, "logps/rejected": -224.31312561035156, "loss": 0.6409, "rewards/accuracies": 0.75, "rewards/chosen": 0.2954238951206207, "rewards/margins": 0.1108165830373764, "rewards/rejected": 0.18460732698440552, "step": 1773 }, { "epoch": 0.9567210462451126, "grad_norm": 6.3150410652160645, "learning_rate": 2.8325385358109468e-09, "logits/chosen": 0.25180745124816895, "logits/rejected": 0.2927658259868622, "logps/chosen": -204.2550811767578, "logps/rejected": -196.22802734375, "loss": 0.6375, "rewards/accuracies": 0.75, "rewards/chosen": 0.3305567502975464, "rewards/margins": 0.12115468084812164, "rewards/rejected": 0.20940208435058594, "step": 1774 }, { "epoch": 0.9572603478495348, "grad_norm": 9.933298110961914, "learning_rate": 2.762297537711372e-09, "logits/chosen": -0.5668280124664307, "logits/rejected": 0.39695408940315247, "logps/chosen": -174.54771423339844, "logps/rejected": -248.91397094726562, "loss": 0.8399, "rewards/accuracies": 0.25, "rewards/chosen": 0.09017782658338547, "rewards/margins": -0.2486536204814911, "rewards/rejected": 0.33883142471313477, "step": 1775 }, { "epoch": 0.9577996494539571, "grad_norm": 8.378133773803711, "learning_rate": 2.6929335853115297e-09, "logits/chosen": 0.16886717081069946, "logits/rejected": 1.3084981441497803, "logps/chosen": -213.16314697265625, "logps/rejected": -305.02618408203125, "loss": 0.7084, "rewards/accuracies": 0.25, "rewards/chosen": 0.24620142579078674, "rewards/margins": -0.01617249846458435, "rewards/rejected": 0.2623739242553711, "step": 1776 }, { "epoch": 0.9583389510583794, "grad_norm": 10.725503921508789, "learning_rate": 2.624446924671675e-09, "logits/chosen": -0.6128494739532471, "logits/rejected": -1.5002354383468628, "logps/chosen": -211.69964599609375, "logps/rejected": -201.44102478027344, "loss": 0.7881, "rewards/accuracies": 0.5, "rewards/chosen": 0.02928624302148819, "rewards/margins": -0.15438151359558105, "rewards/rejected": 0.18366774916648865, "step": 1777 }, { "epoch": 0.9588782526628017, "grad_norm": 8.68467903137207, "learning_rate": 2.556837798739886e-09, "logits/chosen": -1.1344592571258545, "logits/rejected": -0.7175685167312622, "logps/chosen": -218.83828735351562, "logps/rejected": -253.39642333984375, "loss": 0.7046, "rewards/accuracies": 0.5, "rewards/chosen": 0.25652313232421875, "rewards/margins": -0.004388721659779549, "rewards/rejected": 0.26091188192367554, "step": 1778 }, { "epoch": 0.959417554267224, "grad_norm": 9.731689453125, "learning_rate": 2.4901064473513146e-09, "logits/chosen": 0.21380925178527832, "logits/rejected": 0.9830800890922546, "logps/chosen": -240.7066650390625, "logps/rejected": -323.60675048828125, "loss": 0.6336, "rewards/accuracies": 0.75, "rewards/chosen": 0.2678081691265106, "rewards/margins": 0.14282655715942383, "rewards/rejected": 0.1249815970659256, "step": 1779 }, { "epoch": 0.9599568558716463, "grad_norm": 8.344714164733887, "learning_rate": 2.4242531072273255e-09, "logits/chosen": -0.107271209359169, "logits/rejected": -1.0996291637420654, "logps/chosen": -242.7609405517578, "logps/rejected": -199.20550537109375, "loss": 0.6341, "rewards/accuracies": 0.75, "rewards/chosen": 0.23111248016357422, "rewards/margins": 0.13402080535888672, "rewards/rejected": 0.0970916748046875, "step": 1780 }, { "epoch": 0.9604961574760685, "grad_norm": 5.799878120422363, "learning_rate": 2.359278011974636e-09, "logits/chosen": 1.5848459005355835, "logits/rejected": 0.33393579721450806, "logps/chosen": -262.5018615722656, "logps/rejected": -201.57989501953125, "loss": 0.6641, "rewards/accuracies": 0.5, "rewards/chosen": 0.27647894620895386, "rewards/margins": 0.07114563137292862, "rewards/rejected": 0.20533333718776703, "step": 1781 }, { "epoch": 0.9610354590804908, "grad_norm": 9.90775203704834, "learning_rate": 2.295181392084511e-09, "logits/chosen": -0.9086173176765442, "logits/rejected": -0.13198065757751465, "logps/chosen": -238.6376953125, "logps/rejected": -416.68682861328125, "loss": 0.7666, "rewards/accuracies": 0.25, "rewards/chosen": 0.1758657991886139, "rewards/margins": -0.1272074282169342, "rewards/rejected": 0.3030732274055481, "step": 1782 }, { "epoch": 0.961574760684913, "grad_norm": 7.380829334259033, "learning_rate": 2.231963474931875e-09, "logits/chosen": 0.8875344395637512, "logits/rejected": 0.21529781818389893, "logps/chosen": -247.2567901611328, "logps/rejected": -285.00311279296875, "loss": 0.6502, "rewards/accuracies": 0.75, "rewards/chosen": 0.2668933868408203, "rewards/margins": 0.09604444354772568, "rewards/rejected": 0.17084893584251404, "step": 1783 }, { "epoch": 0.9621140622893353, "grad_norm": 7.789770603179932, "learning_rate": 2.1696244847746737e-09, "logits/chosen": 0.5194294452667236, "logits/rejected": -0.015433818101882935, "logps/chosen": -268.6127014160156, "logps/rejected": -249.90957641601562, "loss": 0.6658, "rewards/accuracies": 0.625, "rewards/chosen": 0.29277661442756653, "rewards/margins": 0.08984022587537766, "rewards/rejected": 0.20293636620044708, "step": 1784 }, { "epoch": 0.9626533638937576, "grad_norm": 6.797243595123291, "learning_rate": 2.1081646427528466e-09, "logits/chosen": -0.350344181060791, "logits/rejected": 0.26168596744537354, "logps/chosen": -280.4071960449219, "logps/rejected": -299.0138854980469, "loss": 0.7431, "rewards/accuracies": 0.25, "rewards/chosen": 0.20096895098686218, "rewards/margins": -0.08192101120948792, "rewards/rejected": 0.2828899323940277, "step": 1785 }, { "epoch": 0.9631926654981798, "grad_norm": 8.963312149047852, "learning_rate": 2.047584166887717e-09, "logits/chosen": -0.5552816390991211, "logits/rejected": -0.09404593706130981, "logps/chosen": -172.7222442626953, "logps/rejected": -209.36569213867188, "loss": 0.7775, "rewards/accuracies": 0.375, "rewards/chosen": 0.2489611655473709, "rewards/margins": -0.13990142941474915, "rewards/rejected": 0.38886260986328125, "step": 1786 }, { "epoch": 0.9637319671026021, "grad_norm": 6.681914329528809, "learning_rate": 1.9878832720811867e-09, "logits/chosen": -0.055459052324295044, "logits/rejected": 0.042212337255477905, "logps/chosen": -186.99722290039062, "logps/rejected": -213.20333862304688, "loss": 0.6491, "rewards/accuracies": 0.625, "rewards/chosen": 0.2971828579902649, "rewards/margins": 0.11210900545120239, "rewards/rejected": 0.1850738525390625, "step": 1787 }, { "epoch": 0.9642712687070244, "grad_norm": 6.213136672973633, "learning_rate": 1.9290621701149313e-09, "logits/chosen": 0.36787283420562744, "logits/rejected": -1.1325933933258057, "logps/chosen": -271.464111328125, "logps/rejected": -246.44705200195312, "loss": 0.6539, "rewards/accuracies": 0.75, "rewards/chosen": 0.2679167687892914, "rewards/margins": 0.08561573922634125, "rewards/rejected": 0.18230104446411133, "step": 1788 }, { "epoch": 0.9648105703114467, "grad_norm": 7.102257251739502, "learning_rate": 1.8711210696496225e-09, "logits/chosen": -0.2581184208393097, "logits/rejected": -0.03493857383728027, "logps/chosen": -199.2948455810547, "logps/rejected": -206.3113555908203, "loss": 0.6961, "rewards/accuracies": 0.625, "rewards/chosen": 0.21257168054580688, "rewards/margins": 0.009471943601965904, "rewards/rejected": 0.20309972763061523, "step": 1789 }, { "epoch": 0.965349871915869, "grad_norm": 6.4505085945129395, "learning_rate": 1.8140601762242913e-09, "logits/chosen": -0.5742409229278564, "logits/rejected": -0.7715613842010498, "logps/chosen": -192.35812377929688, "logps/rejected": -233.98838806152344, "loss": 0.6414, "rewards/accuracies": 0.75, "rewards/chosen": 0.2623867392539978, "rewards/margins": 0.11107682436704636, "rewards/rejected": 0.15130990743637085, "step": 1790 }, { "epoch": 0.9658891735202912, "grad_norm": 7.594358921051025, "learning_rate": 1.7578796922554928e-09, "logits/chosen": 0.7575828433036804, "logits/rejected": -0.07502901554107666, "logps/chosen": -288.0997009277344, "logps/rejected": -231.42538452148438, "loss": 0.5806, "rewards/accuracies": 1.0, "rewards/chosen": 0.32755613327026367, "rewards/margins": 0.25461453199386597, "rewards/rejected": 0.07294158637523651, "step": 1791 }, { "epoch": 0.9664284751247135, "grad_norm": 7.985551834106445, "learning_rate": 1.7025798170367255e-09, "logits/chosen": 0.2017316222190857, "logits/rejected": 0.29471826553344727, "logps/chosen": -250.54095458984375, "logps/rejected": -223.71780395507812, "loss": 0.7556, "rewards/accuracies": 0.5, "rewards/chosen": 0.11421652138233185, "rewards/margins": -0.10467539727687836, "rewards/rejected": 0.2188919186592102, "step": 1792 }, { "epoch": 0.9669677767291358, "grad_norm": 7.272802352905273, "learning_rate": 1.6481607467375147e-09, "logits/chosen": 1.4179731607437134, "logits/rejected": 1.0611286163330078, "logps/chosen": -203.3140411376953, "logps/rejected": -210.75076293945312, "loss": 0.6954, "rewards/accuracies": 0.5, "rewards/chosen": 0.24901153147220612, "rewards/margins": 0.030017230659723282, "rewards/rejected": 0.21899428963661194, "step": 1793 }, { "epoch": 0.9675070783335581, "grad_norm": 6.9433064460754395, "learning_rate": 1.59462267440294e-09, "logits/chosen": 0.052154187113046646, "logits/rejected": 1.22550630569458, "logps/chosen": -227.16905212402344, "logps/rejected": -306.86566162109375, "loss": 0.7144, "rewards/accuracies": 0.375, "rewards/chosen": 0.2541021406650543, "rewards/margins": -0.03314991295337677, "rewards/rejected": 0.2872520387172699, "step": 1794 }, { "epoch": 0.9680463799379803, "grad_norm": 8.559525489807129, "learning_rate": 1.5419657899527761e-09, "logits/chosen": 0.057430341839790344, "logits/rejected": 0.4329357147216797, "logps/chosen": -341.589599609375, "logps/rejected": -348.221435546875, "loss": 0.6628, "rewards/accuracies": 0.625, "rewards/chosen": 0.23531398177146912, "rewards/margins": 0.06689567863941193, "rewards/rejected": 0.16841831803321838, "step": 1795 }, { "epoch": 0.9685856815424025, "grad_norm": 7.999962329864502, "learning_rate": 1.490190280180964e-09, "logits/chosen": 0.1756598949432373, "logits/rejected": -0.3888784348964691, "logps/chosen": -226.14588928222656, "logps/rejected": -259.95831298828125, "loss": 0.7045, "rewards/accuracies": 0.625, "rewards/chosen": 0.29075756669044495, "rewards/margins": 0.0060541145503520966, "rewards/rejected": 0.28470346331596375, "step": 1796 }, { "epoch": 0.9691249831468248, "grad_norm": 8.576546669006348, "learning_rate": 1.4392963287547799e-09, "logits/chosen": 0.3904223144054413, "logits/rejected": 0.1377905011177063, "logps/chosen": -241.164794921875, "logps/rejected": -272.98431396484375, "loss": 0.636, "rewards/accuracies": 0.625, "rewards/chosen": 0.2437528669834137, "rewards/margins": 0.1307106912136078, "rewards/rejected": 0.11304216086864471, "step": 1797 }, { "epoch": 0.9696642847512471, "grad_norm": 7.465847015380859, "learning_rate": 1.3892841162143899e-09, "logits/chosen": -0.6927523612976074, "logits/rejected": -0.4693935811519623, "logps/chosen": -154.66952514648438, "logps/rejected": -167.67007446289062, "loss": 0.6549, "rewards/accuracies": 0.75, "rewards/chosen": 0.1948048621416092, "rewards/margins": 0.10971274971961975, "rewards/rejected": 0.08509211242198944, "step": 1798 }, { "epoch": 0.9702035863556694, "grad_norm": 17.001676559448242, "learning_rate": 1.340153819971962e-09, "logits/chosen": 0.8076131939888, "logits/rejected": -0.8724899888038635, "logps/chosen": -303.5433044433594, "logps/rejected": -224.7933349609375, "loss": 0.6038, "rewards/accuracies": 0.75, "rewards/chosen": 0.26435232162475586, "rewards/margins": 0.20038387179374695, "rewards/rejected": 0.0639684721827507, "step": 1799 }, { "epoch": 0.9707428879600917, "grad_norm": 6.814447402954102, "learning_rate": 1.2919056143113061e-09, "logits/chosen": 0.18335311114788055, "logits/rejected": -0.05557282269001007, "logps/chosen": -222.85797119140625, "logps/rejected": -206.0766143798828, "loss": 0.6395, "rewards/accuracies": 0.625, "rewards/chosen": 0.3108508288860321, "rewards/margins": 0.13058623671531677, "rewards/rejected": 0.18026457726955414, "step": 1800 }, { "epoch": 0.9707428879600917, "eval_logits/chosen": 1.3657488822937012, "eval_logits/rejected": 1.0935688018798828, "eval_logps/chosen": -248.85523986816406, "eval_logps/rejected": -233.97061157226562, "eval_loss": 0.6678106188774109, "eval_rewards/accuracies": 0.6105589866638184, "eval_rewards/chosen": 0.25962215662002563, "eval_rewards/margins": 0.07042733579874039, "eval_rewards/rejected": 0.18919481337070465, "eval_runtime": 836.7457, "eval_samples_per_second": 1.924, "eval_steps_per_second": 0.962, "step": 1800 } ], "logging_steps": 1, "max_steps": 1854, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }