{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998691442030882, "eval_steps": 400, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010468463752944255, "grad_norm": 89.9968305873071, "learning_rate": 6.25e-08, "logits/chosen": -0.7388366460800171, "logits/rejected": -0.7827404141426086, "logps/chosen": -1.15103280544281, "logps/rejected": -1.2909390926361084, "loss": 1.2935, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.30206561088562, "rewards/margins": 0.2798125147819519, "rewards/rejected": -2.581878185272217, "step": 5 }, { "epoch": 0.02093692750588851, "grad_norm": 24.705919418070632, "learning_rate": 1.25e-07, "logits/chosen": -0.7937806844711304, "logits/rejected": -0.8651958703994751, "logps/chosen": -1.1529361009597778, "logps/rejected": -1.3611778020858765, "loss": 1.314, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.3058722019195557, "rewards/margins": 0.41648340225219727, "rewards/rejected": -2.722355604171753, "step": 10 }, { "epoch": 0.031405391258832765, "grad_norm": 27.735520006717728, "learning_rate": 1.875e-07, "logits/chosen": -0.7491501569747925, "logits/rejected": -0.8338179588317871, "logps/chosen": -1.1712462902069092, "logps/rejected": -1.270825743675232, "loss": 1.2667, "rewards/accuracies": 0.5625, "rewards/chosen": -2.3424925804138184, "rewards/margins": 0.19915875792503357, "rewards/rejected": -2.541651487350464, "step": 15 }, { "epoch": 0.04187385501177702, "grad_norm": 22.322171681204715, "learning_rate": 2.5e-07, "logits/chosen": -0.7619983553886414, "logits/rejected": -0.9046538472175598, "logps/chosen": -1.1294901371002197, "logps/rejected": -1.2941240072250366, "loss": 1.2696, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.2589802742004395, "rewards/margins": 0.32926779985427856, "rewards/rejected": -2.5882480144500732, "step": 20 }, { "epoch": 0.05234231876472128, "grad_norm": 12.849323230827375, "learning_rate": 3.125e-07, "logits/chosen": -0.772399365901947, "logits/rejected": -0.8519186973571777, "logps/chosen": -1.077214002609253, "logps/rejected": -1.2762653827667236, "loss": 1.2362, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.154428005218506, "rewards/margins": 0.39810293912887573, "rewards/rejected": -2.5525307655334473, "step": 25 }, { "epoch": 0.06281078251766553, "grad_norm": 84.84769866542291, "learning_rate": 3.75e-07, "logits/chosen": -0.7909184694290161, "logits/rejected": -0.8215691447257996, "logps/chosen": -1.059594988822937, "logps/rejected": -1.0990025997161865, "loss": 1.2897, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.119189977645874, "rewards/margins": 0.0788152664899826, "rewards/rejected": -2.198005199432373, "step": 30 }, { "epoch": 0.07327924627060979, "grad_norm": 12.477109087394112, "learning_rate": 4.3749999999999994e-07, "logits/chosen": -0.7678741216659546, "logits/rejected": -0.8405346870422363, "logps/chosen": -0.9820269346237183, "logps/rejected": -1.2532163858413696, "loss": 1.2497, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.9640538692474365, "rewards/margins": 0.5423787236213684, "rewards/rejected": -2.5064327716827393, "step": 35 }, { "epoch": 0.08374771002355404, "grad_norm": 10.85962784004132, "learning_rate": 5e-07, "logits/chosen": -0.7665027379989624, "logits/rejected": -0.8336607217788696, "logps/chosen": -0.9715523719787598, "logps/rejected": -1.1505324840545654, "loss": 1.2359, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.9431047439575195, "rewards/margins": 0.35795995593070984, "rewards/rejected": -2.301064968109131, "step": 40 }, { "epoch": 0.0942161737764983, "grad_norm": 10.414385637292323, "learning_rate": 5.625e-07, "logits/chosen": -0.7420114874839783, "logits/rejected": -0.8339902758598328, "logps/chosen": -0.9872716665267944, "logps/rejected": -1.1155823469161987, "loss": 1.2267, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.9745433330535889, "rewards/margins": 0.2566211223602295, "rewards/rejected": -2.2311646938323975, "step": 45 }, { "epoch": 0.10468463752944256, "grad_norm": 9.651448839940226, "learning_rate": 5.999678242522831e-07, "logits/chosen": -0.7927948832511902, "logits/rejected": -0.8290635943412781, "logps/chosen": -0.9459100961685181, "logps/rejected": -1.2578647136688232, "loss": 1.2207, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8918201923370361, "rewards/margins": 0.6239093542098999, "rewards/rejected": -2.5157294273376465, "step": 50 }, { "epoch": 0.11515310128238682, "grad_norm": 43.90472722310407, "learning_rate": 5.996059263493219e-07, "logits/chosen": -0.7944079637527466, "logits/rejected": -0.9001775979995728, "logps/chosen": -1.072819471359253, "logps/rejected": -1.181773066520691, "loss": 1.2551, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.145638942718506, "rewards/margins": 0.21790704131126404, "rewards/rejected": -2.363546133041382, "step": 55 }, { "epoch": 0.12562156503533106, "grad_norm": 13.393066662370963, "learning_rate": 5.988423976115163e-07, "logits/chosen": -0.7826106548309326, "logits/rejected": -0.8369284868240356, "logps/chosen": -1.0628390312194824, "logps/rejected": -1.2253072261810303, "loss": 1.2246, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.125678062438965, "rewards/margins": 0.32493603229522705, "rewards/rejected": -2.4506144523620605, "step": 60 }, { "epoch": 0.1360900287882753, "grad_norm": 26.206483702491475, "learning_rate": 5.976782615723061e-07, "logits/chosen": -0.7975456714630127, "logits/rejected": -0.8562803268432617, "logps/chosen": -1.0680768489837646, "logps/rejected": -1.2204017639160156, "loss": 1.2268, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.1361536979675293, "rewards/margins": 0.3046496510505676, "rewards/rejected": -2.4408035278320312, "step": 65 }, { "epoch": 0.14655849254121958, "grad_norm": 13.41584537004533, "learning_rate": 5.961150787913738e-07, "logits/chosen": -0.8376196622848511, "logits/rejected": -0.9019572138786316, "logps/chosen": -1.0893644094467163, "logps/rejected": -1.2784545421600342, "loss": 1.1754, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.1787288188934326, "rewards/margins": 0.37818047404289246, "rewards/rejected": -2.5569090843200684, "step": 70 }, { "epoch": 0.15702695629416383, "grad_norm": 32.22425187362688, "learning_rate": 5.941549447626671e-07, "logits/chosen": -0.804112434387207, "logits/rejected": -0.845563530921936, "logps/chosen": -1.0805425643920898, "logps/rejected": -1.3212538957595825, "loss": 1.209, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.1610851287841797, "rewards/margins": 0.4814226031303406, "rewards/rejected": -2.642507791519165, "step": 75 }, { "epoch": 0.16749542004710807, "grad_norm": 8.981853488976475, "learning_rate": 5.918004871053251e-07, "logits/chosen": -0.7968226671218872, "logits/rejected": -0.8211067318916321, "logps/chosen": -1.026604413986206, "logps/rejected": -1.3631267547607422, "loss": 1.1624, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.053208827972412, "rewards/margins": 0.673044741153717, "rewards/rejected": -2.7262535095214844, "step": 80 }, { "epoch": 0.17796388380005235, "grad_norm": 17.367470137588203, "learning_rate": 5.890548620412763e-07, "logits/chosen": -0.8126602172851562, "logits/rejected": -0.8794834017753601, "logps/chosen": -1.0674957036972046, "logps/rejected": -1.3523355722427368, "loss": 1.1625, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.134991407394409, "rewards/margins": 0.5696790814399719, "rewards/rejected": -2.7046711444854736, "step": 85 }, { "epoch": 0.1884323475529966, "grad_norm": 17.833322868673477, "learning_rate": 5.859217501642258e-07, "logits/chosen": -0.840762734413147, "logits/rejected": -0.9274584054946899, "logps/chosen": -1.1602346897125244, "logps/rejected": -1.5290915966033936, "loss": 1.1734, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.320469379425049, "rewards/margins": 0.7377143502235413, "rewards/rejected": -3.058183193206787, "step": 90 }, { "epoch": 0.19890081130594087, "grad_norm": 22.98307788140464, "learning_rate": 5.824053515057091e-07, "logits/chosen": -0.8092079162597656, "logits/rejected": -0.8328098058700562, "logps/chosen": -1.133385419845581, "logps/rejected": -1.4298288822174072, "loss": 1.1919, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.266770839691162, "rewards/margins": 0.5928869247436523, "rewards/rejected": -2.8596577644348145, "step": 95 }, { "epoch": 0.2093692750588851, "grad_norm": 11.026437481785171, "learning_rate": 5.785103799048218e-07, "logits/chosen": -0.8240598440170288, "logits/rejected": -0.8689464330673218, "logps/chosen": -1.147385835647583, "logps/rejected": -1.3535184860229492, "loss": 1.2131, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.294771671295166, "rewards/margins": 0.41226544976234436, "rewards/rejected": -2.7070369720458984, "step": 100 }, { "epoch": 0.21983773881182936, "grad_norm": 9.837343506686455, "learning_rate": 5.742420566891749e-07, "logits/chosen": -0.7966706156730652, "logits/rejected": -0.878908634185791, "logps/chosen": -1.1871858835220337, "logps/rejected": -1.4869831800460815, "loss": 1.1062, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.3743717670440674, "rewards/margins": 0.5995948314666748, "rewards/rejected": -2.973966360092163, "step": 105 }, { "epoch": 0.23030620256477363, "grad_norm": 19.01097451640794, "learning_rate": 5.696061036755478e-07, "logits/chosen": -0.7402995228767395, "logits/rejected": -0.8451690673828125, "logps/chosen": -1.0870535373687744, "logps/rejected": -1.3536127805709839, "loss": 1.1368, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.174107074737549, "rewards/margins": 0.5331184267997742, "rewards/rejected": -2.7072255611419678, "step": 110 }, { "epoch": 0.24077466631771788, "grad_norm": 89.427421788791, "learning_rate": 5.64608735499618e-07, "logits/chosen": -0.833459734916687, "logits/rejected": -0.829018235206604, "logps/chosen": -1.150940179824829, "logps/rejected": -1.287229061126709, "loss": 1.1596, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.301880359649658, "rewards/margins": 0.2725774943828583, "rewards/rejected": -2.574458122253418, "step": 115 }, { "epoch": 0.2512431300706621, "grad_norm": 31.745365051153907, "learning_rate": 5.592566512850545e-07, "logits/chosen": -0.79100501537323, "logits/rejected": -0.8663417100906372, "logps/chosen": -1.0571635961532593, "logps/rejected": -1.4087059497833252, "loss": 1.1752, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1143271923065186, "rewards/margins": 0.703084409236908, "rewards/rejected": -2.8174118995666504, "step": 120 }, { "epoch": 0.26171159382360637, "grad_norm": 14.496796822119729, "learning_rate": 5.535570256631384e-07, "logits/chosen": -0.798068642616272, "logits/rejected": -0.7694944143295288, "logps/chosen": -1.171478271484375, "logps/rejected": -1.5117442607879639, "loss": 1.1603, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.34295654296875, "rewards/margins": 0.6805320978164673, "rewards/rejected": -3.0234885215759277, "step": 125 }, { "epoch": 0.2721800575765506, "grad_norm": 11.15517991690276, "learning_rate": 5.475174991549528e-07, "logits/chosen": -0.7599740624427795, "logits/rejected": -0.8051120638847351, "logps/chosen": -1.1963175535202026, "logps/rejected": -1.5290193557739258, "loss": 1.1204, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.3926351070404053, "rewards/margins": 0.6654035449028015, "rewards/rejected": -3.0580387115478516, "step": 130 }, { "epoch": 0.2826485213294949, "grad_norm": 13.030746243741968, "learning_rate": 5.411461679290317e-07, "logits/chosen": -0.7586075663566589, "logits/rejected": -0.7899220585823059, "logps/chosen": -1.0880517959594727, "logps/rejected": -1.4661823511123657, "loss": 1.1668, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.1761035919189453, "rewards/margins": 0.7562611103057861, "rewards/rejected": -2.9323647022247314, "step": 135 }, { "epoch": 0.29311698508243916, "grad_norm": 12.738817253337984, "learning_rate": 5.34451572948201e-07, "logits/chosen": -0.8128818273544312, "logits/rejected": -0.842110812664032, "logps/chosen": -1.2075114250183105, "logps/rejected": -1.4238183498382568, "loss": 1.2141, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.415022850036621, "rewards/margins": 0.4326140284538269, "rewards/rejected": -2.8476366996765137, "step": 140 }, { "epoch": 0.3035854488353834, "grad_norm": 24.983190739092922, "learning_rate": 5.274426885201582e-07, "logits/chosen": -0.7843077778816223, "logits/rejected": -0.8767129182815552, "logps/chosen": -1.1461377143859863, "logps/rejected": -1.5009636878967285, "loss": 1.1207, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.2922754287719727, "rewards/margins": 0.7096518278121948, "rewards/rejected": -3.001927375793457, "step": 145 }, { "epoch": 0.31405391258832765, "grad_norm": 23.74860585722539, "learning_rate": 5.201289102671411e-07, "logits/chosen": -0.8561376333236694, "logits/rejected": -0.8589056134223938, "logps/chosen": -1.1982135772705078, "logps/rejected": -1.5201013088226318, "loss": 1.1476, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.3964271545410156, "rewards/margins": 0.6437759399414062, "rewards/rejected": -3.0402026176452637, "step": 150 }, { "epoch": 0.3245223763412719, "grad_norm": 15.03909875634319, "learning_rate": 5.12520042530811e-07, "logits/chosen": -0.7681445479393005, "logits/rejected": -0.8174452781677246, "logps/chosen": -1.2068870067596436, "logps/rejected": -1.6613304615020752, "loss": 1.1179, "rewards/accuracies": 0.65625, "rewards/chosen": -2.413774013519287, "rewards/margins": 0.908886730670929, "rewards/rejected": -3.3226609230041504, "step": 155 }, { "epoch": 0.33499084009421615, "grad_norm": 14.191169695059497, "learning_rate": 5.046262852292346e-07, "logits/chosen": -0.8029179573059082, "logits/rejected": -0.8746109008789062, "logps/chosen": -1.1898596286773682, "logps/rejected": -1.6815717220306396, "loss": 1.1138, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.3797192573547363, "rewards/margins": 0.9834240674972534, "rewards/rejected": -3.3631434440612793, "step": 160 }, { "epoch": 0.34545930384716045, "grad_norm": 35.93680907186828, "learning_rate": 4.964582201835856e-07, "logits/chosen": -0.7598133087158203, "logits/rejected": -0.7828689813613892, "logps/chosen": -1.1410859823226929, "logps/rejected": -1.5104478597640991, "loss": 1.1132, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.2821719646453857, "rewards/margins": 0.7387233972549438, "rewards/rejected": -3.0208957195281982, "step": 165 }, { "epoch": 0.3559277676001047, "grad_norm": 33.280459458949075, "learning_rate": 4.880267969328908e-07, "logits/chosen": -0.7489741444587708, "logits/rejected": -0.8511075973510742, "logps/chosen": -1.2344070672988892, "logps/rejected": -1.6722608804702759, "loss": 1.1051, "rewards/accuracies": 0.65625, "rewards/chosen": -2.4688141345977783, "rewards/margins": 0.8757076263427734, "rewards/rejected": -3.3445217609405518, "step": 170 }, { "epoch": 0.36639623135304894, "grad_norm": 13.559524548726696, "learning_rate": 4.793433180558423e-07, "logits/chosen": -0.7471566796302795, "logits/rejected": -0.8381919860839844, "logps/chosen": -1.1587435007095337, "logps/rejected": -1.5522888898849487, "loss": 1.133, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.3174870014190674, "rewards/margins": 0.7870910167694092, "rewards/rejected": -3.1045777797698975, "step": 175 }, { "epoch": 0.3768646951059932, "grad_norm": 19.60609504538111, "learning_rate": 4.704194240193467e-07, "logits/chosen": -0.7779995203018188, "logits/rejected": -0.8208974599838257, "logps/chosen": -1.1914243698120117, "logps/rejected": -1.6478986740112305, "loss": 1.0991, "rewards/accuracies": 0.65625, "rewards/chosen": -2.3828487396240234, "rewards/margins": 0.9129486083984375, "rewards/rejected": -3.295797348022461, "step": 180 }, { "epoch": 0.38733315885893743, "grad_norm": 15.986798312827595, "learning_rate": 4.6126707757412686e-07, "logits/chosen": -0.7536464333534241, "logits/rejected": -0.836445152759552, "logps/chosen": -1.18105149269104, "logps/rejected": -1.5753639936447144, "loss": 1.0801, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.36210298538208, "rewards/margins": 0.788625180721283, "rewards/rejected": -3.1507279872894287, "step": 185 }, { "epoch": 0.39780162261188173, "grad_norm": 11.085659412542848, "learning_rate": 4.5189854771829086e-07, "logits/chosen": -0.7779768705368042, "logits/rejected": -0.860378623008728, "logps/chosen": -1.174264907836914, "logps/rejected": -1.5782097578048706, "loss": 1.0897, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.348529815673828, "rewards/margins": 0.8078898191452026, "rewards/rejected": -3.156419515609741, "step": 190 }, { "epoch": 0.408270086364826, "grad_norm": 19.478521042945726, "learning_rate": 4.4232639325036807e-07, "logits/chosen": -0.8138440251350403, "logits/rejected": -0.888975977897644, "logps/chosen": -1.1923892498016357, "logps/rejected": -1.6592342853546143, "loss": 1.1171, "rewards/accuracies": 0.6875, "rewards/chosen": -2.3847784996032715, "rewards/margins": 0.933690071105957, "rewards/rejected": -3.3184685707092285, "step": 195 }, { "epoch": 0.4187385501177702, "grad_norm": 12.673420292445082, "learning_rate": 4.32563445933859e-07, "logits/chosen": -0.7443628311157227, "logits/rejected": -0.7802754044532776, "logps/chosen": -1.211715579032898, "logps/rejected": -1.5577033758163452, "loss": 1.0631, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.423431158065796, "rewards/margins": 0.6919752955436707, "rewards/rejected": -3.1154067516326904, "step": 200 }, { "epoch": 0.42920701387071447, "grad_norm": 18.156036717162227, "learning_rate": 4.226227932958664e-07, "logits/chosen": -0.8596774935722351, "logits/rejected": -0.8864806294441223, "logps/chosen": -1.2197387218475342, "logps/rejected": -1.706209421157837, "loss": 1.0695, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.4394774436950684, "rewards/margins": 0.9729412794113159, "rewards/rejected": -3.412418842315674, "step": 205 }, { "epoch": 0.4396754776236587, "grad_norm": 18.614311057711063, "learning_rate": 4.1251776108286854e-07, "logits/chosen": -0.7632856965065002, "logits/rejected": -0.7707933187484741, "logps/chosen": -1.2796884775161743, "logps/rejected": -1.6428205966949463, "loss": 1.1264, "rewards/accuracies": 0.65625, "rewards/chosen": -2.5593769550323486, "rewards/margins": 0.7262641191482544, "rewards/rejected": -3.2856411933898926, "step": 210 }, { "epoch": 0.45014394137660296, "grad_norm": 19.070261616595026, "learning_rate": 4.022618953971514e-07, "logits/chosen": -0.7568240761756897, "logits/rejected": -0.8358641862869263, "logps/chosen": -1.308774709701538, "logps/rejected": -1.6738483905792236, "loss": 1.1102, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.617549419403076, "rewards/margins": 0.7301470041275024, "rewards/rejected": -3.3476967811584473, "step": 215 }, { "epoch": 0.46061240512954726, "grad_norm": 13.160800920164423, "learning_rate": 3.918689445378477e-07, "logits/chosen": -0.7660185098648071, "logits/rejected": -0.8393454551696777, "logps/chosen": -1.2900028228759766, "logps/rejected": -1.7106046676635742, "loss": 1.0429, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.580005645751953, "rewards/margins": 0.8412036895751953, "rewards/rejected": -3.4212093353271484, "step": 220 }, { "epoch": 0.4710808688824915, "grad_norm": 15.467772988868518, "learning_rate": 3.813528405709251e-07, "logits/chosen": -0.7320618629455566, "logits/rejected": -0.7756307125091553, "logps/chosen": -1.3943421840667725, "logps/rejected": -1.8419634103775024, "loss": 1.084, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.788684368133545, "rewards/margins": 0.8952423334121704, "rewards/rejected": -3.683926820755005, "step": 225 }, { "epoch": 0.48154933263543576, "grad_norm": 23.599162652169078, "learning_rate": 3.707276806528282e-07, "logits/chosen": -0.7983018159866333, "logits/rejected": -0.8536737561225891, "logps/chosen": -1.3397753238677979, "logps/rejected": -1.8982980251312256, "loss": 1.0107, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.6795506477355957, "rewards/margins": 1.1170451641082764, "rewards/rejected": -3.796596050262451, "step": 230 }, { "epoch": 0.49201779638838, "grad_norm": 22.745006961113983, "learning_rate": 3.6000770813281334e-07, "logits/chosen": -0.7526620626449585, "logits/rejected": -0.7841376066207886, "logps/chosen": -1.3173251152038574, "logps/rejected": -1.6973741054534912, "loss": 1.096, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.634650230407715, "rewards/margins": 0.7600980401039124, "rewards/rejected": -3.3947482109069824, "step": 235 }, { "epoch": 0.5024862601413242, "grad_norm": 17.29631229132808, "learning_rate": 3.4920729345930654e-07, "logits/chosen": -0.8024924993515015, "logits/rejected": -0.8705514669418335, "logps/chosen": -1.3106586933135986, "logps/rejected": -1.8416321277618408, "loss": 1.0622, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.6213173866271973, "rewards/margins": 1.0619468688964844, "rewards/rejected": -3.6832642555236816, "step": 240 }, { "epoch": 0.5129547238942685, "grad_norm": 15.697390709369445, "learning_rate": 3.383409149158814e-07, "logits/chosen": -0.8013178110122681, "logits/rejected": -0.8261008262634277, "logps/chosen": -1.2374125719070435, "logps/rejected": -1.8463026285171509, "loss": 1.0412, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.474825143814087, "rewards/margins": 1.2177798748016357, "rewards/rejected": -3.6926052570343018, "step": 245 }, { "epoch": 0.5234231876472127, "grad_norm": 38.568029652024805, "learning_rate": 3.2742313921268035e-07, "logits/chosen": -0.7440148591995239, "logits/rejected": -0.8371674418449402, "logps/chosen": -1.3792295455932617, "logps/rejected": -1.996372938156128, "loss": 1.0533, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.7584590911865234, "rewards/margins": 1.234286904335022, "rewards/rejected": -3.992745876312256, "step": 250 }, { "epoch": 0.533891651400157, "grad_norm": 31.29600689027817, "learning_rate": 3.1646860195929825e-07, "logits/chosen": -0.798254132270813, "logits/rejected": -0.819698691368103, "logps/chosen": -1.4148808717727661, "logps/rejected": -1.9883480072021484, "loss": 1.1126, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.8297617435455322, "rewards/margins": 1.1469345092773438, "rewards/rejected": -3.976696014404297, "step": 255 }, { "epoch": 0.5443601151531012, "grad_norm": 21.255043892106038, "learning_rate": 3.054919880453032e-07, "logits/chosen": -0.8065778017044067, "logits/rejected": -0.8200203776359558, "logps/chosen": -1.3674335479736328, "logps/rejected": -1.8728046417236328, "loss": 1.0948, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.7348670959472656, "rewards/margins": 1.0107421875, "rewards/rejected": -3.7456092834472656, "step": 260 }, { "epoch": 0.5548285789060455, "grad_norm": 15.283609874940026, "learning_rate": 2.9450801195469686e-07, "logits/chosen": -0.7686730027198792, "logits/rejected": -0.7811926603317261, "logps/chosen": -1.3809654712677002, "logps/rejected": -1.8307151794433594, "loss": 1.0502, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.7619309425354004, "rewards/margins": 0.8994992971420288, "rewards/rejected": -3.6614303588867188, "step": 265 }, { "epoch": 0.5652970426589898, "grad_norm": 34.69673151716839, "learning_rate": 2.835313980407017e-07, "logits/chosen": -0.8522397875785828, "logits/rejected": -0.8554953336715698, "logps/chosen": -1.4796664714813232, "logps/rejected": -1.868570327758789, "loss": 1.11, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.9593329429626465, "rewards/margins": 0.7778076529502869, "rewards/rejected": -3.737140655517578, "step": 270 }, { "epoch": 0.575765506411934, "grad_norm": 12.490257980809535, "learning_rate": 2.7257686078731973e-07, "logits/chosen": -0.8593546748161316, "logits/rejected": -0.8926668167114258, "logps/chosen": -1.2937114238739014, "logps/rejected": -2.0442328453063965, "loss": 0.9612, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.5874228477478027, "rewards/margins": 1.5010432004928589, "rewards/rejected": -4.088465690612793, "step": 275 }, { "epoch": 0.5862339701648783, "grad_norm": 17.66798289482467, "learning_rate": 2.6165908508411857e-07, "logits/chosen": -0.7889951467514038, "logits/rejected": -0.8469230532646179, "logps/chosen": -1.3164467811584473, "logps/rejected": -1.873552680015564, "loss": 1.0829, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.6328935623168945, "rewards/margins": 1.114211916923523, "rewards/rejected": -3.747105360031128, "step": 280 }, { "epoch": 0.5967024339178225, "grad_norm": 25.798144103608532, "learning_rate": 2.5079270654069354e-07, "logits/chosen": -0.7999380230903625, "logits/rejected": -0.8465052843093872, "logps/chosen": -1.4005292654037476, "logps/rejected": -1.9563087224960327, "loss": 1.0559, "rewards/accuracies": 0.71875, "rewards/chosen": -2.801058530807495, "rewards/margins": 1.1115590333938599, "rewards/rejected": -3.9126174449920654, "step": 285 }, { "epoch": 0.6071708976707668, "grad_norm": 26.70646393830588, "learning_rate": 2.399922918671867e-07, "logits/chosen": -0.8188889622688293, "logits/rejected": -0.8326479196548462, "logps/chosen": -1.4042682647705078, "logps/rejected": -1.8107773065567017, "loss": 1.0877, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.8085365295410156, "rewards/margins": 0.8130179643630981, "rewards/rejected": -3.6215546131134033, "step": 290 }, { "epoch": 0.6176393614237111, "grad_norm": 23.426122701316096, "learning_rate": 2.2927231934717176e-07, "logits/chosen": -0.8667086362838745, "logits/rejected": -0.87919682264328, "logps/chosen": -1.4516851902008057, "logps/rejected": -1.7210047245025635, "loss": 1.0425, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.9033703804016113, "rewards/margins": 0.5386390686035156, "rewards/rejected": -3.442009449005127, "step": 295 }, { "epoch": 0.6281078251766553, "grad_norm": 26.456279591360094, "learning_rate": 2.1864715942907487e-07, "logits/chosen": -0.8121633529663086, "logits/rejected": -0.8183205723762512, "logps/chosen": -1.4428894519805908, "logps/rejected": -1.9755233526229858, "loss": 1.0841, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.8857789039611816, "rewards/margins": 1.0652679204940796, "rewards/rejected": -3.9510467052459717, "step": 300 }, { "epoch": 0.6385762889295996, "grad_norm": 31.787422608248555, "learning_rate": 2.081310554621522e-07, "logits/chosen": -0.812918484210968, "logits/rejected": -0.848720371723175, "logps/chosen": -1.3704057931900024, "logps/rejected": -1.7566410303115845, "loss": 1.0211, "rewards/accuracies": 0.6875, "rewards/chosen": -2.740811586380005, "rewards/margins": 0.7724703550338745, "rewards/rejected": -3.513282060623169, "step": 305 }, { "epoch": 0.6490447526825438, "grad_norm": 20.39803180345373, "learning_rate": 1.9773810460284862e-07, "logits/chosen": -0.7991079092025757, "logits/rejected": -0.8711285591125488, "logps/chosen": -1.4278900623321533, "logps/rejected": -2.05625581741333, "loss": 0.9925, "rewards/accuracies": 0.71875, "rewards/chosen": -2.8557801246643066, "rewards/margins": 1.256731629371643, "rewards/rejected": -4.11251163482666, "step": 310 }, { "epoch": 0.6595132164354881, "grad_norm": 22.590110789535018, "learning_rate": 1.874822389171314e-07, "logits/chosen": -0.8574708700180054, "logits/rejected": -0.9009912610054016, "logps/chosen": -1.545143723487854, "logps/rejected": -2.0895230770111084, "loss": 1.0237, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.090287446975708, "rewards/margins": 1.088758945465088, "rewards/rejected": -4.179046154022217, "step": 315 }, { "epoch": 0.6699816801884323, "grad_norm": 19.0520960322845, "learning_rate": 1.7737720670413356e-07, "logits/chosen": -0.8097273111343384, "logits/rejected": -0.8335424661636353, "logps/chosen": -1.5219576358795166, "logps/rejected": -2.0950403213500977, "loss": 1.0412, "rewards/accuracies": 0.6875, "rewards/chosen": -3.043915271759033, "rewards/margins": 1.1461658477783203, "rewards/rejected": -4.190080642700195, "step": 320 }, { "epoch": 0.6804501439413766, "grad_norm": 28.978881064657845, "learning_rate": 1.6743655406614095e-07, "logits/chosen": -0.8851544260978699, "logits/rejected": -0.8812357187271118, "logps/chosen": -1.505824089050293, "logps/rejected": -2.034778118133545, "loss": 1.0881, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.011648178100586, "rewards/margins": 1.0579078197479248, "rewards/rejected": -4.06955623626709, "step": 325 }, { "epoch": 0.6909186076943209, "grad_norm": 23.915843277630973, "learning_rate": 1.5767360674963198e-07, "logits/chosen": -0.870714008808136, "logits/rejected": -0.8971943855285645, "logps/chosen": -1.3601343631744385, "logps/rejected": -2.0130364894866943, "loss": 1.0087, "rewards/accuracies": 0.75, "rewards/chosen": -2.720268726348877, "rewards/margins": 1.3058046102523804, "rewards/rejected": -4.026072978973389, "step": 330 }, { "epoch": 0.7013870714472651, "grad_norm": 26.514246744997322, "learning_rate": 1.4810145228170922e-07, "logits/chosen": -0.8225549459457397, "logits/rejected": -0.8689346313476562, "logps/chosen": -1.4374722242355347, "logps/rejected": -1.9102426767349243, "loss": 1.052, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.8749444484710693, "rewards/margins": 0.9455404281616211, "rewards/rejected": -3.8204853534698486, "step": 335 }, { "epoch": 0.7118555352002094, "grad_norm": 27.7004551617753, "learning_rate": 1.3873292242587306e-07, "logits/chosen": -0.8165398836135864, "logits/rejected": -0.9100580215454102, "logps/chosen": -1.461507797241211, "logps/rejected": -2.0511550903320312, "loss": 1.0709, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.923015594482422, "rewards/margins": 1.1792947053909302, "rewards/rejected": -4.1023101806640625, "step": 340 }, { "epoch": 0.7223239989531536, "grad_norm": 24.617595975995133, "learning_rate": 1.295805759806533e-07, "logits/chosen": -0.8566834330558777, "logits/rejected": -0.8978926539421082, "logps/chosen": -1.5079203844070435, "logps/rejected": -2.044774293899536, "loss": 1.0388, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.015840768814087, "rewards/margins": 1.073707938194275, "rewards/rejected": -4.089548587799072, "step": 345 }, { "epoch": 0.7327924627060979, "grad_norm": 22.995198881906134, "learning_rate": 1.2065668194415777e-07, "logits/chosen": -0.8893098831176758, "logits/rejected": -0.9465163946151733, "logps/chosen": -1.5923842191696167, "logps/rejected": -2.066089153289795, "loss": 0.9896, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -3.1847684383392334, "rewards/margins": 0.947409987449646, "rewards/rejected": -4.13217830657959, "step": 350 }, { "epoch": 0.7432609264590422, "grad_norm": 22.355221430364576, "learning_rate": 1.1197320306710923e-07, "logits/chosen": -0.8776585459709167, "logits/rejected": -0.9053448438644409, "logps/chosen": -1.5153396129608154, "logps/rejected": -2.0724828243255615, "loss": 1.0507, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.030679225921631, "rewards/margins": 1.1142865419387817, "rewards/rejected": -4.144965648651123, "step": 355 }, { "epoch": 0.7537293902119864, "grad_norm": 23.090030368869293, "learning_rate": 1.035417798164145e-07, "logits/chosen": -0.8465662002563477, "logits/rejected": -0.9114416837692261, "logps/chosen": -1.5818672180175781, "logps/rejected": -2.124342441558838, "loss": 1.0082, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.1637344360351562, "rewards/margins": 1.0849504470825195, "rewards/rejected": -4.248684883117676, "step": 360 }, { "epoch": 0.7641978539649307, "grad_norm": 23.156782358223225, "learning_rate": 9.537371477076535e-08, "logits/chosen": -0.8677560687065125, "logits/rejected": -0.9061796069145203, "logps/chosen": -1.5915837287902832, "logps/rejected": -2.287815570831299, "loss": 0.9867, "rewards/accuracies": 0.71875, "rewards/chosen": -3.1831674575805664, "rewards/margins": 1.3924639225006104, "rewards/rejected": -4.575631141662598, "step": 365 }, { "epoch": 0.7746663177178749, "grad_norm": 20.328637763728924, "learning_rate": 8.747995746918898e-08, "logits/chosen": -0.8234347105026245, "logits/rejected": -0.8825669288635254, "logps/chosen": -1.5265567302703857, "logps/rejected": -2.1997315883636475, "loss": 0.9162, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.0531134605407715, "rewards/margins": 1.3463497161865234, "rewards/rejected": -4.399463176727295, "step": 370 }, { "epoch": 0.7851347814708192, "grad_norm": 38.958615879066635, "learning_rate": 7.987108973285888e-08, "logits/chosen": -0.8697785139083862, "logits/rejected": -0.8908045887947083, "logps/chosen": -1.508302927017212, "logps/rejected": -2.1442337036132812, "loss": 1.0045, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.016605854034424, "rewards/margins": 1.2718614339828491, "rewards/rejected": -4.2884674072265625, "step": 375 }, { "epoch": 0.7956032452237635, "grad_norm": 40.64376807024019, "learning_rate": 7.255731147984174e-08, "logits/chosen": -0.8699348568916321, "logits/rejected": -0.9192712903022766, "logps/chosen": -1.5248959064483643, "logps/rejected": -2.057331085205078, "loss": 1.0402, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.0497918128967285, "rewards/margins": 1.064869999885559, "rewards/rejected": -4.114662170410156, "step": 380 }, { "epoch": 0.8060717089767077, "grad_norm": 31.79789174489367, "learning_rate": 6.554842705179898e-08, "logits/chosen": -0.8611375093460083, "logits/rejected": -0.8788291215896606, "logps/chosen": -1.4700887203216553, "logps/rejected": -2.0618722438812256, "loss": 1.0386, "rewards/accuracies": 0.75, "rewards/chosen": -2.9401774406433105, "rewards/margins": 1.183566927909851, "rewards/rejected": -4.123744487762451, "step": 385 }, { "epoch": 0.816540172729652, "grad_norm": 27.699401276090363, "learning_rate": 5.885383207096832e-08, "logits/chosen": -0.8817920684814453, "logits/rejected": -0.9167042970657349, "logps/chosen": -1.5808578729629517, "logps/rejected": -2.0726354122161865, "loss": 1.0164, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.1617157459259033, "rewards/margins": 0.9835556149482727, "rewards/rejected": -4.145270824432373, "step": 390 }, { "epoch": 0.8270086364825961, "grad_norm": 22.291806094067294, "learning_rate": 5.2482500845047165e-08, "logits/chosen": -0.8046171069145203, "logits/rejected": -0.8632856607437134, "logps/chosen": -1.474746823310852, "logps/rejected": -2.074794292449951, "loss": 1.0014, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.949493646621704, "rewards/margins": 1.2000950574874878, "rewards/rejected": -4.149588584899902, "step": 395 }, { "epoch": 0.8374771002355405, "grad_norm": 32.14293789219742, "learning_rate": 4.644297433686162e-08, "logits/chosen": -0.8459577560424805, "logits/rejected": -0.8775212168693542, "logps/chosen": -1.5837218761444092, "logps/rejected": -2.0384469032287598, "loss": 1.0682, "rewards/accuracies": 0.65625, "rewards/chosen": -3.1674437522888184, "rewards/margins": 0.9094497561454773, "rewards/rejected": -4.0768938064575195, "step": 400 }, { "epoch": 0.8374771002355405, "eval_logits/chosen": -0.9671933650970459, "eval_logits/rejected": -0.9966414570808411, "eval_logps/chosen": -1.536142349243164, "eval_logps/rejected": -2.0912911891937256, "eval_loss": 0.9954066276550293, "eval_rewards/accuracies": 0.7279999852180481, "eval_rewards/chosen": -3.072284698486328, "eval_rewards/margins": 1.1102983951568604, "eval_rewards/rejected": -4.182582378387451, "eval_runtime": 45.9263, "eval_samples_per_second": 43.548, "eval_steps_per_second": 2.722, "step": 400 }, { "epoch": 0.8479455639884846, "grad_norm": 23.581025931041157, "learning_rate": 4.074334871494558e-08, "logits/chosen": -0.8318978548049927, "logits/rejected": -0.9007453918457031, "logps/chosen": -1.597597360610962, "logps/rejected": -2.2467799186706543, "loss": 0.9898, "rewards/accuracies": 0.71875, "rewards/chosen": -3.195194721221924, "rewards/margins": 1.2983646392822266, "rewards/rejected": -4.493559837341309, "step": 405 }, { "epoch": 0.8584140277414289, "grad_norm": 37.34203846776795, "learning_rate": 3.5391264500382e-08, "logits/chosen": -0.8569322824478149, "logits/rejected": -0.8944110870361328, "logps/chosen": -1.6689296960830688, "logps/rejected": -2.2536518573760986, "loss": 0.9821, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.3378593921661377, "rewards/margins": 1.16944420337677, "rewards/rejected": -4.507303714752197, "step": 410 }, { "epoch": 0.8688824914943732, "grad_norm": 25.448649440851888, "learning_rate": 3.0393896324452226e-08, "logits/chosen": -0.8548834919929504, "logits/rejected": -0.8898690938949585, "logps/chosen": -1.6892175674438477, "logps/rejected": -2.1383655071258545, "loss": 1.0282, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -3.3784351348876953, "rewards/margins": 0.8982963562011719, "rewards/rejected": -4.276731014251709, "step": 415 }, { "epoch": 0.8793509552473174, "grad_norm": 22.81456603203954, "learning_rate": 2.5757943310825026e-08, "logits/chosen": -0.8120086789131165, "logits/rejected": -0.8377026319503784, "logps/chosen": -1.5306228399276733, "logps/rejected": -2.244910478591919, "loss": 0.9802, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.0612456798553467, "rewards/margins": 1.428574800491333, "rewards/rejected": -4.489820957183838, "step": 420 }, { "epoch": 0.8898194190002617, "grad_norm": 37.81119467654555, "learning_rate": 2.148962009517823e-08, "logits/chosen": -0.8621734380722046, "logits/rejected": -0.9295539855957031, "logps/chosen": -1.594923973083496, "logps/rejected": -2.202113389968872, "loss": 0.9772, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.189847946166992, "rewards/margins": 1.2143787145614624, "rewards/rejected": -4.404226779937744, "step": 425 }, { "epoch": 0.9002878827532059, "grad_norm": 23.35609170503276, "learning_rate": 1.759464849429082e-08, "logits/chosen": -0.8409427404403687, "logits/rejected": -0.8790140151977539, "logps/chosen": -1.6252171993255615, "logps/rejected": -2.1690993309020996, "loss": 0.9766, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -3.250434398651123, "rewards/margins": 1.087764024734497, "rewards/rejected": -4.338198661804199, "step": 430 }, { "epoch": 0.9107563465061502, "grad_norm": 42.92976213914578, "learning_rate": 1.4078249835774169e-08, "logits/chosen": -0.8287452459335327, "logits/rejected": -0.8296720385551453, "logps/chosen": -1.493123173713684, "logps/rejected": -2.055771827697754, "loss": 1.0029, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.986246347427368, "rewards/margins": 1.1252974271774292, "rewards/rejected": -4.111543655395508, "step": 435 }, { "epoch": 0.9212248102590945, "grad_norm": 32.7360124305529, "learning_rate": 1.0945137958723705e-08, "logits/chosen": -0.8666203618049622, "logits/rejected": -0.9023343920707703, "logps/chosen": -1.6795040369033813, "logps/rejected": -2.055238962173462, "loss": 1.0619, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.3590080738067627, "rewards/margins": 0.7514694929122925, "rewards/rejected": -4.110477924346924, "step": 440 }, { "epoch": 0.9316932740120387, "grad_norm": 25.809975837885126, "learning_rate": 8.19951289467482e-09, "logits/chosen": -0.8226273655891418, "logits/rejected": -0.8915680646896362, "logps/chosen": -1.6063209772109985, "logps/rejected": -2.2188549041748047, "loss": 1.0036, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.212641954421997, "rewards/margins": 1.2250680923461914, "rewards/rejected": -4.437709808349609, "step": 445 }, { "epoch": 0.942161737764983, "grad_norm": 27.907394126837357, "learning_rate": 5.84505523733293e-09, "logits/chosen": -0.8590003848075867, "logits/rejected": -0.9254142642021179, "logps/chosen": -1.5489723682403564, "logps/rejected": -2.138707160949707, "loss": 1.0026, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.097944736480713, "rewards/margins": 1.1794699430465698, "rewards/rejected": -4.277414321899414, "step": 450 }, { "epoch": 0.9526302015179272, "grad_norm": 26.194546776590737, "learning_rate": 3.8849212086261466e-09, "logits/chosen": -0.8426074981689453, "logits/rejected": -0.8449162244796753, "logps/chosen": -1.5749680995941162, "logps/rejected": -2.065624475479126, "loss": 1.0628, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.1499361991882324, "rewards/margins": 0.9813130497932434, "rewards/rejected": -4.131248950958252, "step": 455 }, { "epoch": 0.9630986652708715, "grad_norm": 28.647656191366522, "learning_rate": 2.3217384276938756e-09, "logits/chosen": -0.7687999606132507, "logits/rejected": -0.8947674036026001, "logps/chosen": -1.4748101234436035, "logps/rejected": -2.2467246055603027, "loss": 1.0081, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.949620246887207, "rewards/margins": 1.5438289642333984, "rewards/rejected": -4.4934492111206055, "step": 460 }, { "epoch": 0.9735671290238157, "grad_norm": 25.297804062883948, "learning_rate": 1.1576023884836472e-09, "logits/chosen": -0.8351796269416809, "logits/rejected": -0.8887630701065063, "logps/chosen": -1.5146936178207397, "logps/rejected": -2.2188751697540283, "loss": 0.9987, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.0293872356414795, "rewards/margins": 1.4083633422851562, "rewards/rejected": -4.437750339508057, "step": 465 }, { "epoch": 0.98403559277676, "grad_norm": 19.986270660762962, "learning_rate": 3.940736506780395e-10, "logits/chosen": -0.7743644118309021, "logits/rejected": -0.788620114326477, "logps/chosen": -1.4425890445709229, "logps/rejected": -2.27103853225708, "loss": 1.0166, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.8851780891418457, "rewards/margins": 1.6568992137908936, "rewards/rejected": -4.54207706451416, "step": 470 }, { "epoch": 0.9945040565297043, "grad_norm": 26.273630707088135, "learning_rate": 3.2175747716822744e-11, "logits/chosen": -0.8468500971794128, "logits/rejected": -0.9172460436820984, "logps/chosen": -1.5344510078430176, "logps/rejected": -2.111969470977783, "loss": 0.9858, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.068902015686035, "rewards/margins": 1.1550369262695312, "rewards/rejected": -4.223938941955566, "step": 475 } ], "logging_steps": 5, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 225, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }