diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7063 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9981298423724285, + "eval_steps": 400, + "global_step": 467, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0021373230029388193, + "grad_norm": 16.01126099526898, + "learning_rate": 1.0638297872340425e-08, + "logits/chosen": 1.7974858283996582, + "logits/rejected": 1.927241563796997, + "logps/chosen": -269.252685546875, + "logps/rejected": -268.1457824707031, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.004274646005877639, + "grad_norm": 16.887384715665494, + "learning_rate": 2.127659574468085e-08, + "logits/chosen": 1.7954446077346802, + "logits/rejected": 1.7747466564178467, + "logps/chosen": -377.0067138671875, + "logps/rejected": -384.9050598144531, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.006411969008816457, + "grad_norm": 18.97306536839058, + "learning_rate": 3.191489361702127e-08, + "logits/chosen": 2.0545246601104736, + "logits/rejected": 2.030306577682495, + "logps/chosen": -355.0686950683594, + "logps/rejected": -359.64227294921875, + "loss": 0.6956, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01744360849261284, + "rewards/margins": 0.021479416638612747, + "rewards/rejected": -0.004035805352032185, + "step": 3 + }, + { + "epoch": 0.008549292011755277, + "grad_norm": 17.345901334680676, + "learning_rate": 4.25531914893617e-08, + "logits/chosen": 2.01006817817688, + "logits/rejected": 1.9202210903167725, + "logps/chosen": -344.22540283203125, + "logps/rejected": -336.4920654296875, + "loss": 0.695, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.004112933296710253, + "rewards/margins": 0.008065867237746716, + "rewards/rejected": -0.003952931612730026, + "step": 4 + }, + { + "epoch": 0.010686615014694095, + "grad_norm": 16.581897782195426, + "learning_rate": 5.3191489361702123e-08, + "logits/chosen": 1.6817424297332764, + "logits/rejected": 1.723240613937378, + "logps/chosen": -459.6365966796875, + "logps/rejected": -435.55584716796875, + "loss": 0.6941, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.009510684758424759, + "rewards/margins": 0.012329434975981712, + "rewards/rejected": -0.021840117871761322, + "step": 5 + }, + { + "epoch": 0.012823938017632914, + "grad_norm": 16.750335274402644, + "learning_rate": 6.382978723404254e-08, + "logits/chosen": 1.8463941812515259, + "logits/rejected": 2.0205507278442383, + "logps/chosen": -318.8436584472656, + "logps/rejected": -333.49664306640625, + "loss": 0.6893, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.008932210505008698, + "rewards/margins": 0.020014524459838867, + "rewards/rejected": -0.028946734964847565, + "step": 6 + }, + { + "epoch": 0.014961261020571734, + "grad_norm": 16.740726706909392, + "learning_rate": 7.446808510638298e-08, + "logits/chosen": 1.9784932136535645, + "logits/rejected": 2.030113458633423, + "logps/chosen": -490.574951171875, + "logps/rejected": -506.27325439453125, + "loss": 0.6907, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.023572897538542747, + "rewards/margins": 0.0131209846585989, + "rewards/rejected": 0.010451912879943848, + "step": 7 + }, + { + "epoch": 0.017098584023510555, + "grad_norm": 19.90480338228391, + "learning_rate": 8.51063829787234e-08, + "logits/chosen": 2.1563167572021484, + "logits/rejected": 2.082562208175659, + "logps/chosen": -391.081298828125, + "logps/rejected": -380.1916809082031, + "loss": 0.6878, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0003595345187932253, + "rewards/margins": -0.007014084607362747, + "rewards/rejected": 0.006654549390077591, + "step": 8 + }, + { + "epoch": 0.01923590702644937, + "grad_norm": 17.58226872418466, + "learning_rate": 9.574468085106382e-08, + "logits/chosen": 2.088273525238037, + "logits/rejected": 2.1584815979003906, + "logps/chosen": -398.4393615722656, + "logps/rejected": -407.91717529296875, + "loss": 0.6991, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.010095404461026192, + "rewards/margins": 0.026253772899508476, + "rewards/rejected": -0.03634917736053467, + "step": 9 + }, + { + "epoch": 0.02137323002938819, + "grad_norm": 15.285861983785653, + "learning_rate": 1.0638297872340425e-07, + "logits/chosen": 1.5849823951721191, + "logits/rejected": 1.6066241264343262, + "logps/chosen": -275.02215576171875, + "logps/rejected": -285.42071533203125, + "loss": 0.6943, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0008988383924588561, + "rewards/margins": -0.001907038502395153, + "rewards/rejected": 0.002805877011269331, + "step": 10 + }, + { + "epoch": 0.02351055303232701, + "grad_norm": 17.931865230874028, + "learning_rate": 1.1702127659574468e-07, + "logits/chosen": 1.9127209186553955, + "logits/rejected": 1.9879655838012695, + "logps/chosen": -378.0723876953125, + "logps/rejected": -410.64752197265625, + "loss": 0.6937, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.05186593905091286, + "rewards/margins": 0.0336148738861084, + "rewards/rejected": 0.01825106143951416, + "step": 11 + }, + { + "epoch": 0.02564787603526583, + "grad_norm": 15.757238933424347, + "learning_rate": 1.2765957446808508e-07, + "logits/chosen": 2.188413619995117, + "logits/rejected": 2.0202558040618896, + "logps/chosen": -393.43817138671875, + "logps/rejected": -369.0041198730469, + "loss": 0.6912, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.009139979258179665, + "rewards/margins": -0.023510374128818512, + "rewards/rejected": 0.014370394870638847, + "step": 12 + }, + { + "epoch": 0.027785199038204648, + "grad_norm": 20.954337431493947, + "learning_rate": 1.3829787234042553e-07, + "logits/chosen": 2.3227410316467285, + "logits/rejected": 2.255707025527954, + "logps/chosen": -425.1292724609375, + "logps/rejected": -398.930908203125, + "loss": 0.6925, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.016432786360383034, + "rewards/margins": 0.021908044815063477, + "rewards/rejected": -0.0054752579890191555, + "step": 13 + }, + { + "epoch": 0.029922522041143467, + "grad_norm": 17.575256149749116, + "learning_rate": 1.4893617021276595e-07, + "logits/chosen": 0.9818891286849976, + "logits/rejected": 1.0016374588012695, + "logps/chosen": -221.91647338867188, + "logps/rejected": -221.5160369873047, + "loss": 0.6911, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.01453709602355957, + "rewards/margins": -0.007119536399841309, + "rewards/rejected": 0.02165663242340088, + "step": 14 + }, + { + "epoch": 0.03205984504408229, + "grad_norm": 15.987335944794728, + "learning_rate": 1.5957446808510638e-07, + "logits/chosen": 1.8959016799926758, + "logits/rejected": 1.9433327913284302, + "logps/chosen": -296.5621032714844, + "logps/rejected": -310.80889892578125, + "loss": 0.6929, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.011275790631771088, + "rewards/margins": 0.00550649082288146, + "rewards/rejected": 0.00576929934322834, + "step": 15 + }, + { + "epoch": 0.03419716804702111, + "grad_norm": 18.43866679208328, + "learning_rate": 1.702127659574468e-07, + "logits/chosen": 1.4447689056396484, + "logits/rejected": 1.5044986009597778, + "logps/chosen": -256.1441345214844, + "logps/rejected": -300.1876525878906, + "loss": 0.697, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.009331870824098587, + "rewards/margins": -0.006017827428877354, + "rewards/rejected": 0.015349699184298515, + "step": 16 + }, + { + "epoch": 0.03633449104995993, + "grad_norm": 15.462024767466382, + "learning_rate": 1.8085106382978725e-07, + "logits/chosen": 2.2961068153381348, + "logits/rejected": 2.213308572769165, + "logps/chosen": -369.3326110839844, + "logps/rejected": -354.31683349609375, + "loss": 0.6885, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.015958024188876152, + "rewards/margins": 0.03008664958178997, + "rewards/rejected": -0.014128627255558968, + "step": 17 + }, + { + "epoch": 0.03847181405289874, + "grad_norm": 17.489044223273293, + "learning_rate": 1.9148936170212765e-07, + "logits/chosen": 1.8175828456878662, + "logits/rejected": 1.859854817390442, + "logps/chosen": -414.024169921875, + "logps/rejected": -413.8213195800781, + "loss": 0.6894, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.024924801662564278, + "rewards/margins": -0.003364275209605694, + "rewards/rejected": 0.028289081528782845, + "step": 18 + }, + { + "epoch": 0.04060913705583756, + "grad_norm": 17.114195544232533, + "learning_rate": 2.0212765957446807e-07, + "logits/chosen": 2.317970037460327, + "logits/rejected": 2.2373180389404297, + "logps/chosen": -442.814453125, + "logps/rejected": -414.55535888671875, + "loss": 0.6946, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.005524086765944958, + "rewards/margins": 0.01554189808666706, + "rewards/rejected": -0.021065985783934593, + "step": 19 + }, + { + "epoch": 0.04274646005877638, + "grad_norm": 16.890434285248393, + "learning_rate": 2.127659574468085e-07, + "logits/chosen": 1.76038658618927, + "logits/rejected": 1.7405922412872314, + "logps/chosen": -383.553955078125, + "logps/rejected": -391.88543701171875, + "loss": 0.694, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00688550528138876, + "rewards/margins": 0.004018557257950306, + "rewards/rejected": 0.0028669475577771664, + "step": 20 + }, + { + "epoch": 0.0448837830617152, + "grad_norm": 17.504320922484634, + "learning_rate": 2.2340425531914892e-07, + "logits/chosen": 1.5609700679779053, + "logits/rejected": 1.7058910131454468, + "logps/chosen": -323.5635681152344, + "logps/rejected": -348.6455078125, + "loss": 0.687, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.004861879162490368, + "rewards/margins": -0.000568365678191185, + "rewards/rejected": -0.004293511621654034, + "step": 21 + }, + { + "epoch": 0.04702110606465402, + "grad_norm": 18.494669357857983, + "learning_rate": 2.3404255319148937e-07, + "logits/chosen": 1.8989413976669312, + "logits/rejected": 1.9474773406982422, + "logps/chosen": -387.58453369140625, + "logps/rejected": -415.0438232421875, + "loss": 0.6875, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.021361876279115677, + "rewards/margins": -0.0013632788322865963, + "rewards/rejected": -0.019998596981167793, + "step": 22 + }, + { + "epoch": 0.04915842906759284, + "grad_norm": 16.13049724308586, + "learning_rate": 2.4468085106382976e-07, + "logits/chosen": 1.3913708925247192, + "logits/rejected": 1.5202478170394897, + "logps/chosen": -348.6595458984375, + "logps/rejected": -392.2963562011719, + "loss": 0.6932, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.010332870297133923, + "rewards/margins": 0.025678444653749466, + "rewards/rejected": -0.015345573425292969, + "step": 23 + }, + { + "epoch": 0.05129575207053166, + "grad_norm": 17.143905423978246, + "learning_rate": 2.5531914893617016e-07, + "logits/chosen": 1.722089409828186, + "logits/rejected": 1.7591125965118408, + "logps/chosen": -420.39404296875, + "logps/rejected": -425.396484375, + "loss": 0.6909, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.010764744132757187, + "rewards/margins": 0.019897134974598885, + "rewards/rejected": -0.009132390841841698, + "step": 24 + }, + { + "epoch": 0.053433075073470476, + "grad_norm": 16.877922335832068, + "learning_rate": 2.659574468085106e-07, + "logits/chosen": 2.259049415588379, + "logits/rejected": 2.423696756362915, + "logps/chosen": -292.7757568359375, + "logps/rejected": -319.800048828125, + "loss": 0.694, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.01868593692779541, + "rewards/margins": 0.0022046808153390884, + "rewards/rejected": -0.0208906177431345, + "step": 25 + }, + { + "epoch": 0.055570398076409296, + "grad_norm": 17.29347939663918, + "learning_rate": 2.7659574468085106e-07, + "logits/chosen": 1.9354510307312012, + "logits/rejected": 1.9527926445007324, + "logps/chosen": -345.2893981933594, + "logps/rejected": -388.90911865234375, + "loss": 0.6942, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.03965914249420166, + "rewards/margins": -0.005191686097532511, + "rewards/rejected": -0.034467458724975586, + "step": 26 + }, + { + "epoch": 0.057707721079348115, + "grad_norm": 17.646670785000858, + "learning_rate": 2.872340425531915e-07, + "logits/chosen": 1.8476933240890503, + "logits/rejected": 1.9482815265655518, + "logps/chosen": -333.46820068359375, + "logps/rejected": -358.3786315917969, + "loss": 0.6909, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.027103139087557793, + "rewards/margins": -0.0378154031932354, + "rewards/rejected": 0.010712266899645329, + "step": 27 + }, + { + "epoch": 0.059845044082286934, + "grad_norm": 16.761100735068744, + "learning_rate": 2.978723404255319e-07, + "logits/chosen": 1.7289808988571167, + "logits/rejected": 1.6542481184005737, + "logps/chosen": -434.14825439453125, + "logps/rejected": -440.4168395996094, + "loss": 0.6909, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.008729076012969017, + "rewards/margins": -0.01662147231400013, + "rewards/rejected": 0.007892394438385963, + "step": 28 + }, + { + "epoch": 0.061982367085225754, + "grad_norm": 17.745794857594785, + "learning_rate": 3.085106382978723e-07, + "logits/chosen": 2.139738082885742, + "logits/rejected": 1.9200177192687988, + "logps/chosen": -337.0323181152344, + "logps/rejected": -322.8047790527344, + "loss": 0.6864, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.047266557812690735, + "rewards/margins": -0.006123709492385387, + "rewards/rejected": -0.04114284738898277, + "step": 29 + }, + { + "epoch": 0.06411969008816458, + "grad_norm": 17.58518177821876, + "learning_rate": 3.1914893617021275e-07, + "logits/chosen": 1.9211307764053345, + "logits/rejected": 2.0256621837615967, + "logps/chosen": -431.0246887207031, + "logps/rejected": -466.2309875488281, + "loss": 0.685, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.03614044189453125, + "rewards/margins": -0.02504887804389, + "rewards/rejected": -0.0110915657132864, + "step": 30 + }, + { + "epoch": 0.06625701309110339, + "grad_norm": 16.58761166063222, + "learning_rate": 3.2978723404255315e-07, + "logits/chosen": 2.0386786460876465, + "logits/rejected": 2.175553798675537, + "logps/chosen": -430.54254150390625, + "logps/rejected": -434.08551025390625, + "loss": 0.6851, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.023386145010590553, + "rewards/margins": 0.005015873815864325, + "rewards/rejected": -0.028402019292116165, + "step": 31 + }, + { + "epoch": 0.06839433609404222, + "grad_norm": 19.112046530771966, + "learning_rate": 3.404255319148936e-07, + "logits/chosen": 1.7420753240585327, + "logits/rejected": 1.7955291271209717, + "logps/chosen": -357.76708984375, + "logps/rejected": -405.6940002441406, + "loss": 0.6898, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.013730335980653763, + "rewards/margins": -0.009128142148256302, + "rewards/rejected": -0.004602192901074886, + "step": 32 + }, + { + "epoch": 0.07053165909698103, + "grad_norm": 15.87955801875356, + "learning_rate": 3.5106382978723405e-07, + "logits/chosen": 2.003401756286621, + "logits/rejected": 1.9700802564620972, + "logps/chosen": -298.1646423339844, + "logps/rejected": -315.47320556640625, + "loss": 0.6847, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.049097828567028046, + "rewards/margins": -0.01259935088455677, + "rewards/rejected": -0.036498475819826126, + "step": 33 + }, + { + "epoch": 0.07266898209991986, + "grad_norm": 17.271526767661655, + "learning_rate": 3.617021276595745e-07, + "logits/chosen": 1.674816608428955, + "logits/rejected": 1.674377679824829, + "logps/chosen": -262.3334655761719, + "logps/rejected": -293.4676818847656, + "loss": 0.6908, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05956123024225235, + "rewards/margins": -0.0003459937870502472, + "rewards/rejected": -0.0592152364552021, + "step": 34 + }, + { + "epoch": 0.07480630510285867, + "grad_norm": 18.522791044208137, + "learning_rate": 3.7234042553191484e-07, + "logits/chosen": 2.0599467754364014, + "logits/rejected": 1.985105037689209, + "logps/chosen": -361.3081970214844, + "logps/rejected": -335.7303466796875, + "loss": 0.687, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.04977235943078995, + "rewards/margins": 0.0031501527410000563, + "rewards/rejected": -0.052922509610652924, + "step": 35 + }, + { + "epoch": 0.07694362810579748, + "grad_norm": 16.058029701898118, + "learning_rate": 3.829787234042553e-07, + "logits/chosen": 2.2823598384857178, + "logits/rejected": 2.245945930480957, + "logps/chosen": -397.5772705078125, + "logps/rejected": -387.1483459472656, + "loss": 0.6906, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03420672193169594, + "rewards/margins": 0.01202941033989191, + "rewards/rejected": -0.04623613506555557, + "step": 36 + }, + { + "epoch": 0.07908095110873631, + "grad_norm": 16.29611225935786, + "learning_rate": 3.9361702127659574e-07, + "logits/chosen": 1.3664941787719727, + "logits/rejected": 1.407928466796875, + "logps/chosen": -344.04736328125, + "logps/rejected": -343.9777526855469, + "loss": 0.687, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.006882575340569019, + "rewards/margins": 0.06044764071702957, + "rewards/rejected": -0.05356506258249283, + "step": 37 + }, + { + "epoch": 0.08121827411167512, + "grad_norm": 16.675563423812346, + "learning_rate": 4.0425531914893614e-07, + "logits/chosen": 1.9885263442993164, + "logits/rejected": 2.049900531768799, + "logps/chosen": -408.2422790527344, + "logps/rejected": -459.45330810546875, + "loss": 0.6822, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0520525798201561, + "rewards/margins": 0.03479123115539551, + "rewards/rejected": -0.0868438109755516, + "step": 38 + }, + { + "epoch": 0.08335559711461395, + "grad_norm": 15.90555800464618, + "learning_rate": 4.148936170212766e-07, + "logits/chosen": 2.1601202487945557, + "logits/rejected": 2.1942191123962402, + "logps/chosen": -406.88995361328125, + "logps/rejected": -424.9388122558594, + "loss": 0.6792, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11415410041809082, + "rewards/margins": -0.005976894870400429, + "rewards/rejected": -0.10817721486091614, + "step": 39 + }, + { + "epoch": 0.08549292011755276, + "grad_norm": 19.234842538679075, + "learning_rate": 4.25531914893617e-07, + "logits/chosen": 1.4985907077789307, + "logits/rejected": 1.512323021888733, + "logps/chosen": -295.0309143066406, + "logps/rejected": -282.7944641113281, + "loss": 0.6873, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.13608673214912415, + "rewards/margins": -0.009758353233337402, + "rewards/rejected": -0.12632837891578674, + "step": 40 + }, + { + "epoch": 0.08763024312049159, + "grad_norm": 18.1644372771909, + "learning_rate": 4.3617021276595744e-07, + "logits/chosen": 2.3623297214508057, + "logits/rejected": 2.420731544494629, + "logps/chosen": -341.2931823730469, + "logps/rejected": -369.0897216796875, + "loss": 0.6791, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.09444458782672882, + "rewards/margins": 0.05246647074818611, + "rewards/rejected": -0.14691105484962463, + "step": 41 + }, + { + "epoch": 0.0897675661234304, + "grad_norm": 17.564831976861814, + "learning_rate": 4.4680851063829783e-07, + "logits/chosen": 2.216895580291748, + "logits/rejected": 2.060129165649414, + "logps/chosen": -464.18829345703125, + "logps/rejected": -397.678955078125, + "loss": 0.6891, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.11240722239017487, + "rewards/margins": 0.01255890354514122, + "rewards/rejected": -0.12496612221002579, + "step": 42 + }, + { + "epoch": 0.09190488912636922, + "grad_norm": 17.498276873674637, + "learning_rate": 4.574468085106383e-07, + "logits/chosen": 1.8796459436416626, + "logits/rejected": 1.8276338577270508, + "logps/chosen": -378.45849609375, + "logps/rejected": -342.8559265136719, + "loss": 0.6774, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.118812195956707, + "rewards/margins": 0.056394241750240326, + "rewards/rejected": -0.17520645260810852, + "step": 43 + }, + { + "epoch": 0.09404221212930804, + "grad_norm": 16.939255095109502, + "learning_rate": 4.6808510638297873e-07, + "logits/chosen": 1.4337763786315918, + "logits/rejected": 1.4281963109970093, + "logps/chosen": -454.3580627441406, + "logps/rejected": -430.2334899902344, + "loss": 0.685, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.14937520027160645, + "rewards/margins": -0.016789698973298073, + "rewards/rejected": -0.13258551061153412, + "step": 44 + }, + { + "epoch": 0.09617953513224686, + "grad_norm": 18.086769935035708, + "learning_rate": 4.787234042553192e-07, + "logits/chosen": 2.0096044540405273, + "logits/rejected": 2.032930612564087, + "logps/chosen": -356.0893249511719, + "logps/rejected": -363.38531494140625, + "loss": 0.6826, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.11098766326904297, + "rewards/margins": 0.031171085312962532, + "rewards/rejected": -0.14215874671936035, + "step": 45 + }, + { + "epoch": 0.09831685813518568, + "grad_norm": 16.60546022249817, + "learning_rate": 4.893617021276595e-07, + "logits/chosen": 2.2274961471557617, + "logits/rejected": 2.2103538513183594, + "logps/chosen": -450.325439453125, + "logps/rejected": -453.9230041503906, + "loss": 0.6907, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.20025189220905304, + "rewards/margins": -0.030434010550379753, + "rewards/rejected": -0.16981787979602814, + "step": 46 + }, + { + "epoch": 0.1004541811381245, + "grad_norm": 17.64513453243039, + "learning_rate": 5e-07, + "logits/chosen": 1.394955039024353, + "logits/rejected": 1.4171689748764038, + "logps/chosen": -271.34051513671875, + "logps/rejected": -280.0254211425781, + "loss": 0.6873, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13984565436840057, + "rewards/margins": 0.09467978030443192, + "rewards/rejected": -0.23452544212341309, + "step": 47 + }, + { + "epoch": 0.10259150414106331, + "grad_norm": 19.298872476175823, + "learning_rate": 4.999930062653174e-07, + "logits/chosen": 1.808046579360962, + "logits/rejected": 1.7487595081329346, + "logps/chosen": -357.9656982421875, + "logps/rejected": -400.80303955078125, + "loss": 0.687, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.23653987050056458, + "rewards/margins": -0.007849575020372868, + "rewards/rejected": -0.22869029641151428, + "step": 48 + }, + { + "epoch": 0.10472882714400214, + "grad_norm": 17.333339696928125, + "learning_rate": 4.999720254525684e-07, + "logits/chosen": 1.746777892112732, + "logits/rejected": 1.7006160020828247, + "logps/chosen": -421.19268798828125, + "logps/rejected": -411.2952880859375, + "loss": 0.6816, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20930640399456024, + "rewards/margins": 0.018274232745170593, + "rewards/rejected": -0.22758066654205322, + "step": 49 + }, + { + "epoch": 0.10686615014694095, + "grad_norm": 17.729943763732685, + "learning_rate": 4.999370587356267e-07, + "logits/chosen": 1.9540718793869019, + "logits/rejected": 1.9008055925369263, + "logps/chosen": -294.99676513671875, + "logps/rejected": -272.04925537109375, + "loss": 0.6631, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.23157571256160736, + "rewards/margins": 0.12571102380752563, + "rewards/rejected": -0.3572867512702942, + "step": 50 + }, + { + "epoch": 0.10900347314987978, + "grad_norm": 17.468777477407095, + "learning_rate": 4.998881080708758e-07, + "logits/chosen": 1.481137990951538, + "logits/rejected": 1.3699580430984497, + "logps/chosen": -289.7410888671875, + "logps/rejected": -315.044921875, + "loss": 0.683, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.20251740515232086, + "rewards/margins": -0.0064203981310129166, + "rewards/rejected": -0.1960970163345337, + "step": 51 + }, + { + "epoch": 0.11114079615281859, + "grad_norm": 17.54256112367197, + "learning_rate": 4.998251761970996e-07, + "logits/chosen": 2.0398733615875244, + "logits/rejected": 2.158165216445923, + "logps/chosen": -387.5774841308594, + "logps/rejected": -413.7792053222656, + "loss": 0.6729, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21002252399921417, + "rewards/margins": 0.059471115469932556, + "rewards/rejected": -0.26949363946914673, + "step": 52 + }, + { + "epoch": 0.11327811915575742, + "grad_norm": 18.17193642743448, + "learning_rate": 4.997482666353286e-07, + "logits/chosen": 1.551992654800415, + "logits/rejected": 1.6585577726364136, + "logps/chosen": -362.41253662109375, + "logps/rejected": -375.1070556640625, + "loss": 0.6703, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28190845251083374, + "rewards/margins": 0.027381589636206627, + "rewards/rejected": -0.3092900216579437, + "step": 53 + }, + { + "epoch": 0.11541544215869623, + "grad_norm": 19.118405188522143, + "learning_rate": 4.996573836886434e-07, + "logits/chosen": 2.13954496383667, + "logits/rejected": 2.0068345069885254, + "logps/chosen": -494.3727722167969, + "logps/rejected": -441.12664794921875, + "loss": 0.6871, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.28698673844337463, + "rewards/margins": 0.03327002376317978, + "rewards/rejected": -0.3202567994594574, + "step": 54 + }, + { + "epoch": 0.11755276516163506, + "grad_norm": 18.80060244793496, + "learning_rate": 4.995525324419337e-07, + "logits/chosen": 2.386868715286255, + "logits/rejected": 2.2814548015594482, + "logps/chosen": -470.53875732421875, + "logps/rejected": -430.71636962890625, + "loss": 0.6747, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23906640708446503, + "rewards/margins": 0.13759444653987885, + "rewards/rejected": -0.3766608238220215, + "step": 55 + }, + { + "epoch": 0.11969008816457387, + "grad_norm": 17.20090684503435, + "learning_rate": 4.99433718761614e-07, + "logits/chosen": 1.7399883270263672, + "logits/rejected": 1.6878349781036377, + "logps/chosen": -427.767333984375, + "logps/rejected": -383.4020690917969, + "loss": 0.6745, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.28416523337364197, + "rewards/margins": 0.028113719075918198, + "rewards/rejected": -0.31227895617485046, + "step": 56 + }, + { + "epoch": 0.1218274111675127, + "grad_norm": 17.189006839270423, + "learning_rate": 4.993009492952949e-07, + "logits/chosen": 2.0051536560058594, + "logits/rejected": 1.9158482551574707, + "logps/chosen": -328.1854553222656, + "logps/rejected": -342.3035583496094, + "loss": 0.6655, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3610785901546478, + "rewards/margins": 0.006724976003170013, + "rewards/rejected": -0.36780354380607605, + "step": 57 + }, + { + "epoch": 0.12396473417045151, + "grad_norm": 19.604523214040483, + "learning_rate": 4.991542314714122e-07, + "logits/chosen": 1.3874634504318237, + "logits/rejected": 1.4164166450500488, + "logps/chosen": -304.51922607421875, + "logps/rejected": -336.23944091796875, + "loss": 0.671, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.30155548453330994, + "rewards/margins": 0.09271588921546936, + "rewards/rejected": -0.3942714035511017, + "step": 58 + }, + { + "epoch": 0.12610205717339032, + "grad_norm": 16.687200107986385, + "learning_rate": 4.989935734988097e-07, + "logits/chosen": 1.4796478748321533, + "logits/rejected": 1.3376965522766113, + "logps/chosen": -413.5907897949219, + "logps/rejected": -392.4472961425781, + "loss": 0.6652, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24848215281963348, + "rewards/margins": 0.027541160583496094, + "rewards/rejected": -0.27602332830429077, + "step": 59 + }, + { + "epoch": 0.12823938017632916, + "grad_norm": 18.754855497046993, + "learning_rate": 4.988189843662815e-07, + "logits/chosen": 1.5387346744537354, + "logits/rejected": 1.6132254600524902, + "logps/chosen": -306.87213134765625, + "logps/rejected": -335.58721923828125, + "loss": 0.6743, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.26606473326683044, + "rewards/margins": 0.02971341460943222, + "rewards/rejected": -0.29577815532684326, + "step": 60 + }, + { + "epoch": 0.13037670317926797, + "grad_norm": 18.463206919216464, + "learning_rate": 4.986304738420683e-07, + "logits/chosen": 1.944252848625183, + "logits/rejected": 1.896322250366211, + "logps/chosen": -341.7672424316406, + "logps/rejected": -363.7576904296875, + "loss": 0.6524, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2655528485774994, + "rewards/margins": 0.11380692571401596, + "rewards/rejected": -0.37935978174209595, + "step": 61 + }, + { + "epoch": 0.13251402618220678, + "grad_norm": 22.5265817847443, + "learning_rate": 4.984280524733107e-07, + "logits/chosen": 1.4530837535858154, + "logits/rejected": 1.3399604558944702, + "logps/chosen": -355.2033386230469, + "logps/rejected": -351.52618408203125, + "loss": 0.6853, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4064197242259979, + "rewards/margins": 0.10837845504283905, + "rewards/rejected": -0.5147981643676758, + "step": 62 + }, + { + "epoch": 0.1346513491851456, + "grad_norm": 18.564495656237806, + "learning_rate": 4.982117315854593e-07, + "logits/chosen": 2.016234874725342, + "logits/rejected": 1.9874932765960693, + "logps/chosen": -416.2107849121094, + "logps/rejected": -402.76470947265625, + "loss": 0.6711, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4489130973815918, + "rewards/margins": 0.04024820774793625, + "rewards/rejected": -0.48916131258010864, + "step": 63 + }, + { + "epoch": 0.13678867218808444, + "grad_norm": 18.562107659954208, + "learning_rate": 4.979815232816416e-07, + "logits/chosen": 2.208319664001465, + "logits/rejected": 2.190627336502075, + "logps/chosen": -303.19500732421875, + "logps/rejected": -326.4004821777344, + "loss": 0.6603, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2610405683517456, + "rewards/margins": 0.05577556788921356, + "rewards/rejected": -0.31681615114212036, + "step": 64 + }, + { + "epoch": 0.13892599519102325, + "grad_norm": 20.69361614885178, + "learning_rate": 4.977374404419837e-07, + "logits/chosen": 1.9246108531951904, + "logits/rejected": 1.8763636350631714, + "logps/chosen": -358.9330139160156, + "logps/rejected": -346.0728759765625, + "loss": 0.6662, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.22426418960094452, + "rewards/margins": 0.1284552812576294, + "rewards/rejected": -0.3527194857597351, + "step": 65 + }, + { + "epoch": 0.14106331819396206, + "grad_norm": 17.2698240144223, + "learning_rate": 4.974794967228907e-07, + "logits/chosen": 1.483390212059021, + "logits/rejected": 1.375150203704834, + "logps/chosen": -315.0908508300781, + "logps/rejected": -309.7676696777344, + "loss": 0.6689, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3849695920944214, + "rewards/margins": 0.13179758191108704, + "rewards/rejected": -0.516767144203186, + "step": 66 + }, + { + "epoch": 0.14320064119690087, + "grad_norm": 18.912694680875394, + "learning_rate": 4.972077065562821e-07, + "logits/chosen": 1.6654436588287354, + "logits/rejected": 1.6484836339950562, + "logps/chosen": -415.6891174316406, + "logps/rejected": -483.3581848144531, + "loss": 0.6528, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3479224145412445, + "rewards/margins": 0.30767643451690674, + "rewards/rejected": -0.6555988192558289, + "step": 67 + }, + { + "epoch": 0.14533796419983971, + "grad_norm": 18.224972403748673, + "learning_rate": 4.969220851487844e-07, + "logits/chosen": 1.7228754758834839, + "logits/rejected": 1.616809606552124, + "logps/chosen": -387.8039245605469, + "logps/rejected": -381.3202209472656, + "loss": 0.6676, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.41254401206970215, + "rewards/margins": 0.05308759585022926, + "rewards/rejected": -0.4656316041946411, + "step": 68 + }, + { + "epoch": 0.14747528720277853, + "grad_norm": 16.86306787165477, + "learning_rate": 4.966226484808803e-07, + "logits/chosen": 1.2384394407272339, + "logits/rejected": 1.2866795063018799, + "logps/chosen": -314.2529602050781, + "logps/rejected": -329.0174865722656, + "loss": 0.6639, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4110863208770752, + "rewards/margins": 0.2178199589252472, + "rewards/rejected": -0.6289063096046448, + "step": 69 + }, + { + "epoch": 0.14961261020571734, + "grad_norm": 17.195577161087968, + "learning_rate": 4.963094133060148e-07, + "logits/chosen": 2.0873541831970215, + "logits/rejected": 1.998971700668335, + "logps/chosen": -389.9579162597656, + "logps/rejected": -384.6530456542969, + "loss": 0.6708, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.621479332447052, + "rewards/margins": 0.08416196703910828, + "rewards/rejected": -0.7056412696838379, + "step": 70 + }, + { + "epoch": 0.15174993320865615, + "grad_norm": 18.106861789518128, + "learning_rate": 4.959823971496574e-07, + "logits/chosen": 1.5237023830413818, + "logits/rejected": 1.6149706840515137, + "logps/chosen": -400.80560302734375, + "logps/rejected": -411.4190979003906, + "loss": 0.6632, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.38402536511421204, + "rewards/margins": 0.19899356365203857, + "rewards/rejected": -0.583018958568573, + "step": 71 + }, + { + "epoch": 0.15388725621159496, + "grad_norm": 18.8987848353103, + "learning_rate": 4.956416183083221e-07, + "logits/chosen": 2.0366392135620117, + "logits/rejected": 2.1463069915771484, + "logps/chosen": -336.7109375, + "logps/rejected": -391.3525695800781, + "loss": 0.6537, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3251726031303406, + "rewards/margins": 0.0683506578207016, + "rewards/rejected": -0.39352327585220337, + "step": 72 + }, + { + "epoch": 0.1560245792145338, + "grad_norm": 18.02342571848944, + "learning_rate": 4.952870958485431e-07, + "logits/chosen": 2.0702877044677734, + "logits/rejected": 1.999230146408081, + "logps/chosen": -474.9482727050781, + "logps/rejected": -470.8036804199219, + "loss": 0.6621, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5637938976287842, + "rewards/margins": 0.12167691439390182, + "rewards/rejected": -0.6854707598686218, + "step": 73 + }, + { + "epoch": 0.15816190221747262, + "grad_norm": 17.122856393777678, + "learning_rate": 4.949188496058089e-07, + "logits/chosen": 2.1900906562805176, + "logits/rejected": 2.1530239582061768, + "logps/chosen": -365.1473388671875, + "logps/rejected": -355.15826416015625, + "loss": 0.6578, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4727271795272827, + "rewards/margins": 0.044449158012866974, + "rewards/rejected": -0.5171763300895691, + "step": 74 + }, + { + "epoch": 0.16029922522041143, + "grad_norm": 18.705770867692838, + "learning_rate": 4.945369001834514e-07, + "logits/chosen": 1.970231533050537, + "logits/rejected": 1.8822038173675537, + "logps/chosen": -357.3649597167969, + "logps/rejected": -340.84735107421875, + "loss": 0.6279, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39545726776123047, + "rewards/margins": 0.2576879858970642, + "rewards/rejected": -0.6531451940536499, + "step": 75 + }, + { + "epoch": 0.16243654822335024, + "grad_norm": 18.463959365090876, + "learning_rate": 4.941412689514941e-07, + "logits/chosen": 1.9429740905761719, + "logits/rejected": 2.05521559715271, + "logps/chosen": -399.330078125, + "logps/rejected": -448.8360290527344, + "loss": 0.6457, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6837693452835083, + "rewards/margins": 0.19340023398399353, + "rewards/rejected": -0.8771694898605347, + "step": 76 + }, + { + "epoch": 0.16457387122628908, + "grad_norm": 17.748894790019765, + "learning_rate": 4.937319780454559e-07, + "logits/chosen": 1.7817144393920898, + "logits/rejected": 1.8134974241256714, + "logps/chosen": -360.9486999511719, + "logps/rejected": -348.0911560058594, + "loss": 0.6438, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.536231517791748, + "rewards/margins": 0.07527366280555725, + "rewards/rejected": -0.6115051507949829, + "step": 77 + }, + { + "epoch": 0.1667111942292279, + "grad_norm": 17.16271899190261, + "learning_rate": 4.933090503651128e-07, + "logits/chosen": 1.7767925262451172, + "logits/rejected": 1.779471755027771, + "logps/chosen": -400.331787109375, + "logps/rejected": -395.0623779296875, + "loss": 0.624, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6241444945335388, + "rewards/margins": 0.17532269656658173, + "rewards/rejected": -0.7994672060012817, + "step": 78 + }, + { + "epoch": 0.1688485172321667, + "grad_norm": 18.630940755974848, + "learning_rate": 4.928725095732168e-07, + "logits/chosen": 1.643775224685669, + "logits/rejected": 1.786955714225769, + "logps/chosen": -355.02508544921875, + "logps/rejected": -399.382568359375, + "loss": 0.6417, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6717734336853027, + "rewards/margins": 0.11283117532730103, + "rewards/rejected": -0.7846046090126038, + "step": 79 + }, + { + "epoch": 0.17098584023510552, + "grad_norm": 18.925906729991304, + "learning_rate": 4.924223800941717e-07, + "logits/chosen": 2.1771671772003174, + "logits/rejected": 2.2715742588043213, + "logps/chosen": -449.5519714355469, + "logps/rejected": -464.3706970214844, + "loss": 0.654, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.555615246295929, + "rewards/margins": 0.14044499397277832, + "rewards/rejected": -0.6960601806640625, + "step": 80 + }, + { + "epoch": 0.17312316323804436, + "grad_norm": 18.40745960551381, + "learning_rate": 4.919586871126667e-07, + "logits/chosen": 1.9518824815750122, + "logits/rejected": 2.082866668701172, + "logps/chosen": -403.2046203613281, + "logps/rejected": -436.531494140625, + "loss": 0.6453, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6706172227859497, + "rewards/margins": 0.15584391355514526, + "rewards/rejected": -0.8264610767364502, + "step": 81 + }, + { + "epoch": 0.17526048624098317, + "grad_norm": 17.801429215955096, + "learning_rate": 4.91481456572267e-07, + "logits/chosen": 1.9086239337921143, + "logits/rejected": 1.8316439390182495, + "logps/chosen": -399.4256591796875, + "logps/rejected": -387.94171142578125, + "loss": 0.6434, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8419517874717712, + "rewards/margins": 0.10202351212501526, + "rewards/rejected": -0.9439752101898193, + "step": 82 + }, + { + "epoch": 0.17739780924392198, + "grad_norm": 18.421125186391436, + "learning_rate": 4.909907151739633e-07, + "logits/chosen": 2.0825533866882324, + "logits/rejected": 2.0777454376220703, + "logps/chosen": -358.8985290527344, + "logps/rejected": -335.0311279296875, + "loss": 0.6416, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.886951208114624, + "rewards/margins": 0.06833362579345703, + "rewards/rejected": -0.9552848935127258, + "step": 83 + }, + { + "epoch": 0.1795351322468608, + "grad_norm": 17.625368142356823, + "learning_rate": 4.904864903746765e-07, + "logits/chosen": 2.0355796813964844, + "logits/rejected": 2.1017327308654785, + "logps/chosen": -406.8382568359375, + "logps/rejected": -409.212646484375, + "loss": 0.6074, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6774367094039917, + "rewards/margins": 0.4327796995639801, + "rewards/rejected": -1.1102163791656494, + "step": 84 + }, + { + "epoch": 0.18167245524979964, + "grad_norm": 17.846014275986537, + "learning_rate": 4.899688103857222e-07, + "logits/chosen": 1.704572081565857, + "logits/rejected": 1.714634895324707, + "logps/chosen": -386.1976318359375, + "logps/rejected": -388.4258728027344, + "loss": 0.6381, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9329708814620972, + "rewards/margins": 0.04441451653838158, + "rewards/rejected": -0.977385401725769, + "step": 85 + }, + { + "epoch": 0.18380977825273845, + "grad_norm": 17.526539174440522, + "learning_rate": 4.894377041712326e-07, + "logits/chosen": 2.334737777709961, + "logits/rejected": 2.3338565826416016, + "logps/chosen": -421.5735778808594, + "logps/rejected": -438.7161865234375, + "loss": 0.6296, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7888815402984619, + "rewards/margins": 0.31321045756340027, + "rewards/rejected": -1.1020920276641846, + "step": 86 + }, + { + "epoch": 0.18594710125567726, + "grad_norm": 19.21020565741226, + "learning_rate": 4.888932014465352e-07, + "logits/chosen": 1.5965489149093628, + "logits/rejected": 1.6148681640625, + "logps/chosen": -372.6695556640625, + "logps/rejected": -392.4805908203125, + "loss": 0.6242, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7540562152862549, + "rewards/margins": 0.266312837600708, + "rewards/rejected": -1.020369052886963, + "step": 87 + }, + { + "epoch": 0.18808442425861607, + "grad_norm": 18.29361445908713, + "learning_rate": 4.883353326764906e-07, + "logits/chosen": 1.5290958881378174, + "logits/rejected": 1.6073287725448608, + "logps/chosen": -397.7436828613281, + "logps/rejected": -399.38385009765625, + "loss": 0.6581, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8137935996055603, + "rewards/margins": 0.2867809236049652, + "rewards/rejected": -1.1005744934082031, + "step": 88 + }, + { + "epoch": 0.1902217472615549, + "grad_norm": 18.614979910689225, + "learning_rate": 4.877641290737883e-07, + "logits/chosen": 1.936212420463562, + "logits/rejected": 1.8444154262542725, + "logps/chosen": -424.4599609375, + "logps/rejected": -479.34808349609375, + "loss": 0.6174, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9368820190429688, + "rewards/margins": 0.2523537874221802, + "rewards/rejected": -1.1892356872558594, + "step": 89 + }, + { + "epoch": 0.19235907026449373, + "grad_norm": 18.218281107090114, + "learning_rate": 4.871796225971999e-07, + "logits/chosen": 1.7540048360824585, + "logits/rejected": 1.8083505630493164, + "logps/chosen": -352.07464599609375, + "logps/rejected": -390.17193603515625, + "loss": 0.6386, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8687042593955994, + "rewards/margins": 0.04543251544237137, + "rewards/rejected": -0.9141367077827454, + "step": 90 + }, + { + "epoch": 0.19449639326743254, + "grad_norm": 17.42744405233579, + "learning_rate": 4.86581845949791e-07, + "logits/chosen": 1.7399728298187256, + "logits/rejected": 1.881831169128418, + "logps/chosen": -386.78070068359375, + "logps/rejected": -394.3675231933594, + "loss": 0.6278, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8296229839324951, + "rewards/margins": 0.33209383487701416, + "rewards/rejected": -1.1617166996002197, + "step": 91 + }, + { + "epoch": 0.19663371627037135, + "grad_norm": 18.157993962219635, + "learning_rate": 4.859708325770919e-07, + "logits/chosen": 1.8655225038528442, + "logits/rejected": 1.7539880275726318, + "logps/chosen": -425.1163635253906, + "logps/rejected": -450.327392578125, + "loss": 0.6352, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9147007465362549, + "rewards/margins": 0.19170109927654266, + "rewards/rejected": -1.1064016819000244, + "step": 92 + }, + { + "epoch": 0.1987710392733102, + "grad_norm": 18.59822363701581, + "learning_rate": 4.853466166652258e-07, + "logits/chosen": 1.9111980199813843, + "logits/rejected": 2.00334095954895, + "logps/chosen": -383.8531494140625, + "logps/rejected": -441.35626220703125, + "loss": 0.6473, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9924455881118774, + "rewards/margins": 0.08671611547470093, + "rewards/rejected": -1.0791617631912231, + "step": 93 + }, + { + "epoch": 0.200908362276249, + "grad_norm": 18.689539426987327, + "learning_rate": 4.847092331389964e-07, + "logits/chosen": 1.7213155031204224, + "logits/rejected": 1.7230504751205444, + "logps/chosen": -328.0395202636719, + "logps/rejected": -307.9144592285156, + "loss": 0.6289, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7495980262756348, + "rewards/margins": 0.31506437063217163, + "rewards/rejected": -1.0646624565124512, + "step": 94 + }, + { + "epoch": 0.20304568527918782, + "grad_norm": 18.329788390189, + "learning_rate": 4.840587176599343e-07, + "logits/chosen": 1.8517365455627441, + "logits/rejected": 1.867042064666748, + "logps/chosen": -284.2342834472656, + "logps/rejected": -316.3347473144531, + "loss": 0.6473, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1318445205688477, + "rewards/margins": 0.22313019633293152, + "rewards/rejected": -1.3549748659133911, + "step": 95 + }, + { + "epoch": 0.20518300828212663, + "grad_norm": 20.003558783675533, + "learning_rate": 4.833951066243004e-07, + "logits/chosen": 1.9658561944961548, + "logits/rejected": 2.0219621658325195, + "logps/chosen": -385.1724853515625, + "logps/rejected": -374.846923828125, + "loss": 0.6522, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.280335783958435, + "rewards/margins": 0.1471562385559082, + "rewards/rejected": -1.4274920225143433, + "step": 96 + }, + { + "epoch": 0.20732033128506547, + "grad_norm": 19.439956275385367, + "learning_rate": 4.82718437161051e-07, + "logits/chosen": 1.449579119682312, + "logits/rejected": 1.475942850112915, + "logps/chosen": -424.9071044921875, + "logps/rejected": -432.41900634765625, + "loss": 0.6182, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.032875657081604, + "rewards/margins": 0.14200714230537415, + "rewards/rejected": -1.1748827695846558, + "step": 97 + }, + { + "epoch": 0.20945765428800428, + "grad_norm": 20.720426275007746, + "learning_rate": 4.820287471297597e-07, + "logits/chosen": 1.2382292747497559, + "logits/rejected": 1.3223166465759277, + "logps/chosen": -341.36773681640625, + "logps/rejected": -373.3889465332031, + "loss": 0.6139, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9406616687774658, + "rewards/margins": 0.30484455823898315, + "rewards/rejected": -1.2455062866210938, + "step": 98 + }, + { + "epoch": 0.2115949772909431, + "grad_norm": 19.167777501448665, + "learning_rate": 4.813260751184992e-07, + "logits/chosen": 1.4242631196975708, + "logits/rejected": 1.3070321083068848, + "logps/chosen": -382.6838073730469, + "logps/rejected": -369.42333984375, + "loss": 0.638, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2732874155044556, + "rewards/margins": 0.12794676423072815, + "rewards/rejected": -1.4012342691421509, + "step": 99 + }, + { + "epoch": 0.2137323002938819, + "grad_norm": 19.082592371106056, + "learning_rate": 4.806104604416823e-07, + "logits/chosen": 1.8438955545425415, + "logits/rejected": 1.8277944326400757, + "logps/chosen": -375.05303955078125, + "logps/rejected": -428.58056640625, + "loss": 0.6082, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.259270429611206, + "rewards/margins": 0.28902101516723633, + "rewards/rejected": -1.548291563987732, + "step": 100 + }, + { + "epoch": 0.21586962329682075, + "grad_norm": 19.40383100777513, + "learning_rate": 4.798819431378626e-07, + "logits/chosen": 1.5675817728042603, + "logits/rejected": 1.487393856048584, + "logps/chosen": -274.16741943359375, + "logps/rejected": -294.5373840332031, + "loss": 0.5981, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.184496521949768, + "rewards/margins": 0.1605721116065979, + "rewards/rejected": -1.3450688123703003, + "step": 101 + }, + { + "epoch": 0.21800694629975956, + "grad_norm": 19.035365387486454, + "learning_rate": 4.79140563967494e-07, + "logits/chosen": 1.378381371498108, + "logits/rejected": 1.5444639921188354, + "logps/chosen": -342.8183288574219, + "logps/rejected": -344.89239501953125, + "loss": 0.6363, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0111204385757446, + "rewards/margins": 0.26567333936691284, + "rewards/rejected": -1.2767938375473022, + "step": 102 + }, + { + "epoch": 0.22014426930269837, + "grad_norm": 22.460424893993004, + "learning_rate": 4.783863644106502e-07, + "logits/chosen": 1.3568328619003296, + "logits/rejected": 1.3307393789291382, + "logps/chosen": -235.84120178222656, + "logps/rejected": -239.3479766845703, + "loss": 0.6918, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9905429482460022, + "rewards/margins": -0.051335543394088745, + "rewards/rejected": -0.9392074346542358, + "step": 103 + }, + { + "epoch": 0.22228159230563718, + "grad_norm": 20.610208302459373, + "learning_rate": 4.776193866647039e-07, + "logits/chosen": 1.1602747440338135, + "logits/rejected": 1.4320569038391113, + "logps/chosen": -399.7038269042969, + "logps/rejected": -418.18560791015625, + "loss": 0.6695, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4170256853103638, + "rewards/margins": 0.1951914280653, + "rewards/rejected": -1.6122169494628906, + "step": 104 + }, + { + "epoch": 0.224418915308576, + "grad_norm": 21.316293798129664, + "learning_rate": 4.768396736419662e-07, + "logits/chosen": 1.2325228452682495, + "logits/rejected": 1.2283470630645752, + "logps/chosen": -340.61248779296875, + "logps/rejected": -354.34228515625, + "loss": 0.6416, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9992186427116394, + "rewards/margins": 0.22192566096782684, + "rewards/rejected": -1.2211443185806274, + "step": 105 + }, + { + "epoch": 0.22655623831151483, + "grad_norm": 18.691286982248872, + "learning_rate": 4.7604726896728496e-07, + "logits/chosen": 2.0024757385253906, + "logits/rejected": 2.002178430557251, + "logps/chosen": -435.4734802246094, + "logps/rejected": -469.6261291503906, + "loss": 0.5673, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7990100979804993, + "rewards/margins": 0.48767000436782837, + "rewards/rejected": -1.2866802215576172, + "step": 106 + }, + { + "epoch": 0.22869356131445365, + "grad_norm": 17.650688796730755, + "learning_rate": 4.752422169756047e-07, + "logits/chosen": 2.159564256668091, + "logits/rejected": 2.232543468475342, + "logps/chosen": -328.35565185546875, + "logps/rejected": -342.68707275390625, + "loss": 0.6139, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8560774326324463, + "rewards/margins": 0.4257628917694092, + "rewards/rejected": -1.281840443611145, + "step": 107 + }, + { + "epoch": 0.23083088431739246, + "grad_norm": 19.554037566299336, + "learning_rate": 4.744245627094858e-07, + "logits/chosen": 1.77793288230896, + "logits/rejected": 1.717706322669983, + "logps/chosen": -468.2958679199219, + "logps/rejected": -462.658935546875, + "loss": 0.6413, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9724741578102112, + "rewards/margins": 0.3415077030658722, + "rewards/rejected": -1.3139818906784058, + "step": 108 + }, + { + "epoch": 0.23296820732033127, + "grad_norm": 20.50700794956668, + "learning_rate": 4.735943519165842e-07, + "logits/chosen": 1.9187538623809814, + "logits/rejected": 2.0614094734191895, + "logps/chosen": -454.2608642578125, + "logps/rejected": -481.36431884765625, + "loss": 0.6428, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1436271667480469, + "rewards/margins": 0.13930538296699524, + "rewards/rejected": -1.2829326391220093, + "step": 109 + }, + { + "epoch": 0.2351055303232701, + "grad_norm": 18.272975924577455, + "learning_rate": 4.7275163104709194e-07, + "logits/chosen": 1.9010305404663086, + "logits/rejected": 1.7875394821166992, + "logps/chosen": -367.1365966796875, + "logps/rejected": -367.42431640625, + "loss": 0.6146, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9120699763298035, + "rewards/margins": 0.2043510228395462, + "rewards/rejected": -1.116420865058899, + "step": 110 + }, + { + "epoch": 0.23724285332620892, + "grad_norm": 20.295493464767308, + "learning_rate": 4.718964472511385e-07, + "logits/chosen": 1.4215465784072876, + "logits/rejected": 1.4408009052276611, + "logps/chosen": -361.8034973144531, + "logps/rejected": -382.62408447265625, + "loss": 0.6292, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.853609561920166, + "rewards/margins": 0.46648335456848145, + "rewards/rejected": -1.3200929164886475, + "step": 111 + }, + { + "epoch": 0.23938017632914774, + "grad_norm": 18.121395683486867, + "learning_rate": 4.710288483761524e-07, + "logits/chosen": 1.0649147033691406, + "logits/rejected": 1.071373462677002, + "logps/chosen": -352.0085144042969, + "logps/rejected": -351.5292053222656, + "loss": 0.614, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.031156301498413, + "rewards/margins": 0.21290811896324158, + "rewards/rejected": -1.244064450263977, + "step": 112 + }, + { + "epoch": 0.24151749933208655, + "grad_norm": 20.254445315081554, + "learning_rate": 4.7014888296418447e-07, + "logits/chosen": 1.5161610841751099, + "logits/rejected": 1.550595760345459, + "logps/chosen": -460.7918701171875, + "logps/rejected": -468.8692321777344, + "loss": 0.6563, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1960896253585815, + "rewards/margins": 0.059063613414764404, + "rewards/rejected": -1.2551532983779907, + "step": 113 + }, + { + "epoch": 0.2436548223350254, + "grad_norm": 18.896077047246234, + "learning_rate": 4.692566002491916e-07, + "logits/chosen": 1.7081586122512817, + "logits/rejected": 1.6641770601272583, + "logps/chosen": -387.3111877441406, + "logps/rejected": -368.2967224121094, + "loss": 0.621, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8478341102600098, + "rewards/margins": -0.09017517417669296, + "rewards/rejected": -0.7576589584350586, + "step": 114 + }, + { + "epoch": 0.2457921453379642, + "grad_norm": 21.30499019299629, + "learning_rate": 4.683520501542824e-07, + "logits/chosen": 1.545853614807129, + "logits/rejected": 1.6058801412582397, + "logps/chosen": -452.3874206542969, + "logps/rejected": -474.2342834472656, + "loss": 0.6145, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0228480100631714, + "rewards/margins": -0.04428707808256149, + "rewards/rejected": -0.9785609245300293, + "step": 115 + }, + { + "epoch": 0.24792946834090301, + "grad_norm": 21.86187899194981, + "learning_rate": 4.6743528328892384e-07, + "logits/chosen": 1.2116038799285889, + "logits/rejected": 1.1613038778305054, + "logps/chosen": -325.0521240234375, + "logps/rejected": -308.12518310546875, + "loss": 0.6305, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.3946088552474976, + "rewards/margins": -0.029741812497377396, + "rewards/rejected": -1.3648672103881836, + "step": 116 + }, + { + "epoch": 0.25006679134384185, + "grad_norm": 18.91701016098703, + "learning_rate": 4.6650635094610966e-07, + "logits/chosen": 1.4677801132202148, + "logits/rejected": 1.579450249671936, + "logps/chosen": -344.372314453125, + "logps/rejected": -327.50225830078125, + "loss": 0.5949, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8053675293922424, + "rewards/margins": 0.2620340585708618, + "rewards/rejected": -1.067401647567749, + "step": 117 + }, + { + "epoch": 0.25220411434678064, + "grad_norm": 20.012466373331762, + "learning_rate": 4.655653050994906e-07, + "logits/chosen": 1.6915593147277832, + "logits/rejected": 1.726331114768982, + "logps/chosen": -283.65875244140625, + "logps/rejected": -331.01483154296875, + "loss": 0.651, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5659047365188599, + "rewards/margins": 0.2718469202518463, + "rewards/rejected": -0.8377516269683838, + "step": 118 + }, + { + "epoch": 0.2543414373497195, + "grad_norm": 17.44337077106633, + "learning_rate": 4.646121984004665e-07, + "logits/chosen": 1.8943997621536255, + "logits/rejected": 1.9792275428771973, + "logps/chosen": -344.978271484375, + "logps/rejected": -379.34527587890625, + "loss": 0.6436, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8211143612861633, + "rewards/margins": 0.16521628201007843, + "rewards/rejected": -0.9863307476043701, + "step": 119 + }, + { + "epoch": 0.2564787603526583, + "grad_norm": 21.735001575170486, + "learning_rate": 4.636470841752404e-07, + "logits/chosen": 1.5497221946716309, + "logits/rejected": 1.5381598472595215, + "logps/chosen": -470.0634460449219, + "logps/rejected": -428.8343505859375, + "loss": 0.6117, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9829262495040894, + "rewards/margins": 0.3946997821331024, + "rewards/rejected": -1.3776261806488037, + "step": 120 + }, + { + "epoch": 0.2586160833555971, + "grad_norm": 20.694912637280392, + "learning_rate": 4.626700164218349e-07, + "logits/chosen": 1.6372838020324707, + "logits/rejected": 1.5779719352722168, + "logps/chosen": -418.0284423828125, + "logps/rejected": -402.91796875, + "loss": 0.6965, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0081559419631958, + "rewards/margins": -0.04968453198671341, + "rewards/rejected": -0.9584714770317078, + "step": 121 + }, + { + "epoch": 0.26075340635853594, + "grad_norm": 17.553458790755446, + "learning_rate": 4.6168104980707103e-07, + "logits/chosen": 2.0725860595703125, + "logits/rejected": 2.064181089401245, + "logps/chosen": -377.4428405761719, + "logps/rejected": -404.63714599609375, + "loss": 0.6212, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6986384987831116, + "rewards/margins": 0.06995135545730591, + "rewards/rejected": -0.7685898542404175, + "step": 122 + }, + { + "epoch": 0.26289072936147473, + "grad_norm": 20.247623027581653, + "learning_rate": 4.606802396635098e-07, + "logits/chosen": 1.9462324380874634, + "logits/rejected": 1.8845677375793457, + "logps/chosen": -366.8088684082031, + "logps/rejected": -356.9654846191406, + "loss": 0.6748, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7235208749771118, + "rewards/margins": 0.027630850672721863, + "rewards/rejected": -0.7511517405509949, + "step": 123 + }, + { + "epoch": 0.26502805236441357, + "grad_norm": 19.849959723892198, + "learning_rate": 4.59667641986356e-07, + "logits/chosen": 1.8744176626205444, + "logits/rejected": 2.0228824615478516, + "logps/chosen": -398.7731018066406, + "logps/rejected": -424.6177978515625, + "loss": 0.6709, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.693671464920044, + "rewards/margins": 0.2790328860282898, + "rewards/rejected": -0.972704291343689, + "step": 124 + }, + { + "epoch": 0.2671653753673524, + "grad_norm": 17.32173219755259, + "learning_rate": 4.5864331343032565e-07, + "logits/chosen": 1.621677279472351, + "logits/rejected": 1.6809951066970825, + "logps/chosen": -452.27984619140625, + "logps/rejected": -462.4055480957031, + "loss": 0.6348, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9525185823440552, + "rewards/margins": 0.3178427219390869, + "rewards/rejected": -1.270361304283142, + "step": 125 + }, + { + "epoch": 0.2693026983702912, + "grad_norm": 16.88106478871165, + "learning_rate": 4.576073113064759e-07, + "logits/chosen": 2.357682943344116, + "logits/rejected": 2.2975120544433594, + "logps/chosen": -279.5217590332031, + "logps/rejected": -295.60540771484375, + "loss": 0.6292, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.47640547156333923, + "rewards/margins": 0.4183332920074463, + "rewards/rejected": -0.8947387933731079, + "step": 126 + }, + { + "epoch": 0.27144002137323003, + "grad_norm": 18.58434290114408, + "learning_rate": 4.565596935789987e-07, + "logits/chosen": 1.576242446899414, + "logits/rejected": 1.6992592811584473, + "logps/chosen": -439.7972106933594, + "logps/rejected": -498.26849365234375, + "loss": 0.6055, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9947447776794434, + "rewards/margins": 0.28498226404190063, + "rewards/rejected": -1.2797271013259888, + "step": 127 + }, + { + "epoch": 0.2735773443761689, + "grad_norm": 21.02105243570409, + "learning_rate": 4.555005188619775e-07, + "logits/chosen": 2.214823007583618, + "logits/rejected": 2.240490674972534, + "logps/chosen": -381.3008728027344, + "logps/rejected": -405.7254638671875, + "loss": 0.6539, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9661093354225159, + "rewards/margins": 0.04444655776023865, + "rewards/rejected": -1.0105559825897217, + "step": 128 + }, + { + "epoch": 0.27571466737910766, + "grad_norm": 17.756752976550278, + "learning_rate": 4.5442984641610784e-07, + "logits/chosen": 1.8476028442382812, + "logits/rejected": 1.8882935047149658, + "logps/chosen": -436.7222900390625, + "logps/rejected": -449.9324035644531, + "loss": 0.5981, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.788104772567749, + "rewards/margins": 0.29956191778182983, + "rewards/rejected": -1.0876667499542236, + "step": 129 + }, + { + "epoch": 0.2778519903820465, + "grad_norm": 18.161397454748034, + "learning_rate": 4.533477361453819e-07, + "logits/chosen": 1.1066253185272217, + "logits/rejected": 1.1686846017837524, + "logps/chosen": -305.81842041015625, + "logps/rejected": -307.0638122558594, + "loss": 0.5808, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8689908385276794, + "rewards/margins": 0.27876365184783936, + "rewards/rejected": -1.1477545499801636, + "step": 130 + }, + { + "epoch": 0.2799893133849853, + "grad_norm": 17.429213086565447, + "learning_rate": 4.5225424859373684e-07, + "logits/chosen": 1.4423407316207886, + "logits/rejected": 1.51125168800354, + "logps/chosen": -355.8777160644531, + "logps/rejected": -386.8418884277344, + "loss": 0.6476, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9852381348609924, + "rewards/margins": 0.3623862862586975, + "rewards/rejected": -1.3476243019104004, + "step": 131 + }, + { + "epoch": 0.2821266363879241, + "grad_norm": 16.871614858081898, + "learning_rate": 4.511494449416671e-07, + "logits/chosen": 1.5764886140823364, + "logits/rejected": 1.6248146295547485, + "logps/chosen": -409.7974548339844, + "logps/rejected": -408.63824462890625, + "loss": 0.6094, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9070343971252441, + "rewards/margins": 0.17678040266036987, + "rewards/rejected": -1.0838148593902588, + "step": 132 + }, + { + "epoch": 0.28426395939086296, + "grad_norm": 17.095310780630726, + "learning_rate": 4.500333870028016e-07, + "logits/chosen": 1.6591562032699585, + "logits/rejected": 1.5384818315505981, + "logps/chosen": -343.987060546875, + "logps/rejected": -385.374755859375, + "loss": 0.6259, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7391742467880249, + "rewards/margins": 0.5074938535690308, + "rewards/rejected": -1.2466681003570557, + "step": 133 + }, + { + "epoch": 0.28640128239380175, + "grad_norm": 19.656778173814363, + "learning_rate": 4.489061372204452e-07, + "logits/chosen": 1.0009726285934448, + "logits/rejected": 1.051519513130188, + "logps/chosen": -384.1603088378906, + "logps/rejected": -367.7815856933594, + "loss": 0.6539, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0506068468093872, + "rewards/margins": 0.08326918631792068, + "rewards/rejected": -1.1338759660720825, + "step": 134 + }, + { + "epoch": 0.2885386053967406, + "grad_norm": 19.01652787278696, + "learning_rate": 4.4776775866408533e-07, + "logits/chosen": 2.3810667991638184, + "logits/rejected": 2.3225905895233154, + "logps/chosen": -384.6607360839844, + "logps/rejected": -368.5036926269531, + "loss": 0.5726, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6546584367752075, + "rewards/margins": 0.3204587697982788, + "rewards/rejected": -0.9751172065734863, + "step": 135 + }, + { + "epoch": 0.29067592839967943, + "grad_norm": 21.69866647932205, + "learning_rate": 4.4661831502586244e-07, + "logits/chosen": 1.681275486946106, + "logits/rejected": 1.7548019886016846, + "logps/chosen": -457.0005187988281, + "logps/rejected": -467.3171691894531, + "loss": 0.615, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7441829442977905, + "rewards/margins": 0.49308401346206665, + "rewards/rejected": -1.2372668981552124, + "step": 136 + }, + { + "epoch": 0.2928132514026182, + "grad_norm": 18.490983761159814, + "learning_rate": 4.4545787061700746e-07, + "logits/chosen": 1.8112283945083618, + "logits/rejected": 1.8490984439849854, + "logps/chosen": -429.985595703125, + "logps/rejected": -492.1372375488281, + "loss": 0.6052, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9648774862289429, + "rewards/margins": 0.31009066104888916, + "rewards/rejected": -1.274968147277832, + "step": 137 + }, + { + "epoch": 0.29495057440555705, + "grad_norm": 20.87958822150684, + "learning_rate": 4.442864903642427e-07, + "logits/chosen": 1.2387418746948242, + "logits/rejected": 1.3154277801513672, + "logps/chosen": -298.0567626953125, + "logps/rejected": -298.1575927734375, + "loss": 0.6371, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8304409384727478, + "rewards/margins": 0.3910561203956604, + "rewards/rejected": -1.2214970588684082, + "step": 138 + }, + { + "epoch": 0.29708789740849584, + "grad_norm": 16.835799139060907, + "learning_rate": 4.4310423980614986e-07, + "logits/chosen": 1.3767895698547363, + "logits/rejected": 1.4596718549728394, + "logps/chosen": -327.63934326171875, + "logps/rejected": -384.70782470703125, + "loss": 0.6162, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9396180510520935, + "rewards/margins": -0.1563718318939209, + "rewards/rejected": -0.7832461595535278, + "step": 139 + }, + { + "epoch": 0.2992252204114347, + "grad_norm": 19.34921651543505, + "learning_rate": 4.4191118508950277e-07, + "logits/chosen": 1.306443691253662, + "logits/rejected": 1.4678008556365967, + "logps/chosen": -356.93157958984375, + "logps/rejected": -380.83612060546875, + "loss": 0.6661, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7840622663497925, + "rewards/margins": -0.01293850876390934, + "rewards/rejected": -0.7711237668991089, + "step": 140 + }, + { + "epoch": 0.3013625434143735, + "grad_norm": 19.997863233531348, + "learning_rate": 4.407073929655666e-07, + "logits/chosen": 1.8293468952178955, + "logits/rejected": 1.9084713459014893, + "logps/chosen": -418.7998046875, + "logps/rejected": -456.46282958984375, + "loss": 0.626, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0191199779510498, + "rewards/margins": 0.17436285316944122, + "rewards/rejected": -1.1934828758239746, + "step": 141 + }, + { + "epoch": 0.3034998664173123, + "grad_norm": 17.11885511546179, + "learning_rate": 4.394929307863632e-07, + "logits/chosen": 1.5053921937942505, + "logits/rejected": 1.5208780765533447, + "logps/chosen": -437.318603515625, + "logps/rejected": -465.1886291503906, + "loss": 0.5784, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2199373245239258, + "rewards/margins": 0.392232745885849, + "rewards/rejected": -1.6121701002120972, + "step": 142 + }, + { + "epoch": 0.30563718942025114, + "grad_norm": 18.052198878210906, + "learning_rate": 4.3826786650090273e-07, + "logits/chosen": 1.749753475189209, + "logits/rejected": 1.900386929512024, + "logps/chosen": -450.9555969238281, + "logps/rejected": -499.1803894042969, + "loss": 0.6127, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1993368864059448, + "rewards/margins": 0.33942604064941406, + "rewards/rejected": -1.5387628078460693, + "step": 143 + }, + { + "epoch": 0.3077745124231899, + "grad_norm": 19.061405430342706, + "learning_rate": 4.370322686513817e-07, + "logits/chosen": 1.7149816751480103, + "logits/rejected": 1.6238107681274414, + "logps/chosen": -469.9088134765625, + "logps/rejected": -504.73382568359375, + "loss": 0.6283, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1103051900863647, + "rewards/margins": 0.37591469287872314, + "rewards/rejected": -1.486219882965088, + "step": 144 + }, + { + "epoch": 0.30991183542612877, + "grad_norm": 19.971830399651115, + "learning_rate": 4.357862063693485e-07, + "logits/chosen": 1.6376560926437378, + "logits/rejected": 1.64955735206604, + "logps/chosen": -450.31072998046875, + "logps/rejected": -495.2893371582031, + "loss": 0.6152, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3110533952713013, + "rewards/margins": 0.012708161026239395, + "rewards/rejected": -1.3237614631652832, + "step": 145 + }, + { + "epoch": 0.3120491584290676, + "grad_norm": 20.61129851426581, + "learning_rate": 4.345297493718352e-07, + "logits/chosen": 1.9281656742095947, + "logits/rejected": 1.8954627513885498, + "logps/chosen": -473.4234619140625, + "logps/rejected": -445.9279479980469, + "loss": 0.6213, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2771466970443726, + "rewards/margins": 0.2621864378452301, + "rewards/rejected": -1.5393332242965698, + "step": 146 + }, + { + "epoch": 0.3141864814320064, + "grad_norm": 18.097782796746507, + "learning_rate": 4.332629679574565e-07, + "logits/chosen": 1.952436923980713, + "logits/rejected": 2.018927574157715, + "logps/chosen": -394.45086669921875, + "logps/rejected": -399.55499267578125, + "loss": 0.5665, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1240993738174438, + "rewards/margins": 0.2864275574684143, + "rewards/rejected": -1.410526990890503, + "step": 147 + }, + { + "epoch": 0.31632380443494523, + "grad_norm": 20.183130534419625, + "learning_rate": 4.319859330024777e-07, + "logits/chosen": 2.2235820293426514, + "logits/rejected": 2.276655673980713, + "logps/chosen": -307.6512145996094, + "logps/rejected": -317.3448486328125, + "loss": 0.6333, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3463375568389893, + "rewards/margins": -0.0017591193318367004, + "rewards/rejected": -1.3445783853530884, + "step": 148 + }, + { + "epoch": 0.3184611274378841, + "grad_norm": 25.15607121291107, + "learning_rate": 4.3069871595684787e-07, + "logits/chosen": 1.5035438537597656, + "logits/rejected": 1.4331194162368774, + "logps/chosen": -327.4940490722656, + "logps/rejected": -331.21673583984375, + "loss": 0.6058, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3689939975738525, + "rewards/margins": -0.0226670503616333, + "rewards/rejected": -1.3463268280029297, + "step": 149 + }, + { + "epoch": 0.32059845044082286, + "grad_norm": 21.360613822616227, + "learning_rate": 4.294013888402029e-07, + "logits/chosen": 1.6609265804290771, + "logits/rejected": 1.6308729648590088, + "logps/chosen": -381.4766845703125, + "logps/rejected": -370.9427185058594, + "loss": 0.6779, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2506117820739746, + "rewards/margins": 0.20036408305168152, + "rewards/rejected": -1.4509758949279785, + "step": 150 + }, + { + "epoch": 0.3227357734437617, + "grad_norm": 20.441359698002614, + "learning_rate": 4.280940242378362e-07, + "logits/chosen": 1.9611876010894775, + "logits/rejected": 2.0835318565368652, + "logps/chosen": -370.7065124511719, + "logps/rejected": -373.98919677734375, + "loss": 0.6251, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1139070987701416, + "rewards/margins": 0.22739200294017792, + "rewards/rejected": -1.341299057006836, + "step": 151 + }, + { + "epoch": 0.3248730964467005, + "grad_norm": 18.79562489140208, + "learning_rate": 4.2677669529663686e-07, + "logits/chosen": 1.4962538480758667, + "logits/rejected": 1.5024197101593018, + "logps/chosen": -337.7670593261719, + "logps/rejected": -365.6881103515625, + "loss": 0.6088, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1060924530029297, + "rewards/margins": 0.7126469612121582, + "rewards/rejected": -1.818739652633667, + "step": 152 + }, + { + "epoch": 0.3270104194496393, + "grad_norm": 18.073724217070875, + "learning_rate": 4.254494757209979e-07, + "logits/chosen": 1.3349535465240479, + "logits/rejected": 1.4294294118881226, + "logps/chosen": -372.82452392578125, + "logps/rejected": -384.3750915527344, + "loss": 0.6056, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.492186427116394, + "rewards/margins": 0.22766265273094177, + "rewards/rejected": -1.7198491096496582, + "step": 153 + }, + { + "epoch": 0.32914774245257816, + "grad_norm": 18.58205194816464, + "learning_rate": 4.2411243976869173e-07, + "logits/chosen": 1.724280834197998, + "logits/rejected": 1.7333550453186035, + "logps/chosen": -360.4881591796875, + "logps/rejected": -359.3287658691406, + "loss": 0.6381, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1819064617156982, + "rewards/margins": 0.5323120355606079, + "rewards/rejected": -1.7142184972763062, + "step": 154 + }, + { + "epoch": 0.33128506545551695, + "grad_norm": 19.960829202955406, + "learning_rate": 4.227656622467162e-07, + "logits/chosen": 1.953383207321167, + "logits/rejected": 1.9463614225387573, + "logps/chosen": -409.99627685546875, + "logps/rejected": -437.938232421875, + "loss": 0.6533, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0230939388275146, + "rewards/margins": 0.16208747029304504, + "rewards/rejected": -1.1851812601089478, + "step": 155 + }, + { + "epoch": 0.3334223884584558, + "grad_norm": 18.62902600269558, + "learning_rate": 4.2140921850710855e-07, + "logits/chosen": 1.7730010747909546, + "logits/rejected": 1.8674681186676025, + "logps/chosen": -416.8703308105469, + "logps/rejected": -413.43560791015625, + "loss": 0.6169, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1183083057403564, + "rewards/margins": 0.2687716484069824, + "rewards/rejected": -1.3870799541473389, + "step": 156 + }, + { + "epoch": 0.3355597114613946, + "grad_norm": 18.56104964810593, + "learning_rate": 4.200431844427298e-07, + "logits/chosen": 1.5295546054840088, + "logits/rejected": 1.7054471969604492, + "logps/chosen": -362.6158752441406, + "logps/rejected": -384.364013671875, + "loss": 0.6365, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8897120952606201, + "rewards/margins": -0.04062645137310028, + "rewards/rejected": -1.849085807800293, + "step": 157 + }, + { + "epoch": 0.3376970344643334, + "grad_norm": 21.508842817831596, + "learning_rate": 4.186676364830186e-07, + "logits/chosen": 2.0236904621124268, + "logits/rejected": 1.945557713508606, + "logps/chosen": -344.7059020996094, + "logps/rejected": -316.12835693359375, + "loss": 0.6745, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.44972562789917, + "rewards/margins": 0.07076007127761841, + "rewards/rejected": -1.520485758781433, + "step": 158 + }, + { + "epoch": 0.33983435746727225, + "grad_norm": 19.368493910930727, + "learning_rate": 4.172826515897145e-07, + "logits/chosen": 2.0061938762664795, + "logits/rejected": 1.9779328107833862, + "logps/chosen": -355.9288330078125, + "logps/rejected": -391.3779602050781, + "loss": 0.6288, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.173938512802124, + "rewards/margins": 0.46023327112197876, + "rewards/rejected": -1.6341716051101685, + "step": 159 + }, + { + "epoch": 0.34197168047021104, + "grad_norm": 20.382801776648744, + "learning_rate": 4.158883072525528e-07, + "logits/chosen": 2.31357479095459, + "logits/rejected": 2.2882211208343506, + "logps/chosen": -408.6782531738281, + "logps/rejected": -441.49530029296875, + "loss": 0.6609, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3026232719421387, + "rewards/margins": 0.30984416604042053, + "rewards/rejected": -1.6124674081802368, + "step": 160 + }, + { + "epoch": 0.3441090034731499, + "grad_norm": 19.084492084822422, + "learning_rate": 4.1448468148492814e-07, + "logits/chosen": 1.8440988063812256, + "logits/rejected": 1.9733848571777344, + "logps/chosen": -435.4466552734375, + "logps/rejected": -475.33648681640625, + "loss": 0.6153, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3617444038391113, + "rewards/margins": 0.13640473783016205, + "rewards/rejected": -1.4981491565704346, + "step": 161 + }, + { + "epoch": 0.3462463264760887, + "grad_norm": 27.08324422105949, + "learning_rate": 4.130718528195303e-07, + "logits/chosen": 1.7824862003326416, + "logits/rejected": 1.6462738513946533, + "logps/chosen": -437.1048889160156, + "logps/rejected": -440.6667785644531, + "loss": 0.6982, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.561838150024414, + "rewards/margins": -0.11082294583320618, + "rewards/rejected": -1.4510152339935303, + "step": 162 + }, + { + "epoch": 0.3483836494790275, + "grad_norm": 19.814697570819504, + "learning_rate": 4.1164990030394985e-07, + "logits/chosen": 1.0093296766281128, + "logits/rejected": 1.0821220874786377, + "logps/chosen": -423.516357421875, + "logps/rejected": -420.7889404296875, + "loss": 0.6014, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1094772815704346, + "rewards/margins": 0.3603382408618927, + "rewards/rejected": -1.4698156118392944, + "step": 163 + }, + { + "epoch": 0.35052097248196634, + "grad_norm": 19.599198524357487, + "learning_rate": 4.10218903496256e-07, + "logits/chosen": 1.682007074356079, + "logits/rejected": 1.787623643875122, + "logps/chosen": -282.51171875, + "logps/rejected": -345.05877685546875, + "loss": 0.6625, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.205474615097046, + "rewards/margins": 0.271236389875412, + "rewards/rejected": -1.4767110347747803, + "step": 164 + }, + { + "epoch": 0.3526582954849052, + "grad_norm": 17.120301761319624, + "learning_rate": 4.087789424605447e-07, + "logits/chosen": 1.741114854812622, + "logits/rejected": 1.6152499914169312, + "logps/chosen": -417.688720703125, + "logps/rejected": -433.7536315917969, + "loss": 0.6162, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1665997505187988, + "rewards/margins": 0.2989981770515442, + "rewards/rejected": -1.4655979871749878, + "step": 165 + }, + { + "epoch": 0.35479561848784397, + "grad_norm": 18.747521162934547, + "learning_rate": 4.0733009776245937e-07, + "logits/chosen": 1.4322879314422607, + "logits/rejected": 1.3907232284545898, + "logps/chosen": -347.32489013671875, + "logps/rejected": -360.52301025390625, + "loss": 0.6227, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.081840991973877, + "rewards/margins": 0.16741850972175598, + "rewards/rejected": -1.249259352684021, + "step": 166 + }, + { + "epoch": 0.3569329414907828, + "grad_norm": 18.0004093305282, + "learning_rate": 4.058724504646834e-07, + "logits/chosen": 2.36202335357666, + "logits/rejected": 2.4911999702453613, + "logps/chosen": -383.17584228515625, + "logps/rejected": -401.8243408203125, + "loss": 0.6195, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2278780937194824, + "rewards/margins": 0.20441044867038727, + "rewards/rejected": -1.4322885274887085, + "step": 167 + }, + { + "epoch": 0.3590702644937216, + "grad_norm": 17.80844319511304, + "learning_rate": 4.0440608212240445e-07, + "logits/chosen": 1.4843562841415405, + "logits/rejected": 1.5054625272750854, + "logps/chosen": -411.7197265625, + "logps/rejected": -443.1764221191406, + "loss": 0.6053, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9291986227035522, + "rewards/margins": 0.41230058670043945, + "rewards/rejected": -1.3414990901947021, + "step": 168 + }, + { + "epoch": 0.36120758749666043, + "grad_norm": 18.374934776369027, + "learning_rate": 4.0293107477875156e-07, + "logits/chosen": 1.700923204421997, + "logits/rejected": 1.6426975727081299, + "logps/chosen": -396.6122741699219, + "logps/rejected": -397.70355224609375, + "loss": 0.6373, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2499213218688965, + "rewards/margins": 0.22793683409690857, + "rewards/rejected": -1.4778581857681274, + "step": 169 + }, + { + "epoch": 0.36334491049959927, + "grad_norm": 19.145060598159265, + "learning_rate": 4.0144751096020497e-07, + "logits/chosen": 1.7463726997375488, + "logits/rejected": 1.8051655292510986, + "logps/chosen": -353.2317810058594, + "logps/rejected": -378.6690979003906, + "loss": 0.6298, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9795432686805725, + "rewards/margins": 0.40754634141921997, + "rewards/rejected": -1.387089729309082, + "step": 170 + }, + { + "epoch": 0.36548223350253806, + "grad_norm": 16.861158005133213, + "learning_rate": 3.9995547367197843e-07, + "logits/chosen": 1.6164271831512451, + "logits/rejected": 1.6135661602020264, + "logps/chosen": -327.3568115234375, + "logps/rejected": -337.6192321777344, + "loss": 0.5895, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.874947190284729, + "rewards/margins": 0.4394197463989258, + "rewards/rejected": -1.3143669366836548, + "step": 171 + }, + { + "epoch": 0.3676195565054769, + "grad_norm": 17.777424855400742, + "learning_rate": 3.9845504639337535e-07, + "logits/chosen": 1.070286750793457, + "logits/rejected": 1.2197216749191284, + "logps/chosen": -231.44830322265625, + "logps/rejected": -272.8808898925781, + "loss": 0.57, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6078026294708252, + "rewards/margins": 0.5821976661682129, + "rewards/rejected": -1.190000295639038, + "step": 172 + }, + { + "epoch": 0.36975687950841574, + "grad_norm": 17.126819669486423, + "learning_rate": 3.9694631307311825e-07, + "logits/chosen": 2.135113477706909, + "logits/rejected": 2.2197318077087402, + "logps/chosen": -419.06134033203125, + "logps/rejected": -513.2752075195312, + "loss": 0.5982, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9003175497055054, + "rewards/margins": 0.7730761170387268, + "rewards/rejected": -1.673393726348877, + "step": 173 + }, + { + "epoch": 0.3718942025113545, + "grad_norm": 18.225988784634456, + "learning_rate": 3.954293581246514e-07, + "logits/chosen": 1.7381099462509155, + "logits/rejected": 1.8567441701889038, + "logps/chosen": -414.3870544433594, + "logps/rejected": -441.8538818359375, + "loss": 0.6366, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8376742601394653, + "rewards/margins": 0.05604010820388794, + "rewards/rejected": -0.8937143087387085, + "step": 174 + }, + { + "epoch": 0.37403152551429336, + "grad_norm": 19.941158048862572, + "learning_rate": 3.939042664214184e-07, + "logits/chosen": 1.4559072256088257, + "logits/rejected": 1.3386807441711426, + "logps/chosen": -338.5787658691406, + "logps/rejected": -348.70880126953125, + "loss": 0.612, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0519522428512573, + "rewards/margins": 0.3615396022796631, + "rewards/rejected": -1.41349196434021, + "step": 175 + }, + { + "epoch": 0.37616884851723215, + "grad_norm": 17.811647654827375, + "learning_rate": 3.92371123292113e-07, + "logits/chosen": 0.6934733986854553, + "logits/rejected": 0.7196266055107117, + "logps/chosen": -258.3397216796875, + "logps/rejected": -262.33905029296875, + "loss": 0.6108, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8770631551742554, + "rewards/margins": 0.588469386100769, + "rewards/rejected": -1.465532660484314, + "step": 176 + }, + { + "epoch": 0.378306171520171, + "grad_norm": 19.009550272152616, + "learning_rate": 3.908300145159055e-07, + "logits/chosen": 2.3191661834716797, + "logits/rejected": 2.3956990242004395, + "logps/chosen": -498.1864318847656, + "logps/rejected": -484.289794921875, + "loss": 0.6321, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0074068307876587, + "rewards/margins": 0.2774062752723694, + "rewards/rejected": -1.2848131656646729, + "step": 177 + }, + { + "epoch": 0.3804434945231098, + "grad_norm": 20.426912799899327, + "learning_rate": 3.8928102631764304e-07, + "logits/chosen": 1.5165929794311523, + "logits/rejected": 1.4213718175888062, + "logps/chosen": -411.7895812988281, + "logps/rejected": -394.05938720703125, + "loss": 0.6124, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4399906396865845, + "rewards/margins": 0.20191159844398499, + "rewards/rejected": -1.6419024467468262, + "step": 178 + }, + { + "epoch": 0.3825808175260486, + "grad_norm": 18.22302533315433, + "learning_rate": 3.877242453630256e-07, + "logits/chosen": 1.3885517120361328, + "logits/rejected": 1.6490609645843506, + "logps/chosen": -260.5142822265625, + "logps/rejected": -311.5865173339844, + "loss": 0.5865, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7588627338409424, + "rewards/margins": 0.8435803651809692, + "rewards/rejected": -1.6024430990219116, + "step": 179 + }, + { + "epoch": 0.38471814052898745, + "grad_norm": 18.319584297433117, + "learning_rate": 3.8615975875375676e-07, + "logits/chosen": 1.64811110496521, + "logits/rejected": 1.57973051071167, + "logps/chosen": -375.68115234375, + "logps/rejected": -375.6374206542969, + "loss": 0.6257, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0208864212036133, + "rewards/margins": 0.451630562543869, + "rewards/rejected": -1.4725168943405151, + "step": 180 + }, + { + "epoch": 0.38685546353192624, + "grad_norm": 17.171371040882548, + "learning_rate": 3.8458765402267056e-07, + "logits/chosen": 2.0625722408294678, + "logits/rejected": 2.011898994445801, + "logps/chosen": -447.5124206542969, + "logps/rejected": -442.49383544921875, + "loss": 0.5831, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1310521364212036, + "rewards/margins": 0.3197172284126282, + "rewards/rejected": -1.4507691860198975, + "step": 181 + }, + { + "epoch": 0.3889927865348651, + "grad_norm": 18.254942031621926, + "learning_rate": 3.8300801912883414e-07, + "logits/chosen": 2.303025007247925, + "logits/rejected": 2.362499713897705, + "logps/chosen": -428.386962890625, + "logps/rejected": -428.35321044921875, + "loss": 0.6029, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0741493701934814, + "rewards/margins": 0.13146010041236877, + "rewards/rejected": -1.2056095600128174, + "step": 182 + }, + { + "epoch": 0.3911301095378039, + "grad_norm": 18.02286496058953, + "learning_rate": 3.8142094245262615e-07, + "logits/chosen": 1.539035439491272, + "logits/rejected": 1.5929516553878784, + "logps/chosen": -343.71429443359375, + "logps/rejected": -337.14923095703125, + "loss": 0.6112, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3707005977630615, + "rewards/margins": 0.03174225986003876, + "rewards/rejected": -1.4024429321289062, + "step": 183 + }, + { + "epoch": 0.3932674325407427, + "grad_norm": 19.325777233326907, + "learning_rate": 3.7982651279079227e-07, + "logits/chosen": 1.77326238155365, + "logits/rejected": 1.7490006685256958, + "logps/chosen": -453.261962890625, + "logps/rejected": -457.6907958984375, + "loss": 0.5617, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9381343722343445, + "rewards/margins": 0.5582923889160156, + "rewards/rejected": -1.4964268207550049, + "step": 184 + }, + { + "epoch": 0.39540475554368154, + "grad_norm": 17.697347986359333, + "learning_rate": 3.7822481935147655e-07, + "logits/chosen": 1.5330384969711304, + "logits/rejected": 1.6347877979278564, + "logps/chosen": -455.7304992675781, + "logps/rejected": -506.4276123046875, + "loss": 0.5943, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9335508346557617, + "rewards/margins": 0.45107167959213257, + "rewards/rejected": -1.384622573852539, + "step": 185 + }, + { + "epoch": 0.3975420785466204, + "grad_norm": 17.07660005715345, + "learning_rate": 3.766159517492307e-07, + "logits/chosen": 1.6240900754928589, + "logits/rejected": 1.6119805574417114, + "logps/chosen": -381.19952392578125, + "logps/rejected": -398.58319091796875, + "loss": 0.5969, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0835504531860352, + "rewards/margins": 0.2535113990306854, + "rewards/rejected": -1.337061882019043, + "step": 186 + }, + { + "epoch": 0.39967940154955917, + "grad_norm": 21.101749897060266, + "learning_rate": 3.75e-07, + "logits/chosen": 0.8521130084991455, + "logits/rejected": 0.9815881252288818, + "logps/chosen": -459.6861877441406, + "logps/rejected": -452.48126220703125, + "loss": 0.678, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3385112285614014, + "rewards/margins": 0.12616467475891113, + "rewards/rejected": -1.464676022529602, + "step": 187 + }, + { + "epoch": 0.401816724552498, + "grad_norm": 16.950089126169885, + "learning_rate": 3.7337705451608667e-07, + "logits/chosen": 2.048741340637207, + "logits/rejected": 1.9400901794433594, + "logps/chosen": -363.7098388671875, + "logps/rejected": -378.2185363769531, + "loss": 0.6111, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7874991297721863, + "rewards/margins": 0.5683628916740417, + "rewards/rejected": -1.355862021446228, + "step": 188 + }, + { + "epoch": 0.4039540475554368, + "grad_norm": 16.71431664815698, + "learning_rate": 3.717472061010918e-07, + "logits/chosen": 1.9219565391540527, + "logits/rejected": 1.8952841758728027, + "logps/chosen": -433.7643737792969, + "logps/rejected": -417.7620849609375, + "loss": 0.6077, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0933847427368164, + "rewards/margins": 0.05321376025676727, + "rewards/rejected": -1.1465984582901, + "step": 189 + }, + { + "epoch": 0.40609137055837563, + "grad_norm": 16.58728088417605, + "learning_rate": 3.7011054594483443e-07, + "logits/chosen": 1.4869471788406372, + "logits/rejected": 1.5433428287506104, + "logps/chosen": -349.62884521484375, + "logps/rejected": -359.94091796875, + "loss": 0.611, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8237789273262024, + "rewards/margins": 0.12867197394371033, + "rewards/rejected": -0.9524509906768799, + "step": 190 + }, + { + "epoch": 0.40822869356131447, + "grad_norm": 16.71557444491728, + "learning_rate": 3.6846716561824967e-07, + "logits/chosen": 1.945807933807373, + "logits/rejected": 1.9054489135742188, + "logps/chosen": -399.7051696777344, + "logps/rejected": -397.4898681640625, + "loss": 0.5702, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.2767914533615112, + "rewards/margins": 0.09971967339515686, + "rewards/rejected": -1.3765110969543457, + "step": 191 + }, + { + "epoch": 0.41036601656425326, + "grad_norm": 17.940217471393204, + "learning_rate": 3.668171570682655e-07, + "logits/chosen": 1.712918758392334, + "logits/rejected": 1.7127172946929932, + "logps/chosen": -469.094970703125, + "logps/rejected": -432.9211120605469, + "loss": 0.6023, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2319235801696777, + "rewards/margins": 0.32265806198120117, + "rewards/rejected": -1.554581642150879, + "step": 192 + }, + { + "epoch": 0.4125033395671921, + "grad_norm": 19.798434486116633, + "learning_rate": 3.6516061261265805e-07, + "logits/chosen": 1.795310139656067, + "logits/rejected": 1.7313811779022217, + "logps/chosen": -429.7494812011719, + "logps/rejected": -461.983642578125, + "loss": 0.6181, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9297257661819458, + "rewards/margins": 0.4894411563873291, + "rewards/rejected": -1.419166922569275, + "step": 193 + }, + { + "epoch": 0.41464066257013094, + "grad_norm": 19.397752291933788, + "learning_rate": 3.634976249348867e-07, + "logits/chosen": 1.9277091026306152, + "logits/rejected": 1.7616665363311768, + "logps/chosen": -368.06817626953125, + "logps/rejected": -363.0306701660156, + "loss": 0.5866, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8755682706832886, + "rewards/margins": 0.64577317237854, + "rewards/rejected": -1.5213414430618286, + "step": 194 + }, + { + "epoch": 0.4167779855730697, + "grad_norm": 17.93101753591044, + "learning_rate": 3.618282870789081e-07, + "logits/chosen": 1.7866439819335938, + "logits/rejected": 1.8261215686798096, + "logps/chosen": -434.7184753417969, + "logps/rejected": -425.9129333496094, + "loss": 0.5672, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1498773097991943, + "rewards/margins": 0.3212715983390808, + "rewards/rejected": -1.4711488485336304, + "step": 195 + }, + { + "epoch": 0.41891530857600856, + "grad_norm": 20.67237268539849, + "learning_rate": 3.601526924439709e-07, + "logits/chosen": 1.2071539163589478, + "logits/rejected": 1.148329257965088, + "logps/chosen": -399.2107238769531, + "logps/rejected": -418.5735778808594, + "loss": 0.6518, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2001055479049683, + "rewards/margins": -0.004660561680793762, + "rewards/rejected": -1.1954450607299805, + "step": 196 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 19.547002379951252, + "learning_rate": 3.584709347793895e-07, + "logits/chosen": 1.9946776628494263, + "logits/rejected": 2.094057559967041, + "logps/chosen": -338.82177734375, + "logps/rejected": -348.4697265625, + "loss": 0.6322, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9167635440826416, + "rewards/margins": 0.0018602609634399414, + "rewards/rejected": -0.9186238646507263, + "step": 197 + }, + { + "epoch": 0.4231899545818862, + "grad_norm": 17.58918480785892, + "learning_rate": 3.567831081792992e-07, + "logits/chosen": 1.9100654125213623, + "logits/rejected": 1.8705369234085083, + "logps/chosen": -388.90478515625, + "logps/rejected": -419.180908203125, + "loss": 0.6028, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0298148393630981, + "rewards/margins": 0.3086302876472473, + "rewards/rejected": -1.3384451866149902, + "step": 198 + }, + { + "epoch": 0.425327277584825, + "grad_norm": 19.436995082353043, + "learning_rate": 3.550893070773914e-07, + "logits/chosen": 1.6310675144195557, + "logits/rejected": 1.5973904132843018, + "logps/chosen": -369.18695068359375, + "logps/rejected": -372.208740234375, + "loss": 0.6379, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0363404750823975, + "rewards/margins": 0.19525709748268127, + "rewards/rejected": -1.2315975427627563, + "step": 199 + }, + { + "epoch": 0.4274646005877638, + "grad_norm": 18.684092669252145, + "learning_rate": 3.5338962624163016e-07, + "logits/chosen": 1.7700620889663696, + "logits/rejected": 1.70797598361969, + "logps/chosen": -411.388427734375, + "logps/rejected": -411.8304748535156, + "loss": 0.6157, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.03152596950531, + "rewards/margins": 0.24860131740570068, + "rewards/rejected": -1.2801272869110107, + "step": 200 + }, + { + "epoch": 0.42960192359070265, + "grad_norm": 20.121707150971464, + "learning_rate": 3.516841607689501e-07, + "logits/chosen": 1.967681884765625, + "logits/rejected": 2.080134153366089, + "logps/chosen": -450.67706298828125, + "logps/rejected": -449.7607727050781, + "loss": 0.5954, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.259590983390808, + "rewards/margins": 0.48261815309524536, + "rewards/rejected": -1.7422093152999878, + "step": 201 + }, + { + "epoch": 0.4317392465936415, + "grad_norm": 18.887931551669933, + "learning_rate": 3.499730060799352e-07, + "logits/chosen": 2.4707322120666504, + "logits/rejected": 2.550736427307129, + "logps/chosen": -446.833251953125, + "logps/rejected": -454.7132568359375, + "loss": 0.6176, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1572250127792358, + "rewards/margins": 0.007829396985471249, + "rewards/rejected": -1.1650543212890625, + "step": 202 + }, + { + "epoch": 0.4338765695965803, + "grad_norm": 18.15280641353194, + "learning_rate": 3.482562579134809e-07, + "logits/chosen": 1.8412984609603882, + "logits/rejected": 1.7161986827850342, + "logps/chosen": -391.0166015625, + "logps/rejected": -431.3764343261719, + "loss": 0.5922, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2463746070861816, + "rewards/margins": 0.5344080924987793, + "rewards/rejected": -1.7807824611663818, + "step": 203 + }, + { + "epoch": 0.4360138925995191, + "grad_norm": 19.28039369733405, + "learning_rate": 3.465340123214365e-07, + "logits/chosen": 1.8423885107040405, + "logits/rejected": 1.8443387746810913, + "logps/chosen": -418.2002868652344, + "logps/rejected": -433.6718444824219, + "loss": 0.6318, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4090765714645386, + "rewards/margins": 0.5609220862388611, + "rewards/rejected": -1.969998836517334, + "step": 204 + }, + { + "epoch": 0.4381512156024579, + "grad_norm": 18.816447396480665, + "learning_rate": 3.448063656632321e-07, + "logits/chosen": 1.948996663093567, + "logits/rejected": 1.820084810256958, + "logps/chosen": -379.0099182128906, + "logps/rejected": -352.95184326171875, + "loss": 0.6106, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2226296663284302, + "rewards/margins": 0.21551814675331116, + "rewards/rejected": -1.438147783279419, + "step": 205 + }, + { + "epoch": 0.44028853860539674, + "grad_norm": 20.201506617359616, + "learning_rate": 3.430734146004863e-07, + "logits/chosen": 1.65437912940979, + "logits/rejected": 1.5461066961288452, + "logps/chosen": -402.8113708496094, + "logps/rejected": -382.8777160644531, + "loss": 0.6535, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.4631258249282837, + "rewards/margins": 0.11626932770013809, + "rewards/rejected": -1.579395055770874, + "step": 206 + }, + { + "epoch": 0.4424258616083356, + "grad_norm": 19.67653738402853, + "learning_rate": 3.413352560915988e-07, + "logits/chosen": 1.7494556903839111, + "logits/rejected": 1.737342357635498, + "logps/chosen": -361.5591125488281, + "logps/rejected": -357.0711364746094, + "loss": 0.643, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.302952766418457, + "rewards/margins": 0.4283585548400879, + "rewards/rejected": -1.7313114404678345, + "step": 207 + }, + { + "epoch": 0.44456318461127436, + "grad_norm": 20.51918848235735, + "learning_rate": 3.39591987386325e-07, + "logits/chosen": 2.0057222843170166, + "logits/rejected": 2.0365169048309326, + "logps/chosen": -418.554931640625, + "logps/rejected": -424.8497009277344, + "loss": 0.632, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3311376571655273, + "rewards/margins": 0.1549416184425354, + "rewards/rejected": -1.4860793352127075, + "step": 208 + }, + { + "epoch": 0.4467005076142132, + "grad_norm": 18.895094732056656, + "learning_rate": 3.378437060203357e-07, + "logits/chosen": 1.2798339128494263, + "logits/rejected": 1.163004755973816, + "logps/chosen": -344.80889892578125, + "logps/rejected": -377.0693664550781, + "loss": 0.6167, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.96147221326828, + "rewards/margins": 0.33394408226013184, + "rewards/rejected": -1.2954163551330566, + "step": 209 + }, + { + "epoch": 0.448837830617152, + "grad_norm": 18.922009197013033, + "learning_rate": 3.360905098097587e-07, + "logits/chosen": 1.6536269187927246, + "logits/rejected": 1.6847363710403442, + "logps/chosen": -446.45416259765625, + "logps/rejected": -452.1558837890625, + "loss": 0.602, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2614518404006958, + "rewards/margins": 0.22943194210529327, + "rewards/rejected": -1.4908838272094727, + "step": 210 + }, + { + "epoch": 0.45097515362009083, + "grad_norm": 21.4124521627508, + "learning_rate": 3.343324968457075e-07, + "logits/chosen": 1.5657013654708862, + "logits/rejected": 1.6874078512191772, + "logps/chosen": -344.92913818359375, + "logps/rejected": -384.62396240234375, + "loss": 0.6227, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2245018482208252, + "rewards/margins": 0.45444363355636597, + "rewards/rejected": -1.678945541381836, + "step": 211 + }, + { + "epoch": 0.45311247662302967, + "grad_norm": 26.23976086161117, + "learning_rate": 3.325697654887918e-07, + "logits/chosen": 1.4959322214126587, + "logits/rejected": 1.4617286920547485, + "logps/chosen": -397.129150390625, + "logps/rejected": -358.9854431152344, + "loss": 0.6527, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2049939632415771, + "rewards/margins": 0.24925091862678528, + "rewards/rejected": -1.45424485206604, + "step": 212 + }, + { + "epoch": 0.45524979962596845, + "grad_norm": 19.599613357584392, + "learning_rate": 3.30802414363615e-07, + "logits/chosen": 1.8930243253707886, + "logits/rejected": 1.8906832933425903, + "logps/chosen": -369.51617431640625, + "logps/rejected": -402.1026611328125, + "loss": 0.6202, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.204052209854126, + "rewards/margins": 0.41892537474632263, + "rewards/rejected": -1.6229774951934814, + "step": 213 + }, + { + "epoch": 0.4573871226289073, + "grad_norm": 19.22419030566061, + "learning_rate": 3.2903054235325613e-07, + "logits/chosen": 1.3914402723312378, + "logits/rejected": 1.4307550191879272, + "logps/chosen": -392.59539794921875, + "logps/rejected": -429.68450927734375, + "loss": 0.6409, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.376542329788208, + "rewards/margins": 0.47458475828170776, + "rewards/rejected": -1.851127028465271, + "step": 214 + }, + { + "epoch": 0.45952444563184613, + "grad_norm": 19.753436323643434, + "learning_rate": 3.272542485937368e-07, + "logits/chosen": 1.3418879508972168, + "logits/rejected": 1.3356355428695679, + "logps/chosen": -376.60491943359375, + "logps/rejected": -379.2442321777344, + "loss": 0.611, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1197147369384766, + "rewards/margins": 0.5473043322563171, + "rewards/rejected": -1.6670188903808594, + "step": 215 + }, + { + "epoch": 0.4616617686347849, + "grad_norm": 16.73095945133393, + "learning_rate": 3.2547363246847546e-07, + "logits/chosen": 2.101710319519043, + "logits/rejected": 2.2036097049713135, + "logps/chosen": -342.0418395996094, + "logps/rejected": -385.2028503417969, + "loss": 0.5769, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7874879240989685, + "rewards/margins": 0.38399648666381836, + "rewards/rejected": -1.171484351158142, + "step": 216 + }, + { + "epoch": 0.46379909163772376, + "grad_norm": 21.40612134241797, + "learning_rate": 3.2368879360272606e-07, + "logits/chosen": 1.7089003324508667, + "logits/rejected": 1.89894437789917, + "logps/chosen": -470.57440185546875, + "logps/rejected": -497.7328186035156, + "loss": 0.6595, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5549209117889404, + "rewards/margins": 0.23799550533294678, + "rewards/rejected": -1.7929165363311768, + "step": 217 + }, + { + "epoch": 0.46593641464066254, + "grad_norm": 18.401537502823185, + "learning_rate": 3.218998318580043e-07, + "logits/chosen": 1.7439593076705933, + "logits/rejected": 1.7078120708465576, + "logps/chosen": -417.99560546875, + "logps/rejected": -467.07464599609375, + "loss": 0.612, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0848441123962402, + "rewards/margins": 0.633072018623352, + "rewards/rejected": -1.7179162502288818, + "step": 218 + }, + { + "epoch": 0.4680737376436014, + "grad_norm": 19.854545849811785, + "learning_rate": 3.201068473265007e-07, + "logits/chosen": 1.8572185039520264, + "logits/rejected": 1.8451629877090454, + "logps/chosen": -476.4209899902344, + "logps/rejected": -464.2806701660156, + "loss": 0.6046, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.108113408088684, + "rewards/margins": 0.4227336347103119, + "rewards/rejected": -1.5308470726013184, + "step": 219 + }, + { + "epoch": 0.4702110606465402, + "grad_norm": 18.469882304782676, + "learning_rate": 3.1830994032548e-07, + "logits/chosen": 1.1646744012832642, + "logits/rejected": 1.2345802783966064, + "logps/chosen": -373.4507141113281, + "logps/rejected": -387.0456848144531, + "loss": 0.5847, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9688059091567993, + "rewards/margins": 0.19057507812976837, + "rewards/rejected": -2.1593809127807617, + "step": 220 + }, + { + "epoch": 0.472348383649479, + "grad_norm": 18.9867259815962, + "learning_rate": 3.1650921139166874e-07, + "logits/chosen": 2.185783624649048, + "logits/rejected": 2.1693177223205566, + "logps/chosen": -326.4842529296875, + "logps/rejected": -349.39556884765625, + "loss": 0.5825, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.598148226737976, + "rewards/margins": 0.4604637920856476, + "rewards/rejected": -2.058612108230591, + "step": 221 + }, + { + "epoch": 0.47448570665241785, + "grad_norm": 17.69369969243383, + "learning_rate": 3.147047612756302e-07, + "logits/chosen": 1.6809875965118408, + "logits/rejected": 1.5675013065338135, + "logps/chosen": -352.68463134765625, + "logps/rejected": -350.1728515625, + "loss": 0.5836, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0454927682876587, + "rewards/margins": 0.3611335754394531, + "rewards/rejected": -1.4066263437271118, + "step": 222 + }, + { + "epoch": 0.4766230296553567, + "grad_norm": 18.720455518598285, + "learning_rate": 3.128966909361271e-07, + "logits/chosen": 1.5484097003936768, + "logits/rejected": 1.5819151401519775, + "logps/chosen": -358.4718017578125, + "logps/rejected": -339.5688781738281, + "loss": 0.6065, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.3430771827697754, + "rewards/margins": -0.17120108008384705, + "rewards/rejected": -1.1718761920928955, + "step": 223 + }, + { + "epoch": 0.4787603526582955, + "grad_norm": 16.817886671501697, + "learning_rate": 3.110851015344735e-07, + "logits/chosen": 1.784898281097412, + "logits/rejected": 2.0283141136169434, + "logps/chosen": -401.959716796875, + "logps/rejected": -411.4044494628906, + "loss": 0.5935, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5906190872192383, + "rewards/margins": 0.40126991271972656, + "rewards/rejected": -1.991889238357544, + "step": 224 + }, + { + "epoch": 0.4808976756612343, + "grad_norm": 17.147517583264804, + "learning_rate": 3.0927009442887437e-07, + "logits/chosen": 1.4650673866271973, + "logits/rejected": 1.5623681545257568, + "logps/chosen": -391.3199768066406, + "logps/rejected": -397.706787109375, + "loss": 0.5909, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2298505306243896, + "rewards/margins": 0.3492811918258667, + "rewards/rejected": -1.5791317224502563, + "step": 225 + }, + { + "epoch": 0.4830349986641731, + "grad_norm": 18.38943701452515, + "learning_rate": 3.074517711687549e-07, + "logits/chosen": 2.138887882232666, + "logits/rejected": 2.0834007263183594, + "logps/chosen": -324.4191589355469, + "logps/rejected": -337.5340270996094, + "loss": 0.6122, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0309844017028809, + "rewards/margins": 0.10200852155685425, + "rewards/rejected": -1.1329929828643799, + "step": 226 + }, + { + "epoch": 0.48517232166711194, + "grad_norm": 18.020181570236574, + "learning_rate": 3.056302334890786e-07, + "logits/chosen": 1.6313700675964355, + "logits/rejected": 1.5899854898452759, + "logps/chosen": -368.62835693359375, + "logps/rejected": -400.20013427734375, + "loss": 0.571, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.055250644683838, + "rewards/margins": 0.3379458785057068, + "rewards/rejected": -1.3931964635849, + "step": 227 + }, + { + "epoch": 0.4873096446700508, + "grad_norm": 21.231287782547422, + "learning_rate": 3.038055833046555e-07, + "logits/chosen": 1.821986198425293, + "logits/rejected": 1.775019645690918, + "logps/chosen": -438.9476318359375, + "logps/rejected": -462.4380187988281, + "loss": 0.7017, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1709932088851929, + "rewards/margins": 0.017013883218169212, + "rewards/rejected": -1.1880072355270386, + "step": 228 + }, + { + "epoch": 0.48944696767298956, + "grad_norm": 18.242763714139798, + "learning_rate": 3.0197792270443976e-07, + "logits/chosen": 1.685359239578247, + "logits/rejected": 1.7110083103179932, + "logps/chosen": -382.34051513671875, + "logps/rejected": -361.8902893066406, + "loss": 0.5608, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1712117195129395, + "rewards/margins": 0.15365606546401978, + "rewards/rejected": -1.3248677253723145, + "step": 229 + }, + { + "epoch": 0.4915842906759284, + "grad_norm": 19.342657906988098, + "learning_rate": 3.001473539458182e-07, + "logits/chosen": 1.652226209640503, + "logits/rejected": 1.7933372259140015, + "logps/chosen": -351.8333435058594, + "logps/rejected": -345.5216369628906, + "loss": 0.5985, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9596707820892334, + "rewards/margins": 0.48877692222595215, + "rewards/rejected": -1.4484477043151855, + "step": 230 + }, + { + "epoch": 0.49372161367886724, + "grad_norm": 18.306301233512528, + "learning_rate": 2.983139794488883e-07, + "logits/chosen": 2.194626569747925, + "logits/rejected": 2.039341926574707, + "logps/chosen": -422.2347412109375, + "logps/rejected": -381.56793212890625, + "loss": 0.6316, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5122215747833252, + "rewards/margins": -0.1322540044784546, + "rewards/rejected": -1.379967451095581, + "step": 231 + }, + { + "epoch": 0.49585893668180603, + "grad_norm": 18.94320134699598, + "learning_rate": 2.964779017907287e-07, + "logits/chosen": 1.6469025611877441, + "logits/rejected": 1.6451257467269897, + "logps/chosen": -387.7391052246094, + "logps/rejected": -381.7521667480469, + "loss": 0.5867, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2008886337280273, + "rewards/margins": 0.707405686378479, + "rewards/rejected": -1.9082942008972168, + "step": 232 + }, + { + "epoch": 0.49799625968474487, + "grad_norm": 17.646525560974858, + "learning_rate": 2.9463922369965915e-07, + "logits/chosen": 0.9604759812355042, + "logits/rejected": 1.081686019897461, + "logps/chosen": -320.6999816894531, + "logps/rejected": -338.20977783203125, + "loss": 0.5859, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3322499990463257, + "rewards/margins": 0.4187338948249817, + "rewards/rejected": -1.7509838342666626, + "step": 233 + }, + { + "epoch": 0.5001335826876837, + "grad_norm": 18.940116172317182, + "learning_rate": 2.927980480494938e-07, + "logits/chosen": 1.430747389793396, + "logits/rejected": 1.602396011352539, + "logps/chosen": -286.286376953125, + "logps/rejected": -305.3640441894531, + "loss": 0.568, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2952995300292969, + "rewards/margins": 0.4015873074531555, + "rewards/rejected": -1.6968867778778076, + "step": 234 + }, + { + "epoch": 0.5022709056906225, + "grad_norm": 23.939013344426872, + "learning_rate": 2.909544778537844e-07, + "logits/chosen": 1.7608510255813599, + "logits/rejected": 1.8317222595214844, + "logps/chosen": -354.6390686035156, + "logps/rejected": -346.6441650390625, + "loss": 0.5761, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8426222205162048, + "rewards/margins": 0.4636692404747009, + "rewards/rejected": -1.3062914609909058, + "step": 235 + }, + { + "epoch": 0.5044082286935613, + "grad_norm": 17.859554625792914, + "learning_rate": 2.8910861626005773e-07, + "logits/chosen": 1.4780582189559937, + "logits/rejected": 1.4497883319854736, + "logps/chosen": -377.6356506347656, + "logps/rejected": -397.4453430175781, + "loss": 0.6219, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2822011709213257, + "rewards/margins": 0.2759856581687927, + "rewards/rejected": -1.5581867694854736, + "step": 236 + }, + { + "epoch": 0.5065455516965002, + "grad_norm": 15.921170465179717, + "learning_rate": 2.872605665440436e-07, + "logits/chosen": 2.2028841972351074, + "logits/rejected": 2.1681137084960938, + "logps/chosen": -500.1951599121094, + "logps/rejected": -484.7807312011719, + "loss": 0.5792, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0691642761230469, + "rewards/margins": 0.5099738836288452, + "rewards/rejected": -1.579138159751892, + "step": 237 + }, + { + "epoch": 0.508682874699439, + "grad_norm": 20.104152888553777, + "learning_rate": 2.8541043210389726e-07, + "logits/chosen": 1.4386096000671387, + "logits/rejected": 1.5074604749679565, + "logps/chosen": -327.89239501953125, + "logps/rejected": -340.6610412597656, + "loss": 0.6041, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4297116994857788, + "rewards/margins": -0.030831288546323776, + "rewards/rejected": -1.3988802433013916, + "step": 238 + }, + { + "epoch": 0.5108201977023777, + "grad_norm": 18.984989332345673, + "learning_rate": 2.8355831645441387e-07, + "logits/chosen": 1.4687011241912842, + "logits/rejected": 1.5416244268417358, + "logps/chosen": -298.0223693847656, + "logps/rejected": -331.571533203125, + "loss": 0.6282, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4456336498260498, + "rewards/margins": 0.42436861991882324, + "rewards/rejected": -1.8700025081634521, + "step": 239 + }, + { + "epoch": 0.5129575207053166, + "grad_norm": 19.616076404098113, + "learning_rate": 2.817043232212371e-07, + "logits/chosen": 1.4483119249343872, + "logits/rejected": 1.4856162071228027, + "logps/chosen": -362.3177795410156, + "logps/rejected": -404.8100280761719, + "loss": 0.6419, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.233857274055481, + "rewards/margins": 0.10059986263513565, + "rewards/rejected": -1.3344570398330688, + "step": 240 + }, + { + "epoch": 0.5150948437082554, + "grad_norm": 19.564465947045136, + "learning_rate": 2.7984855613506106e-07, + "logits/chosen": 1.8776293992996216, + "logits/rejected": 1.9832379817962646, + "logps/chosen": -344.974365234375, + "logps/rejected": -392.325927734375, + "loss": 0.6338, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1595617532730103, + "rewards/margins": 0.3295942544937134, + "rewards/rejected": -1.4891560077667236, + "step": 241 + }, + { + "epoch": 0.5172321667111942, + "grad_norm": 16.85077595027773, + "learning_rate": 2.7799111902582693e-07, + "logits/chosen": 1.8834673166275024, + "logits/rejected": 1.9119462966918945, + "logps/chosen": -342.9952392578125, + "logps/rejected": -360.51220703125, + "loss": 0.6157, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2162364721298218, + "rewards/margins": 0.1725614368915558, + "rewards/rejected": -1.3887979984283447, + "step": 242 + }, + { + "epoch": 0.5193694897141331, + "grad_norm": 18.949703224831786, + "learning_rate": 2.761321158169134e-07, + "logits/chosen": 2.179548978805542, + "logits/rejected": 2.157480478286743, + "logps/chosen": -390.4351501464844, + "logps/rejected": -449.8084411621094, + "loss": 0.6343, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.394233226776123, + "rewards/margins": 0.2236776202917099, + "rewards/rejected": -1.6179107427597046, + "step": 243 + }, + { + "epoch": 0.5215068127170719, + "grad_norm": 16.607869629392933, + "learning_rate": 2.74271650519322e-07, + "logits/chosen": 1.8933420181274414, + "logits/rejected": 1.9424501657485962, + "logps/chosen": -599.1654052734375, + "logps/rejected": -581.9706420898438, + "loss": 0.5837, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4606138467788696, + "rewards/margins": 0.1728818118572235, + "rewards/rejected": -1.6334956884384155, + "step": 244 + }, + { + "epoch": 0.5236441357200107, + "grad_norm": 16.905690523597688, + "learning_rate": 2.7240982722585837e-07, + "logits/chosen": 1.7946285009384155, + "logits/rejected": 1.8385579586029053, + "logps/chosen": -281.8657531738281, + "logps/rejected": -314.0006408691406, + "loss": 0.5875, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2459651231765747, + "rewards/margins": 0.014865804463624954, + "rewards/rejected": -1.2608309984207153, + "step": 245 + }, + { + "epoch": 0.5257814587229495, + "grad_norm": 19.248918362668643, + "learning_rate": 2.705467501053076e-07, + "logits/chosen": 2.24014949798584, + "logits/rejected": 2.2482752799987793, + "logps/chosen": -412.81854248046875, + "logps/rejected": -382.4270324707031, + "loss": 0.6086, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1754889488220215, + "rewards/margins": 0.09349602460861206, + "rewards/rejected": -1.2689850330352783, + "step": 246 + }, + { + "epoch": 0.5279187817258884, + "grad_norm": 16.8906231683352, + "learning_rate": 2.6868252339660607e-07, + "logits/chosen": 1.3152052164077759, + "logits/rejected": 1.4119510650634766, + "logps/chosen": -329.6816101074219, + "logps/rejected": -337.0438537597656, + "loss": 0.579, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5544246435165405, + "rewards/margins": 0.47526106238365173, + "rewards/rejected": -2.0296854972839355, + "step": 247 + }, + { + "epoch": 0.5300561047288271, + "grad_norm": 18.021321805955274, + "learning_rate": 2.6681725140300995e-07, + "logits/chosen": 1.5704407691955566, + "logits/rejected": 1.6143972873687744, + "logps/chosen": -369.656494140625, + "logps/rejected": -393.353515625, + "loss": 0.6217, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3475369215011597, + "rewards/margins": 0.3202340602874756, + "rewards/rejected": -1.6677708625793457, + "step": 248 + }, + { + "epoch": 0.5321934277317659, + "grad_norm": 19.509340419547396, + "learning_rate": 2.6495103848625854e-07, + "logits/chosen": 1.7228862047195435, + "logits/rejected": 1.735210657119751, + "logps/chosen": -366.4405822753906, + "logps/rejected": -398.99603271484375, + "loss": 0.6436, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3989753723144531, + "rewards/margins": 0.46870413422584534, + "rewards/rejected": -1.8676795959472656, + "step": 249 + }, + { + "epoch": 0.5343307507347048, + "grad_norm": 21.318546314009538, + "learning_rate": 2.63083989060736e-07, + "logits/chosen": 1.8195374011993408, + "logits/rejected": 1.8398746252059937, + "logps/chosen": -427.18402099609375, + "logps/rejected": -452.298828125, + "loss": 0.6685, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5230650901794434, + "rewards/margins": 0.3252180218696594, + "rewards/rejected": -1.848283052444458, + "step": 250 + }, + { + "epoch": 0.5364680737376436, + "grad_norm": 19.878831886338375, + "learning_rate": 2.6121620758762875e-07, + "logits/chosen": 1.4604578018188477, + "logits/rejected": 1.3828535079956055, + "logps/chosen": -400.2124938964844, + "logps/rejected": -367.87689208984375, + "loss": 0.5929, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4065700769424438, + "rewards/margins": 0.5055478811264038, + "rewards/rejected": -1.9121177196502686, + "step": 251 + }, + { + "epoch": 0.5386053967405824, + "grad_norm": 19.27873005266974, + "learning_rate": 2.593477985690815e-07, + "logits/chosen": 2.034696578979492, + "logits/rejected": 1.9050860404968262, + "logps/chosen": -502.56475830078125, + "logps/rejected": -473.90606689453125, + "loss": 0.5616, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5159586668014526, + "rewards/margins": 0.5788987278938293, + "rewards/rejected": -2.0948572158813477, + "step": 252 + }, + { + "epoch": 0.5407427197435213, + "grad_norm": 17.037711101859895, + "learning_rate": 2.574788665423496e-07, + "logits/chosen": 1.8234672546386719, + "logits/rejected": 1.8364436626434326, + "logps/chosen": -403.5150146484375, + "logps/rejected": -435.0159606933594, + "loss": 0.5847, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9063006043434143, + "rewards/margins": 0.604102611541748, + "rewards/rejected": -1.5104031562805176, + "step": 253 + }, + { + "epoch": 0.5428800427464601, + "grad_norm": 20.75619046985192, + "learning_rate": 2.5560951607395126e-07, + "logits/chosen": 1.916282296180725, + "logits/rejected": 1.836082935333252, + "logps/chosen": -396.24560546875, + "logps/rejected": -400.76007080078125, + "loss": 0.6306, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3977752923965454, + "rewards/margins": 0.23083746433258057, + "rewards/rejected": -1.628612756729126, + "step": 254 + }, + { + "epoch": 0.5450173657493989, + "grad_norm": 19.622882332492626, + "learning_rate": 2.537398517538159e-07, + "logits/chosen": 2.3518788814544678, + "logits/rejected": 2.373096227645874, + "logps/chosen": -384.93212890625, + "logps/rejected": -370.65478515625, + "loss": 0.6153, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1938955783843994, + "rewards/margins": 0.3775806725025177, + "rewards/rejected": -1.5714763402938843, + "step": 255 + }, + { + "epoch": 0.5471546887523377, + "grad_norm": 19.58386952453966, + "learning_rate": 2.518699781894332e-07, + "logits/chosen": 1.9337654113769531, + "logits/rejected": 1.907602071762085, + "logps/chosen": -416.7008056640625, + "logps/rejected": -424.2861633300781, + "loss": 0.5973, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4020006656646729, + "rewards/margins": 0.38494953513145447, + "rewards/rejected": -1.7869502305984497, + "step": 256 + }, + { + "epoch": 0.5492920117552765, + "grad_norm": 16.361122872177194, + "learning_rate": 2.5e-07, + "logits/chosen": 1.7576777935028076, + "logits/rejected": 1.6899727582931519, + "logps/chosen": -391.47491455078125, + "logps/rejected": -425.26531982421875, + "loss": 0.6003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7915077805519104, + "rewards/margins": 0.6424891948699951, + "rewards/rejected": -1.4339970350265503, + "step": 257 + }, + { + "epoch": 0.5514293347582153, + "grad_norm": 20.14867898121995, + "learning_rate": 2.4813002181056676e-07, + "logits/chosen": 1.8691372871398926, + "logits/rejected": 1.8315614461898804, + "logps/chosen": -336.1017761230469, + "logps/rejected": -355.53497314453125, + "loss": 0.6183, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5710742473602295, + "rewards/margins": 0.2784579396247864, + "rewards/rejected": -1.8495322465896606, + "step": 258 + }, + { + "epoch": 0.5535666577611541, + "grad_norm": 18.423158676910308, + "learning_rate": 2.4626014824618413e-07, + "logits/chosen": 1.8882901668548584, + "logits/rejected": 1.8200494050979614, + "logps/chosen": -396.83642578125, + "logps/rejected": -417.7898254394531, + "loss": 0.6253, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1084598302841187, + "rewards/margins": 0.2924001216888428, + "rewards/rejected": -1.4008599519729614, + "step": 259 + }, + { + "epoch": 0.555703980764093, + "grad_norm": 17.985218179528584, + "learning_rate": 2.4439048392604877e-07, + "logits/chosen": 1.7273188829421997, + "logits/rejected": 1.7321103811264038, + "logps/chosen": -319.5902099609375, + "logps/rejected": -343.101318359375, + "loss": 0.6234, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.216766595840454, + "rewards/margins": 0.5042122602462769, + "rewards/rejected": -1.720978856086731, + "step": 260 + }, + { + "epoch": 0.5578413037670318, + "grad_norm": 18.28183834907717, + "learning_rate": 2.4252113345765043e-07, + "logits/chosen": 2.003500461578369, + "logits/rejected": 2.1276445388793945, + "logps/chosen": -425.6303405761719, + "logps/rejected": -465.6767578125, + "loss": 0.5733, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.439363718032837, + "rewards/margins": 0.22122399508953094, + "rewards/rejected": -1.6605877876281738, + "step": 261 + }, + { + "epoch": 0.5599786267699706, + "grad_norm": 18.461608241814286, + "learning_rate": 2.406522014309186e-07, + "logits/chosen": 1.563164234161377, + "logits/rejected": 1.5734854936599731, + "logps/chosen": -325.31903076171875, + "logps/rejected": -325.20819091796875, + "loss": 0.5877, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7867953777313232, + "rewards/margins": 0.3784661591053009, + "rewards/rejected": -1.1652615070343018, + "step": 262 + }, + { + "epoch": 0.5621159497729095, + "grad_norm": 17.283252492595803, + "learning_rate": 2.3878379241237134e-07, + "logits/chosen": 2.279524087905884, + "logits/rejected": 2.296083688735962, + "logps/chosen": -328.20220947265625, + "logps/rejected": -361.738037109375, + "loss": 0.5871, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.240782618522644, + "rewards/margins": 0.6395030617713928, + "rewards/rejected": -1.8802857398986816, + "step": 263 + }, + { + "epoch": 0.5642532727758482, + "grad_norm": 16.930318409654138, + "learning_rate": 2.3691601093926402e-07, + "logits/chosen": 1.8105309009552002, + "logits/rejected": 1.8964433670043945, + "logps/chosen": -310.3644714355469, + "logps/rejected": -350.0364685058594, + "loss": 0.5622, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1182781457901, + "rewards/margins": 0.6728768348693848, + "rewards/rejected": -1.7911549806594849, + "step": 264 + }, + { + "epoch": 0.566390595778787, + "grad_norm": 18.154082784411116, + "learning_rate": 2.3504896151374144e-07, + "logits/chosen": 1.9459747076034546, + "logits/rejected": 1.9903091192245483, + "logps/chosen": -496.9175109863281, + "logps/rejected": -534.62255859375, + "loss": 0.5978, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.114598274230957, + "rewards/margins": 0.49393293261528015, + "rewards/rejected": -1.6085312366485596, + "step": 265 + }, + { + "epoch": 0.5685279187817259, + "grad_norm": 17.886561891555203, + "learning_rate": 2.3318274859699008e-07, + "logits/chosen": 1.9951430559158325, + "logits/rejected": 2.013209819793701, + "logps/chosen": -431.7491455078125, + "logps/rejected": -472.21661376953125, + "loss": 0.5579, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0381141901016235, + "rewards/margins": 0.957762598991394, + "rewards/rejected": -1.995876669883728, + "step": 266 + }, + { + "epoch": 0.5706652417846647, + "grad_norm": 19.118226835611033, + "learning_rate": 2.3131747660339394e-07, + "logits/chosen": 1.1025474071502686, + "logits/rejected": 0.9203678965568542, + "logps/chosen": -452.95941162109375, + "logps/rejected": -400.9754943847656, + "loss": 0.6536, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3553560972213745, + "rewards/margins": 0.09373009204864502, + "rewards/rejected": -1.4490864276885986, + "step": 267 + }, + { + "epoch": 0.5728025647876035, + "grad_norm": 17.64914152601931, + "learning_rate": 2.2945324989469243e-07, + "logits/chosen": 1.9431540966033936, + "logits/rejected": 1.746927261352539, + "logps/chosen": -369.2441101074219, + "logps/rejected": -329.55145263671875, + "loss": 0.5729, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0598613023757935, + "rewards/margins": 0.18452677130699158, + "rewards/rejected": -1.2443879842758179, + "step": 268 + }, + { + "epoch": 0.5749398877905424, + "grad_norm": 16.484270905189813, + "learning_rate": 2.2759017277414164e-07, + "logits/chosen": 2.0279760360717773, + "logits/rejected": 1.9914069175720215, + "logps/chosen": -434.94366455078125, + "logps/rejected": -388.86077880859375, + "loss": 0.5838, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2059601545333862, + "rewards/margins": 0.1697658896446228, + "rewards/rejected": -1.3757259845733643, + "step": 269 + }, + { + "epoch": 0.5770772107934812, + "grad_norm": 22.56606378913711, + "learning_rate": 2.2572834948067795e-07, + "logits/chosen": 2.1591734886169434, + "logits/rejected": 2.223292112350464, + "logps/chosen": -468.49639892578125, + "logps/rejected": -476.2985534667969, + "loss": 0.6631, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.7323036193847656, + "rewards/margins": 0.29446297883987427, + "rewards/rejected": -2.026766538619995, + "step": 270 + }, + { + "epoch": 0.57921453379642, + "grad_norm": 19.87333935608928, + "learning_rate": 2.2386788418308665e-07, + "logits/chosen": 1.343954086303711, + "logits/rejected": 1.1634771823883057, + "logps/chosen": -380.564453125, + "logps/rejected": -400.9522705078125, + "loss": 0.6052, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.206040859222412, + "rewards/margins": 0.5869691967964172, + "rewards/rejected": -1.7930101156234741, + "step": 271 + }, + { + "epoch": 0.5813518567993589, + "grad_norm": 18.1006643173551, + "learning_rate": 2.2200888097417302e-07, + "logits/chosen": 1.7346237897872925, + "logits/rejected": 1.5861271619796753, + "logps/chosen": -320.0159606933594, + "logps/rejected": -314.8853759765625, + "loss": 0.6227, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.077418565750122, + "rewards/margins": 0.36749520897865295, + "rewards/rejected": -1.4449137449264526, + "step": 272 + }, + { + "epoch": 0.5834891798022976, + "grad_norm": 19.024746475121724, + "learning_rate": 2.2015144386493895e-07, + "logits/chosen": 1.7071813344955444, + "logits/rejected": 1.628884196281433, + "logps/chosen": -341.3028564453125, + "logps/rejected": -353.91278076171875, + "loss": 0.5773, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3241965770721436, + "rewards/margins": 0.444732129573822, + "rewards/rejected": -1.7689287662506104, + "step": 273 + }, + { + "epoch": 0.5856265028052364, + "grad_norm": 18.34263207305998, + "learning_rate": 2.1829567677876297e-07, + "logits/chosen": 1.7893891334533691, + "logits/rejected": 1.8639053106307983, + "logps/chosen": -319.2881164550781, + "logps/rejected": -349.91851806640625, + "loss": 0.6001, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0994592905044556, + "rewards/margins": 0.519347071647644, + "rewards/rejected": -1.6188066005706787, + "step": 274 + }, + { + "epoch": 0.5877638258081752, + "grad_norm": 16.918446999240707, + "learning_rate": 2.164416835455862e-07, + "logits/chosen": 1.8019373416900635, + "logits/rejected": 1.6737080812454224, + "logps/chosen": -393.3109436035156, + "logps/rejected": -404.9291076660156, + "loss": 0.5719, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1502411365509033, + "rewards/margins": 0.16453669965267181, + "rewards/rejected": -1.3147778511047363, + "step": 275 + }, + { + "epoch": 0.5899011488111141, + "grad_norm": 18.434871669590013, + "learning_rate": 2.1458956789610277e-07, + "logits/chosen": 1.8519237041473389, + "logits/rejected": 1.8247859477996826, + "logps/chosen": -298.4532165527344, + "logps/rejected": -277.18121337890625, + "loss": 0.5889, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2886124849319458, + "rewards/margins": 0.4535689353942871, + "rewards/rejected": -1.7421815395355225, + "step": 276 + }, + { + "epoch": 0.5920384718140529, + "grad_norm": 18.52956407873372, + "learning_rate": 2.1273943345595635e-07, + "logits/chosen": 1.8608410358428955, + "logits/rejected": 1.8228371143341064, + "logps/chosen": -433.77166748046875, + "logps/rejected": -428.9039001464844, + "loss": 0.6168, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.420507550239563, + "rewards/margins": 0.2784229815006256, + "rewards/rejected": -1.6989305019378662, + "step": 277 + }, + { + "epoch": 0.5941757948169917, + "grad_norm": 17.630697492069693, + "learning_rate": 2.1089138373994222e-07, + "logits/chosen": 1.5853919982910156, + "logits/rejected": 1.4946439266204834, + "logps/chosen": -352.94097900390625, + "logps/rejected": -350.8989562988281, + "loss": 0.6307, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.009688377380371, + "rewards/margins": 0.2674168348312378, + "rewards/rejected": -1.2771050930023193, + "step": 278 + }, + { + "epoch": 0.5963131178199306, + "grad_norm": 18.10952812595942, + "learning_rate": 2.0904552214621556e-07, + "logits/chosen": 1.433165192604065, + "logits/rejected": 1.5178760290145874, + "logps/chosen": -301.49359130859375, + "logps/rejected": -378.566162109375, + "loss": 0.5841, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9828816056251526, + "rewards/margins": 0.8563581109046936, + "rewards/rejected": -1.8392397165298462, + "step": 279 + }, + { + "epoch": 0.5984504408228694, + "grad_norm": 16.077356770517124, + "learning_rate": 2.072019519505062e-07, + "logits/chosen": 2.150374412536621, + "logits/rejected": 2.144287109375, + "logps/chosen": -378.2465515136719, + "logps/rejected": -414.12860107421875, + "loss": 0.5743, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7395087480545044, + "rewards/margins": 0.6563863158226013, + "rewards/rejected": -1.3958951234817505, + "step": 280 + }, + { + "epoch": 0.6005877638258081, + "grad_norm": 24.441481846041146, + "learning_rate": 2.0536077630034085e-07, + "logits/chosen": 1.7997475862503052, + "logits/rejected": 1.8200863599777222, + "logps/chosen": -391.2835693359375, + "logps/rejected": -403.364501953125, + "loss": 0.6913, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2694519758224487, + "rewards/margins": 0.08388321101665497, + "rewards/rejected": -1.3533351421356201, + "step": 281 + }, + { + "epoch": 0.602725086828747, + "grad_norm": 17.8344144696276, + "learning_rate": 2.0352209820927135e-07, + "logits/chosen": 1.073244571685791, + "logits/rejected": 0.8806475400924683, + "logps/chosen": -315.171630859375, + "logps/rejected": -333.1543884277344, + "loss": 0.6089, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9738328456878662, + "rewards/margins": 0.7536094188690186, + "rewards/rejected": -1.7274422645568848, + "step": 282 + }, + { + "epoch": 0.6048624098316858, + "grad_norm": 18.10807913356027, + "learning_rate": 2.0168602055111173e-07, + "logits/chosen": 1.743872046470642, + "logits/rejected": 1.6714892387390137, + "logps/chosen": -403.4494323730469, + "logps/rejected": -429.84051513671875, + "loss": 0.5822, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9764223694801331, + "rewards/margins": 0.6022226214408875, + "rewards/rejected": -1.57864511013031, + "step": 283 + }, + { + "epoch": 0.6069997328346246, + "grad_norm": 16.26326383123038, + "learning_rate": 1.998526460541818e-07, + "logits/chosen": 1.78279709815979, + "logits/rejected": 1.7751100063323975, + "logps/chosen": -391.4514465332031, + "logps/rejected": -400.68658447265625, + "loss": 0.547, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.096341609954834, + "rewards/margins": 0.5708625912666321, + "rewards/rejected": -1.6672042608261108, + "step": 284 + }, + { + "epoch": 0.6091370558375635, + "grad_norm": 21.544359613974976, + "learning_rate": 1.980220772955602e-07, + "logits/chosen": 2.0660159587860107, + "logits/rejected": 2.0274107456207275, + "logps/chosen": -456.6308898925781, + "logps/rejected": -493.3993225097656, + "loss": 0.6296, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.374593734741211, + "rewards/margins": 0.18421491980552673, + "rewards/rejected": -1.55880868434906, + "step": 285 + }, + { + "epoch": 0.6112743788405023, + "grad_norm": 19.038810259481426, + "learning_rate": 1.961944166953445e-07, + "logits/chosen": 1.1080034971237183, + "logits/rejected": 1.2669103145599365, + "logps/chosen": -365.549072265625, + "logps/rejected": -351.73028564453125, + "loss": 0.62, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.421024203300476, + "rewards/margins": 0.5578509569168091, + "rewards/rejected": -1.9788750410079956, + "step": 286 + }, + { + "epoch": 0.6134117018434411, + "grad_norm": 20.136440706184228, + "learning_rate": 1.9436976651092142e-07, + "logits/chosen": 1.9399811029434204, + "logits/rejected": 1.8495838642120361, + "logps/chosen": -355.4549255371094, + "logps/rejected": -369.4902648925781, + "loss": 0.6159, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2287437915802002, + "rewards/margins": 0.48982152342796326, + "rewards/rejected": -1.7185652256011963, + "step": 287 + }, + { + "epoch": 0.6155490248463799, + "grad_norm": 17.789228307045246, + "learning_rate": 1.9254822883124517e-07, + "logits/chosen": 1.929220199584961, + "logits/rejected": 1.928113579750061, + "logps/chosen": -463.58563232421875, + "logps/rejected": -486.4768371582031, + "loss": 0.5905, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3614306449890137, + "rewards/margins": 0.24906916916370392, + "rewards/rejected": -1.6104998588562012, + "step": 288 + }, + { + "epoch": 0.6176863478493188, + "grad_norm": 17.924045280638552, + "learning_rate": 1.9072990557112564e-07, + "logits/chosen": 1.9938664436340332, + "logits/rejected": 2.048049211502075, + "logps/chosen": -314.8684997558594, + "logps/rejected": -366.38739013671875, + "loss": 0.5973, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2091230154037476, + "rewards/margins": 0.43797287344932556, + "rewards/rejected": -1.6470959186553955, + "step": 289 + }, + { + "epoch": 0.6198236708522575, + "grad_norm": 19.138114842281414, + "learning_rate": 1.8891489846552644e-07, + "logits/chosen": 1.8760986328125, + "logits/rejected": 1.8651152849197388, + "logps/chosen": -352.26373291015625, + "logps/rejected": -363.06231689453125, + "loss": 0.5813, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2897804975509644, + "rewards/margins": 0.037354469299316406, + "rewards/rejected": -1.3271349668502808, + "step": 290 + }, + { + "epoch": 0.6219609938551963, + "grad_norm": 18.141812527530227, + "learning_rate": 1.8710330906387286e-07, + "logits/chosen": 1.7009693384170532, + "logits/rejected": 1.7097214460372925, + "logps/chosen": -401.5587463378906, + "logps/rejected": -406.8404541015625, + "loss": 0.6046, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2753279209136963, + "rewards/margins": 0.20724107325077057, + "rewards/rejected": -1.4825689792633057, + "step": 291 + }, + { + "epoch": 0.6240983168581352, + "grad_norm": 22.82747902823447, + "learning_rate": 1.8529523872436977e-07, + "logits/chosen": 1.7425600290298462, + "logits/rejected": 1.490181803703308, + "logps/chosen": -419.9559631347656, + "logps/rejected": -406.5120849609375, + "loss": 0.6505, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4110760688781738, + "rewards/margins": 0.21470992267131805, + "rewards/rejected": -1.6257858276367188, + "step": 292 + }, + { + "epoch": 0.626235639861074, + "grad_norm": 17.909633658779036, + "learning_rate": 1.8349078860833124e-07, + "logits/chosen": 1.7675672769546509, + "logits/rejected": 1.8061796426773071, + "logps/chosen": -391.7986145019531, + "logps/rejected": -431.70172119140625, + "loss": 0.6072, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6551833152770996, + "rewards/margins": 0.3061232566833496, + "rewards/rejected": -0.9613065719604492, + "step": 293 + }, + { + "epoch": 0.6283729628640128, + "grad_norm": 17.64445331496705, + "learning_rate": 1.8169005967452e-07, + "logits/chosen": 0.9978383779525757, + "logits/rejected": 1.074844479560852, + "logps/chosen": -396.01104736328125, + "logps/rejected": -366.7948303222656, + "loss": 0.5871, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1115831136703491, + "rewards/margins": 0.7094423174858093, + "rewards/rejected": -1.8210253715515137, + "step": 294 + }, + { + "epoch": 0.6305102858669517, + "grad_norm": 19.11480868885265, + "learning_rate": 1.7989315267349933e-07, + "logits/chosen": 1.2297173738479614, + "logits/rejected": 1.197448968887329, + "logps/chosen": -276.0805969238281, + "logps/rejected": -330.3874206542969, + "loss": 0.6201, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9019981026649475, + "rewards/margins": 0.9751827716827393, + "rewards/rejected": -1.8771809339523315, + "step": 295 + }, + { + "epoch": 0.6326476088698905, + "grad_norm": 18.788836039541163, + "learning_rate": 1.781001681419957e-07, + "logits/chosen": 1.2067197561264038, + "logits/rejected": 1.1462682485580444, + "logps/chosen": -435.5975646972656, + "logps/rejected": -410.3743896484375, + "loss": 0.6502, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8658223748207092, + "rewards/margins": 0.13561075925827026, + "rewards/rejected": -1.0014331340789795, + "step": 296 + }, + { + "epoch": 0.6347849318728293, + "grad_norm": 22.345954181375724, + "learning_rate": 1.763112063972739e-07, + "logits/chosen": 1.4078682661056519, + "logits/rejected": 1.5019978284835815, + "logps/chosen": -265.8661804199219, + "logps/rejected": -328.0052185058594, + "loss": 0.6665, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9942247867584229, + "rewards/margins": 0.46525779366493225, + "rewards/rejected": -1.4594827890396118, + "step": 297 + }, + { + "epoch": 0.6369222548757681, + "grad_norm": 17.95979388047236, + "learning_rate": 1.745263675315245e-07, + "logits/chosen": 1.9481912851333618, + "logits/rejected": 2.0087857246398926, + "logps/chosen": -399.0690002441406, + "logps/rejected": -425.6741943359375, + "loss": 0.6197, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0927371978759766, + "rewards/margins": 0.22326043248176575, + "rewards/rejected": -1.3159977197647095, + "step": 298 + }, + { + "epoch": 0.6390595778787069, + "grad_norm": 20.924697959345067, + "learning_rate": 1.7274575140626315e-07, + "logits/chosen": 1.2710243463516235, + "logits/rejected": 1.1798756122589111, + "logps/chosen": -346.3345031738281, + "logps/rejected": -352.3256530761719, + "loss": 0.6307, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.233717441558838, + "rewards/margins": 0.1365053653717041, + "rewards/rejected": -1.370222806930542, + "step": 299 + }, + { + "epoch": 0.6411969008816457, + "grad_norm": 17.692093613276146, + "learning_rate": 1.7096945764674398e-07, + "logits/chosen": 1.7905324697494507, + "logits/rejected": 1.7943121194839478, + "logps/chosen": -347.2984924316406, + "logps/rejected": -340.6080322265625, + "loss": 0.5765, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.07566499710083, + "rewards/margins": 0.10227301716804504, + "rewards/rejected": -1.1779381036758423, + "step": 300 + }, + { + "epoch": 0.6433342238845846, + "grad_norm": 19.852613137333368, + "learning_rate": 1.6919758563638502e-07, + "logits/chosen": 2.267047882080078, + "logits/rejected": 2.2171971797943115, + "logps/chosen": -509.4197998046875, + "logps/rejected": -506.5215759277344, + "loss": 0.6171, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9691479802131653, + "rewards/margins": 0.4192157983779907, + "rewards/rejected": -1.3883638381958008, + "step": 301 + }, + { + "epoch": 0.6454715468875234, + "grad_norm": 20.951925501886503, + "learning_rate": 1.674302345112083e-07, + "logits/chosen": 1.9694963693618774, + "logits/rejected": 1.918088674545288, + "logps/chosen": -429.9407653808594, + "logps/rejected": -431.4341735839844, + "loss": 0.6454, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0554986000061035, + "rewards/margins": 0.7666572332382202, + "rewards/rejected": -1.8221558332443237, + "step": 302 + }, + { + "epoch": 0.6476088698904622, + "grad_norm": 19.559839163696893, + "learning_rate": 1.656675031542925e-07, + "logits/chosen": 1.6839439868927002, + "logits/rejected": 1.764726161956787, + "logps/chosen": -374.66827392578125, + "logps/rejected": -428.7526550292969, + "loss": 0.5975, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1381664276123047, + "rewards/margins": 0.4832189977169037, + "rewards/rejected": -1.6213853359222412, + "step": 303 + }, + { + "epoch": 0.649746192893401, + "grad_norm": 21.095559884110685, + "learning_rate": 1.6390949019024118e-07, + "logits/chosen": 1.3224655389785767, + "logits/rejected": 1.3726483583450317, + "logps/chosen": -343.5739440917969, + "logps/rejected": -349.3829040527344, + "loss": 0.6829, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.487676978111267, + "rewards/margins": 0.12136916816234589, + "rewards/rejected": -1.6090461015701294, + "step": 304 + }, + { + "epoch": 0.6518835158963399, + "grad_norm": 19.396572088599502, + "learning_rate": 1.621562939796643e-07, + "logits/chosen": 1.6602495908737183, + "logits/rejected": 1.7938483953475952, + "logps/chosen": -333.3067626953125, + "logps/rejected": -398.20355224609375, + "loss": 0.5991, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.591664731502533, + "rewards/margins": 0.5985820889472961, + "rewards/rejected": -1.190246820449829, + "step": 305 + }, + { + "epoch": 0.6540208388992786, + "grad_norm": 18.311690178787455, + "learning_rate": 1.6040801261367493e-07, + "logits/chosen": 1.9007511138916016, + "logits/rejected": 1.9012222290039062, + "logps/chosen": -379.4654846191406, + "logps/rejected": -393.036865234375, + "loss": 0.5811, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.008962631225586, + "rewards/margins": 0.23067611455917358, + "rewards/rejected": -1.2396388053894043, + "step": 306 + }, + { + "epoch": 0.6561581619022174, + "grad_norm": 21.266085139828498, + "learning_rate": 1.5866474390840124e-07, + "logits/chosen": 1.795674443244934, + "logits/rejected": 1.858997106552124, + "logps/chosen": -380.346435546875, + "logps/rejected": -384.34075927734375, + "loss": 0.6022, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.886805534362793, + "rewards/margins": 0.37954381108283997, + "rewards/rejected": -1.2663494348526, + "step": 307 + }, + { + "epoch": 0.6582954849051563, + "grad_norm": 17.461304449850505, + "learning_rate": 1.569265853995137e-07, + "logits/chosen": 1.9612005949020386, + "logits/rejected": 1.921550989151001, + "logps/chosen": -421.7749328613281, + "logps/rejected": -458.66717529296875, + "loss": 0.6178, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0546059608459473, + "rewards/margins": 0.7568830251693726, + "rewards/rejected": -1.8114891052246094, + "step": 308 + }, + { + "epoch": 0.6604328079080951, + "grad_norm": 19.082726960592346, + "learning_rate": 1.5519363433676791e-07, + "logits/chosen": 1.519544005393982, + "logits/rejected": 1.5249278545379639, + "logps/chosen": -421.0302429199219, + "logps/rejected": -439.40557861328125, + "loss": 0.5887, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3705350160598755, + "rewards/margins": 0.23214447498321533, + "rewards/rejected": -1.6026794910430908, + "step": 309 + }, + { + "epoch": 0.6625701309110339, + "grad_norm": 18.11403641529886, + "learning_rate": 1.5346598767856345e-07, + "logits/chosen": 1.587712049484253, + "logits/rejected": 1.6687382459640503, + "logps/chosen": -351.07464599609375, + "logps/rejected": -359.98126220703125, + "loss": 0.6091, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9926980137825012, + "rewards/margins": 0.7598885297775269, + "rewards/rejected": -1.7525867223739624, + "step": 310 + }, + { + "epoch": 0.6647074539139728, + "grad_norm": 16.941104687169506, + "learning_rate": 1.517437420865191e-07, + "logits/chosen": 1.8143850564956665, + "logits/rejected": 1.6587419509887695, + "logps/chosen": -404.6020202636719, + "logps/rejected": -389.1875305175781, + "loss": 0.5913, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0236051082611084, + "rewards/margins": 0.31825628876686096, + "rewards/rejected": -1.3418614864349365, + "step": 311 + }, + { + "epoch": 0.6668447769169116, + "grad_norm": 18.994018231106203, + "learning_rate": 1.500269939200648e-07, + "logits/chosen": 1.5793185234069824, + "logits/rejected": 1.6349645853042603, + "logps/chosen": -373.96240234375, + "logps/rejected": -388.9745788574219, + "loss": 0.5628, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9590240120887756, + "rewards/margins": 0.5476614832878113, + "rewards/rejected": -1.5066853761672974, + "step": 312 + }, + { + "epoch": 0.6689820999198504, + "grad_norm": 19.795530112372177, + "learning_rate": 1.4831583923104998e-07, + "logits/chosen": 1.6909123659133911, + "logits/rejected": 1.7797627449035645, + "logps/chosen": -416.3101806640625, + "logps/rejected": -438.67462158203125, + "loss": 0.6688, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9395960569381714, + "rewards/margins": 0.044527970254421234, + "rewards/rejected": -0.9841240644454956, + "step": 313 + }, + { + "epoch": 0.6711194229227893, + "grad_norm": 23.065919224015374, + "learning_rate": 1.4661037375836987e-07, + "logits/chosen": 1.376868486404419, + "logits/rejected": 1.4309742450714111, + "logps/chosen": -407.21051025390625, + "logps/rejected": -379.1553039550781, + "loss": 0.6451, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0327116250991821, + "rewards/margins": 0.013223947957158089, + "rewards/rejected": -1.0459357500076294, + "step": 314 + }, + { + "epoch": 0.673256745925728, + "grad_norm": 18.097806750182333, + "learning_rate": 1.4491069292260866e-07, + "logits/chosen": 1.9266318082809448, + "logits/rejected": 1.8753666877746582, + "logps/chosen": -430.4090576171875, + "logps/rejected": -414.37841796875, + "loss": 0.6214, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0124588012695312, + "rewards/margins": 0.7154525518417358, + "rewards/rejected": -1.727911353111267, + "step": 315 + }, + { + "epoch": 0.6753940689286668, + "grad_norm": 16.883704416038682, + "learning_rate": 1.432168918207009e-07, + "logits/chosen": 2.082824468612671, + "logits/rejected": 2.1169075965881348, + "logps/chosen": -444.53173828125, + "logps/rejected": -443.7054443359375, + "loss": 0.5946, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0104761123657227, + "rewards/margins": 0.373771071434021, + "rewards/rejected": -1.3842473030090332, + "step": 316 + }, + { + "epoch": 0.6775313919316056, + "grad_norm": 18.54452166971697, + "learning_rate": 1.4152906522061047e-07, + "logits/chosen": 1.2201721668243408, + "logits/rejected": 1.2176278829574585, + "logps/chosen": -415.645263671875, + "logps/rejected": -436.7527160644531, + "loss": 0.6087, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.836451530456543, + "rewards/margins": 0.502465009689331, + "rewards/rejected": -1.338916540145874, + "step": 317 + }, + { + "epoch": 0.6796687149345445, + "grad_norm": 17.299138202298057, + "learning_rate": 1.3984730755602903e-07, + "logits/chosen": 1.1743218898773193, + "logits/rejected": 1.1369413137435913, + "logps/chosen": -279.7085266113281, + "logps/rejected": -293.5144958496094, + "loss": 0.5782, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.750356137752533, + "rewards/margins": 0.5203050374984741, + "rewards/rejected": -1.2706611156463623, + "step": 318 + }, + { + "epoch": 0.6818060379374833, + "grad_norm": 19.398975454291527, + "learning_rate": 1.381717129210918e-07, + "logits/chosen": 2.5072126388549805, + "logits/rejected": 2.4705142974853516, + "logps/chosen": -370.4862060546875, + "logps/rejected": -377.8222351074219, + "loss": 0.5967, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3816720247268677, + "rewards/margins": 0.17874151468276978, + "rewards/rejected": -1.5604134798049927, + "step": 319 + }, + { + "epoch": 0.6839433609404221, + "grad_norm": 17.240016620199377, + "learning_rate": 1.365023750651133e-07, + "logits/chosen": 2.6305899620056152, + "logits/rejected": 2.5452349185943604, + "logps/chosen": -395.29217529296875, + "logps/rejected": -391.28607177734375, + "loss": 0.5865, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.043142318725586, + "rewards/margins": 0.10385166853666306, + "rewards/rejected": -1.1469941139221191, + "step": 320 + }, + { + "epoch": 0.686080683943361, + "grad_norm": 21.048682994153253, + "learning_rate": 1.3483938738734195e-07, + "logits/chosen": 1.2768375873565674, + "logits/rejected": 1.4707895517349243, + "logps/chosen": -333.0333251953125, + "logps/rejected": -352.16094970703125, + "loss": 0.6481, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7810248732566833, + "rewards/margins": 0.33725664019584656, + "rewards/rejected": -1.118281602859497, + "step": 321 + }, + { + "epoch": 0.6882180069462998, + "grad_norm": 17.394392159707856, + "learning_rate": 1.3318284293173449e-07, + "logits/chosen": 1.9551142454147339, + "logits/rejected": 1.876376986503601, + "logps/chosen": -409.2510986328125, + "logps/rejected": -405.74188232421875, + "loss": 0.5974, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8536494970321655, + "rewards/margins": 0.24086396396160126, + "rewards/rejected": -1.0945135354995728, + "step": 322 + }, + { + "epoch": 0.6903553299492385, + "grad_norm": 18.94853226611994, + "learning_rate": 1.3153283438175034e-07, + "logits/chosen": 1.839475154876709, + "logits/rejected": 1.858643889427185, + "logps/chosen": -378.91265869140625, + "logps/rejected": -390.3384704589844, + "loss": 0.6003, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.978223979473114, + "rewards/margins": 0.5790024399757385, + "rewards/rejected": -1.557226300239563, + "step": 323 + }, + { + "epoch": 0.6924926529521774, + "grad_norm": 17.111787232424657, + "learning_rate": 1.2988945405516565e-07, + "logits/chosen": 1.826279640197754, + "logits/rejected": 1.8660060167312622, + "logps/chosen": -330.77508544921875, + "logps/rejected": -358.8695068359375, + "loss": 0.5461, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9692739248275757, + "rewards/margins": 0.4413262605667114, + "rewards/rejected": -1.410600185394287, + "step": 324 + }, + { + "epoch": 0.6946299759551162, + "grad_norm": 25.561218495275238, + "learning_rate": 1.2825279389890818e-07, + "logits/chosen": 1.8373939990997314, + "logits/rejected": 1.7409933805465698, + "logps/chosen": -426.71917724609375, + "logps/rejected": -416.2432861328125, + "loss": 0.6059, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2536462545394897, + "rewards/margins": 0.25466710329055786, + "rewards/rejected": -1.5083134174346924, + "step": 325 + }, + { + "epoch": 0.696767298958055, + "grad_norm": 17.8636661186785, + "learning_rate": 1.2662294548391328e-07, + "logits/chosen": 1.5317473411560059, + "logits/rejected": 1.3886245489120483, + "logps/chosen": -384.1850280761719, + "logps/rejected": -339.3521728515625, + "loss": 0.6232, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3272415399551392, + "rewards/margins": 0.08453375101089478, + "rewards/rejected": -1.4117752313613892, + "step": 326 + }, + { + "epoch": 0.6989046219609939, + "grad_norm": 17.144689153521533, + "learning_rate": 1.2500000000000005e-07, + "logits/chosen": 1.782150387763977, + "logits/rejected": 1.9099899530410767, + "logps/chosen": -380.603271484375, + "logps/rejected": -433.623779296875, + "loss": 0.5832, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9439319372177124, + "rewards/margins": 0.28429514169692993, + "rewards/rejected": -1.2282270193099976, + "step": 327 + }, + { + "epoch": 0.7010419449639327, + "grad_norm": 19.552962720871037, + "learning_rate": 1.2338404825076935e-07, + "logits/chosen": 1.2995223999023438, + "logits/rejected": 1.3280295133590698, + "logps/chosen": -324.47314453125, + "logps/rejected": -334.8476257324219, + "loss": 0.6088, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9188759326934814, + "rewards/margins": 0.35758963227272034, + "rewards/rejected": -1.2764655351638794, + "step": 328 + }, + { + "epoch": 0.7031792679668715, + "grad_norm": 16.96691219753261, + "learning_rate": 1.2177518064852348e-07, + "logits/chosen": 2.564326524734497, + "logits/rejected": 2.5473833084106445, + "logps/chosen": -552.3907470703125, + "logps/rejected": -558.2307739257812, + "loss": 0.565, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9720156192779541, + "rewards/margins": 0.3254457116127014, + "rewards/rejected": -1.2974613904953003, + "step": 329 + }, + { + "epoch": 0.7053165909698104, + "grad_norm": 17.13700860515758, + "learning_rate": 1.201734872092077e-07, + "logits/chosen": 1.8777260780334473, + "logits/rejected": 1.888671875, + "logps/chosen": -434.1102600097656, + "logps/rejected": -476.6391296386719, + "loss": 0.6071, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1475944519042969, + "rewards/margins": 0.6439616084098816, + "rewards/rejected": -1.7915560007095337, + "step": 330 + }, + { + "epoch": 0.7074539139727491, + "grad_norm": 16.56035938775486, + "learning_rate": 1.185790575473738e-07, + "logits/chosen": 2.2853150367736816, + "logits/rejected": 2.4159228801727295, + "logps/chosen": -415.8793029785156, + "logps/rejected": -461.54559326171875, + "loss": 0.5657, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1002264022827148, + "rewards/margins": 0.29029062390327454, + "rewards/rejected": -1.390516996383667, + "step": 331 + }, + { + "epoch": 0.7095912369756879, + "grad_norm": 21.13067642035339, + "learning_rate": 1.1699198087116588e-07, + "logits/chosen": 1.7377692461013794, + "logits/rejected": 1.709326982498169, + "logps/chosen": -392.8721008300781, + "logps/rejected": -427.005859375, + "loss": 0.626, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1958163976669312, + "rewards/margins": 0.2574688196182251, + "rewards/rejected": -1.4532852172851562, + "step": 332 + }, + { + "epoch": 0.7117285599786267, + "grad_norm": 16.821147592051055, + "learning_rate": 1.1541234597732947e-07, + "logits/chosen": 1.5543380975723267, + "logits/rejected": 1.5971977710723877, + "logps/chosen": -374.1974792480469, + "logps/rejected": -394.3440246582031, + "loss": 0.5213, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9774885773658752, + "rewards/margins": 0.5341455936431885, + "rewards/rejected": -1.5116342306137085, + "step": 333 + }, + { + "epoch": 0.7138658829815656, + "grad_norm": 17.020478258816116, + "learning_rate": 1.1384024124624322e-07, + "logits/chosen": 1.4832370281219482, + "logits/rejected": 1.3855926990509033, + "logps/chosen": -402.4433288574219, + "logps/rejected": -406.5257568359375, + "loss": 0.5865, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5169297456741333, + "rewards/margins": 0.22409147024154663, + "rewards/rejected": -1.7410211563110352, + "step": 334 + }, + { + "epoch": 0.7160032059845044, + "grad_norm": 17.550708004075997, + "learning_rate": 1.1227575463697439e-07, + "logits/chosen": 1.2210332155227661, + "logits/rejected": 1.3429027795791626, + "logps/chosen": -275.6416015625, + "logps/rejected": -383.0740966796875, + "loss": 0.6042, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2633944749832153, + "rewards/margins": 0.37785589694976807, + "rewards/rejected": -1.6412503719329834, + "step": 335 + }, + { + "epoch": 0.7181405289874432, + "grad_norm": 18.381082023596203, + "learning_rate": 1.1071897368235694e-07, + "logits/chosen": 1.761547327041626, + "logits/rejected": 1.9874017238616943, + "logps/chosen": -475.7955017089844, + "logps/rejected": -545.6195678710938, + "loss": 0.6306, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.518486738204956, + "rewards/margins": 0.3139525055885315, + "rewards/rejected": -1.8324391841888428, + "step": 336 + }, + { + "epoch": 0.7202778519903821, + "grad_norm": 18.478233920231197, + "learning_rate": 1.0916998548409447e-07, + "logits/chosen": 1.4830739498138428, + "logits/rejected": 1.531538486480713, + "logps/chosen": -257.1949157714844, + "logps/rejected": -257.0049133300781, + "loss": 0.5829, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9545426368713379, + "rewards/margins": 0.5574601888656616, + "rewards/rejected": -1.512002944946289, + "step": 337 + }, + { + "epoch": 0.7224151749933209, + "grad_norm": 16.386183638471508, + "learning_rate": 1.0762887670788701e-07, + "logits/chosen": 1.5718142986297607, + "logits/rejected": 1.7238279581069946, + "logps/chosen": -427.6659240722656, + "logps/rejected": -471.98724365234375, + "loss": 0.6115, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8210560083389282, + "rewards/margins": 0.35570234060287476, + "rewards/rejected": -1.1767584085464478, + "step": 338 + }, + { + "epoch": 0.7245524979962596, + "grad_norm": 17.362629994095062, + "learning_rate": 1.0609573357858165e-07, + "logits/chosen": 2.049461841583252, + "logits/rejected": 2.0207159519195557, + "logps/chosen": -350.08209228515625, + "logps/rejected": -366.1018981933594, + "loss": 0.5832, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9150630831718445, + "rewards/margins": 0.3026912808418274, + "rewards/rejected": -1.2177543640136719, + "step": 339 + }, + { + "epoch": 0.7266898209991985, + "grad_norm": 18.074963794700235, + "learning_rate": 1.0457064187534861e-07, + "logits/chosen": 1.8654290437698364, + "logits/rejected": 1.9139888286590576, + "logps/chosen": -436.3177490234375, + "logps/rejected": -468.21026611328125, + "loss": 0.6101, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.125885009765625, + "rewards/margins": 0.6720912456512451, + "rewards/rejected": -1.7979762554168701, + "step": 340 + }, + { + "epoch": 0.7288271440021373, + "grad_norm": 15.616075952348003, + "learning_rate": 1.0305368692688174e-07, + "logits/chosen": 1.8928383588790894, + "logits/rejected": 1.7172499895095825, + "logps/chosen": -320.7323303222656, + "logps/rejected": -340.637939453125, + "loss": 0.5296, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9158480167388916, + "rewards/margins": 0.6651303768157959, + "rewards/rejected": -1.5809783935546875, + "step": 341 + }, + { + "epoch": 0.7309644670050761, + "grad_norm": 16.141766521096457, + "learning_rate": 1.0154495360662463e-07, + "logits/chosen": 1.8351894617080688, + "logits/rejected": 1.88357412815094, + "logps/chosen": -417.806884765625, + "logps/rejected": -424.0835266113281, + "loss": 0.5906, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1564135551452637, + "rewards/margins": 0.2745019495487213, + "rewards/rejected": -1.4309154748916626, + "step": 342 + }, + { + "epoch": 0.733101790008015, + "grad_norm": 16.58837646437526, + "learning_rate": 1.0004452632802158e-07, + "logits/chosen": 2.133150100708008, + "logits/rejected": 2.064979076385498, + "logps/chosen": -376.74078369140625, + "logps/rejected": -378.5306091308594, + "loss": 0.5551, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7134183049201965, + "rewards/margins": 0.5067132711410522, + "rewards/rejected": -1.220131754875183, + "step": 343 + }, + { + "epoch": 0.7352391130109538, + "grad_norm": 16.39380243963404, + "learning_rate": 9.855248903979505e-08, + "logits/chosen": 2.0602328777313232, + "logits/rejected": 2.0420618057250977, + "logps/chosen": -302.2620849609375, + "logps/rejected": -306.7290344238281, + "loss": 0.5853, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4054746925830841, + "rewards/margins": 0.44839826226234436, + "rewards/rejected": -0.8538729548454285, + "step": 344 + }, + { + "epoch": 0.7373764360138926, + "grad_norm": 19.731670469441074, + "learning_rate": 9.706892522124838e-08, + "logits/chosen": 1.8298213481903076, + "logits/rejected": 1.9004275798797607, + "logps/chosen": -327.8082275390625, + "logps/rejected": -365.7606506347656, + "loss": 0.6029, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0073175430297852, + "rewards/margins": 0.22869540750980377, + "rewards/rejected": -1.2360128164291382, + "step": 345 + }, + { + "epoch": 0.7395137590168315, + "grad_norm": 19.011257992667222, + "learning_rate": 9.559391787759554e-08, + "logits/chosen": 1.3878638744354248, + "logits/rejected": 1.568437099456787, + "logps/chosen": -372.02978515625, + "logps/rejected": -407.9342041015625, + "loss": 0.6405, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1335147619247437, + "rewards/margins": 0.025209851562976837, + "rewards/rejected": -1.1587246656417847, + "step": 346 + }, + { + "epoch": 0.7416510820197703, + "grad_norm": 18.46695592828962, + "learning_rate": 9.412754953531663e-08, + "logits/chosen": 1.710093379020691, + "logits/rejected": 1.6472371816635132, + "logps/chosen": -297.39776611328125, + "logps/rejected": -267.59954833984375, + "loss": 0.5957, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2740660905838013, + "rewards/margins": 0.06755752861499786, + "rewards/rejected": -1.3416236639022827, + "step": 347 + }, + { + "epoch": 0.743788405022709, + "grad_norm": 16.87621305823869, + "learning_rate": 9.266990223754067e-08, + "logits/chosen": 1.8978588581085205, + "logits/rejected": 1.8024924993515015, + "logps/chosen": -327.6407775878906, + "logps/rejected": -362.8518981933594, + "loss": 0.625, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0301729440689087, + "rewards/margins": 0.3730047643184662, + "rewards/rejected": -1.4031778573989868, + "step": 348 + }, + { + "epoch": 0.7459257280256478, + "grad_norm": 19.589160930248937, + "learning_rate": 9.12210575394553e-08, + "logits/chosen": 1.2508031129837036, + "logits/rejected": 1.1444499492645264, + "logps/chosen": -450.5746154785156, + "logps/rejected": -492.925537109375, + "loss": 0.5814, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.165111780166626, + "rewards/margins": 0.5934293270111084, + "rewards/rejected": -1.758541226387024, + "step": 349 + }, + { + "epoch": 0.7480630510285867, + "grad_norm": 16.899125344870786, + "learning_rate": 8.978109650374396e-08, + "logits/chosen": 2.270671844482422, + "logits/rejected": 2.231257200241089, + "logps/chosen": -384.2763977050781, + "logps/rejected": -375.3595275878906, + "loss": 0.6009, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0201106071472168, + "rewards/margins": 0.1503101885318756, + "rewards/rejected": -1.1704206466674805, + "step": 350 + }, + { + "epoch": 0.7502003740315255, + "grad_norm": 16.195586617871964, + "learning_rate": 8.835009969605011e-08, + "logits/chosen": 1.483668565750122, + "logits/rejected": 1.482094645500183, + "logps/chosen": -343.3363037109375, + "logps/rejected": -386.342041015625, + "loss": 0.6039, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.092482566833496, + "rewards/margins": 0.4346364736557007, + "rewards/rejected": -1.5271189212799072, + "step": 351 + }, + { + "epoch": 0.7523376970344643, + "grad_norm": 21.26793469108042, + "learning_rate": 8.692814718046978e-08, + "logits/chosen": 1.866996169090271, + "logits/rejected": 1.7753551006317139, + "logps/chosen": -456.31988525390625, + "logps/rejected": -433.0694885253906, + "loss": 0.6028, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2143518924713135, + "rewards/margins": 0.2675904929637909, + "rewards/rejected": -1.4819422960281372, + "step": 352 + }, + { + "epoch": 0.7544750200374032, + "grad_norm": 18.427744526820025, + "learning_rate": 8.551531851507185e-08, + "logits/chosen": 1.7221219539642334, + "logits/rejected": 1.7266207933425903, + "logps/chosen": -395.39874267578125, + "logps/rejected": -417.5553894042969, + "loss": 0.6136, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.024322748184204, + "rewards/margins": 0.6090888381004333, + "rewards/rejected": -1.6334116458892822, + "step": 353 + }, + { + "epoch": 0.756612343040342, + "grad_norm": 20.305718374217165, + "learning_rate": 8.411169274744723e-08, + "logits/chosen": 1.9147306680679321, + "logits/rejected": 1.8940410614013672, + "logps/chosen": -376.7578430175781, + "logps/rejected": -394.53680419921875, + "loss": 0.6336, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3257228136062622, + "rewards/margins": 0.3503354787826538, + "rewards/rejected": -1.676058292388916, + "step": 354 + }, + { + "epoch": 0.7587496660432808, + "grad_norm": 20.54054389713915, + "learning_rate": 8.271734841028552e-08, + "logits/chosen": 1.9527983665466309, + "logits/rejected": 2.0401647090911865, + "logps/chosen": -419.7406005859375, + "logps/rejected": -460.14605712890625, + "loss": 0.6028, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8988450169563293, + "rewards/margins": 0.2838803231716156, + "rewards/rejected": -1.182725429534912, + "step": 355 + }, + { + "epoch": 0.7608869890462197, + "grad_norm": 18.611126738741184, + "learning_rate": 8.133236351698142e-08, + "logits/chosen": 1.9735045433044434, + "logits/rejected": 2.096036911010742, + "logps/chosen": -400.4068603515625, + "logps/rejected": -418.3611755371094, + "loss": 0.5567, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8807581663131714, + "rewards/margins": 0.4934861361980438, + "rewards/rejected": -1.3742443323135376, + "step": 356 + }, + { + "epoch": 0.7630243120491584, + "grad_norm": 19.445978428810093, + "learning_rate": 7.99568155572701e-08, + "logits/chosen": 1.6883461475372314, + "logits/rejected": 1.5084795951843262, + "logps/chosen": -461.36859130859375, + "logps/rejected": -436.689453125, + "loss": 0.6174, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3846616744995117, + "rewards/margins": 0.6382476091384888, + "rewards/rejected": -2.022909164428711, + "step": 357 + }, + { + "epoch": 0.7651616350520972, + "grad_norm": 18.04686433261846, + "learning_rate": 7.859078149289144e-08, + "logits/chosen": 1.8313043117523193, + "logits/rejected": 1.745527744293213, + "logps/chosen": -395.2892761230469, + "logps/rejected": -412.5917663574219, + "loss": 0.5956, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.17001473903656, + "rewards/margins": 0.6119087338447571, + "rewards/rejected": -1.7819232940673828, + "step": 358 + }, + { + "epoch": 0.7672989580550361, + "grad_norm": 21.260482136795545, + "learning_rate": 7.723433775328384e-08, + "logits/chosen": 1.4848061800003052, + "logits/rejected": 1.4556670188903809, + "logps/chosen": -301.49517822265625, + "logps/rejected": -311.4028015136719, + "loss": 0.6213, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8877332806587219, + "rewards/margins": 0.2783505916595459, + "rewards/rejected": -1.166083812713623, + "step": 359 + }, + { + "epoch": 0.7694362810579749, + "grad_norm": 20.078171800579664, + "learning_rate": 7.588756023130833e-08, + "logits/chosen": 1.4349831342697144, + "logits/rejected": 1.5936346054077148, + "logps/chosen": -403.015380859375, + "logps/rejected": -404.7160339355469, + "loss": 0.5879, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9739541411399841, + "rewards/margins": 0.3197196125984192, + "rewards/rejected": -1.2936737537384033, + "step": 360 + }, + { + "epoch": 0.7715736040609137, + "grad_norm": 20.485624067682732, + "learning_rate": 7.455052427900213e-08, + "logits/chosen": 1.5565801858901978, + "logits/rejected": 1.6235904693603516, + "logps/chosen": -479.0734558105469, + "logps/rejected": -470.5107421875, + "loss": 0.6689, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.511535406112671, + "rewards/margins": 0.019254431128501892, + "rewards/rejected": -1.5307896137237549, + "step": 361 + }, + { + "epoch": 0.7737109270638525, + "grad_norm": 17.04191318961155, + "learning_rate": 7.322330470336313e-08, + "logits/chosen": 1.7196496725082397, + "logits/rejected": 1.6668784618377686, + "logps/chosen": -298.79754638671875, + "logps/rejected": -327.07879638671875, + "loss": 0.6116, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.298088788986206, + "rewards/margins": 0.46125340461730957, + "rewards/rejected": -1.7593421936035156, + "step": 362 + }, + { + "epoch": 0.7758482500667914, + "grad_norm": 22.269060100698063, + "learning_rate": 7.190597576216384e-08, + "logits/chosen": 1.768049955368042, + "logits/rejected": 1.6878827810287476, + "logps/chosen": -308.0188903808594, + "logps/rejected": -323.5241394042969, + "loss": 0.6123, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6722269058227539, + "rewards/margins": 0.5267820954322815, + "rewards/rejected": -1.1990089416503906, + "step": 363 + }, + { + "epoch": 0.7779855730697302, + "grad_norm": 19.044197893042973, + "learning_rate": 7.059861115979701e-08, + "logits/chosen": 1.6526095867156982, + "logits/rejected": 1.7360644340515137, + "logps/chosen": -311.6914367675781, + "logps/rejected": -372.5617980957031, + "loss": 0.6656, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48367545008659363, + "rewards/margins": 0.8194734454154968, + "rewards/rejected": -1.303148865699768, + "step": 364 + }, + { + "epoch": 0.7801228960726689, + "grad_norm": 18.813524672602487, + "learning_rate": 6.930128404315214e-08, + "logits/chosen": 1.5229746103286743, + "logits/rejected": 1.3987520933151245, + "logps/chosen": -377.3014221191406, + "logps/rejected": -419.2353515625, + "loss": 0.6065, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7370649576187134, + "rewards/margins": 0.3761173188686371, + "rewards/rejected": -2.1131820678710938, + "step": 365 + }, + { + "epoch": 0.7822602190756078, + "grad_norm": 17.713282448963408, + "learning_rate": 6.801406699752229e-08, + "logits/chosen": 1.6024835109710693, + "logits/rejected": 1.670932412147522, + "logps/chosen": -372.3557434082031, + "logps/rejected": -386.1446228027344, + "loss": 0.5762, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2764437198638916, + "rewards/margins": 0.2568923830986023, + "rewards/rejected": -1.5333361625671387, + "step": 366 + }, + { + "epoch": 0.7843975420785466, + "grad_norm": 16.625075332510626, + "learning_rate": 6.673703204254347e-08, + "logits/chosen": 1.6393647193908691, + "logits/rejected": 1.6989482641220093, + "logps/chosen": -397.8586120605469, + "logps/rejected": -411.6268310546875, + "loss": 0.545, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.292647361755371, + "rewards/margins": 0.40703633427619934, + "rewards/rejected": -1.6996837854385376, + "step": 367 + }, + { + "epoch": 0.7865348650814854, + "grad_norm": 19.878935002956247, + "learning_rate": 6.547025062816486e-08, + "logits/chosen": 1.4603272676467896, + "logits/rejected": 1.3453279733657837, + "logps/chosen": -299.8364562988281, + "logps/rejected": -330.9093017578125, + "loss": 0.62, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6287568211555481, + "rewards/margins": 0.374176561832428, + "rewards/rejected": -1.002933382987976, + "step": 368 + }, + { + "epoch": 0.7886721880844243, + "grad_norm": 16.758828572680688, + "learning_rate": 6.42137936306514e-08, + "logits/chosen": 1.7640485763549805, + "logits/rejected": 1.8099982738494873, + "logps/chosen": -362.0057373046875, + "logps/rejected": -391.24224853515625, + "loss": 0.5984, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1697113513946533, + "rewards/margins": 0.047207579016685486, + "rewards/rejected": -1.2169188261032104, + "step": 369 + }, + { + "epoch": 0.7908095110873631, + "grad_norm": 21.845423639848487, + "learning_rate": 6.296773134861824e-08, + "logits/chosen": 1.6217718124389648, + "logits/rejected": 1.734086513519287, + "logps/chosen": -256.1837158203125, + "logps/rejected": -272.9673767089844, + "loss": 0.5846, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0282855033874512, + "rewards/margins": 0.41900819540023804, + "rewards/rejected": -1.447293758392334, + "step": 370 + }, + { + "epoch": 0.7929468340903019, + "grad_norm": 17.92986801930431, + "learning_rate": 6.173213349909728e-08, + "logits/chosen": 1.8200995922088623, + "logits/rejected": 1.7330509424209595, + "logps/chosen": -356.558837890625, + "logps/rejected": -348.9017028808594, + "loss": 0.6052, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9982262849807739, + "rewards/margins": 0.14351393282413483, + "rewards/rejected": -1.1417402029037476, + "step": 371 + }, + { + "epoch": 0.7950841570932408, + "grad_norm": 17.081174619575314, + "learning_rate": 6.050706921363672e-08, + "logits/chosen": 1.8869644403457642, + "logits/rejected": 1.8416467905044556, + "logps/chosen": -354.5775146484375, + "logps/rejected": -388.51177978515625, + "loss": 0.6067, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0341633558273315, + "rewards/margins": 0.3402591943740845, + "rewards/rejected": -1.3744226694107056, + "step": 372 + }, + { + "epoch": 0.7972214800961795, + "grad_norm": 17.662807478755507, + "learning_rate": 5.929260703443337e-08, + "logits/chosen": 1.3998608589172363, + "logits/rejected": 1.4698572158813477, + "logps/chosen": -449.04351806640625, + "logps/rejected": -449.3763122558594, + "loss": 0.585, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1849581003189087, + "rewards/margins": 0.4725034534931183, + "rewards/rejected": -1.6574615240097046, + "step": 373 + }, + { + "epoch": 0.7993588030991183, + "grad_norm": 17.725555922665713, + "learning_rate": 5.808881491049722e-08, + "logits/chosen": 1.6900920867919922, + "logits/rejected": 1.6598479747772217, + "logps/chosen": -412.2428894042969, + "logps/rejected": -422.5267333984375, + "loss": 0.5971, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0798150300979614, + "rewards/margins": 0.28746387362480164, + "rewards/rejected": -1.3672789335250854, + "step": 374 + }, + { + "epoch": 0.8014961261020572, + "grad_norm": 17.16991579530448, + "learning_rate": 5.6895760193850145e-08, + "logits/chosen": 1.4595896005630493, + "logits/rejected": 1.4326881170272827, + "logps/chosen": -408.4560852050781, + "logps/rejected": -405.8981018066406, + "loss": 0.6283, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.646263599395752, + "rewards/margins": 0.3666486144065857, + "rewards/rejected": -2.0129122734069824, + "step": 375 + }, + { + "epoch": 0.803633449104996, + "grad_norm": 17.93410549937266, + "learning_rate": 5.571350963575727e-08, + "logits/chosen": 1.743166208267212, + "logits/rejected": 1.6486046314239502, + "logps/chosen": -445.26312255859375, + "logps/rejected": -459.120849609375, + "loss": 0.6159, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.186528205871582, + "rewards/margins": 0.33439502120018005, + "rewards/rejected": -1.520923376083374, + "step": 376 + }, + { + "epoch": 0.8057707721079348, + "grad_norm": 16.9845646295775, + "learning_rate": 5.454212938299255e-08, + "logits/chosen": 1.9025753736495972, + "logits/rejected": 1.917330265045166, + "logps/chosen": -345.7820129394531, + "logps/rejected": -331.1052551269531, + "loss": 0.6072, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7959749102592468, + "rewards/margins": 0.298651784658432, + "rewards/rejected": -1.0946266651153564, + "step": 377 + }, + { + "epoch": 0.8079080951108736, + "grad_norm": 16.873032439456804, + "learning_rate": 5.338168497413756e-08, + "logits/chosen": 2.3049681186676025, + "logits/rejected": 2.2500970363616943, + "logps/chosen": -402.2143249511719, + "logps/rejected": -402.95166015625, + "loss": 0.537, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9636644124984741, + "rewards/margins": 0.5833888053894043, + "rewards/rejected": -1.5470532178878784, + "step": 378 + }, + { + "epoch": 0.8100454181138125, + "grad_norm": 16.325007461853215, + "learning_rate": 5.223224133591475e-08, + "logits/chosen": 1.597383737564087, + "logits/rejected": 1.609183669090271, + "logps/chosen": -288.2392578125, + "logps/rejected": -337.51092529296875, + "loss": 0.5317, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7511001825332642, + "rewards/margins": 0.693979799747467, + "rewards/rejected": -1.445080041885376, + "step": 379 + }, + { + "epoch": 0.8121827411167513, + "grad_norm": 19.572806409831877, + "learning_rate": 5.109386277955477e-08, + "logits/chosen": 1.6565470695495605, + "logits/rejected": 1.6742223501205444, + "logps/chosen": -377.81939697265625, + "logps/rejected": -387.5241394042969, + "loss": 0.5908, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6612589359283447, + "rewards/margins": 0.5410428047180176, + "rewards/rejected": -1.2023016214370728, + "step": 380 + }, + { + "epoch": 0.81432006411969, + "grad_norm": 19.520950615429946, + "learning_rate": 4.996661299719845e-08, + "logits/chosen": 1.8140640258789062, + "logits/rejected": 1.7656865119934082, + "logps/chosen": -510.03521728515625, + "logps/rejected": -507.64166259765625, + "loss": 0.5909, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2178007364273071, + "rewards/margins": 0.18270380795001984, + "rewards/rejected": -1.4005045890808105, + "step": 381 + }, + { + "epoch": 0.8164573871226289, + "grad_norm": 16.127377849885388, + "learning_rate": 4.885055505833291e-08, + "logits/chosen": 1.644123911857605, + "logits/rejected": 1.6220022439956665, + "logps/chosen": -386.73663330078125, + "logps/rejected": -393.9167175292969, + "loss": 0.5988, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1794971227645874, + "rewards/margins": 0.3469049632549286, + "rewards/rejected": -1.5264021158218384, + "step": 382 + }, + { + "epoch": 0.8185947101255677, + "grad_norm": 17.73846719380124, + "learning_rate": 4.774575140626316e-08, + "logits/chosen": 1.3664933443069458, + "logits/rejected": 1.4751458168029785, + "logps/chosen": -344.3662109375, + "logps/rejected": -390.3739013671875, + "loss": 0.5697, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9151713848114014, + "rewards/margins": 0.487021267414093, + "rewards/rejected": -1.40219247341156, + "step": 383 + }, + { + "epoch": 0.8207320331285065, + "grad_norm": 17.072732271001154, + "learning_rate": 4.6652263854618016e-08, + "logits/chosen": 1.934602975845337, + "logits/rejected": 2.0105841159820557, + "logps/chosen": -426.7675476074219, + "logps/rejected": -451.0812683105469, + "loss": 0.553, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2221730947494507, + "rewards/margins": 0.48440563678741455, + "rewards/rejected": -1.7065787315368652, + "step": 384 + }, + { + "epoch": 0.8228693561314454, + "grad_norm": 21.293620391850812, + "learning_rate": 4.557015358389216e-08, + "logits/chosen": 2.1146156787872314, + "logits/rejected": 2.1517491340637207, + "logps/chosen": -464.082763671875, + "logps/rejected": -470.5016784667969, + "loss": 0.6409, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3467998504638672, + "rewards/margins": 0.40472251176834106, + "rewards/rejected": -1.751522421836853, + "step": 385 + }, + { + "epoch": 0.8250066791343842, + "grad_norm": 17.833652532065038, + "learning_rate": 4.449948113802254e-08, + "logits/chosen": 1.807660698890686, + "logits/rejected": 1.7980822324752808, + "logps/chosen": -438.4178771972656, + "logps/rejected": -453.2962951660156, + "loss": 0.6096, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.104383111000061, + "rewards/margins": 0.7033094167709351, + "rewards/rejected": -1.807692527770996, + "step": 386 + }, + { + "epoch": 0.827144002137323, + "grad_norm": 19.571597961934124, + "learning_rate": 4.3440306421001324e-08, + "logits/chosen": 1.7510319948196411, + "logits/rejected": 1.6834566593170166, + "logps/chosen": -484.4024658203125, + "logps/rejected": -485.3985900878906, + "loss": 0.6339, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0107614994049072, + "rewards/margins": 0.25170525908470154, + "rewards/rejected": -1.2624667882919312, + "step": 387 + }, + { + "epoch": 0.8292813251402619, + "grad_norm": 17.82974603873972, + "learning_rate": 4.2392688693524055e-08, + "logits/chosen": 1.404268741607666, + "logits/rejected": 1.3694736957550049, + "logps/chosen": -408.179443359375, + "logps/rejected": -406.07086181640625, + "loss": 0.5722, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0687264204025269, + "rewards/margins": 0.3735535144805908, + "rewards/rejected": -1.4422800540924072, + "step": 388 + }, + { + "epoch": 0.8314186481432007, + "grad_norm": 17.438473933793833, + "learning_rate": 4.1356686569674335e-08, + "logits/chosen": 1.3533443212509155, + "logits/rejected": 1.4366700649261475, + "logps/chosen": -380.77960205078125, + "logps/rejected": -405.69097900390625, + "loss": 0.6209, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9108083844184875, + "rewards/margins": 0.4199364483356476, + "rewards/rejected": -1.330744981765747, + "step": 389 + }, + { + "epoch": 0.8335559711461394, + "grad_norm": 20.355804196982337, + "learning_rate": 4.0332358013644015e-08, + "logits/chosen": 1.4978055953979492, + "logits/rejected": 1.5026313066482544, + "logps/chosen": -390.0057373046875, + "logps/rejected": -386.77313232421875, + "loss": 0.5277, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7286229133605957, + "rewards/margins": 0.4630648195743561, + "rewards/rejected": -1.191687822341919, + "step": 390 + }, + { + "epoch": 0.8356932941490782, + "grad_norm": 18.45237542669808, + "learning_rate": 3.9319760336490205e-08, + "logits/chosen": 1.5355026721954346, + "logits/rejected": 1.543558120727539, + "logps/chosen": -339.8648986816406, + "logps/rejected": -330.8528747558594, + "loss": 0.5491, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5863010883331299, + "rewards/margins": 0.04436583071947098, + "rewards/rejected": -1.630666971206665, + "step": 391 + }, + { + "epoch": 0.8378306171520171, + "grad_norm": 16.374379157073367, + "learning_rate": 3.831895019292897e-08, + "logits/chosen": 1.8390171527862549, + "logits/rejected": 1.98207426071167, + "logps/chosen": -447.08380126953125, + "logps/rejected": -459.7073974609375, + "loss": 0.5677, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8593235015869141, + "rewards/margins": 0.2716907858848572, + "rewards/rejected": -1.131014347076416, + "step": 392 + }, + { + "epoch": 0.8399679401549559, + "grad_norm": 19.305861548140808, + "learning_rate": 3.732998357816514e-08, + "logits/chosen": 1.5250623226165771, + "logits/rejected": 1.6143085956573486, + "logps/chosen": -306.3028564453125, + "logps/rejected": -309.7184143066406, + "loss": 0.6167, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2037074565887451, + "rewards/margins": 0.11004292964935303, + "rewards/rejected": -1.3137503862380981, + "step": 393 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 20.03718277534217, + "learning_rate": 3.635291582475963e-08, + "logits/chosen": 1.9921854734420776, + "logits/rejected": 2.0196597576141357, + "logps/chosen": -310.9258728027344, + "logps/rejected": -345.7138366699219, + "loss": 0.6297, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.169144868850708, + "rewards/margins": 0.5138453245162964, + "rewards/rejected": -1.682990312576294, + "step": 394 + }, + { + "epoch": 0.8442425861608336, + "grad_norm": 28.142649519687087, + "learning_rate": 3.538780159953347e-08, + "logits/chosen": 0.4998638331890106, + "logits/rejected": 0.8166034817695618, + "logps/chosen": -302.5617980957031, + "logps/rejected": -285.3108825683594, + "loss": 0.6362, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.474445104598999, + "rewards/margins": 0.5089064240455627, + "rewards/rejected": -1.9833515882492065, + "step": 395 + }, + { + "epoch": 0.8463799091637724, + "grad_norm": 16.81382076431638, + "learning_rate": 3.4434694900509345e-08, + "logits/chosen": 2.299107789993286, + "logits/rejected": 2.3635454177856445, + "logps/chosen": -444.2193603515625, + "logps/rejected": -458.1888732910156, + "loss": 0.5549, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7250373363494873, + "rewards/margins": 0.566501259803772, + "rewards/rejected": -1.2915387153625488, + "step": 396 + }, + { + "epoch": 0.8485172321667112, + "grad_norm": 19.392060798939607, + "learning_rate": 3.349364905389032e-08, + "logits/chosen": 1.8515828847885132, + "logits/rejected": 1.804177165031433, + "logps/chosen": -409.74365234375, + "logps/rejected": -418.00323486328125, + "loss": 0.63, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.2581006288528442, + "rewards/margins": 0.11751668900251389, + "rewards/rejected": -1.375617265701294, + "step": 397 + }, + { + "epoch": 0.85065455516965, + "grad_norm": 17.103729406158553, + "learning_rate": 3.256471671107616e-08, + "logits/chosen": 1.6766064167022705, + "logits/rejected": 1.737638235092163, + "logps/chosen": -428.44775390625, + "logps/rejected": -407.27215576171875, + "loss": 0.5907, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2123147249221802, + "rewards/margins": 0.46986332535743713, + "rewards/rejected": -1.6821781396865845, + "step": 398 + }, + { + "epoch": 0.8527918781725888, + "grad_norm": 19.577154726036273, + "learning_rate": 3.1647949845717585e-08, + "logits/chosen": 1.630979299545288, + "logits/rejected": 1.6724216938018799, + "logps/chosen": -348.2309265136719, + "logps/rejected": -371.38079833984375, + "loss": 0.5965, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.337809681892395, + "rewards/margins": 0.36785420775413513, + "rewards/rejected": -1.7056639194488525, + "step": 399 + }, + { + "epoch": 0.8549292011755276, + "grad_norm": 18.826846500494238, + "learning_rate": 3.074339975080836e-08, + "logits/chosen": 1.4629188776016235, + "logits/rejected": 1.4965126514434814, + "logps/chosen": -372.1953125, + "logps/rejected": -401.7340087890625, + "loss": 0.62, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5468955039978027, + "rewards/margins": 0.15613311529159546, + "rewards/rejected": -1.703028678894043, + "step": 400 + }, + { + "epoch": 0.8549292011755276, + "eval_logits/chosen": 1.6708999872207642, + "eval_logits/rejected": 1.6820895671844482, + "eval_logps/chosen": -403.0909729003906, + "eval_logps/rejected": -433.6640930175781, + "eval_loss": 0.61039799451828, + "eval_rewards/accuracies": 0.6975806355476379, + "eval_rewards/chosen": -1.0658750534057617, + "eval_rewards/margins": 0.3874683082103729, + "eval_rewards/rejected": -1.4533432722091675, + "eval_runtime": 88.4828, + "eval_samples_per_second": 22.162, + "eval_steps_per_second": 0.701, + "step": 400 + }, + { + "epoch": 0.8570665241784665, + "grad_norm": 19.7974029204823, + "learning_rate": 2.98511170358155e-08, + "logits/chosen": 1.7001169919967651, + "logits/rejected": 1.532285213470459, + "logps/chosen": -407.736083984375, + "logps/rejected": -362.34912109375, + "loss": 0.6207, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1236542463302612, + "rewards/margins": 0.419066846370697, + "rewards/rejected": -1.542720913887024, + "step": 401 + }, + { + "epoch": 0.8592038471814053, + "grad_norm": 21.14677537275842, + "learning_rate": 2.8971151623847584e-08, + "logits/chosen": 1.7446285486221313, + "logits/rejected": 1.785717487335205, + "logps/chosen": -397.1467590332031, + "logps/rejected": -435.0672302246094, + "loss": 0.6311, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2843698263168335, + "rewards/margins": 0.144765704870224, + "rewards/rejected": -1.4291354417800903, + "step": 402 + }, + { + "epoch": 0.8613411701843441, + "grad_norm": 18.56274549083658, + "learning_rate": 2.8103552748861475e-08, + "logits/chosen": 1.6394376754760742, + "logits/rejected": 1.714260220527649, + "logps/chosen": -428.9457092285156, + "logps/rejected": -443.63946533203125, + "loss": 0.6151, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0276083946228027, + "rewards/margins": 0.4798241853713989, + "rewards/rejected": -1.5074325799942017, + "step": 403 + }, + { + "epoch": 0.863478493187283, + "grad_norm": 18.223738666005126, + "learning_rate": 2.724836895290805e-08, + "logits/chosen": 2.072843074798584, + "logits/rejected": 2.2029428482055664, + "logps/chosen": -372.4205017089844, + "logps/rejected": -394.25372314453125, + "loss": 0.5865, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1573227643966675, + "rewards/margins": 0.34818750619888306, + "rewards/rejected": -1.5055103302001953, + "step": 404 + }, + { + "epoch": 0.8656158161902218, + "grad_norm": 16.462464022149383, + "learning_rate": 2.6405648083415833e-08, + "logits/chosen": 1.7049862146377563, + "logits/rejected": 1.6428170204162598, + "logps/chosen": -384.18572998046875, + "logps/rejected": -400.8232421875, + "loss": 0.5885, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0246027708053589, + "rewards/margins": 0.5153809189796448, + "rewards/rejected": -1.5399837493896484, + "step": 405 + }, + { + "epoch": 0.8677531391931605, + "grad_norm": 17.711142089994855, + "learning_rate": 2.55754372905142e-08, + "logits/chosen": 1.3477802276611328, + "logits/rejected": 1.3108845949172974, + "logps/chosen": -384.1131591796875, + "logps/rejected": -423.6861267089844, + "loss": 0.6309, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3002797365188599, + "rewards/margins": 0.2088843584060669, + "rewards/rejected": -1.5091642141342163, + "step": 406 + }, + { + "epoch": 0.8698904621960993, + "grad_norm": 18.762054039394492, + "learning_rate": 2.475778302439524e-08, + "logits/chosen": 1.4159724712371826, + "logits/rejected": 1.4871361255645752, + "logps/chosen": -470.30670166015625, + "logps/rejected": -511.0997314453125, + "loss": 0.6385, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4217987060546875, + "rewards/margins": 0.12027350813150406, + "rewards/rejected": -1.5420721769332886, + "step": 407 + }, + { + "epoch": 0.8720277851990382, + "grad_norm": 18.123731183123528, + "learning_rate": 2.3952731032714973e-08, + "logits/chosen": 2.404209613800049, + "logits/rejected": 2.4266862869262695, + "logps/chosen": -427.8636169433594, + "logps/rejected": -422.4261474609375, + "loss": 0.6259, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2890108823776245, + "rewards/margins": 0.378464937210083, + "rewards/rejected": -1.667475700378418, + "step": 408 + }, + { + "epoch": 0.874165108201977, + "grad_norm": 16.47161779285551, + "learning_rate": 2.3160326358033778e-08, + "logits/chosen": 1.366338849067688, + "logits/rejected": 1.3981362581253052, + "logps/chosen": -380.618408203125, + "logps/rejected": -390.28057861328125, + "loss": 0.5722, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7875087261199951, + "rewards/margins": 0.5520244836807251, + "rewards/rejected": -1.3395333290100098, + "step": 409 + }, + { + "epoch": 0.8763024312049158, + "grad_norm": 17.122362330907116, + "learning_rate": 2.2380613335296033e-08, + "logits/chosen": 1.8973968029022217, + "logits/rejected": 1.8657612800598145, + "logps/chosen": -510.1373291015625, + "logps/rejected": -504.5380859375, + "loss": 0.5772, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1060776710510254, + "rewards/margins": 0.7315502762794495, + "rewards/rejected": -1.8376280069351196, + "step": 410 + }, + { + "epoch": 0.8784397542078547, + "grad_norm": 18.921033730633166, + "learning_rate": 2.1613635589349756e-08, + "logits/chosen": 1.4683369398117065, + "logits/rejected": 1.5493369102478027, + "logps/chosen": -384.0714111328125, + "logps/rejected": -391.6690673828125, + "loss": 0.6087, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4163315296173096, + "rewards/margins": 0.286923348903656, + "rewards/rejected": -1.7032545804977417, + "step": 411 + }, + { + "epoch": 0.8805770772107935, + "grad_norm": 18.56812402329405, + "learning_rate": 2.085943603250595e-08, + "logits/chosen": 2.2680721282958984, + "logits/rejected": 2.329545259475708, + "logps/chosen": -436.45550537109375, + "logps/rejected": -465.65167236328125, + "loss": 0.6226, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.101943016052246, + "rewards/margins": 0.39092782139778137, + "rewards/rejected": -1.4928709268569946, + "step": 412 + }, + { + "epoch": 0.8827144002137323, + "grad_norm": 17.23421202374209, + "learning_rate": 2.0118056862137354e-08, + "logits/chosen": 1.721946120262146, + "logits/rejected": 1.654489517211914, + "logps/chosen": -283.10394287109375, + "logps/rejected": -316.85931396484375, + "loss": 0.5663, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0829007625579834, + "rewards/margins": 0.6879382729530334, + "rewards/rejected": -1.7708390951156616, + "step": 413 + }, + { + "epoch": 0.8848517232166712, + "grad_norm": 17.24040512239067, + "learning_rate": 1.938953955831771e-08, + "logits/chosen": 1.8397215604782104, + "logits/rejected": 1.8681014776229858, + "logps/chosen": -348.84930419921875, + "logps/rejected": -354.12872314453125, + "loss": 0.552, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1776130199432373, + "rewards/margins": 0.4248766303062439, + "rewards/rejected": -1.602489948272705, + "step": 414 + }, + { + "epoch": 0.88698904621961, + "grad_norm": 18.359788171946988, + "learning_rate": 1.8673924881500823e-08, + "logits/chosen": 1.946815848350525, + "logits/rejected": 1.872100591659546, + "logps/chosen": -431.5869140625, + "logps/rejected": -441.61566162109375, + "loss": 0.6415, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.366295576095581, + "rewards/margins": 0.09268444776535034, + "rewards/rejected": -1.4589800834655762, + "step": 415 + }, + { + "epoch": 0.8891263692225487, + "grad_norm": 19.161300804849038, + "learning_rate": 1.797125287024029e-08, + "logits/chosen": 1.4459537267684937, + "logits/rejected": 1.4782161712646484, + "logps/chosen": -321.7972412109375, + "logps/rejected": -357.9532775878906, + "loss": 0.5477, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3282475471496582, + "rewards/margins": 0.6495993733406067, + "rewards/rejected": -1.9778469800949097, + "step": 416 + }, + { + "epoch": 0.8912636922254876, + "grad_norm": 18.975245958523743, + "learning_rate": 1.7281562838948966e-08, + "logits/chosen": 1.9562761783599854, + "logits/rejected": 1.9078611135482788, + "logps/chosen": -411.2685241699219, + "logps/rejected": -417.2503967285156, + "loss": 0.6413, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3274304866790771, + "rewards/margins": 0.19440825283527374, + "rewards/rejected": -1.5218387842178345, + "step": 417 + }, + { + "epoch": 0.8934010152284264, + "grad_norm": 17.631994700003954, + "learning_rate": 1.6604893375699592e-08, + "logits/chosen": 1.4374586343765259, + "logits/rejected": 1.4749794006347656, + "logps/chosen": -467.3548583984375, + "logps/rejected": -471.19305419921875, + "loss": 0.5571, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3381353616714478, + "rewards/margins": 0.5423804521560669, + "rewards/rejected": -1.8805158138275146, + "step": 418 + }, + { + "epoch": 0.8955383382313652, + "grad_norm": 18.607931434550764, + "learning_rate": 1.5941282340065697e-08, + "logits/chosen": 1.6795357465744019, + "logits/rejected": 1.4486351013183594, + "logps/chosen": -454.1962890625, + "logps/rejected": -454.06036376953125, + "loss": 0.5847, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1021569967269897, + "rewards/margins": 0.6963544487953186, + "rewards/rejected": -1.7985113859176636, + "step": 419 + }, + { + "epoch": 0.897675661234304, + "grad_norm": 18.496258966475576, + "learning_rate": 1.5290766861003475e-08, + "logits/chosen": 1.4845163822174072, + "logits/rejected": 1.4758884906768799, + "logps/chosen": -339.92315673828125, + "logps/rejected": -377.06219482421875, + "loss": 0.6292, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4483891725540161, + "rewards/margins": 0.5280267000198364, + "rewards/rejected": -1.9764158725738525, + "step": 420 + }, + { + "epoch": 0.8998129842372429, + "grad_norm": 18.43461695598332, + "learning_rate": 1.4653383334774228e-08, + "logits/chosen": 1.790908694267273, + "logits/rejected": 1.859453797340393, + "logps/chosen": -349.16796875, + "logps/rejected": -384.42059326171875, + "loss": 0.5825, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3110591173171997, + "rewards/margins": 0.5185103416442871, + "rewards/rejected": -1.8295693397521973, + "step": 421 + }, + { + "epoch": 0.9019503072401817, + "grad_norm": 17.81347366573304, + "learning_rate": 1.4029167422908105e-08, + "logits/chosen": 2.2679755687713623, + "logits/rejected": 2.147033214569092, + "logps/chosen": -546.00732421875, + "logps/rejected": -525.1266479492188, + "loss": 0.5897, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2079718112945557, + "rewards/margins": 0.28895166516304016, + "rewards/rejected": -1.4969233274459839, + "step": 422 + }, + { + "epoch": 0.9040876302431204, + "grad_norm": 18.982282659913977, + "learning_rate": 1.3418154050208936e-08, + "logits/chosen": 1.8013602495193481, + "logits/rejected": 1.6778424978256226, + "logps/chosen": -425.6045227050781, + "logps/rejected": -390.84942626953125, + "loss": 0.5949, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9693441390991211, + "rewards/margins": 0.3830145001411438, + "rewards/rejected": -1.3523586988449097, + "step": 423 + }, + { + "epoch": 0.9062249532460593, + "grad_norm": 17.345552565300007, + "learning_rate": 1.2820377402800064e-08, + "logits/chosen": 1.6084281206130981, + "logits/rejected": 1.5029420852661133, + "logps/chosen": -253.730712890625, + "logps/rejected": -260.1260681152344, + "loss": 0.6188, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9357234835624695, + "rewards/margins": 0.2174171507358551, + "rewards/rejected": -1.1531407833099365, + "step": 424 + }, + { + "epoch": 0.9083622762489981, + "grad_norm": 16.934150955880476, + "learning_rate": 1.2235870926211616e-08, + "logits/chosen": 2.1916890144348145, + "logits/rejected": 2.2400388717651367, + "logps/chosen": -285.0917663574219, + "logps/rejected": -322.8558349609375, + "loss": 0.5861, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.114344835281372, + "rewards/margins": 0.6048067808151245, + "rewards/rejected": -1.7191518545150757, + "step": 425 + }, + { + "epoch": 0.9104995992519369, + "grad_norm": 16.78887791389279, + "learning_rate": 1.1664667323509347e-08, + "logits/chosen": 1.4608023166656494, + "logits/rejected": 1.4370110034942627, + "logps/chosen": -377.3074645996094, + "logps/rejected": -393.9685363769531, + "loss": 0.5724, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.325226902961731, + "rewards/margins": 0.3096981942653656, + "rewards/rejected": -1.634925127029419, + "step": 426 + }, + { + "epoch": 0.9126369222548758, + "grad_norm": 16.418296977579498, + "learning_rate": 1.1106798553464802e-08, + "logits/chosen": 2.0858511924743652, + "logits/rejected": 2.105901002883911, + "logps/chosen": -367.344970703125, + "logps/rejected": -403.23944091796875, + "loss": 0.5597, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9577431678771973, + "rewards/margins": 0.36515700817108154, + "rewards/rejected": -1.3229001760482788, + "step": 427 + }, + { + "epoch": 0.9147742452578146, + "grad_norm": 19.245837329047824, + "learning_rate": 1.0562295828767387e-08, + "logits/chosen": 1.3938257694244385, + "logits/rejected": 1.4853570461273193, + "logps/chosen": -355.104248046875, + "logps/rejected": -412.22222900390625, + "loss": 0.5855, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0600641965866089, + "rewards/margins": 0.5212962627410889, + "rewards/rejected": -1.5813604593276978, + "step": 428 + }, + { + "epoch": 0.9169115682607534, + "grad_norm": 17.51716706952601, + "learning_rate": 1.0031189614277763e-08, + "logits/chosen": 1.5417289733886719, + "logits/rejected": 1.5729870796203613, + "logps/chosen": -321.3535461425781, + "logps/rejected": -309.21234130859375, + "loss": 0.5932, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.24485182762146, + "rewards/margins": 0.23819807171821594, + "rewards/rejected": -1.483049750328064, + "step": 429 + }, + { + "epoch": 0.9190488912636923, + "grad_norm": 15.052027680122107, + "learning_rate": 9.513509625323518e-09, + "logits/chosen": 1.9989356994628906, + "logits/rejected": 2.145040273666382, + "logps/chosen": -402.7864685058594, + "logps/rejected": -425.85296630859375, + "loss": 0.5434, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2284355163574219, + "rewards/margins": 0.4855433702468872, + "rewards/rejected": -1.713978886604309, + "step": 430 + }, + { + "epoch": 0.921186214266631, + "grad_norm": 19.85542406922844, + "learning_rate": 9.009284826036689e-09, + "logits/chosen": 1.4120731353759766, + "logits/rejected": 1.4285330772399902, + "logps/chosen": -397.44110107421875, + "logps/rejected": -441.92449951171875, + "loss": 0.5901, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.050657868385315, + "rewards/margins": 0.8108992576599121, + "rewards/rejected": -1.8615570068359375, + "step": 431 + }, + { + "epoch": 0.9233235372695698, + "grad_norm": 19.008344723565617, + "learning_rate": 8.518543427732949e-09, + "logits/chosen": 1.5640474557876587, + "logits/rejected": 1.5279781818389893, + "logps/chosen": -368.58758544921875, + "logps/rejected": -412.3382568359375, + "loss": 0.6518, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3109221458435059, + "rewards/margins": 0.4531381130218506, + "rewards/rejected": -1.7640602588653564, + "step": 432 + }, + { + "epoch": 0.9254608602725087, + "grad_norm": 17.725519985685008, + "learning_rate": 8.041312887333396e-09, + "logits/chosen": 1.6748428344726562, + "logits/rejected": 1.7114133834838867, + "logps/chosen": -323.8331298828125, + "logps/rejected": -365.55615234375, + "loss": 0.6122, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0393937826156616, + "rewards/margins": 0.4052044153213501, + "rewards/rejected": -1.4445981979370117, + "step": 433 + }, + { + "epoch": 0.9275981832754475, + "grad_norm": 17.42277444922853, + "learning_rate": 7.577619905828281e-09, + "logits/chosen": 1.5716729164123535, + "logits/rejected": 1.6077600717544556, + "logps/chosen": -355.86785888671875, + "logps/rejected": -387.0648498535156, + "loss": 0.5723, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3303015232086182, + "rewards/margins": 0.4627699553966522, + "rewards/rejected": -1.7930715084075928, + "step": 434 + }, + { + "epoch": 0.9297355062783863, + "grad_norm": 17.161022534670806, + "learning_rate": 7.127490426783123e-09, + "logits/chosen": 1.7443987131118774, + "logits/rejected": 1.7588456869125366, + "logps/chosen": -477.2964172363281, + "logps/rejected": -463.51165771484375, + "loss": 0.6176, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5801703929901123, + "rewards/margins": 0.22587111592292786, + "rewards/rejected": -1.8060415983200073, + "step": 435 + }, + { + "epoch": 0.9318728292813251, + "grad_norm": 23.430770890245924, + "learning_rate": 6.6909496348871445e-09, + "logits/chosen": 1.8086642026901245, + "logits/rejected": 1.8920540809631348, + "logps/chosen": -347.9205322265625, + "logps/rejected": -364.6381530761719, + "loss": 0.6435, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6429501175880432, + "rewards/margins": 0.5737862586975098, + "rewards/rejected": -1.2167364358901978, + "step": 436 + }, + { + "epoch": 0.934010152284264, + "grad_norm": 20.41363056479245, + "learning_rate": 6.268021954544095e-09, + "logits/chosen": 1.019057273864746, + "logits/rejected": 0.9966921806335449, + "logps/chosen": -283.1648864746094, + "logps/rejected": -311.0303039550781, + "loss": 0.651, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9427959322929382, + "rewards/margins": 0.709195077419281, + "rewards/rejected": -1.6519910097122192, + "step": 437 + }, + { + "epoch": 0.9361474752872028, + "grad_norm": 16.18819460648659, + "learning_rate": 5.858731048505927e-09, + "logits/chosen": 1.7406284809112549, + "logits/rejected": 1.8502442836761475, + "logps/chosen": -348.90777587890625, + "logps/rejected": -371.7511291503906, + "loss": 0.5833, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2307209968566895, + "rewards/margins": 0.5154100656509399, + "rewards/rejected": -1.7461309432983398, + "step": 438 + }, + { + "epoch": 0.9382847982901416, + "grad_norm": 19.014687420050503, + "learning_rate": 5.463099816548577e-09, + "logits/chosen": 1.6393160820007324, + "logits/rejected": 1.7532588243484497, + "logps/chosen": -459.3277282714844, + "logps/rejected": -491.35870361328125, + "loss": 0.555, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3520638942718506, + "rewards/margins": 0.6467914581298828, + "rewards/rejected": -1.9988553524017334, + "step": 439 + }, + { + "epoch": 0.9404221212930804, + "grad_norm": 21.2096647419158, + "learning_rate": 5.08115039419113e-09, + "logits/chosen": 1.6956130266189575, + "logits/rejected": 1.79883873462677, + "logps/chosen": -414.0579528808594, + "logps/rejected": -428.4068298339844, + "loss": 0.5805, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1613951921463013, + "rewards/margins": 0.7041161060333252, + "rewards/rejected": -1.865511178970337, + "step": 440 + }, + { + "epoch": 0.9425594442960192, + "grad_norm": 17.17027730259822, + "learning_rate": 4.712904151456864e-09, + "logits/chosen": 1.5854616165161133, + "logits/rejected": 1.6955193281173706, + "logps/chosen": -409.63006591796875, + "logps/rejected": -437.7228088378906, + "loss": 0.5695, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9472392797470093, + "rewards/margins": 0.22789564728736877, + "rewards/rejected": -1.1751350164413452, + "step": 441 + }, + { + "epoch": 0.944696767298958, + "grad_norm": 20.349000792886113, + "learning_rate": 4.358381691677931e-09, + "logits/chosen": 1.95444917678833, + "logits/rejected": 2.0044314861297607, + "logps/chosen": -442.7948303222656, + "logps/rejected": -469.05487060546875, + "loss": 0.6312, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7205852270126343, + "rewards/margins": 0.20369315147399902, + "rewards/rejected": -1.9242782592773438, + "step": 442 + }, + { + "epoch": 0.9468340903018969, + "grad_norm": 19.194373432794922, + "learning_rate": 4.0176028503425826e-09, + "logits/chosen": 0.7825450897216797, + "logits/rejected": 0.7275460362434387, + "logps/chosen": -357.76983642578125, + "logps/rejected": -377.4739074707031, + "loss": 0.6453, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2489838600158691, + "rewards/margins": -0.00024299323558807373, + "rewards/rejected": -1.2487409114837646, + "step": 443 + }, + { + "epoch": 0.9489714133048357, + "grad_norm": 18.315036023173974, + "learning_rate": 3.6905866939851983e-09, + "logits/chosen": 1.3465955257415771, + "logits/rejected": 1.50806725025177, + "logps/chosen": -312.5934753417969, + "logps/rejected": -344.8988952636719, + "loss": 0.5825, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9831118583679199, + "rewards/margins": 0.4171569347381592, + "rewards/rejected": -1.4002689123153687, + "step": 444 + }, + { + "epoch": 0.9511087363077745, + "grad_norm": 20.014785567703672, + "learning_rate": 3.3773515191196646e-09, + "logits/chosen": 1.5976628065109253, + "logits/rejected": 1.6187068223953247, + "logps/chosen": -359.6556091308594, + "logps/rejected": -336.0928649902344, + "loss": 0.6265, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2695033550262451, + "rewards/margins": 0.45331743359565735, + "rewards/rejected": -1.72282075881958, + "step": 445 + }, + { + "epoch": 0.9532460593107134, + "grad_norm": 17.72727377899664, + "learning_rate": 3.077914851215585e-09, + "logits/chosen": 1.5538979768753052, + "logits/rejected": 1.7278094291687012, + "logps/chosen": -372.9303283691406, + "logps/rejected": -364.25457763671875, + "loss": 0.5963, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1903377771377563, + "rewards/margins": 0.4384117126464844, + "rewards/rejected": -1.6287494897842407, + "step": 446 + }, + { + "epoch": 0.9553833823136522, + "grad_norm": 16.23642380668402, + "learning_rate": 2.7922934437178692e-09, + "logits/chosen": 1.7799421548843384, + "logits/rejected": 1.7960634231567383, + "logps/chosen": -444.5595397949219, + "logps/rejected": -479.6990966796875, + "loss": 0.543, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.126379132270813, + "rewards/margins": 0.5162345170974731, + "rewards/rejected": -1.6426136493682861, + "step": 447 + }, + { + "epoch": 0.957520705316591, + "grad_norm": 19.101807236171627, + "learning_rate": 2.5205032771092592e-09, + "logits/chosen": 1.3535457849502563, + "logits/rejected": 1.2006926536560059, + "logps/chosen": -330.3343505859375, + "logps/rejected": -374.99310302734375, + "loss": 0.5951, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1633282899856567, + "rewards/margins": 0.4544922709465027, + "rewards/rejected": -1.6178205013275146, + "step": 448 + }, + { + "epoch": 0.9596580283195298, + "grad_norm": 16.56693609193232, + "learning_rate": 2.2625595580163247e-09, + "logits/chosen": 2.0447781085968018, + "logits/rejected": 1.8950867652893066, + "logps/chosen": -429.4393005371094, + "logps/rejected": -414.31817626953125, + "loss": 0.5752, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2402998208999634, + "rewards/margins": 0.6825169324874878, + "rewards/rejected": -1.9228168725967407, + "step": 449 + }, + { + "epoch": 0.9617953513224686, + "grad_norm": 16.980928002541575, + "learning_rate": 2.0184767183584474e-09, + "logits/chosen": 1.4854786396026611, + "logits/rejected": 1.4846787452697754, + "logps/chosen": -308.0284118652344, + "logps/rejected": -305.34515380859375, + "loss": 0.6133, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0690131187438965, + "rewards/margins": 0.16039828956127167, + "rewards/rejected": -1.2294113636016846, + "step": 450 + }, + { + "epoch": 0.9639326743254074, + "grad_norm": 18.056225979920594, + "learning_rate": 1.7882684145406612e-09, + "logits/chosen": 2.019209146499634, + "logits/rejected": 2.1171696186065674, + "logps/chosen": -350.12774658203125, + "logps/rejected": -378.6883850097656, + "loss": 0.5985, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2527142763137817, + "rewards/margins": 0.3824135661125183, + "rewards/rejected": -1.6351279020309448, + "step": 451 + }, + { + "epoch": 0.9660699973283462, + "grad_norm": 17.543262127192886, + "learning_rate": 1.5719475266893489e-09, + "logits/chosen": 1.4318145513534546, + "logits/rejected": 1.5697556734085083, + "logps/chosen": -303.3445129394531, + "logps/rejected": -348.2174377441406, + "loss": 0.5964, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.186130404472351, + "rewards/margins": 0.7902547121047974, + "rewards/rejected": -1.9763849973678589, + "step": 452 + }, + { + "epoch": 0.9682073203312851, + "grad_norm": 18.220000876315183, + "learning_rate": 1.3695261579316775e-09, + "logits/chosen": 1.6201151609420776, + "logits/rejected": 1.7068089246749878, + "logps/chosen": -370.4643859863281, + "logps/rejected": -408.29852294921875, + "loss": 0.6176, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4112707376480103, + "rewards/margins": 0.37669438123703003, + "rewards/rejected": -1.787965178489685, + "step": 453 + }, + { + "epoch": 0.9703446433342239, + "grad_norm": 16.929529214225397, + "learning_rate": 1.1810156337183908e-09, + "logits/chosen": 1.9085679054260254, + "logits/rejected": 2.029019832611084, + "logps/chosen": -444.6353454589844, + "logps/rejected": -482.870849609375, + "loss": 0.6378, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.495185375213623, + "rewards/margins": -0.083378866314888, + "rewards/rejected": -1.411806344985962, + "step": 454 + }, + { + "epoch": 0.9724819663371627, + "grad_norm": 19.141724017680925, + "learning_rate": 1.0064265011902328e-09, + "logits/chosen": 0.8631810545921326, + "logits/rejected": 0.833740234375, + "logps/chosen": -340.0003356933594, + "logps/rejected": -393.02728271484375, + "loss": 0.6086, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.579225778579712, + "rewards/margins": 0.29944878816604614, + "rewards/rejected": -1.8786746263504028, + "step": 455 + }, + { + "epoch": 0.9746192893401016, + "grad_norm": 16.719925817090374, + "learning_rate": 8.457685285878091e-10, + "logits/chosen": 2.0139966011047363, + "logits/rejected": 2.11411714553833, + "logps/chosen": -473.91241455078125, + "logps/rejected": -476.2071533203125, + "loss": 0.5955, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1062893867492676, + "rewards/margins": 0.22559532523155212, + "rewards/rejected": -1.331884741783142, + "step": 456 + }, + { + "epoch": 0.9767566123430403, + "grad_norm": 16.220077060781502, + "learning_rate": 6.990507047049676e-10, + "logits/chosen": 1.58895742893219, + "logits/rejected": 1.6351604461669922, + "logps/chosen": -302.9062805175781, + "logps/rejected": -351.12957763671875, + "loss": 0.5676, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6347337961196899, + "rewards/margins": 0.34598052501678467, + "rewards/rejected": -0.9807142019271851, + "step": 457 + }, + { + "epoch": 0.9788939353459791, + "grad_norm": 16.320270998571377, + "learning_rate": 5.662812383859794e-10, + "logits/chosen": 1.4099833965301514, + "logits/rejected": 1.4714851379394531, + "logps/chosen": -324.4279479980469, + "logps/rejected": -353.4306335449219, + "loss": 0.5799, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4264363050460815, + "rewards/margins": 0.448211669921875, + "rewards/rejected": -1.874647855758667, + "step": 458 + }, + { + "epoch": 0.981031258348918, + "grad_norm": 19.089311245869983, + "learning_rate": 4.4746755806621126e-10, + "logits/chosen": 1.4548715353012085, + "logits/rejected": 1.485275387763977, + "logps/chosen": -355.63311767578125, + "logps/rejected": -358.629150390625, + "loss": 0.6253, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3190171718597412, + "rewards/margins": 0.17262953519821167, + "rewards/rejected": -1.4916467666625977, + "step": 459 + }, + { + "epoch": 0.9831685813518568, + "grad_norm": 17.724546722072443, + "learning_rate": 3.4261631135654167e-10, + "logits/chosen": 1.7169846296310425, + "logits/rejected": 1.4584649801254272, + "logps/chosen": -417.5559997558594, + "logps/rejected": -402.72406005859375, + "loss": 0.6109, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3284595012664795, + "rewards/margins": 0.16694369912147522, + "rewards/rejected": -1.4954030513763428, + "step": 460 + }, + { + "epoch": 0.9853059043547956, + "grad_norm": 18.58153544828685, + "learning_rate": 2.5173336467135263e-10, + "logits/chosen": 1.4444609880447388, + "logits/rejected": 1.4017637968063354, + "logps/chosen": -395.44097900390625, + "logps/rejected": -413.9075622558594, + "loss": 0.5902, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2212958335876465, + "rewards/margins": 0.5377592444419861, + "rewards/rejected": -1.7590551376342773, + "step": 461 + }, + { + "epoch": 0.9874432273577345, + "grad_norm": 19.162318454777324, + "learning_rate": 1.7482380290034792e-10, + "logits/chosen": 0.9124627113342285, + "logits/rejected": 0.8698334097862244, + "logps/chosen": -333.6249694824219, + "logps/rejected": -346.1784973144531, + "loss": 0.6127, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4773962497711182, + "rewards/margins": 0.1805485039949417, + "rewards/rejected": -1.657944679260254, + "step": 462 + }, + { + "epoch": 0.9895805503606733, + "grad_norm": 19.724273693538393, + "learning_rate": 1.1189192912416933e-10, + "logits/chosen": 1.6409908533096313, + "logits/rejected": 1.5480575561523438, + "logps/chosen": -283.3055114746094, + "logps/rejected": -310.44561767578125, + "loss": 0.658, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3127912282943726, + "rewards/margins": 0.5362306237220764, + "rewards/rejected": -1.8490217924118042, + "step": 463 + }, + { + "epoch": 0.9917178733636121, + "grad_norm": 17.93307015830412, + "learning_rate": 6.294126437336733e-11, + "logits/chosen": 2.145725727081299, + "logits/rejected": 2.0747246742248535, + "logps/chosen": -436.8760986328125, + "logps/rejected": -406.91290283203125, + "loss": 0.6152, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.6242163181304932, + "rewards/margins": 0.16452085971832275, + "rewards/rejected": -1.7887372970581055, + "step": 464 + }, + { + "epoch": 0.9938551963665508, + "grad_norm": 21.39798162878369, + "learning_rate": 2.797454743164174e-11, + "logits/chosen": 1.4774665832519531, + "logits/rejected": 1.4269077777862549, + "logps/chosen": -484.0561828613281, + "logps/rejected": -466.2383117675781, + "loss": 0.6466, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3485268354415894, + "rewards/margins": 0.09689469635486603, + "rewards/rejected": -1.4454214572906494, + "step": 465 + }, + { + "epoch": 0.9959925193694897, + "grad_norm": 18.82475158368635, + "learning_rate": 6.993734682547714e-12, + "logits/chosen": 1.9347305297851562, + "logits/rejected": 1.9315987825393677, + "logps/chosen": -536.5445556640625, + "logps/rejected": -522.9882202148438, + "loss": 0.6231, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4731837511062622, + "rewards/margins": -0.18822771310806274, + "rewards/rejected": -1.2849558591842651, + "step": 466 + }, + { + "epoch": 0.9981298423724285, + "grad_norm": 18.113547265416, + "learning_rate": 0.0, + "logits/chosen": 2.014084577560425, + "logits/rejected": 1.9462015628814697, + "logps/chosen": -489.33050537109375, + "logps/rejected": -409.4538879394531, + "loss": 0.633, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1971293687820435, + "rewards/margins": 0.26006725430488586, + "rewards/rejected": -1.457196593284607, + "step": 467 + }, + { + "epoch": 0.9981298423724285, + "step": 467, + "total_flos": 0.0, + "train_loss": 0.6216196353991996, + "train_runtime": 6954.5059, + "train_samples_per_second": 8.61, + "train_steps_per_second": 0.067 + } + ], + "logging_steps": 1, + "max_steps": 467, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}