diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,40617 @@ +{ + "best_metric": 0.436347097158432, + "best_model_checkpoint": "checkpoints/microsoft/Phi-3-mini-4k-instructm1-stack-ultrafeedback/checkpoint-18000", + "epoch": 1.7596597991055063, + "eval_steps": 9000, + "global_step": 54000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006517258515205579, + "grad_norm": 2.0002994537353516, + "learning_rate": 4.999076699145132e-05, + "logits/chosen": 3.2755870819091797, + "logits/rejected": 3.584838390350342, + "logps/chosen": -315.9823303222656, + "logps/rejected": -316.63995361328125, + "loss": 0.6965, + "rewards/accuracies": 0.38749998807907104, + "rewards/chosen": -0.008498706854879856, + "rewards/margins": -0.006061377935111523, + "rewards/rejected": -0.0024373289197683334, + "step": 20 + }, + { + "epoch": 0.0013034517030411157, + "grad_norm": 1.159406065940857, + "learning_rate": 4.997990462845288e-05, + "logits/chosen": 3.113154888153076, + "logits/rejected": 3.0421106815338135, + "logps/chosen": -325.1206970214844, + "logps/rejected": -283.7405090332031, + "loss": 0.6925, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.008245082572102547, + "rewards/margins": 0.002093712566420436, + "rewards/rejected": -0.010338795371353626, + "step": 40 + }, + { + "epoch": 0.0019551775545616737, + "grad_norm": 3.130927085876465, + "learning_rate": 4.996904226545443e-05, + "logits/chosen": 3.023083448410034, + "logits/rejected": 3.0179762840270996, + "logps/chosen": -321.07830810546875, + "logps/rejected": -274.4349365234375, + "loss": 0.6783, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.020953591912984848, + "rewards/margins": 0.03214743733406067, + "rewards/rejected": -0.053101032972335815, + "step": 60 + }, + { + "epoch": 0.0026069034060822315, + "grad_norm": 1.4224945306777954, + "learning_rate": 4.995817990245598e-05, + "logits/chosen": 3.274817705154419, + "logits/rejected": 3.3151297569274902, + "logps/chosen": -335.99053955078125, + "logps/rejected": -250.5460205078125, + "loss": 0.6757, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.01326818484812975, + "rewards/margins": 0.08374638855457306, + "rewards/rejected": -0.07047822326421738, + "step": 80 + }, + { + "epoch": 0.0032586292576027892, + "grad_norm": 1.7415271997451782, + "learning_rate": 4.994731753945753e-05, + "logits/chosen": 3.335944652557373, + "logits/rejected": 3.254437208175659, + "logps/chosen": -332.8917541503906, + "logps/rejected": -274.4801025390625, + "loss": 0.6704, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.1129145622253418, + "rewards/margins": 0.09479150921106339, + "rewards/rejected": 0.01812305673956871, + "step": 100 + }, + { + "epoch": 0.003910355109123347, + "grad_norm": 2.4410550594329834, + "learning_rate": 4.993645517645909e-05, + "logits/chosen": 3.2900516986846924, + "logits/rejected": 3.3001999855041504, + "logps/chosen": -338.61279296875, + "logps/rejected": -270.4815368652344, + "loss": 0.6614, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.14215609431266785, + "rewards/margins": 0.10030213743448257, + "rewards/rejected": 0.04185396805405617, + "step": 120 + }, + { + "epoch": 0.004562080960643905, + "grad_norm": 1.7546881437301636, + "learning_rate": 4.992559281346064e-05, + "logits/chosen": 3.3762047290802, + "logits/rejected": 3.3036887645721436, + "logps/chosen": -316.80792236328125, + "logps/rejected": -268.8638000488281, + "loss": 0.6421, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.13752397894859314, + "rewards/margins": 0.14285337924957275, + "rewards/rejected": -0.0053294021636247635, + "step": 140 + }, + { + "epoch": 0.005213806812164463, + "grad_norm": 2.333815336227417, + "learning_rate": 4.991473045046219e-05, + "logits/chosen": 3.3221511840820312, + "logits/rejected": 3.396556854248047, + "logps/chosen": -305.49749755859375, + "logps/rejected": -272.806884765625, + "loss": 0.6117, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33509039878845215, + "rewards/margins": 0.35877615213394165, + "rewards/rejected": -0.023685742169618607, + "step": 160 + }, + { + "epoch": 0.005865532663685021, + "grad_norm": 1.7144825458526611, + "learning_rate": 4.990386808746375e-05, + "logits/chosen": 3.3403027057647705, + "logits/rejected": 3.3312041759490967, + "logps/chosen": -335.2523193359375, + "logps/rejected": -277.9786376953125, + "loss": 0.6527, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.05352731794118881, + "rewards/margins": 0.2562181055545807, + "rewards/rejected": -0.20269076526165009, + "step": 180 + }, + { + "epoch": 0.0065172585152055785, + "grad_norm": 5.539780139923096, + "learning_rate": 4.98930057244653e-05, + "logits/chosen": 3.3477530479431152, + "logits/rejected": 3.4726786613464355, + "logps/chosen": -316.0703430175781, + "logps/rejected": -254.63796997070312, + "loss": 0.6558, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1296534687280655, + "rewards/margins": 0.1924997717142105, + "rewards/rejected": -0.06284630298614502, + "step": 200 + }, + { + "epoch": 0.007168984366726137, + "grad_norm": 2.4386396408081055, + "learning_rate": 4.988214336146686e-05, + "logits/chosen": 3.11942982673645, + "logits/rejected": 3.1790788173675537, + "logps/chosen": -295.380126953125, + "logps/rejected": -257.9931640625, + "loss": 0.6257, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.3933296799659729, + "rewards/margins": 0.3269059658050537, + "rewards/rejected": 0.0664236918091774, + "step": 220 + }, + { + "epoch": 0.007820710218246695, + "grad_norm": 2.7915520668029785, + "learning_rate": 4.9871280998468415e-05, + "logits/chosen": 3.1915907859802246, + "logits/rejected": 3.397726535797119, + "logps/chosen": -306.9893493652344, + "logps/rejected": -259.20928955078125, + "loss": 0.6449, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.40997734665870667, + "rewards/margins": 0.36211854219436646, + "rewards/rejected": 0.0478588342666626, + "step": 240 + }, + { + "epoch": 0.008472436069767253, + "grad_norm": 2.5831496715545654, + "learning_rate": 4.9860418635469966e-05, + "logits/chosen": 3.417271852493286, + "logits/rejected": 3.483586549758911, + "logps/chosen": -347.23345947265625, + "logps/rejected": -290.9099426269531, + "loss": 0.5804, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.5220203995704651, + "rewards/margins": 0.3559853434562683, + "rewards/rejected": 0.16603508591651917, + "step": 260 + }, + { + "epoch": 0.00912416192128781, + "grad_norm": 2.444408416748047, + "learning_rate": 4.9849556272471516e-05, + "logits/chosen": 3.455443859100342, + "logits/rejected": 3.6179747581481934, + "logps/chosen": -324.1989440917969, + "logps/rejected": -291.21221923828125, + "loss": 0.7909, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.41506344079971313, + "rewards/margins": 0.20854708552360535, + "rewards/rejected": 0.206516295671463, + "step": 280 + }, + { + "epoch": 0.009775887772808368, + "grad_norm": 1.9054545164108276, + "learning_rate": 4.9839237027622995e-05, + "logits/chosen": 3.1389222145080566, + "logits/rejected": 3.2204978466033936, + "logps/chosen": -344.78424072265625, + "logps/rejected": -270.83697509765625, + "loss": 0.6713, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.03411946818232536, + "rewards/margins": 0.8329988718032837, + "rewards/rejected": -0.8671183586120605, + "step": 300 + }, + { + "epoch": 0.010427613624328926, + "grad_norm": 5.244300842285156, + "learning_rate": 4.9828374664624545e-05, + "logits/chosen": 2.9095842838287354, + "logits/rejected": 2.9244651794433594, + "logps/chosen": -262.78839111328125, + "logps/rejected": -241.27151489257812, + "loss": 0.6485, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.37290525436401367, + "rewards/margins": 0.4622132182121277, + "rewards/rejected": -0.8351184725761414, + "step": 320 + }, + { + "epoch": 0.011079339475849484, + "grad_norm": 2.204965353012085, + "learning_rate": 4.9817512301626096e-05, + "logits/chosen": 3.109475612640381, + "logits/rejected": 3.090216875076294, + "logps/chosen": -370.21624755859375, + "logps/rejected": -326.73455810546875, + "loss": 0.636, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.28643929958343506, + "rewards/margins": 0.41290345788002014, + "rewards/rejected": -0.6993427872657776, + "step": 340 + }, + { + "epoch": 0.011731065327370041, + "grad_norm": 3.0430994033813477, + "learning_rate": 4.9806649938627654e-05, + "logits/chosen": 3.3772213459014893, + "logits/rejected": 3.3231430053710938, + "logps/chosen": -346.76007080078125, + "logps/rejected": -280.5775146484375, + "loss": 0.548, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.07680945843458176, + "rewards/margins": 0.6309676766395569, + "rewards/rejected": -0.5541582703590393, + "step": 360 + }, + { + "epoch": 0.0123827911788906, + "grad_norm": 1.9388673305511475, + "learning_rate": 4.9795787575629205e-05, + "logits/chosen": 2.6644036769866943, + "logits/rejected": 2.894009590148926, + "logps/chosen": -322.9000549316406, + "logps/rejected": -251.2526092529297, + "loss": 0.673, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.5188241600990295, + "rewards/margins": 1.0635565519332886, + "rewards/rejected": -0.5447324514389038, + "step": 380 + }, + { + "epoch": 0.013034517030411157, + "grad_norm": 2.6208252906799316, + "learning_rate": 4.9784925212630755e-05, + "logits/chosen": 3.832808256149292, + "logits/rejected": 3.755136013031006, + "logps/chosen": -365.3720397949219, + "logps/rejected": -318.8487243652344, + "loss": 0.6082, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.29274991154670715, + "rewards/margins": 0.4208676815032959, + "rewards/rejected": -0.12811776995658875, + "step": 400 + }, + { + "epoch": 0.013686242881931715, + "grad_norm": 9.917403221130371, + "learning_rate": 4.977406284963231e-05, + "logits/chosen": 3.222848892211914, + "logits/rejected": 3.3471474647521973, + "logps/chosen": -296.58636474609375, + "logps/rejected": -283.06097412109375, + "loss": 0.7976, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.035634320229291916, + "rewards/margins": 0.11476624011993408, + "rewards/rejected": -0.1504005640745163, + "step": 420 + }, + { + "epoch": 0.014337968733452274, + "grad_norm": 1.2053836584091187, + "learning_rate": 4.9763200486633864e-05, + "logits/chosen": 3.034000873565674, + "logits/rejected": 3.1709632873535156, + "logps/chosen": -310.9400634765625, + "logps/rejected": -247.30337524414062, + "loss": 0.569, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.006636962294578552, + "rewards/margins": 0.5295768976211548, + "rewards/rejected": -0.5362138152122498, + "step": 440 + }, + { + "epoch": 0.014989694584972832, + "grad_norm": 4.7379255294799805, + "learning_rate": 4.9752338123635414e-05, + "logits/chosen": 3.292654514312744, + "logits/rejected": 3.4133739471435547, + "logps/chosen": -344.68780517578125, + "logps/rejected": -312.36810302734375, + "loss": 0.602, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.16535577178001404, + "rewards/margins": 0.5754185914993286, + "rewards/rejected": -0.7407742738723755, + "step": 460 + }, + { + "epoch": 0.01564142043649339, + "grad_norm": 1.9355920553207397, + "learning_rate": 4.974147576063697e-05, + "logits/chosen": 3.0965640544891357, + "logits/rejected": 3.1571037769317627, + "logps/chosen": -328.1559143066406, + "logps/rejected": -274.5924377441406, + "loss": 0.5565, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.5526318550109863, + "rewards/margins": 0.7302650213241577, + "rewards/rejected": -1.282896876335144, + "step": 480 + }, + { + "epoch": 0.016293146288013947, + "grad_norm": 3.575183868408203, + "learning_rate": 4.973061339763852e-05, + "logits/chosen": 3.177799701690674, + "logits/rejected": 3.2371227741241455, + "logps/chosen": -341.98895263671875, + "logps/rejected": -276.9753723144531, + "loss": 0.5963, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5560494661331177, + "rewards/margins": 0.6572286486625671, + "rewards/rejected": -1.2132781744003296, + "step": 500 + }, + { + "epoch": 0.016944872139534505, + "grad_norm": 2.9191343784332275, + "learning_rate": 4.971975103464008e-05, + "logits/chosen": 3.4300758838653564, + "logits/rejected": 3.4523768424987793, + "logps/chosen": -370.9397888183594, + "logps/rejected": -301.653076171875, + "loss": 0.613, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2855314314365387, + "rewards/margins": 0.6213425397872925, + "rewards/rejected": -0.9068740010261536, + "step": 520 + }, + { + "epoch": 0.017596597991055063, + "grad_norm": 2.945192575454712, + "learning_rate": 4.970888867164163e-05, + "logits/chosen": 3.748929977416992, + "logits/rejected": 3.6652514934539795, + "logps/chosen": -353.2174072265625, + "logps/rejected": -273.7146301269531, + "loss": 0.5801, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.31786778569221497, + "rewards/margins": 0.6783866286277771, + "rewards/rejected": -0.996254563331604, + "step": 540 + }, + { + "epoch": 0.01824832384257562, + "grad_norm": 3.147676467895508, + "learning_rate": 4.969802630864319e-05, + "logits/chosen": 3.2535336017608643, + "logits/rejected": 3.2287399768829346, + "logps/chosen": -317.5291442871094, + "logps/rejected": -293.724365234375, + "loss": 0.5046, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.39739280939102173, + "rewards/margins": 0.9210759401321411, + "rewards/rejected": -1.318468689918518, + "step": 560 + }, + { + "epoch": 0.01890004969409618, + "grad_norm": 1.69011390209198, + "learning_rate": 4.968716394564474e-05, + "logits/chosen": 2.8041045665740967, + "logits/rejected": 2.8577895164489746, + "logps/chosen": -321.950927734375, + "logps/rejected": -285.2073974609375, + "loss": 0.5449, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.9357019662857056, + "rewards/margins": 1.0415992736816406, + "rewards/rejected": -1.9773012399673462, + "step": 580 + }, + { + "epoch": 0.019551775545616736, + "grad_norm": 2.714644193649292, + "learning_rate": 4.967630158264629e-05, + "logits/chosen": 2.9420933723449707, + "logits/rejected": 2.8859665393829346, + "logps/chosen": -347.3935546875, + "logps/rejected": -266.0564270019531, + "loss": 0.6044, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9895059466362, + "rewards/margins": 0.6867011785507202, + "rewards/rejected": -1.676207184791565, + "step": 600 + }, + { + "epoch": 0.020203501397137294, + "grad_norm": 2.470775604248047, + "learning_rate": 4.966543921964785e-05, + "logits/chosen": 3.1547980308532715, + "logits/rejected": 3.1254725456237793, + "logps/chosen": -355.8478088378906, + "logps/rejected": -301.76025390625, + "loss": 0.5885, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4044327735900879, + "rewards/margins": 1.2512335777282715, + "rewards/rejected": -1.6556663513183594, + "step": 620 + }, + { + "epoch": 0.020855227248657852, + "grad_norm": 3.602844476699829, + "learning_rate": 4.96545768566494e-05, + "logits/chosen": 3.248357057571411, + "logits/rejected": 3.2533936500549316, + "logps/chosen": -327.37750244140625, + "logps/rejected": -309.9153137207031, + "loss": 0.4649, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.34223777055740356, + "rewards/margins": 1.3297322988510132, + "rewards/rejected": -1.671970009803772, + "step": 640 + }, + { + "epoch": 0.02150695310017841, + "grad_norm": 3.605502128601074, + "learning_rate": 4.964371449365095e-05, + "logits/chosen": 3.543985366821289, + "logits/rejected": 3.6314215660095215, + "logps/chosen": -346.60980224609375, + "logps/rejected": -300.12542724609375, + "loss": 0.5184, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.726009726524353, + "rewards/margins": 1.101555585861206, + "rewards/rejected": -1.8275654315948486, + "step": 660 + }, + { + "epoch": 0.022158678951698967, + "grad_norm": 7.422413349151611, + "learning_rate": 4.96328521306525e-05, + "logits/chosen": 3.5343551635742188, + "logits/rejected": 3.5532424449920654, + "logps/chosen": -364.8016662597656, + "logps/rejected": -310.97540283203125, + "loss": 0.7598, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5585274696350098, + "rewards/margins": 1.0025911331176758, + "rewards/rejected": -1.561118483543396, + "step": 680 + }, + { + "epoch": 0.022810404803219525, + "grad_norm": 2.490894079208374, + "learning_rate": 4.962198976765406e-05, + "logits/chosen": 3.3488402366638184, + "logits/rejected": 3.394294261932373, + "logps/chosen": -326.3158874511719, + "logps/rejected": -256.7493591308594, + "loss": 0.3884, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.09988899528980255, + "rewards/margins": 1.5758745670318604, + "rewards/rejected": -1.675763726234436, + "step": 700 + }, + { + "epoch": 0.023462130654740083, + "grad_norm": 1.135406255722046, + "learning_rate": 4.961112740465561e-05, + "logits/chosen": 3.289991855621338, + "logits/rejected": 3.178989887237549, + "logps/chosen": -354.816162109375, + "logps/rejected": -287.60791015625, + "loss": 0.5847, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.2795276641845703, + "rewards/margins": 1.972662329673767, + "rewards/rejected": -1.6931349039077759, + "step": 720 + }, + { + "epoch": 0.02411385650626064, + "grad_norm": 2.1922452449798584, + "learning_rate": 4.960026504165716e-05, + "logits/chosen": 3.290985107421875, + "logits/rejected": 3.438814640045166, + "logps/chosen": -353.9092102050781, + "logps/rejected": -317.64825439453125, + "loss": 0.6934, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5046671032905579, + "rewards/margins": 1.052455186843872, + "rewards/rejected": -1.5571222305297852, + "step": 740 + }, + { + "epoch": 0.0247655823577812, + "grad_norm": 2.2178659439086914, + "learning_rate": 4.958940267865872e-05, + "logits/chosen": 3.306546688079834, + "logits/rejected": 3.206681489944458, + "logps/chosen": -361.18951416015625, + "logps/rejected": -284.6773376464844, + "loss": 0.4979, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8971925973892212, + "rewards/margins": 1.1848243474960327, + "rewards/rejected": -2.082016944885254, + "step": 760 + }, + { + "epoch": 0.025417308209301756, + "grad_norm": 1.7360730171203613, + "learning_rate": 4.9578540315660274e-05, + "logits/chosen": 3.0132853984832764, + "logits/rejected": 3.057724714279175, + "logps/chosen": -306.1099853515625, + "logps/rejected": -238.8798828125, + "loss": 0.5941, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5607724189758301, + "rewards/margins": 1.0321210622787476, + "rewards/rejected": -1.592893362045288, + "step": 780 + }, + { + "epoch": 0.026069034060822314, + "grad_norm": 19.743188858032227, + "learning_rate": 4.9568221070811746e-05, + "logits/chosen": 3.2059528827667236, + "logits/rejected": 3.3919575214385986, + "logps/chosen": -368.04595947265625, + "logps/rejected": -323.3333435058594, + "loss": 0.6823, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8780875205993652, + "rewards/margins": 1.1952801942825317, + "rewards/rejected": -2.0733675956726074, + "step": 800 + }, + { + "epoch": 0.02672075991234287, + "grad_norm": 1.5568467378616333, + "learning_rate": 4.9557358707813304e-05, + "logits/chosen": 2.9319424629211426, + "logits/rejected": 3.0428719520568848, + "logps/chosen": -319.97564697265625, + "logps/rejected": -266.247314453125, + "loss": 0.5286, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.9834583401679993, + "rewards/margins": 1.3183648586273193, + "rewards/rejected": -2.301823139190674, + "step": 820 + }, + { + "epoch": 0.02737248576386343, + "grad_norm": 5.25149393081665, + "learning_rate": 4.9546496344814854e-05, + "logits/chosen": 2.9574294090270996, + "logits/rejected": 3.0268216133117676, + "logps/chosen": -376.7042236328125, + "logps/rejected": -274.9063720703125, + "loss": 0.5608, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3265694379806519, + "rewards/margins": 1.1978371143341064, + "rewards/rejected": -2.524406671524048, + "step": 840 + }, + { + "epoch": 0.028024211615383987, + "grad_norm": 3.7644338607788086, + "learning_rate": 4.953563398181641e-05, + "logits/chosen": 2.9962642192840576, + "logits/rejected": 3.0002036094665527, + "logps/chosen": -328.4303894042969, + "logps/rejected": -244.16091918945312, + "loss": 0.6595, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4431251287460327, + "rewards/margins": 1.0763286352157593, + "rewards/rejected": -2.519453525543213, + "step": 860 + }, + { + "epoch": 0.02867593746690455, + "grad_norm": 2.8306970596313477, + "learning_rate": 4.952477161881796e-05, + "logits/chosen": 3.121464252471924, + "logits/rejected": 3.222446918487549, + "logps/chosen": -306.6799621582031, + "logps/rejected": -286.7738342285156, + "loss": 0.5097, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7855662107467651, + "rewards/margins": 1.3137809038162231, + "rewards/rejected": -2.099346876144409, + "step": 880 + }, + { + "epoch": 0.029327663318425106, + "grad_norm": 1.758278489112854, + "learning_rate": 4.9513909255819513e-05, + "logits/chosen": 3.1308178901672363, + "logits/rejected": 3.1015286445617676, + "logps/chosen": -343.5506286621094, + "logps/rejected": -264.5284729003906, + "loss": 0.5248, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.3233672082424164, + "rewards/margins": 1.198577880859375, + "rewards/rejected": -1.5219451189041138, + "step": 900 + }, + { + "epoch": 0.029979389169945664, + "grad_norm": 1.5988177061080933, + "learning_rate": 4.9503046892821064e-05, + "logits/chosen": 3.2178597450256348, + "logits/rejected": 3.281151294708252, + "logps/chosen": -297.4223327636719, + "logps/rejected": -272.41229248046875, + "loss": 0.6177, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.28259551525115967, + "rewards/margins": 1.089690923690796, + "rewards/rejected": -1.3722865581512451, + "step": 920 + }, + { + "epoch": 0.03063111502146622, + "grad_norm": 2.1860711574554443, + "learning_rate": 4.949218452982262e-05, + "logits/chosen": 3.225794553756714, + "logits/rejected": 3.1921653747558594, + "logps/chosen": -305.1083068847656, + "logps/rejected": -279.957275390625, + "loss": 0.5551, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5306088328361511, + "rewards/margins": 1.6453800201416016, + "rewards/rejected": -2.1759886741638184, + "step": 940 + }, + { + "epoch": 0.03128284087298678, + "grad_norm": 3.613379716873169, + "learning_rate": 4.948132216682417e-05, + "logits/chosen": 3.323183536529541, + "logits/rejected": 3.3981921672821045, + "logps/chosen": -339.24365234375, + "logps/rejected": -288.04791259765625, + "loss": 0.4959, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6143248677253723, + "rewards/margins": 1.1638405323028564, + "rewards/rejected": -1.7781654596328735, + "step": 960 + }, + { + "epoch": 0.031934566724507334, + "grad_norm": 4.4491119384765625, + "learning_rate": 4.947045980382572e-05, + "logits/chosen": 3.08207631111145, + "logits/rejected": 3.337578296661377, + "logps/chosen": -355.8562927246094, + "logps/rejected": -318.37506103515625, + "loss": 0.5749, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.4549214243888855, + "rewards/margins": 1.3577829599380493, + "rewards/rejected": -1.8127044439315796, + "step": 980 + }, + { + "epoch": 0.032586292576027895, + "grad_norm": 1.7130435705184937, + "learning_rate": 4.945959744082728e-05, + "logits/chosen": 3.4158108234405518, + "logits/rejected": 3.3476855754852295, + "logps/chosen": -365.210693359375, + "logps/rejected": -301.89337158203125, + "loss": 0.3169, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.21900419890880585, + "rewards/margins": 2.2676305770874023, + "rewards/rejected": -2.4866347312927246, + "step": 1000 + }, + { + "epoch": 0.03323801842754845, + "grad_norm": 2.121593475341797, + "learning_rate": 4.944873507782883e-05, + "logits/chosen": 2.622936725616455, + "logits/rejected": 2.710310220718384, + "logps/chosen": -318.5855407714844, + "logps/rejected": -311.7076110839844, + "loss": 0.6369, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.926166296005249, + "rewards/margins": 2.015831232070923, + "rewards/rejected": -3.94199800491333, + "step": 1020 + }, + { + "epoch": 0.03388974427906901, + "grad_norm": 3.659578800201416, + "learning_rate": 4.943787271483038e-05, + "logits/chosen": 3.230228900909424, + "logits/rejected": 3.197178363800049, + "logps/chosen": -368.3141784667969, + "logps/rejected": -295.503662109375, + "loss": 0.5154, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5222212076187134, + "rewards/margins": 1.689462423324585, + "rewards/rejected": -3.211683750152588, + "step": 1040 + }, + { + "epoch": 0.034541470130589565, + "grad_norm": 4.695761680603027, + "learning_rate": 4.942701035183194e-05, + "logits/chosen": 2.996948719024658, + "logits/rejected": 3.1099629402160645, + "logps/chosen": -335.01422119140625, + "logps/rejected": -305.25799560546875, + "loss": 0.6017, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2580616474151611, + "rewards/margins": 1.5926748514175415, + "rewards/rejected": -2.850736379623413, + "step": 1060 + }, + { + "epoch": 0.035193195982110126, + "grad_norm": 1.6678528785705566, + "learning_rate": 4.94161479888335e-05, + "logits/chosen": 3.186009168624878, + "logits/rejected": 3.254631519317627, + "logps/chosen": -337.8241271972656, + "logps/rejected": -289.04522705078125, + "loss": 0.7357, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.42491278052330017, + "rewards/margins": 1.221513032913208, + "rewards/rejected": -1.6464258432388306, + "step": 1080 + }, + { + "epoch": 0.03584492183363068, + "grad_norm": 1.4018584489822388, + "learning_rate": 4.940528562583505e-05, + "logits/chosen": 3.721158504486084, + "logits/rejected": 3.6486403942108154, + "logps/chosen": -314.2821350097656, + "logps/rejected": -301.2421569824219, + "loss": 0.6017, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6530038714408875, + "rewards/margins": 1.1717398166656494, + "rewards/rejected": -1.824743628501892, + "step": 1100 + }, + { + "epoch": 0.03649664768515124, + "grad_norm": 3.454059362411499, + "learning_rate": 4.93944232628366e-05, + "logits/chosen": 3.314424991607666, + "logits/rejected": 3.2343153953552246, + "logps/chosen": -326.15936279296875, + "logps/rejected": -288.42645263671875, + "loss": 0.486, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2685048580169678, + "rewards/margins": 1.5519254207611084, + "rewards/rejected": -2.820430278778076, + "step": 1120 + }, + { + "epoch": 0.037148373536671796, + "grad_norm": 4.468503475189209, + "learning_rate": 4.938356089983816e-05, + "logits/chosen": 3.292987823486328, + "logits/rejected": 3.274381160736084, + "logps/chosen": -371.98248291015625, + "logps/rejected": -303.5506896972656, + "loss": 0.525, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3482186794281006, + "rewards/margins": 1.8830010890960693, + "rewards/rejected": -3.231220245361328, + "step": 1140 + }, + { + "epoch": 0.03780009938819236, + "grad_norm": 1.8160191774368286, + "learning_rate": 4.937269853683971e-05, + "logits/chosen": 2.9222099781036377, + "logits/rejected": 2.916886568069458, + "logps/chosen": -343.80731201171875, + "logps/rejected": -327.94427490234375, + "loss": 0.4865, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5637692213058472, + "rewards/margins": 1.8219455480575562, + "rewards/rejected": -3.3857147693634033, + "step": 1160 + }, + { + "epoch": 0.03845182523971292, + "grad_norm": 1.1571201086044312, + "learning_rate": 4.936183617384126e-05, + "logits/chosen": 3.130405902862549, + "logits/rejected": 3.126316547393799, + "logps/chosen": -349.72650146484375, + "logps/rejected": -325.358154296875, + "loss": 0.5331, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4702491760253906, + "rewards/margins": 1.66522216796875, + "rewards/rejected": -3.1354711055755615, + "step": 1180 + }, + { + "epoch": 0.03910355109123347, + "grad_norm": 3.459935426712036, + "learning_rate": 4.9350973810842816e-05, + "logits/chosen": 3.35313081741333, + "logits/rejected": 3.2494494915008545, + "logps/chosen": -306.07861328125, + "logps/rejected": -283.18719482421875, + "loss": 0.5395, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.678659439086914, + "rewards/margins": 1.514310598373413, + "rewards/rejected": -3.1929705142974854, + "step": 1200 + }, + { + "epoch": 0.039755276942754034, + "grad_norm": 9.030374526977539, + "learning_rate": 4.934011144784437e-05, + "logits/chosen": 2.9550273418426514, + "logits/rejected": 3.1418185234069824, + "logps/chosen": -336.17926025390625, + "logps/rejected": -300.88641357421875, + "loss": 0.5855, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4015058279037476, + "rewards/margins": 1.3468605279922485, + "rewards/rejected": -2.7483668327331543, + "step": 1220 + }, + { + "epoch": 0.04040700279427459, + "grad_norm": 3.681443929672241, + "learning_rate": 4.932924908484592e-05, + "logits/chosen": 2.9063830375671387, + "logits/rejected": 3.02773118019104, + "logps/chosen": -303.16241455078125, + "logps/rejected": -288.82464599609375, + "loss": 0.5353, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.766743540763855, + "rewards/margins": 1.5767686367034912, + "rewards/rejected": -2.3435122966766357, + "step": 1240 + }, + { + "epoch": 0.04105872864579515, + "grad_norm": 1.5037360191345215, + "learning_rate": 4.931838672184747e-05, + "logits/chosen": 2.899566411972046, + "logits/rejected": 3.0979952812194824, + "logps/chosen": -306.69696044921875, + "logps/rejected": -267.4485168457031, + "loss": 0.4241, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.43908628821372986, + "rewards/margins": 2.0851669311523438, + "rewards/rejected": -2.5242533683776855, + "step": 1260 + }, + { + "epoch": 0.041710454497315703, + "grad_norm": 8.037769317626953, + "learning_rate": 4.9307524358849026e-05, + "logits/chosen": 3.6885464191436768, + "logits/rejected": 3.644721269607544, + "logps/chosen": -309.16070556640625, + "logps/rejected": -291.6141052246094, + "loss": 0.6266, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.822218120098114, + "rewards/margins": 1.45439875125885, + "rewards/rejected": -2.2766165733337402, + "step": 1280 + }, + { + "epoch": 0.042362180348836265, + "grad_norm": 4.472288608551025, + "learning_rate": 4.9296661995850577e-05, + "logits/chosen": 3.221527576446533, + "logits/rejected": 3.3840911388397217, + "logps/chosen": -347.5926818847656, + "logps/rejected": -305.2088317871094, + "loss": 0.3873, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.33888328075408936, + "rewards/margins": 2.1571099758148193, + "rewards/rejected": -2.4959933757781982, + "step": 1300 + }, + { + "epoch": 0.04301390620035682, + "grad_norm": 1.9211620092391968, + "learning_rate": 4.9285799632852134e-05, + "logits/chosen": 3.3838467597961426, + "logits/rejected": 3.311892032623291, + "logps/chosen": -355.1278381347656, + "logps/rejected": -292.70037841796875, + "loss": 0.5109, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.7594250440597534, + "rewards/margins": 1.9383275508880615, + "rewards/rejected": -2.6977524757385254, + "step": 1320 + }, + { + "epoch": 0.04366563205187738, + "grad_norm": 0.3854999840259552, + "learning_rate": 4.9274937269853685e-05, + "logits/chosen": 3.1019046306610107, + "logits/rejected": 3.3326351642608643, + "logps/chosen": -329.44482421875, + "logps/rejected": -295.9028015136719, + "loss": 0.7069, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2456196546554565, + "rewards/margins": 1.5836914777755737, + "rewards/rejected": -2.8293111324310303, + "step": 1340 + }, + { + "epoch": 0.044317357903397935, + "grad_norm": 1.782427430152893, + "learning_rate": 4.926407490685524e-05, + "logits/chosen": 3.09218168258667, + "logits/rejected": 3.1193976402282715, + "logps/chosen": -294.4489440917969, + "logps/rejected": -248.64376831054688, + "loss": 0.4268, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2912242412567139, + "rewards/margins": 2.091925621032715, + "rewards/rejected": -3.383150100708008, + "step": 1360 + }, + { + "epoch": 0.044969083754918496, + "grad_norm": 7.984747886657715, + "learning_rate": 4.925321254385679e-05, + "logits/chosen": 3.1405673027038574, + "logits/rejected": 3.286118268966675, + "logps/chosen": -349.6425476074219, + "logps/rejected": -283.6289367675781, + "loss": 0.6386, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9638798236846924, + "rewards/margins": 1.7706916332244873, + "rewards/rejected": -3.7345714569091797, + "step": 1380 + }, + { + "epoch": 0.04562080960643905, + "grad_norm": 4.720180988311768, + "learning_rate": 4.924235018085835e-05, + "logits/chosen": 3.2560157775878906, + "logits/rejected": 3.273289442062378, + "logps/chosen": -339.9998779296875, + "logps/rejected": -331.459716796875, + "loss": 0.37, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5887820720672607, + "rewards/margins": 2.537172555923462, + "rewards/rejected": -4.125954627990723, + "step": 1400 + }, + { + "epoch": 0.04627253545795961, + "grad_norm": 5.943798065185547, + "learning_rate": 4.92314878178599e-05, + "logits/chosen": 2.953237295150757, + "logits/rejected": 3.086613893508911, + "logps/chosen": -317.92059326171875, + "logps/rejected": -299.180908203125, + "loss": 0.5422, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6511799097061157, + "rewards/margins": 1.965884804725647, + "rewards/rejected": -3.617064952850342, + "step": 1420 + }, + { + "epoch": 0.046924261309480166, + "grad_norm": 2.5939550399780273, + "learning_rate": 4.922062545486145e-05, + "logits/chosen": 3.182440996170044, + "logits/rejected": 3.1875510215759277, + "logps/chosen": -341.707763671875, + "logps/rejected": -294.01617431640625, + "loss": 0.6345, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9987379312515259, + "rewards/margins": 2.1863648891448975, + "rewards/rejected": -4.185103416442871, + "step": 1440 + }, + { + "epoch": 0.04757598716100073, + "grad_norm": 2.0249931812286377, + "learning_rate": 4.9209763091863e-05, + "logits/chosen": 3.1032910346984863, + "logits/rejected": 3.1730690002441406, + "logps/chosen": -308.22161865234375, + "logps/rejected": -282.42218017578125, + "loss": 0.7568, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5209877490997314, + "rewards/margins": 1.4384276866912842, + "rewards/rejected": -2.9594156742095947, + "step": 1460 + }, + { + "epoch": 0.04822771301252128, + "grad_norm": 3.8195812702178955, + "learning_rate": 4.919890072886456e-05, + "logits/chosen": 3.4346847534179688, + "logits/rejected": 3.678351640701294, + "logps/chosen": -383.71307373046875, + "logps/rejected": -312.18310546875, + "loss": 0.4238, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.9416574239730835, + "rewards/margins": 2.5401346683502197, + "rewards/rejected": -3.4817919731140137, + "step": 1480 + }, + { + "epoch": 0.04887943886404184, + "grad_norm": 2.876647472381592, + "learning_rate": 4.918803836586611e-05, + "logits/chosen": 3.3587405681610107, + "logits/rejected": 3.396892547607422, + "logps/chosen": -332.4215087890625, + "logps/rejected": -278.9684143066406, + "loss": 0.4914, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.3332586288452148, + "rewards/margins": 1.947119951248169, + "rewards/rejected": -3.280378818511963, + "step": 1500 + }, + { + "epoch": 0.0495311647155624, + "grad_norm": 4.031825065612793, + "learning_rate": 4.917717600286766e-05, + "logits/chosen": 3.0445380210876465, + "logits/rejected": 3.0374515056610107, + "logps/chosen": -356.8233337402344, + "logps/rejected": -313.815185546875, + "loss": 0.4942, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2921643257141113, + "rewards/margins": 2.484142780303955, + "rewards/rejected": -3.7763068675994873, + "step": 1520 + }, + { + "epoch": 0.05018289056708296, + "grad_norm": 1.530194878578186, + "learning_rate": 4.916631363986922e-05, + "logits/chosen": 3.154949188232422, + "logits/rejected": 3.2605679035186768, + "logps/chosen": -318.78704833984375, + "logps/rejected": -318.1283264160156, + "loss": 0.5111, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5655627250671387, + "rewards/margins": 2.212632894515991, + "rewards/rejected": -3.77819561958313, + "step": 1540 + }, + { + "epoch": 0.05083461641860351, + "grad_norm": 6.53126335144043, + "learning_rate": 4.915545127687077e-05, + "logits/chosen": 2.6729936599731445, + "logits/rejected": 2.5371644496917725, + "logps/chosen": -327.2698059082031, + "logps/rejected": -363.981201171875, + "loss": 0.424, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.115673303604126, + "rewards/margins": 2.8010082244873047, + "rewards/rejected": -3.9166812896728516, + "step": 1560 + }, + { + "epoch": 0.05148634227012407, + "grad_norm": 4.1405158042907715, + "learning_rate": 4.914458891387232e-05, + "logits/chosen": 2.861321449279785, + "logits/rejected": 3.0549893379211426, + "logps/chosen": -339.23895263671875, + "logps/rejected": -317.939453125, + "loss": 0.4671, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2968525886535645, + "rewards/margins": 1.6391313076019287, + "rewards/rejected": -2.9359841346740723, + "step": 1580 + }, + { + "epoch": 0.05213806812164463, + "grad_norm": 2.183073043823242, + "learning_rate": 4.913372655087388e-05, + "logits/chosen": 2.9399428367614746, + "logits/rejected": 3.0689430236816406, + "logps/chosen": -317.28704833984375, + "logps/rejected": -325.1346435546875, + "loss": 0.4512, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.653885841369629, + "rewards/margins": 2.318889617919922, + "rewards/rejected": -3.9727752208709717, + "step": 1600 + }, + { + "epoch": 0.05278979397316519, + "grad_norm": 2.1778676509857178, + "learning_rate": 4.9122864187875437e-05, + "logits/chosen": 2.834561586380005, + "logits/rejected": 2.7304847240448, + "logps/chosen": -327.08203125, + "logps/rejected": -331.9773864746094, + "loss": 0.4538, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.341362714767456, + "rewards/margins": 2.6388278007507324, + "rewards/rejected": -4.980190753936768, + "step": 1620 + }, + { + "epoch": 0.05344151982468574, + "grad_norm": 4.42081356048584, + "learning_rate": 4.911200182487699e-05, + "logits/chosen": 3.0355606079101562, + "logits/rejected": 3.015087127685547, + "logps/chosen": -388.0062255859375, + "logps/rejected": -329.47052001953125, + "loss": 0.748, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.297586441040039, + "rewards/margins": 1.5863662958145142, + "rewards/rejected": -3.8839523792266846, + "step": 1640 + }, + { + "epoch": 0.054093245676206304, + "grad_norm": 2.0187039375305176, + "learning_rate": 4.910113946187854e-05, + "logits/chosen": 3.0302553176879883, + "logits/rejected": 3.069577217102051, + "logps/chosen": -316.7452697753906, + "logps/rejected": -306.7051086425781, + "loss": 0.4999, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4863325357437134, + "rewards/margins": 2.191751003265381, + "rewards/rejected": -3.678083896636963, + "step": 1660 + }, + { + "epoch": 0.05474497152772686, + "grad_norm": 4.29229736328125, + "learning_rate": 4.9090277098880096e-05, + "logits/chosen": 3.194532632827759, + "logits/rejected": 3.344210147857666, + "logps/chosen": -348.8492431640625, + "logps/rejected": -308.6143493652344, + "loss": 0.4682, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3414158821105957, + "rewards/margins": 2.0128674507141113, + "rewards/rejected": -3.354283571243286, + "step": 1680 + }, + { + "epoch": 0.05539669737924742, + "grad_norm": 2.8154351711273193, + "learning_rate": 4.9079414735881646e-05, + "logits/chosen": 3.328953266143799, + "logits/rejected": 3.249859571456909, + "logps/chosen": -318.03271484375, + "logps/rejected": -259.8677062988281, + "loss": 0.7002, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.43419599533081055, + "rewards/margins": 1.7350908517837524, + "rewards/rejected": -2.1692872047424316, + "step": 1700 + }, + { + "epoch": 0.056048423230767974, + "grad_norm": 5.724809169769287, + "learning_rate": 4.90685523728832e-05, + "logits/chosen": 2.9971394538879395, + "logits/rejected": 3.1168036460876465, + "logps/chosen": -325.5506286621094, + "logps/rejected": -302.1744689941406, + "loss": 0.675, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7285394072532654, + "rewards/margins": 1.1602392196655273, + "rewards/rejected": -1.8887784481048584, + "step": 1720 + }, + { + "epoch": 0.056700149082288535, + "grad_norm": 3.582751512527466, + "learning_rate": 4.9057690009884755e-05, + "logits/chosen": 3.2904751300811768, + "logits/rejected": 3.4136478900909424, + "logps/chosen": -311.2755432128906, + "logps/rejected": -336.40814208984375, + "loss": 0.6343, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9921520352363586, + "rewards/margins": 1.8498016595840454, + "rewards/rejected": -2.8419535160064697, + "step": 1740 + }, + { + "epoch": 0.0573518749338091, + "grad_norm": 1.8242824077606201, + "learning_rate": 4.9046827646886306e-05, + "logits/chosen": 3.0093131065368652, + "logits/rejected": 3.165144681930542, + "logps/chosen": -281.39862060546875, + "logps/rejected": -329.52642822265625, + "loss": 0.4934, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.521057367324829, + "rewards/margins": 1.8175052404403687, + "rewards/rejected": -3.338562488555908, + "step": 1760 + }, + { + "epoch": 0.05800360078532965, + "grad_norm": 3.27577543258667, + "learning_rate": 4.9035965283887856e-05, + "logits/chosen": 2.8701319694519043, + "logits/rejected": 3.1115987300872803, + "logps/chosen": -345.18304443359375, + "logps/rejected": -378.97613525390625, + "loss": 0.4389, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.0047900676727295, + "rewards/margins": 2.41137957572937, + "rewards/rejected": -4.416170120239258, + "step": 1780 + }, + { + "epoch": 0.05865532663685021, + "grad_norm": 5.905219078063965, + "learning_rate": 4.9025102920889414e-05, + "logits/chosen": 3.325164318084717, + "logits/rejected": 3.2965996265411377, + "logps/chosen": -352.53387451171875, + "logps/rejected": -293.7185363769531, + "loss": 0.4836, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7356431484222412, + "rewards/margins": 1.546096920967102, + "rewards/rejected": -3.2817397117614746, + "step": 1800 + }, + { + "epoch": 0.059307052488370766, + "grad_norm": 1.0842186212539673, + "learning_rate": 4.9014240557890965e-05, + "logits/chosen": 2.701112985610962, + "logits/rejected": 2.6731133460998535, + "logps/chosen": -342.8392639160156, + "logps/rejected": -341.0564880371094, + "loss": 0.4648, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0274157524108887, + "rewards/margins": 2.541079044342041, + "rewards/rejected": -4.56849479675293, + "step": 1820 + }, + { + "epoch": 0.05995877833989133, + "grad_norm": 2.721435070037842, + "learning_rate": 4.9003378194892515e-05, + "logits/chosen": 3.0578207969665527, + "logits/rejected": 3.03995418548584, + "logps/chosen": -373.294189453125, + "logps/rejected": -350.509033203125, + "loss": 0.5981, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9487797021865845, + "rewards/margins": 1.6428403854370117, + "rewards/rejected": -3.5916202068328857, + "step": 1840 + }, + { + "epoch": 0.06061050419141188, + "grad_norm": 3.81075382232666, + "learning_rate": 4.899251583189407e-05, + "logits/chosen": 3.1673595905303955, + "logits/rejected": 3.197908878326416, + "logps/chosen": -353.603271484375, + "logps/rejected": -283.707763671875, + "loss": 0.6767, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.025254964828491, + "rewards/margins": 1.6237194538116455, + "rewards/rejected": -3.648974657058716, + "step": 1860 + }, + { + "epoch": 0.06126223004293244, + "grad_norm": 3.324390172958374, + "learning_rate": 4.898165346889563e-05, + "logits/chosen": 3.114119291305542, + "logits/rejected": 3.203930616378784, + "logps/chosen": -358.25946044921875, + "logps/rejected": -307.8440246582031, + "loss": 0.5332, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.5742974281311035, + "rewards/margins": 1.9004100561141968, + "rewards/rejected": -3.4747073650360107, + "step": 1880 + }, + { + "epoch": 0.061913955894453, + "grad_norm": 11.77093505859375, + "learning_rate": 4.897079110589718e-05, + "logits/chosen": 3.052638530731201, + "logits/rejected": 3.0841596126556396, + "logps/chosen": -304.6425476074219, + "logps/rejected": -330.8580627441406, + "loss": 0.8359, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.1766719818115234, + "rewards/margins": 1.204751968383789, + "rewards/rejected": -3.3814244270324707, + "step": 1900 + }, + { + "epoch": 0.06256568174597356, + "grad_norm": 2.1730291843414307, + "learning_rate": 4.895992874289873e-05, + "logits/chosen": 2.96565842628479, + "logits/rejected": 3.073212146759033, + "logps/chosen": -329.25433349609375, + "logps/rejected": -324.4353332519531, + "loss": 0.5136, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.103126049041748, + "rewards/margins": 2.382038116455078, + "rewards/rejected": -4.485164165496826, + "step": 1920 + }, + { + "epoch": 0.06321740759749411, + "grad_norm": 1.4755703210830688, + "learning_rate": 4.894906637990029e-05, + "logits/chosen": 3.1562132835388184, + "logits/rejected": 3.310255527496338, + "logps/chosen": -358.6934814453125, + "logps/rejected": -340.08001708984375, + "loss": 0.5432, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1305965185165405, + "rewards/margins": 2.0588295459747314, + "rewards/rejected": -3.1894259452819824, + "step": 1940 + }, + { + "epoch": 0.06386913344901467, + "grad_norm": 0.701674222946167, + "learning_rate": 4.893820401690184e-05, + "logits/chosen": 3.1663904190063477, + "logits/rejected": 3.301480531692505, + "logps/chosen": -348.23968505859375, + "logps/rejected": -332.303466796875, + "loss": 0.4156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8067615032196045, + "rewards/margins": 2.2472434043884277, + "rewards/rejected": -4.054005146026611, + "step": 1960 + }, + { + "epoch": 0.06452085930053524, + "grad_norm": 0.7967122197151184, + "learning_rate": 4.892734165390339e-05, + "logits/chosen": 3.0498640537261963, + "logits/rejected": 3.175757646560669, + "logps/chosen": -369.11785888671875, + "logps/rejected": -319.9985656738281, + "loss": 0.7735, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.066406488418579, + "rewards/margins": 1.2265758514404297, + "rewards/rejected": -3.292982578277588, + "step": 1980 + }, + { + "epoch": 0.06517258515205579, + "grad_norm": 2.7733004093170166, + "learning_rate": 4.891647929090495e-05, + "logits/chosen": 3.1022884845733643, + "logits/rejected": 3.1928253173828125, + "logps/chosen": -319.0776672363281, + "logps/rejected": -338.71063232421875, + "loss": 0.3955, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5001776218414307, + "rewards/margins": 2.264112949371338, + "rewards/rejected": -3.7642905712127686, + "step": 2000 + }, + { + "epoch": 0.06582431100357634, + "grad_norm": 1.3460286855697632, + "learning_rate": 4.89056169279065e-05, + "logits/chosen": 3.106889247894287, + "logits/rejected": 3.04951810836792, + "logps/chosen": -338.2735595703125, + "logps/rejected": -331.1890563964844, + "loss": 0.3658, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.3364770412445068, + "rewards/margins": 2.511465549468994, + "rewards/rejected": -3.84794282913208, + "step": 2020 + }, + { + "epoch": 0.0664760368550969, + "grad_norm": 9.135180473327637, + "learning_rate": 4.889475456490805e-05, + "logits/chosen": 3.1013216972351074, + "logits/rejected": 3.074261426925659, + "logps/chosen": -331.16021728515625, + "logps/rejected": -305.0299987792969, + "loss": 0.424, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.209039330482483, + "rewards/margins": 1.8724387884140015, + "rewards/rejected": -3.0814781188964844, + "step": 2040 + }, + { + "epoch": 0.06712776270661747, + "grad_norm": 1.425521969795227, + "learning_rate": 4.88838922019096e-05, + "logits/chosen": 3.113393783569336, + "logits/rejected": 3.0789787769317627, + "logps/chosen": -361.08306884765625, + "logps/rejected": -269.58905029296875, + "loss": 0.7281, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8519785404205322, + "rewards/margins": 1.4942944049835205, + "rewards/rejected": -3.3462729454040527, + "step": 2060 + }, + { + "epoch": 0.06777948855813802, + "grad_norm": 5.439502239227295, + "learning_rate": 4.887302983891116e-05, + "logits/chosen": 3.442831516265869, + "logits/rejected": 3.534897565841675, + "logps/chosen": -384.8095703125, + "logps/rejected": -305.2535095214844, + "loss": 0.4502, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.2760227918624878, + "rewards/margins": 2.284453868865967, + "rewards/rejected": -3.560476303100586, + "step": 2080 + }, + { + "epoch": 0.06843121440965858, + "grad_norm": 1.9145796298980713, + "learning_rate": 4.886216747591271e-05, + "logits/chosen": 3.0372393131256104, + "logits/rejected": 3.2808780670166016, + "logps/chosen": -318.7366638183594, + "logps/rejected": -295.4101257324219, + "loss": 0.6117, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4301011562347412, + "rewards/margins": 1.8984920978546143, + "rewards/rejected": -3.3285934925079346, + "step": 2100 + }, + { + "epoch": 0.06908294026117913, + "grad_norm": 4.712088108062744, + "learning_rate": 4.885130511291427e-05, + "logits/chosen": 3.2037253379821777, + "logits/rejected": 3.2749180793762207, + "logps/chosen": -334.9817810058594, + "logps/rejected": -313.22491455078125, + "loss": 0.544, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4875833988189697, + "rewards/margins": 1.9483098983764648, + "rewards/rejected": -3.4358935356140137, + "step": 2120 + }, + { + "epoch": 0.0697346661126997, + "grad_norm": 8.336044311523438, + "learning_rate": 4.884044274991582e-05, + "logits/chosen": 3.015939712524414, + "logits/rejected": 2.984008312225342, + "logps/chosen": -340.44097900390625, + "logps/rejected": -311.269287109375, + "loss": 0.4737, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5333013534545898, + "rewards/margins": 2.1228556632995605, + "rewards/rejected": -3.6561577320098877, + "step": 2140 + }, + { + "epoch": 0.07038639196422025, + "grad_norm": 3.108241558074951, + "learning_rate": 4.8829580386917375e-05, + "logits/chosen": 3.2511239051818848, + "logits/rejected": 3.299100160598755, + "logps/chosen": -375.90789794921875, + "logps/rejected": -289.52386474609375, + "loss": 0.5868, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5677201747894287, + "rewards/margins": 1.978894829750061, + "rewards/rejected": -3.5466148853302, + "step": 2160 + }, + { + "epoch": 0.0710381178157408, + "grad_norm": 5.390713691711426, + "learning_rate": 4.8818718023918926e-05, + "logits/chosen": 3.6352438926696777, + "logits/rejected": 3.5679352283477783, + "logps/chosen": -361.12481689453125, + "logps/rejected": -279.9466247558594, + "loss": 0.5633, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.19115044176578522, + "rewards/margins": 2.147390127182007, + "rewards/rejected": -2.338540554046631, + "step": 2180 + }, + { + "epoch": 0.07168984366726136, + "grad_norm": 1.3955556154251099, + "learning_rate": 4.880785566092048e-05, + "logits/chosen": 3.1050782203674316, + "logits/rejected": 3.0858867168426514, + "logps/chosen": -346.4744873046875, + "logps/rejected": -291.1316833496094, + "loss": 0.4635, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.9009138941764832, + "rewards/margins": 1.5523127317428589, + "rewards/rejected": -2.4532265663146973, + "step": 2200 + }, + { + "epoch": 0.07234156951878193, + "grad_norm": 0.16661906242370605, + "learning_rate": 4.8796993297922035e-05, + "logits/chosen": 3.588263750076294, + "logits/rejected": 3.724269390106201, + "logps/chosen": -360.5055847167969, + "logps/rejected": -293.8421325683594, + "loss": 0.4457, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.6160874366760254, + "rewards/margins": 2.2922258377075195, + "rewards/rejected": -2.908313274383545, + "step": 2220 + }, + { + "epoch": 0.07299329537030248, + "grad_norm": 1.7254592180252075, + "learning_rate": 4.8786130934923585e-05, + "logits/chosen": 3.1013903617858887, + "logits/rejected": 3.347790479660034, + "logps/chosen": -323.84942626953125, + "logps/rejected": -272.2254333496094, + "loss": 0.6309, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3946813344955444, + "rewards/margins": 1.6265146732330322, + "rewards/rejected": -3.021195888519287, + "step": 2240 + }, + { + "epoch": 0.07364502122182304, + "grad_norm": 6.019381046295166, + "learning_rate": 4.8775268571925136e-05, + "logits/chosen": 3.3908767700195312, + "logits/rejected": 3.4177958965301514, + "logps/chosen": -324.94366455078125, + "logps/rejected": -308.05511474609375, + "loss": 0.4808, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.8651292324066162, + "rewards/margins": 2.036652088165283, + "rewards/rejected": -3.9017810821533203, + "step": 2260 + }, + { + "epoch": 0.07429674707334359, + "grad_norm": 6.563716888427734, + "learning_rate": 4.8764406208926694e-05, + "logits/chosen": 3.4610416889190674, + "logits/rejected": 3.4134421348571777, + "logps/chosen": -330.375244140625, + "logps/rejected": -305.14886474609375, + "loss": 0.4623, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7848215103149414, + "rewards/margins": 2.2472290992736816, + "rewards/rejected": -4.032050132751465, + "step": 2280 + }, + { + "epoch": 0.07494847292486416, + "grad_norm": 1.5396589040756226, + "learning_rate": 4.8753543845928244e-05, + "logits/chosen": 3.545539140701294, + "logits/rejected": 3.7033028602600098, + "logps/chosen": -360.1042785644531, + "logps/rejected": -305.9522399902344, + "loss": 0.6486, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6453876495361328, + "rewards/margins": 1.808876633644104, + "rewards/rejected": -3.4542641639709473, + "step": 2300 + }, + { + "epoch": 0.07560019877638471, + "grad_norm": 1.9114954471588135, + "learning_rate": 4.8742681482929795e-05, + "logits/chosen": 3.488121747970581, + "logits/rejected": 3.4122447967529297, + "logps/chosen": -377.0107727050781, + "logps/rejected": -324.42724609375, + "loss": 0.4323, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.3318827152252197, + "rewards/margins": 2.275813341140747, + "rewards/rejected": -3.607696056365967, + "step": 2320 + }, + { + "epoch": 0.07625192462790527, + "grad_norm": 2.02285099029541, + "learning_rate": 4.873181911993135e-05, + "logits/chosen": 3.1647136211395264, + "logits/rejected": 3.141711711883545, + "logps/chosen": -330.5382080078125, + "logps/rejected": -345.6282958984375, + "loss": 0.5125, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4608603715896606, + "rewards/margins": 2.144697666168213, + "rewards/rejected": -3.605557918548584, + "step": 2340 + }, + { + "epoch": 0.07690365047942584, + "grad_norm": 1.0665384531021118, + "learning_rate": 4.8720956756932904e-05, + "logits/chosen": 3.2348361015319824, + "logits/rejected": 3.3157265186309814, + "logps/chosen": -389.5257873535156, + "logps/rejected": -383.21429443359375, + "loss": 0.4575, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5710957050323486, + "rewards/margins": 2.7289435863494873, + "rewards/rejected": -4.300039768218994, + "step": 2360 + }, + { + "epoch": 0.07755537633094639, + "grad_norm": 3.416515350341797, + "learning_rate": 4.871009439393446e-05, + "logits/chosen": 3.011584997177124, + "logits/rejected": 3.196425199508667, + "logps/chosen": -369.1874084472656, + "logps/rejected": -329.36151123046875, + "loss": 0.6924, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6406739950180054, + "rewards/margins": 1.6562957763671875, + "rewards/rejected": -3.2969698905944824, + "step": 2380 + }, + { + "epoch": 0.07820710218246694, + "grad_norm": 2.6627047061920166, + "learning_rate": 4.869923203093601e-05, + "logits/chosen": 3.307826519012451, + "logits/rejected": 3.1792867183685303, + "logps/chosen": -394.25775146484375, + "logps/rejected": -337.26104736328125, + "loss": 0.4678, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5748776197433472, + "rewards/margins": 2.3989832401275635, + "rewards/rejected": -3.9738609790802, + "step": 2400 + }, + { + "epoch": 0.0788588280339875, + "grad_norm": 4.475430488586426, + "learning_rate": 4.868836966793757e-05, + "logits/chosen": 3.3728833198547363, + "logits/rejected": 3.364027738571167, + "logps/chosen": -347.3896789550781, + "logps/rejected": -289.74603271484375, + "loss": 0.5862, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.154302716255188, + "rewards/margins": 1.7762022018432617, + "rewards/rejected": -2.9305050373077393, + "step": 2420 + }, + { + "epoch": 0.07951055388550807, + "grad_norm": 2.419133186340332, + "learning_rate": 4.867750730493912e-05, + "logits/chosen": 3.398907423019409, + "logits/rejected": 3.4923527240753174, + "logps/chosen": -330.61761474609375, + "logps/rejected": -330.3555603027344, + "loss": 0.5911, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9902742505073547, + "rewards/margins": 1.9495359659194946, + "rewards/rejected": -2.939810276031494, + "step": 2440 + }, + { + "epoch": 0.08016227973702862, + "grad_norm": 0.28605857491493225, + "learning_rate": 4.866664494194067e-05, + "logits/chosen": 3.0459954738616943, + "logits/rejected": 3.134272813796997, + "logps/chosen": -331.1294250488281, + "logps/rejected": -300.8978271484375, + "loss": 0.3241, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3474743366241455, + "rewards/margins": 2.3251054286956787, + "rewards/rejected": -3.6725800037384033, + "step": 2460 + }, + { + "epoch": 0.08081400558854918, + "grad_norm": 2.3713274002075195, + "learning_rate": 4.865578257894223e-05, + "logits/chosen": 2.940674304962158, + "logits/rejected": 2.9065871238708496, + "logps/chosen": -312.2759094238281, + "logps/rejected": -334.91473388671875, + "loss": 0.4687, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8916456699371338, + "rewards/margins": 2.368234872817993, + "rewards/rejected": -4.259881019592285, + "step": 2480 + }, + { + "epoch": 0.08146573144006973, + "grad_norm": 2.6014657020568848, + "learning_rate": 4.864492021594378e-05, + "logits/chosen": 3.072540283203125, + "logits/rejected": 3.2436866760253906, + "logps/chosen": -336.00665283203125, + "logps/rejected": -285.5826110839844, + "loss": 0.7345, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.1243340969085693, + "rewards/margins": 1.3461533784866333, + "rewards/rejected": -3.470487594604492, + "step": 2500 + }, + { + "epoch": 0.0821174572915903, + "grad_norm": 2.032809019088745, + "learning_rate": 4.863405785294533e-05, + "logits/chosen": 3.1597533226013184, + "logits/rejected": 3.358090877532959, + "logps/chosen": -324.52313232421875, + "logps/rejected": -325.60235595703125, + "loss": 0.4379, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5257233381271362, + "rewards/margins": 2.0351815223693848, + "rewards/rejected": -3.5609049797058105, + "step": 2520 + }, + { + "epoch": 0.08276918314311085, + "grad_norm": 3.3991565704345703, + "learning_rate": 4.862319548994689e-05, + "logits/chosen": 3.407768726348877, + "logits/rejected": 3.355952024459839, + "logps/chosen": -355.8735656738281, + "logps/rejected": -318.25927734375, + "loss": 0.5782, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2884528636932373, + "rewards/margins": 1.7077703475952148, + "rewards/rejected": -2.996222972869873, + "step": 2540 + }, + { + "epoch": 0.08342090899463141, + "grad_norm": 3.449995517730713, + "learning_rate": 4.861233312694844e-05, + "logits/chosen": 3.4200196266174316, + "logits/rejected": 3.5554909706115723, + "logps/chosen": -357.2406311035156, + "logps/rejected": -301.2788391113281, + "loss": 0.585, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.353020429611206, + "rewards/margins": 1.68710458278656, + "rewards/rejected": -3.0401253700256348, + "step": 2560 + }, + { + "epoch": 0.08407263484615196, + "grad_norm": 0.7194994688034058, + "learning_rate": 4.860147076394999e-05, + "logits/chosen": 3.428574323654175, + "logits/rejected": 3.3621513843536377, + "logps/chosen": -375.3366394042969, + "logps/rejected": -332.4507751464844, + "loss": 0.4782, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8310451507568359, + "rewards/margins": 2.3753461837768555, + "rewards/rejected": -3.2063910961151123, + "step": 2580 + }, + { + "epoch": 0.08472436069767253, + "grad_norm": 4.624852180480957, + "learning_rate": 4.859060840095154e-05, + "logits/chosen": 3.117311477661133, + "logits/rejected": 3.1907880306243896, + "logps/chosen": -321.0003662109375, + "logps/rejected": -279.2815856933594, + "loss": 0.5828, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5245931148529053, + "rewards/margins": 1.919537901878357, + "rewards/rejected": -3.4441311359405518, + "step": 2600 + }, + { + "epoch": 0.08537608654919308, + "grad_norm": 3.3032710552215576, + "learning_rate": 4.85797460379531e-05, + "logits/chosen": 2.9112682342529297, + "logits/rejected": 2.9053637981414795, + "logps/chosen": -343.2389831542969, + "logps/rejected": -325.9366149902344, + "loss": 0.4255, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.172179698944092, + "rewards/margins": 2.503065586090088, + "rewards/rejected": -4.675245761871338, + "step": 2620 + }, + { + "epoch": 0.08602781240071364, + "grad_norm": 2.3447608947753906, + "learning_rate": 4.856888367495465e-05, + "logits/chosen": 2.9468905925750732, + "logits/rejected": 3.1004958152770996, + "logps/chosen": -368.4523620605469, + "logps/rejected": -314.30706787109375, + "loss": 0.4487, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.014706611633301, + "rewards/margins": 2.600196361541748, + "rewards/rejected": -4.614902973175049, + "step": 2640 + }, + { + "epoch": 0.08667953825223419, + "grad_norm": 13.312652587890625, + "learning_rate": 4.8558021311956206e-05, + "logits/chosen": 3.263922929763794, + "logits/rejected": 3.314683198928833, + "logps/chosen": -354.1935119628906, + "logps/rejected": -316.7143859863281, + "loss": 0.5895, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1838791370391846, + "rewards/margins": 1.8780847787857056, + "rewards/rejected": -4.0619635581970215, + "step": 2660 + }, + { + "epoch": 0.08733126410375476, + "grad_norm": 3.4340410232543945, + "learning_rate": 4.8547158948957764e-05, + "logits/chosen": 2.6895620822906494, + "logits/rejected": 2.9712119102478027, + "logps/chosen": -363.28460693359375, + "logps/rejected": -324.77349853515625, + "loss": 0.4706, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0496747493743896, + "rewards/margins": 2.4808828830718994, + "rewards/rejected": -4.530557632446289, + "step": 2680 + }, + { + "epoch": 0.08798298995527531, + "grad_norm": 1.98042631149292, + "learning_rate": 4.8536296585959314e-05, + "logits/chosen": 2.782830238342285, + "logits/rejected": 2.8208088874816895, + "logps/chosen": -290.26336669921875, + "logps/rejected": -270.7415466308594, + "loss": 0.5596, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.0367517471313477, + "rewards/margins": 1.9297367334365845, + "rewards/rejected": -3.9664885997772217, + "step": 2700 + }, + { + "epoch": 0.08863471580679587, + "grad_norm": 0.6068879961967468, + "learning_rate": 4.8525434222960865e-05, + "logits/chosen": 2.916003704071045, + "logits/rejected": 3.0652108192443848, + "logps/chosen": -336.24334716796875, + "logps/rejected": -300.14581298828125, + "loss": 0.3562, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.142127513885498, + "rewards/margins": 2.811161994934082, + "rewards/rejected": -4.953289985656738, + "step": 2720 + }, + { + "epoch": 0.08928644165831642, + "grad_norm": 2.9305684566497803, + "learning_rate": 4.851457185996242e-05, + "logits/chosen": 3.1195149421691895, + "logits/rejected": 3.2765884399414062, + "logps/chosen": -349.86865234375, + "logps/rejected": -290.37139892578125, + "loss": 0.4844, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7392257452011108, + "rewards/margins": 2.188805341720581, + "rewards/rejected": -3.9280307292938232, + "step": 2740 + }, + { + "epoch": 0.08993816750983699, + "grad_norm": 6.131348609924316, + "learning_rate": 4.8503709496963973e-05, + "logits/chosen": 3.2465717792510986, + "logits/rejected": 3.217587947845459, + "logps/chosen": -356.74603271484375, + "logps/rejected": -348.3101501464844, + "loss": 0.5169, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5891307592391968, + "rewards/margins": 2.063149929046631, + "rewards/rejected": -3.652280807495117, + "step": 2760 + }, + { + "epoch": 0.09058989336135755, + "grad_norm": 2.8475828170776367, + "learning_rate": 4.8492847133965524e-05, + "logits/chosen": 3.0181539058685303, + "logits/rejected": 3.1401472091674805, + "logps/chosen": -356.98095703125, + "logps/rejected": -311.0254211425781, + "loss": 0.5218, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.0779497623443604, + "rewards/margins": 2.209205150604248, + "rewards/rejected": -4.2871551513671875, + "step": 2780 + }, + { + "epoch": 0.0912416192128781, + "grad_norm": 4.560398101806641, + "learning_rate": 4.8481984770967075e-05, + "logits/chosen": 3.3359527587890625, + "logits/rejected": 3.372161865234375, + "logps/chosen": -348.257080078125, + "logps/rejected": -333.60003662109375, + "loss": 0.6312, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1572248935699463, + "rewards/margins": 1.5507323741912842, + "rewards/rejected": -3.7079575061798096, + "step": 2800 + }, + { + "epoch": 0.09189334506439865, + "grad_norm": 5.598357200622559, + "learning_rate": 4.847112240796863e-05, + "logits/chosen": 3.147787094116211, + "logits/rejected": 3.2743096351623535, + "logps/chosen": -346.3968505859375, + "logps/rejected": -299.1639404296875, + "loss": 0.3855, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.956291675567627, + "rewards/margins": 3.6548469066619873, + "rewards/rejected": -4.611138820648193, + "step": 2820 + }, + { + "epoch": 0.09254507091591922, + "grad_norm": 1.688124179840088, + "learning_rate": 4.8460260044970183e-05, + "logits/chosen": 3.173776149749756, + "logits/rejected": 3.381490707397461, + "logps/chosen": -339.4457092285156, + "logps/rejected": -290.26983642578125, + "loss": 0.4439, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.69302237033844, + "rewards/margins": 2.02378511428833, + "rewards/rejected": -3.7168076038360596, + "step": 2840 + }, + { + "epoch": 0.09319679676743978, + "grad_norm": 3.7235593795776367, + "learning_rate": 4.844994080012166e-05, + "logits/chosen": 3.1705305576324463, + "logits/rejected": 3.2367827892303467, + "logps/chosen": -379.390869140625, + "logps/rejected": -348.94537353515625, + "loss": 0.5727, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5147287845611572, + "rewards/margins": 1.6875797510147095, + "rewards/rejected": -3.2023086547851562, + "step": 2860 + }, + { + "epoch": 0.09384852261896033, + "grad_norm": 0.8401127457618713, + "learning_rate": 4.843907843712321e-05, + "logits/chosen": 2.9626004695892334, + "logits/rejected": 3.0554299354553223, + "logps/chosen": -326.7702941894531, + "logps/rejected": -296.3485107421875, + "loss": 0.7423, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5124372243881226, + "rewards/margins": 1.370335340499878, + "rewards/rejected": -2.88277268409729, + "step": 2880 + }, + { + "epoch": 0.09450024847048089, + "grad_norm": 2.985961437225342, + "learning_rate": 4.842821607412476e-05, + "logits/chosen": 3.1667888164520264, + "logits/rejected": 3.282496929168701, + "logps/chosen": -318.4917297363281, + "logps/rejected": -251.64047241210938, + "loss": 0.4939, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0254353284835815, + "rewards/margins": 1.904370903968811, + "rewards/rejected": -2.9298062324523926, + "step": 2900 + }, + { + "epoch": 0.09515197432200145, + "grad_norm": 4.9020209312438965, + "learning_rate": 4.841735371112632e-05, + "logits/chosen": 3.331024169921875, + "logits/rejected": 3.4837279319763184, + "logps/chosen": -328.1170349121094, + "logps/rejected": -316.92919921875, + "loss": 0.607, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8203598260879517, + "rewards/margins": 1.8698279857635498, + "rewards/rejected": -2.690187454223633, + "step": 2920 + }, + { + "epoch": 0.09580370017352201, + "grad_norm": 1.1818922758102417, + "learning_rate": 4.840649134812787e-05, + "logits/chosen": 3.2830185890197754, + "logits/rejected": 3.460902452468872, + "logps/chosen": -327.47088623046875, + "logps/rejected": -294.6483459472656, + "loss": 0.4088, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5904995203018188, + "rewards/margins": 2.402766704559326, + "rewards/rejected": -3.9932658672332764, + "step": 2940 + }, + { + "epoch": 0.09645542602504256, + "grad_norm": 1.0706948041915894, + "learning_rate": 4.839562898512943e-05, + "logits/chosen": 3.292011260986328, + "logits/rejected": 3.39190936088562, + "logps/chosen": -347.59539794921875, + "logps/rejected": -330.92657470703125, + "loss": 0.5233, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2272733449935913, + "rewards/margins": 2.0948541164398193, + "rewards/rejected": -3.3221278190612793, + "step": 2960 + }, + { + "epoch": 0.09710715187656313, + "grad_norm": 1.3578648567199707, + "learning_rate": 4.838476662213098e-05, + "logits/chosen": 3.510169267654419, + "logits/rejected": 3.6238322257995605, + "logps/chosen": -383.2890319824219, + "logps/rejected": -331.7569274902344, + "loss": 0.5489, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.249558687210083, + "rewards/margins": 2.0173704624176025, + "rewards/rejected": -3.2669291496276855, + "step": 2980 + }, + { + "epoch": 0.09775887772808368, + "grad_norm": 1.446892499923706, + "learning_rate": 4.837390425913254e-05, + "logits/chosen": 3.182013511657715, + "logits/rejected": 3.2702174186706543, + "logps/chosen": -380.23388671875, + "logps/rejected": -317.84039306640625, + "loss": 0.3644, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7618297338485718, + "rewards/margins": 2.645143508911133, + "rewards/rejected": -4.406973361968994, + "step": 3000 + }, + { + "epoch": 0.09841060357960424, + "grad_norm": 1.628352403640747, + "learning_rate": 4.836304189613409e-05, + "logits/chosen": 3.4415671825408936, + "logits/rejected": 3.564343214035034, + "logps/chosen": -344.9812927246094, + "logps/rejected": -309.3968505859375, + "loss": 0.3714, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.2570264339447021, + "rewards/margins": 2.580779790878296, + "rewards/rejected": -3.837805986404419, + "step": 3020 + }, + { + "epoch": 0.0990623294311248, + "grad_norm": 6.392394542694092, + "learning_rate": 4.835217953313564e-05, + "logits/chosen": 2.938063144683838, + "logits/rejected": 3.104886054992676, + "logps/chosen": -303.8746643066406, + "logps/rejected": -287.5484619140625, + "loss": 0.5205, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3517080545425415, + "rewards/margins": 2.0206799507141113, + "rewards/rejected": -3.372387647628784, + "step": 3040 + }, + { + "epoch": 0.09971405528264536, + "grad_norm": 1.1390193700790405, + "learning_rate": 4.83413171701372e-05, + "logits/chosen": 2.974391222000122, + "logits/rejected": 3.0723843574523926, + "logps/chosen": -335.21917724609375, + "logps/rejected": -341.2181396484375, + "loss": 0.6479, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5093357563018799, + "rewards/margins": 1.6925106048583984, + "rewards/rejected": -3.2018463611602783, + "step": 3060 + }, + { + "epoch": 0.10036578113416592, + "grad_norm": 2.213254451751709, + "learning_rate": 4.833045480713875e-05, + "logits/chosen": 3.205638885498047, + "logits/rejected": 3.375814437866211, + "logps/chosen": -344.04876708984375, + "logps/rejected": -306.48919677734375, + "loss": 0.4592, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7299909591674805, + "rewards/margins": 2.2531847953796387, + "rewards/rejected": -3.983175754547119, + "step": 3080 + }, + { + "epoch": 0.10101750698568647, + "grad_norm": 0.8037653565406799, + "learning_rate": 4.83195924441403e-05, + "logits/chosen": 3.5719268321990967, + "logits/rejected": 3.6199612617492676, + "logps/chosen": -325.3515930175781, + "logps/rejected": -284.2156677246094, + "loss": 0.614, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4651761054992676, + "rewards/margins": 1.81514573097229, + "rewards/rejected": -3.2803218364715576, + "step": 3100 + }, + { + "epoch": 0.10166923283720702, + "grad_norm": 0.4418405592441559, + "learning_rate": 4.8308730081141856e-05, + "logits/chosen": 3.411686420440674, + "logits/rejected": 3.361645221710205, + "logps/chosen": -376.738037109375, + "logps/rejected": -294.10968017578125, + "loss": 0.4726, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.2032114267349243, + "rewards/margins": 2.5117015838623047, + "rewards/rejected": -3.7149131298065186, + "step": 3120 + }, + { + "epoch": 0.10232095868872759, + "grad_norm": 3.8723270893096924, + "learning_rate": 4.8297867718143407e-05, + "logits/chosen": 3.00282621383667, + "logits/rejected": 3.2661385536193848, + "logps/chosen": -320.58709716796875, + "logps/rejected": -282.6607971191406, + "loss": 0.5779, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.549626111984253, + "rewards/margins": 1.8850734233856201, + "rewards/rejected": -3.434699296951294, + "step": 3140 + }, + { + "epoch": 0.10297268454024815, + "grad_norm": 2.0280823707580566, + "learning_rate": 4.828700535514496e-05, + "logits/chosen": 3.7110908031463623, + "logits/rejected": 3.765380859375, + "logps/chosen": -413.7069396972656, + "logps/rejected": -344.6554870605469, + "loss": 0.4435, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6571415066719055, + "rewards/margins": 2.478834867477417, + "rewards/rejected": -3.135976552963257, + "step": 3160 + }, + { + "epoch": 0.1036244103917687, + "grad_norm": 1.5350008010864258, + "learning_rate": 4.827614299214651e-05, + "logits/chosen": 3.1751132011413574, + "logits/rejected": 3.261382579803467, + "logps/chosen": -316.1951904296875, + "logps/rejected": -331.7400817871094, + "loss": 0.4791, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0221152305603027, + "rewards/margins": 2.2312042713165283, + "rewards/rejected": -4.25331974029541, + "step": 3180 + }, + { + "epoch": 0.10427613624328926, + "grad_norm": 0.17169839143753052, + "learning_rate": 4.8265280629148066e-05, + "logits/chosen": 3.2595343589782715, + "logits/rejected": 3.2882485389709473, + "logps/chosen": -353.89398193359375, + "logps/rejected": -303.7634582519531, + "loss": 0.5073, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2738564014434814, + "rewards/margins": 2.1649222373962402, + "rewards/rejected": -3.4387786388397217, + "step": 3200 + }, + { + "epoch": 0.10492786209480982, + "grad_norm": 1.2004095315933228, + "learning_rate": 4.825441826614962e-05, + "logits/chosen": 3.074195384979248, + "logits/rejected": 3.2111659049987793, + "logps/chosen": -312.779296875, + "logps/rejected": -279.2161865234375, + "loss": 0.4786, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.543473720550537, + "rewards/margins": 1.9868192672729492, + "rewards/rejected": -4.530292987823486, + "step": 3220 + }, + { + "epoch": 0.10557958794633038, + "grad_norm": 4.684961795806885, + "learning_rate": 4.8243555903151174e-05, + "logits/chosen": 3.3574326038360596, + "logits/rejected": 3.3850765228271484, + "logps/chosen": -349.60894775390625, + "logps/rejected": -325.5541076660156, + "loss": 0.6637, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6759449243545532, + "rewards/margins": 1.6936219930648804, + "rewards/rejected": -3.3695666790008545, + "step": 3240 + }, + { + "epoch": 0.10623131379785093, + "grad_norm": 1.774433970451355, + "learning_rate": 4.823269354015273e-05, + "logits/chosen": 3.2585575580596924, + "logits/rejected": 3.4998035430908203, + "logps/chosen": -300.41070556640625, + "logps/rejected": -260.4970703125, + "loss": 0.5544, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7477691173553467, + "rewards/margins": 1.9228003025054932, + "rewards/rejected": -3.670569658279419, + "step": 3260 + }, + { + "epoch": 0.10688303964937149, + "grad_norm": 2.423067331314087, + "learning_rate": 4.822183117715428e-05, + "logits/chosen": 3.2747395038604736, + "logits/rejected": 3.4553685188293457, + "logps/chosen": -301.4560241699219, + "logps/rejected": -328.68896484375, + "loss": 0.4334, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.384781837463379, + "rewards/margins": 2.2276735305786133, + "rewards/rejected": -3.6124558448791504, + "step": 3280 + }, + { + "epoch": 0.10753476550089205, + "grad_norm": 7.942028522491455, + "learning_rate": 4.821096881415583e-05, + "logits/chosen": 3.1592416763305664, + "logits/rejected": 3.2232470512390137, + "logps/chosen": -319.8425598144531, + "logps/rejected": -306.49371337890625, + "loss": 0.677, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.3819332122802734, + "rewards/margins": 1.6273428201675415, + "rewards/rejected": -4.009275913238525, + "step": 3300 + }, + { + "epoch": 0.10818649135241261, + "grad_norm": 1.8428938388824463, + "learning_rate": 4.820010645115739e-05, + "logits/chosen": 3.0052857398986816, + "logits/rejected": 3.192760467529297, + "logps/chosen": -310.99420166015625, + "logps/rejected": -276.99639892578125, + "loss": 0.5464, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1939033269882202, + "rewards/margins": 1.7446696758270264, + "rewards/rejected": -2.938573122024536, + "step": 3320 + }, + { + "epoch": 0.10883821720393316, + "grad_norm": 1.0301592350006104, + "learning_rate": 4.818924408815894e-05, + "logits/chosen": 3.402620315551758, + "logits/rejected": 3.597104549407959, + "logps/chosen": -363.9002685546875, + "logps/rejected": -291.1602478027344, + "loss": 0.5711, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.367156744003296, + "rewards/margins": 1.6767299175262451, + "rewards/rejected": -3.043886661529541, + "step": 3340 + }, + { + "epoch": 0.10948994305545372, + "grad_norm": 2.9117982387542725, + "learning_rate": 4.817838172516049e-05, + "logits/chosen": 3.481297731399536, + "logits/rejected": 3.482236385345459, + "logps/chosen": -314.7165222167969, + "logps/rejected": -301.0784606933594, + "loss": 0.4822, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8351952433586121, + "rewards/margins": 1.9844099283218384, + "rewards/rejected": -2.8196051120758057, + "step": 3360 + }, + { + "epoch": 0.11014166890697429, + "grad_norm": 6.15998649597168, + "learning_rate": 4.816751936216204e-05, + "logits/chosen": 3.2260537147521973, + "logits/rejected": 3.33510160446167, + "logps/chosen": -317.2128601074219, + "logps/rejected": -280.23046875, + "loss": 0.565, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6067514419555664, + "rewards/margins": 2.3346667289733887, + "rewards/rejected": -3.941417694091797, + "step": 3380 + }, + { + "epoch": 0.11079339475849484, + "grad_norm": 2.7245569229125977, + "learning_rate": 4.81566569991636e-05, + "logits/chosen": 3.0314221382141113, + "logits/rejected": 3.0793070793151855, + "logps/chosen": -345.80029296875, + "logps/rejected": -321.6756896972656, + "loss": 0.5991, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3254845142364502, + "rewards/margins": 2.4774346351623535, + "rewards/rejected": -3.8029189109802246, + "step": 3400 + }, + { + "epoch": 0.1114451206100154, + "grad_norm": 1.471034288406372, + "learning_rate": 4.814579463616515e-05, + "logits/chosen": 3.1111223697662354, + "logits/rejected": 3.3610599040985107, + "logps/chosen": -332.94610595703125, + "logps/rejected": -319.8433837890625, + "loss": 0.4586, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2840871810913086, + "rewards/margins": 1.5985676050186157, + "rewards/rejected": -2.882655143737793, + "step": 3420 + }, + { + "epoch": 0.11209684646153595, + "grad_norm": 3.4354004859924316, + "learning_rate": 4.81349322731667e-05, + "logits/chosen": 3.016026496887207, + "logits/rejected": 3.1886117458343506, + "logps/chosen": -354.13787841796875, + "logps/rejected": -280.9300537109375, + "loss": 0.4036, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1787219047546387, + "rewards/margins": 2.0955543518066406, + "rewards/rejected": -3.2742760181427, + "step": 3440 + }, + { + "epoch": 0.11274857231305652, + "grad_norm": 3.4572436809539795, + "learning_rate": 4.812406991016826e-05, + "logits/chosen": 3.537639617919922, + "logits/rejected": 3.534397840499878, + "logps/chosen": -351.0204162597656, + "logps/rejected": -310.41326904296875, + "loss": 0.5746, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4632055759429932, + "rewards/margins": 1.6462090015411377, + "rewards/rejected": -3.1094143390655518, + "step": 3460 + }, + { + "epoch": 0.11340029816457707, + "grad_norm": 2.173419713973999, + "learning_rate": 4.811320754716982e-05, + "logits/chosen": 3.1719956398010254, + "logits/rejected": 3.3853707313537598, + "logps/chosen": -315.4241943359375, + "logps/rejected": -303.2987365722656, + "loss": 0.453, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.3639192581176758, + "rewards/margins": 1.794708013534546, + "rewards/rejected": -3.158627510070801, + "step": 3480 + }, + { + "epoch": 0.11405202401609763, + "grad_norm": 1.4126843214035034, + "learning_rate": 4.810234518417137e-05, + "logits/chosen": 3.1402816772460938, + "logits/rejected": 3.303173780441284, + "logps/chosen": -331.085693359375, + "logps/rejected": -276.90399169921875, + "loss": 0.4651, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5797648429870605, + "rewards/margins": 1.8597230911254883, + "rewards/rejected": -3.439487934112549, + "step": 3500 + }, + { + "epoch": 0.1147037498676182, + "grad_norm": 3.3025062084198, + "learning_rate": 4.8091482821172926e-05, + "logits/chosen": 3.272444486618042, + "logits/rejected": 3.237774610519409, + "logps/chosen": -330.67041015625, + "logps/rejected": -273.96197509765625, + "loss": 0.5522, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.0467075109481812, + "rewards/margins": 1.9589941501617432, + "rewards/rejected": -3.0057015419006348, + "step": 3520 + }, + { + "epoch": 0.11535547571913875, + "grad_norm": 3.4790730476379395, + "learning_rate": 4.8080620458174476e-05, + "logits/chosen": 3.6164257526397705, + "logits/rejected": 3.715759754180908, + "logps/chosen": -339.82098388671875, + "logps/rejected": -334.42608642578125, + "loss": 0.6125, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6630821228027344, + "rewards/margins": 1.5571156740188599, + "rewards/rejected": -3.2201976776123047, + "step": 3540 + }, + { + "epoch": 0.1160072015706593, + "grad_norm": 3.443769693374634, + "learning_rate": 4.806975809517603e-05, + "logits/chosen": 3.061797618865967, + "logits/rejected": 3.204420566558838, + "logps/chosen": -327.5578918457031, + "logps/rejected": -319.2113037109375, + "loss": 0.5224, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7603095769882202, + "rewards/margins": 2.2424628734588623, + "rewards/rejected": -4.002772808074951, + "step": 3560 + }, + { + "epoch": 0.11665892742217986, + "grad_norm": 1.0495328903198242, + "learning_rate": 4.805889573217758e-05, + "logits/chosen": 3.2929840087890625, + "logits/rejected": 3.5496819019317627, + "logps/chosen": -330.95501708984375, + "logps/rejected": -359.1007080078125, + "loss": 0.5442, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8670425415039062, + "rewards/margins": 2.3852100372314453, + "rewards/rejected": -4.252252101898193, + "step": 3580 + }, + { + "epoch": 0.11731065327370042, + "grad_norm": 5.633178234100342, + "learning_rate": 4.8048033369179136e-05, + "logits/chosen": 2.9205307960510254, + "logits/rejected": 2.874577522277832, + "logps/chosen": -338.70574951171875, + "logps/rejected": -286.7315673828125, + "loss": 0.8449, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.5782811641693115, + "rewards/margins": 1.3945848941802979, + "rewards/rejected": -3.9728660583496094, + "step": 3600 + }, + { + "epoch": 0.11796237912522098, + "grad_norm": 4.952943325042725, + "learning_rate": 4.8037171006180686e-05, + "logits/chosen": 3.2113425731658936, + "logits/rejected": 3.271812915802002, + "logps/chosen": -369.41387939453125, + "logps/rejected": -344.3109436035156, + "loss": 0.5715, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.3972742557525635, + "rewards/margins": 2.334925413131714, + "rewards/rejected": -4.732199668884277, + "step": 3620 + }, + { + "epoch": 0.11861410497674153, + "grad_norm": 0.2830749750137329, + "learning_rate": 4.802630864318224e-05, + "logits/chosen": 3.546623945236206, + "logits/rejected": 3.531198501586914, + "logps/chosen": -369.1507873535156, + "logps/rejected": -325.29791259765625, + "loss": 0.4383, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.2360032796859741, + "rewards/margins": 2.7361464500427246, + "rewards/rejected": -3.972149610519409, + "step": 3640 + }, + { + "epoch": 0.11926583082826209, + "grad_norm": 1.7173436880111694, + "learning_rate": 4.8015446280183795e-05, + "logits/chosen": 3.2853798866271973, + "logits/rejected": 3.430323839187622, + "logps/chosen": -336.56121826171875, + "logps/rejected": -303.7469482421875, + "loss": 0.4016, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5165762901306152, + "rewards/margins": 2.236255168914795, + "rewards/rejected": -3.7528319358825684, + "step": 3660 + }, + { + "epoch": 0.11991755667978266, + "grad_norm": 1.0306845903396606, + "learning_rate": 4.8004583917185345e-05, + "logits/chosen": 3.170335292816162, + "logits/rejected": 3.3276195526123047, + "logps/chosen": -344.1324768066406, + "logps/rejected": -348.3489074707031, + "loss": 0.373, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.115529775619507, + "rewards/margins": 2.9815051555633545, + "rewards/rejected": -5.0970354080200195, + "step": 3680 + }, + { + "epoch": 0.12056928253130321, + "grad_norm": 1.3324962854385376, + "learning_rate": 4.7993721554186896e-05, + "logits/chosen": 3.114670515060425, + "logits/rejected": 3.118375301361084, + "logps/chosen": -345.19757080078125, + "logps/rejected": -308.4264221191406, + "loss": 0.5253, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.107150077819824, + "rewards/margins": 2.032174825668335, + "rewards/rejected": -4.139325141906738, + "step": 3700 + }, + { + "epoch": 0.12122100838282376, + "grad_norm": 4.476604461669922, + "learning_rate": 4.7982859191188454e-05, + "logits/chosen": 3.3885390758514404, + "logits/rejected": 3.572317123413086, + "logps/chosen": -348.1287536621094, + "logps/rejected": -355.77008056640625, + "loss": 0.5912, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.38696026802063, + "rewards/margins": 2.2507660388946533, + "rewards/rejected": -4.637725830078125, + "step": 3720 + }, + { + "epoch": 0.12187273423434432, + "grad_norm": 2.973952293395996, + "learning_rate": 4.7971996828190005e-05, + "logits/chosen": 3.232344150543213, + "logits/rejected": 3.368131637573242, + "logps/chosen": -334.1014404296875, + "logps/rejected": -319.7297058105469, + "loss": 0.5294, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.154893159866333, + "rewards/margins": 2.1440391540527344, + "rewards/rejected": -4.298932075500488, + "step": 3740 + }, + { + "epoch": 0.12252446008586489, + "grad_norm": 2.5072684288024902, + "learning_rate": 4.796113446519156e-05, + "logits/chosen": 2.951312780380249, + "logits/rejected": 3.319657802581787, + "logps/chosen": -301.41168212890625, + "logps/rejected": -294.91973876953125, + "loss": 0.546, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4087029695510864, + "rewards/margins": 2.091248035430908, + "rewards/rejected": -3.499950408935547, + "step": 3760 + }, + { + "epoch": 0.12317618593738544, + "grad_norm": 3.7616419792175293, + "learning_rate": 4.795027210219311e-05, + "logits/chosen": 3.341683864593506, + "logits/rejected": 3.4284210205078125, + "logps/chosen": -398.31878662109375, + "logps/rejected": -318.09027099609375, + "loss": 0.3863, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.9214717745780945, + "rewards/margins": 2.547614336013794, + "rewards/rejected": -3.469086170196533, + "step": 3780 + }, + { + "epoch": 0.123827911788906, + "grad_norm": 1.2143784761428833, + "learning_rate": 4.793940973919467e-05, + "logits/chosen": 3.4040026664733887, + "logits/rejected": 3.3499724864959717, + "logps/chosen": -368.40887451171875, + "logps/rejected": -332.9649963378906, + "loss": 0.3764, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.9779273867607117, + "rewards/margins": 1.8920389413833618, + "rewards/rejected": -2.8699660301208496, + "step": 3800 + }, + { + "epoch": 0.12447963764042655, + "grad_norm": 0.7619727253913879, + "learning_rate": 4.792854737619622e-05, + "logits/chosen": 3.101006031036377, + "logits/rejected": 3.3345744609832764, + "logps/chosen": -343.05047607421875, + "logps/rejected": -296.72882080078125, + "loss": 0.4942, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3425414562225342, + "rewards/margins": 1.9104019403457642, + "rewards/rejected": -3.252943515777588, + "step": 3820 + }, + { + "epoch": 0.12513136349194712, + "grad_norm": 2.110708713531494, + "learning_rate": 4.791768501319777e-05, + "logits/chosen": 3.4913489818573, + "logits/rejected": 3.6159377098083496, + "logps/chosen": -345.2345275878906, + "logps/rejected": -318.19488525390625, + "loss": 0.641, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3445169925689697, + "rewards/margins": 1.706154227256775, + "rewards/rejected": -3.050671100616455, + "step": 3840 + }, + { + "epoch": 0.12578308934346766, + "grad_norm": 2.0314948558807373, + "learning_rate": 4.790682265019933e-05, + "logits/chosen": 3.166126251220703, + "logits/rejected": 3.211013078689575, + "logps/chosen": -336.8634338378906, + "logps/rejected": -291.47442626953125, + "loss": 0.5271, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5655171871185303, + "rewards/margins": 2.3539721965789795, + "rewards/rejected": -3.919489622116089, + "step": 3860 + }, + { + "epoch": 0.12643481519498823, + "grad_norm": 2.896772861480713, + "learning_rate": 4.789596028720088e-05, + "logits/chosen": 3.2256062030792236, + "logits/rejected": 3.4148707389831543, + "logps/chosen": -349.36968994140625, + "logps/rejected": -336.10699462890625, + "loss": 0.543, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9220046997070312, + "rewards/margins": 1.7819019556045532, + "rewards/rejected": -3.703907012939453, + "step": 3880 + }, + { + "epoch": 0.1270865410465088, + "grad_norm": 4.063043594360352, + "learning_rate": 4.788509792420243e-05, + "logits/chosen": 3.00451397895813, + "logits/rejected": 3.223066806793213, + "logps/chosen": -326.4573974609375, + "logps/rejected": -281.1050109863281, + "loss": 0.5162, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3519790172576904, + "rewards/margins": 1.9176855087280273, + "rewards/rejected": -3.2696642875671387, + "step": 3900 + }, + { + "epoch": 0.12773826689802933, + "grad_norm": 1.3207942247390747, + "learning_rate": 4.787423556120398e-05, + "logits/chosen": 3.3245787620544434, + "logits/rejected": 3.3240458965301514, + "logps/chosen": -327.05035400390625, + "logps/rejected": -275.00128173828125, + "loss": 0.4688, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9735370874404907, + "rewards/margins": 2.0403201580047607, + "rewards/rejected": -4.013857364654541, + "step": 3920 + }, + { + "epoch": 0.1283899927495499, + "grad_norm": 1.9018841981887817, + "learning_rate": 4.786337319820554e-05, + "logits/chosen": 3.587249755859375, + "logits/rejected": 3.5885086059570312, + "logps/chosen": -367.9815368652344, + "logps/rejected": -306.0716857910156, + "loss": 0.5118, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.51593816280365, + "rewards/margins": 2.5325253009796143, + "rewards/rejected": -4.048462867736816, + "step": 3940 + }, + { + "epoch": 0.12904171860107047, + "grad_norm": 1.635209321975708, + "learning_rate": 4.785251083520709e-05, + "logits/chosen": 3.262660503387451, + "logits/rejected": 3.2750353813171387, + "logps/chosen": -324.1220703125, + "logps/rejected": -308.80364990234375, + "loss": 0.4549, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1993035078048706, + "rewards/margins": 2.2897047996520996, + "rewards/rejected": -3.4890084266662598, + "step": 3960 + }, + { + "epoch": 0.129693444452591, + "grad_norm": 2.3711977005004883, + "learning_rate": 4.784164847220864e-05, + "logits/chosen": 3.5401031970977783, + "logits/rejected": 3.499389171600342, + "logps/chosen": -387.13140869140625, + "logps/rejected": -314.45941162109375, + "loss": 0.5528, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.008472442626953, + "rewards/margins": 2.117974042892456, + "rewards/rejected": -4.12644624710083, + "step": 3980 + }, + { + "epoch": 0.13034517030411158, + "grad_norm": 2.716196298599243, + "learning_rate": 4.78307861092102e-05, + "logits/chosen": 3.542637348175049, + "logits/rejected": 3.714996337890625, + "logps/chosen": -329.84735107421875, + "logps/rejected": -304.624267578125, + "loss": 0.2785, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.8744034767150879, + "rewards/margins": 2.747865676879883, + "rewards/rejected": -3.6222686767578125, + "step": 4000 + }, + { + "epoch": 0.13099689615563212, + "grad_norm": 6.002039432525635, + "learning_rate": 4.7819923746211756e-05, + "logits/chosen": 3.418450117111206, + "logits/rejected": 3.5375773906707764, + "logps/chosen": -333.166748046875, + "logps/rejected": -323.7908630371094, + "loss": 0.5052, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5081346035003662, + "rewards/margins": 2.4540512561798096, + "rewards/rejected": -3.9621856212615967, + "step": 4020 + }, + { + "epoch": 0.1316486220071527, + "grad_norm": 1.2203742265701294, + "learning_rate": 4.780906138321331e-05, + "logits/chosen": 3.4568779468536377, + "logits/rejected": 3.5832176208496094, + "logps/chosen": -323.633056640625, + "logps/rejected": -328.6131591796875, + "loss": 0.4624, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3115978240966797, + "rewards/margins": 2.457318067550659, + "rewards/rejected": -3.768916368484497, + "step": 4040 + }, + { + "epoch": 0.13230034785867326, + "grad_norm": 4.328817367553711, + "learning_rate": 4.7798199020214865e-05, + "logits/chosen": 3.0114283561706543, + "logits/rejected": 3.1767141819000244, + "logps/chosen": -321.50152587890625, + "logps/rejected": -271.7521057128906, + "loss": 0.4785, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8810756206512451, + "rewards/margins": 1.786128282546997, + "rewards/rejected": -3.667203903198242, + "step": 4060 + }, + { + "epoch": 0.1329520737101938, + "grad_norm": 6.199656009674072, + "learning_rate": 4.7787336657216415e-05, + "logits/chosen": 3.3070030212402344, + "logits/rejected": 3.337674617767334, + "logps/chosen": -364.4031982421875, + "logps/rejected": -298.0321960449219, + "loss": 0.5094, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7024831771850586, + "rewards/margins": 1.8847261667251587, + "rewards/rejected": -3.5872092247009277, + "step": 4080 + }, + { + "epoch": 0.13360379956171436, + "grad_norm": 4.303965091705322, + "learning_rate": 4.7776474294217966e-05, + "logits/chosen": 3.360316753387451, + "logits/rejected": 3.3354098796844482, + "logps/chosen": -337.97857666015625, + "logps/rejected": -307.7684631347656, + "loss": 0.5299, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3390792608261108, + "rewards/margins": 2.2833104133605957, + "rewards/rejected": -3.622390031814575, + "step": 4100 + }, + { + "epoch": 0.13425552541323493, + "grad_norm": 2.48771071434021, + "learning_rate": 4.776561193121952e-05, + "logits/chosen": 3.3443126678466797, + "logits/rejected": 3.4574427604675293, + "logps/chosen": -354.29791259765625, + "logps/rejected": -325.16265869140625, + "loss": 0.3446, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7422090768814087, + "rewards/margins": 2.9824318885803223, + "rewards/rejected": -4.724640846252441, + "step": 4120 + }, + { + "epoch": 0.13490725126475547, + "grad_norm": 0.6384651064872742, + "learning_rate": 4.7754749568221075e-05, + "logits/chosen": 2.964951992034912, + "logits/rejected": 3.0969996452331543, + "logps/chosen": -295.624267578125, + "logps/rejected": -308.13165283203125, + "loss": 0.4578, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.575047492980957, + "rewards/margins": 2.797207832336426, + "rewards/rejected": -4.372255325317383, + "step": 4140 + }, + { + "epoch": 0.13555897711627604, + "grad_norm": 7.4914774894714355, + "learning_rate": 4.7743887205222625e-05, + "logits/chosen": 3.2801098823547363, + "logits/rejected": 3.3422157764434814, + "logps/chosen": -345.1646423339844, + "logps/rejected": -315.3554992675781, + "loss": 0.3435, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0162193775177, + "rewards/margins": 2.779615879058838, + "rewards/rejected": -4.795835018157959, + "step": 4160 + }, + { + "epoch": 0.1362107029677966, + "grad_norm": 0.12124518305063248, + "learning_rate": 4.7733024842224176e-05, + "logits/chosen": 3.267251491546631, + "logits/rejected": 3.4055659770965576, + "logps/chosen": -344.1845703125, + "logps/rejected": -339.99127197265625, + "loss": 0.5147, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.907819151878357, + "rewards/margins": 2.550783634185791, + "rewards/rejected": -4.4586029052734375, + "step": 4180 + }, + { + "epoch": 0.13686242881931715, + "grad_norm": 3.973163366317749, + "learning_rate": 4.7722162479225734e-05, + "logits/chosen": 3.392414093017578, + "logits/rejected": 3.4588558673858643, + "logps/chosen": -310.72418212890625, + "logps/rejected": -260.3303527832031, + "loss": 0.7144, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.18868088722229, + "rewards/margins": 1.88055419921875, + "rewards/rejected": -4.069235324859619, + "step": 4200 + }, + { + "epoch": 0.13751415467083772, + "grad_norm": 1.0785205364227295, + "learning_rate": 4.7711300116227284e-05, + "logits/chosen": 3.370027542114258, + "logits/rejected": 3.56152606010437, + "logps/chosen": -359.5828552246094, + "logps/rejected": -328.8059997558594, + "loss": 0.3708, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.224390983581543, + "rewards/margins": 2.4530386924743652, + "rewards/rejected": -4.677429676055908, + "step": 4220 + }, + { + "epoch": 0.13816588052235826, + "grad_norm": 4.192521095275879, + "learning_rate": 4.7700437753228835e-05, + "logits/chosen": 3.3111908435821533, + "logits/rejected": 3.363724946975708, + "logps/chosen": -366.167236328125, + "logps/rejected": -389.9554748535156, + "loss": 0.3106, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.2957208156585693, + "rewards/margins": 2.668612003326416, + "rewards/rejected": -4.9643330574035645, + "step": 4240 + }, + { + "epoch": 0.13881760637387883, + "grad_norm": 2.6119003295898438, + "learning_rate": 4.768957539023039e-05, + "logits/chosen": 3.1809980869293213, + "logits/rejected": 3.326423168182373, + "logps/chosen": -362.0994873046875, + "logps/rejected": -311.4002685546875, + "loss": 0.4476, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.260061502456665, + "rewards/margins": 2.3892154693603516, + "rewards/rejected": -4.649277210235596, + "step": 4260 + }, + { + "epoch": 0.1394693322253994, + "grad_norm": 0.2667711675167084, + "learning_rate": 4.767871302723195e-05, + "logits/chosen": 3.060974597930908, + "logits/rejected": 3.241163969039917, + "logps/chosen": -335.7603454589844, + "logps/rejected": -296.862548828125, + "loss": 0.3792, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.5045039653778076, + "rewards/margins": 2.502964735031128, + "rewards/rejected": -4.007468223571777, + "step": 4280 + }, + { + "epoch": 0.14012105807691994, + "grad_norm": 3.0214033126831055, + "learning_rate": 4.76678506642335e-05, + "logits/chosen": 3.4456305503845215, + "logits/rejected": 3.6104111671447754, + "logps/chosen": -347.200927734375, + "logps/rejected": -343.3063049316406, + "loss": 0.8219, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.9312336444854736, + "rewards/margins": 2.1532235145568848, + "rewards/rejected": -4.084456920623779, + "step": 4300 + }, + { + "epoch": 0.1407727839284405, + "grad_norm": 1.0856289863586426, + "learning_rate": 4.765698830123505e-05, + "logits/chosen": 3.3786911964416504, + "logits/rejected": 3.564383029937744, + "logps/chosen": -361.21527099609375, + "logps/rejected": -292.3622741699219, + "loss": 0.4409, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8548787832260132, + "rewards/margins": 2.452040195465088, + "rewards/rejected": -3.3069190979003906, + "step": 4320 + }, + { + "epoch": 0.14142450977996107, + "grad_norm": 3.044638156890869, + "learning_rate": 4.764612593823661e-05, + "logits/chosen": 3.4658894538879395, + "logits/rejected": 3.6022956371307373, + "logps/chosen": -302.7248840332031, + "logps/rejected": -292.3055419921875, + "loss": 0.7505, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9712207317352295, + "rewards/margins": 1.42770516872406, + "rewards/rejected": -3.39892578125, + "step": 4340 + }, + { + "epoch": 0.1420762356314816, + "grad_norm": 1.2167586088180542, + "learning_rate": 4.763526357523816e-05, + "logits/chosen": 3.5733418464660645, + "logits/rejected": 3.6132216453552246, + "logps/chosen": -346.92889404296875, + "logps/rejected": -316.3614807128906, + "loss": 0.4046, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7447795867919922, + "rewards/margins": 2.1305763721466064, + "rewards/rejected": -3.8753559589385986, + "step": 4360 + }, + { + "epoch": 0.14272796148300218, + "grad_norm": 3.168600559234619, + "learning_rate": 4.762440121223971e-05, + "logits/chosen": 3.615260362625122, + "logits/rejected": 3.7598109245300293, + "logps/chosen": -348.7086181640625, + "logps/rejected": -356.2722473144531, + "loss": 0.5356, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6584972143173218, + "rewards/margins": 2.345463275909424, + "rewards/rejected": -4.003960609436035, + "step": 4380 + }, + { + "epoch": 0.14337968733452272, + "grad_norm": 3.1880242824554443, + "learning_rate": 4.761353884924127e-05, + "logits/chosen": 3.502600908279419, + "logits/rejected": 3.653825283050537, + "logps/chosen": -363.8152770996094, + "logps/rejected": -333.09503173828125, + "loss": 0.5357, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7713476419448853, + "rewards/margins": 2.077867031097412, + "rewards/rejected": -3.849215269088745, + "step": 4400 + }, + { + "epoch": 0.1440314131860433, + "grad_norm": 2.494497776031494, + "learning_rate": 4.760267648624282e-05, + "logits/chosen": 3.4260547161102295, + "logits/rejected": 3.5526058673858643, + "logps/chosen": -346.73883056640625, + "logps/rejected": -357.4334411621094, + "loss": 0.3648, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.600815773010254, + "rewards/margins": 2.9171605110168457, + "rewards/rejected": -4.5179762840271, + "step": 4420 + }, + { + "epoch": 0.14468313903756386, + "grad_norm": 0.5524844527244568, + "learning_rate": 4.759181412324437e-05, + "logits/chosen": 3.3572700023651123, + "logits/rejected": 3.656176805496216, + "logps/chosen": -342.6084289550781, + "logps/rejected": -349.8631286621094, + "loss": 0.5123, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.990971326828003, + "rewards/margins": 2.815192937850952, + "rewards/rejected": -4.806164741516113, + "step": 4440 + }, + { + "epoch": 0.1453348648890844, + "grad_norm": 0.35118892788887024, + "learning_rate": 4.758095176024593e-05, + "logits/chosen": 3.4468300342559814, + "logits/rejected": 3.6358039379119873, + "logps/chosen": -359.23455810546875, + "logps/rejected": -314.895263671875, + "loss": 0.5549, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4694300889968872, + "rewards/margins": 2.7624599933624268, + "rewards/rejected": -4.2318902015686035, + "step": 4460 + }, + { + "epoch": 0.14598659074060497, + "grad_norm": 1.2932639122009277, + "learning_rate": 4.757008939724748e-05, + "logits/chosen": 3.514385938644409, + "logits/rejected": 3.5754036903381348, + "logps/chosen": -328.79254150390625, + "logps/rejected": -284.5199279785156, + "loss": 0.51, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4083623886108398, + "rewards/margins": 2.0073330402374268, + "rewards/rejected": -3.4156951904296875, + "step": 4480 + }, + { + "epoch": 0.14663831659212553, + "grad_norm": 4.287021636962891, + "learning_rate": 4.755922703424903e-05, + "logits/chosen": 3.3937172889709473, + "logits/rejected": 3.3192219734191895, + "logps/chosen": -321.12164306640625, + "logps/rejected": -313.1876525878906, + "loss": 0.5551, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1540181636810303, + "rewards/margins": 2.068850517272949, + "rewards/rejected": -3.2228686809539795, + "step": 4500 + }, + { + "epoch": 0.14729004244364607, + "grad_norm": 3.9672701358795166, + "learning_rate": 4.754836467125059e-05, + "logits/chosen": 3.565959930419922, + "logits/rejected": 3.7843680381774902, + "logps/chosen": -357.4993896484375, + "logps/rejected": -301.7267150878906, + "loss": 0.4033, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2553943395614624, + "rewards/margins": 2.065965175628662, + "rewards/rejected": -3.321359634399414, + "step": 4520 + }, + { + "epoch": 0.14794176829516664, + "grad_norm": 1.9612826108932495, + "learning_rate": 4.753750230825214e-05, + "logits/chosen": 3.6454977989196777, + "logits/rejected": 3.9177799224853516, + "logps/chosen": -334.1510314941406, + "logps/rejected": -296.9614562988281, + "loss": 0.392, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.763628363609314, + "rewards/margins": 2.119750738143921, + "rewards/rejected": -3.8833796977996826, + "step": 4540 + }, + { + "epoch": 0.14859349414668718, + "grad_norm": 5.295777797698975, + "learning_rate": 4.7526639945253695e-05, + "logits/chosen": 3.436094284057617, + "logits/rejected": 3.5731844902038574, + "logps/chosen": -310.9961853027344, + "logps/rejected": -296.6043395996094, + "loss": 0.4413, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7362120151519775, + "rewards/margins": 2.526987314224243, + "rewards/rejected": -4.2631988525390625, + "step": 4560 + }, + { + "epoch": 0.14924521999820775, + "grad_norm": 2.410281181335449, + "learning_rate": 4.7515777582255246e-05, + "logits/chosen": 3.103964328765869, + "logits/rejected": 3.1214470863342285, + "logps/chosen": -352.2610778808594, + "logps/rejected": -335.65496826171875, + "loss": 0.4096, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.4951176643371582, + "rewards/margins": 2.7274696826934814, + "rewards/rejected": -4.222587585449219, + "step": 4580 + }, + { + "epoch": 0.14989694584972832, + "grad_norm": 5.7111897468566895, + "learning_rate": 4.7504915219256804e-05, + "logits/chosen": 3.268876314163208, + "logits/rejected": 3.289720058441162, + "logps/chosen": -326.1383361816406, + "logps/rejected": -290.9326477050781, + "loss": 0.5193, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.0489602088928223, + "rewards/margins": 2.147092819213867, + "rewards/rejected": -4.196052551269531, + "step": 4600 + }, + { + "epoch": 0.15054867170124886, + "grad_norm": 12.647480964660645, + "learning_rate": 4.7494052856258354e-05, + "logits/chosen": 3.4842395782470703, + "logits/rejected": 3.6077091693878174, + "logps/chosen": -313.7911071777344, + "logps/rejected": -312.915771484375, + "loss": 0.429, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5693511962890625, + "rewards/margins": 2.7161622047424316, + "rewards/rejected": -4.285513877868652, + "step": 4620 + }, + { + "epoch": 0.15120039755276943, + "grad_norm": 1.3191006183624268, + "learning_rate": 4.7483190493259905e-05, + "logits/chosen": 3.2355446815490723, + "logits/rejected": 3.475215196609497, + "logps/chosen": -361.35723876953125, + "logps/rejected": -334.56365966796875, + "loss": 0.3345, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3857884407043457, + "rewards/margins": 2.7593047618865967, + "rewards/rejected": -4.145092964172363, + "step": 4640 + }, + { + "epoch": 0.15185212340429, + "grad_norm": 3.244960069656372, + "learning_rate": 4.747232813026146e-05, + "logits/chosen": 3.6138694286346436, + "logits/rejected": 3.7367217540740967, + "logps/chosen": -322.3809814453125, + "logps/rejected": -299.03973388671875, + "loss": 0.5805, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.078828811645508, + "rewards/margins": 2.6037044525146484, + "rewards/rejected": -4.682532787322998, + "step": 4660 + }, + { + "epoch": 0.15250384925581054, + "grad_norm": 0.7272012233734131, + "learning_rate": 4.7461465767263013e-05, + "logits/chosen": 3.6508584022521973, + "logits/rejected": 3.6634299755096436, + "logps/chosen": -384.0274658203125, + "logps/rejected": -286.86383056640625, + "loss": 0.5374, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.1805899143218994, + "rewards/margins": 2.343367338180542, + "rewards/rejected": -3.5239574909210205, + "step": 4680 + }, + { + "epoch": 0.1531555751073311, + "grad_norm": 1.9652293920516968, + "learning_rate": 4.7450603404264564e-05, + "logits/chosen": 3.311790943145752, + "logits/rejected": 3.4001667499542236, + "logps/chosen": -365.1358947753906, + "logps/rejected": -340.8925476074219, + "loss": 0.5088, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.010852336883545, + "rewards/margins": 2.523430347442627, + "rewards/rejected": -4.534282207489014, + "step": 4700 + }, + { + "epoch": 0.15380730095885167, + "grad_norm": 1.8673332929611206, + "learning_rate": 4.7439741041266115e-05, + "logits/chosen": 3.226254940032959, + "logits/rejected": 3.5225861072540283, + "logps/chosen": -371.91119384765625, + "logps/rejected": -331.07733154296875, + "loss": 0.3113, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.8575215339660645, + "rewards/margins": 2.963975667953491, + "rewards/rejected": -5.821497440338135, + "step": 4720 + }, + { + "epoch": 0.1544590268103722, + "grad_norm": 2.197981834411621, + "learning_rate": 4.742887867826767e-05, + "logits/chosen": 3.297686815261841, + "logits/rejected": 3.4040896892547607, + "logps/chosen": -343.0133361816406, + "logps/rejected": -365.48858642578125, + "loss": 0.4736, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.4017493724823, + "rewards/margins": 2.7940502166748047, + "rewards/rejected": -5.195799350738525, + "step": 4740 + }, + { + "epoch": 0.15511075266189278, + "grad_norm": 1.253828525543213, + "learning_rate": 4.741801631526922e-05, + "logits/chosen": 3.4029972553253174, + "logits/rejected": 3.5767483711242676, + "logps/chosen": -368.8125915527344, + "logps/rejected": -287.0077209472656, + "loss": 0.462, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.853061318397522, + "rewards/margins": 2.1573092937469482, + "rewards/rejected": -4.01037073135376, + "step": 4760 + }, + { + "epoch": 0.15576247851341332, + "grad_norm": 0.6835604906082153, + "learning_rate": 4.7407153952270774e-05, + "logits/chosen": 3.282827377319336, + "logits/rejected": 3.5503299236297607, + "logps/chosen": -341.30706787109375, + "logps/rejected": -281.1317443847656, + "loss": 0.4591, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2409061193466187, + "rewards/margins": 2.7806262969970703, + "rewards/rejected": -4.0215325355529785, + "step": 4780 + }, + { + "epoch": 0.1564142043649339, + "grad_norm": 3.6613593101501465, + "learning_rate": 4.739629158927233e-05, + "logits/chosen": 3.527968645095825, + "logits/rejected": 3.5939126014709473, + "logps/chosen": -331.69775390625, + "logps/rejected": -323.77142333984375, + "loss": 0.5365, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1499850749969482, + "rewards/margins": 2.8520288467407227, + "rewards/rejected": -4.002013683319092, + "step": 4800 + }, + { + "epoch": 0.15706593021645446, + "grad_norm": 4.060462474822998, + "learning_rate": 4.738542922627389e-05, + "logits/chosen": 3.8000004291534424, + "logits/rejected": 3.7871506214141846, + "logps/chosen": -400.53924560546875, + "logps/rejected": -337.87518310546875, + "loss": 0.3814, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.1674377918243408, + "rewards/margins": 2.7285850048065186, + "rewards/rejected": -3.8960227966308594, + "step": 4820 + }, + { + "epoch": 0.157717656067975, + "grad_norm": 2.9386818408966064, + "learning_rate": 4.737456686327544e-05, + "logits/chosen": 3.5052897930145264, + "logits/rejected": 3.684677839279175, + "logps/chosen": -340.1436767578125, + "logps/rejected": -298.93475341796875, + "loss": 0.6985, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3739690780639648, + "rewards/margins": 2.6354479789733887, + "rewards/rejected": -4.0094170570373535, + "step": 4840 + }, + { + "epoch": 0.15836938191949557, + "grad_norm": 6.718563556671143, + "learning_rate": 4.7363704500277e-05, + "logits/chosen": 3.4750618934631348, + "logits/rejected": 3.506892681121826, + "logps/chosen": -353.84967041015625, + "logps/rejected": -300.3255310058594, + "loss": 0.7518, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1600862741470337, + "rewards/margins": 1.797690749168396, + "rewards/rejected": -2.9577770233154297, + "step": 4860 + }, + { + "epoch": 0.15902110777101613, + "grad_norm": 0.7971829771995544, + "learning_rate": 4.735284213727855e-05, + "logits/chosen": 3.5100746154785156, + "logits/rejected": 3.629595994949341, + "logps/chosen": -336.2204284667969, + "logps/rejected": -316.00201416015625, + "loss": 0.3656, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.1943188905715942, + "rewards/margins": 2.5588669776916504, + "rewards/rejected": -3.753185749053955, + "step": 4880 + }, + { + "epoch": 0.15967283362253668, + "grad_norm": 1.1719496250152588, + "learning_rate": 4.73419797742801e-05, + "logits/chosen": 3.4142539501190186, + "logits/rejected": 3.6324546337127686, + "logps/chosen": -323.6809387207031, + "logps/rejected": -283.2771301269531, + "loss": 0.5671, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8230235576629639, + "rewards/margins": 2.354316234588623, + "rewards/rejected": -4.177340030670166, + "step": 4900 + }, + { + "epoch": 0.16032455947405724, + "grad_norm": 0.9148614406585693, + "learning_rate": 4.733111741128165e-05, + "logits/chosen": 3.8769335746765137, + "logits/rejected": 3.899446964263916, + "logps/chosen": -405.61236572265625, + "logps/rejected": -358.24334716796875, + "loss": 0.5748, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1102511882781982, + "rewards/margins": 1.8782352209091187, + "rewards/rejected": -3.9884867668151855, + "step": 4920 + }, + { + "epoch": 0.16097628532557778, + "grad_norm": 2.3127052783966064, + "learning_rate": 4.732025504828321e-05, + "logits/chosen": 3.4955108165740967, + "logits/rejected": 3.558689832687378, + "logps/chosen": -400.2259521484375, + "logps/rejected": -362.9164123535156, + "loss": 0.3634, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.3189480304718018, + "rewards/margins": 3.3217625617980957, + "rewards/rejected": -4.640710353851318, + "step": 4940 + }, + { + "epoch": 0.16162801117709835, + "grad_norm": 2.0803282260894775, + "learning_rate": 4.730993580343468e-05, + "logits/chosen": 3.534506320953369, + "logits/rejected": 3.4916186332702637, + "logps/chosen": -396.37890625, + "logps/rejected": -323.8974609375, + "loss": 0.4965, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.4711406230926514, + "rewards/margins": 2.6693577766418457, + "rewards/rejected": -5.140498638153076, + "step": 4960 + }, + { + "epoch": 0.16227973702861892, + "grad_norm": 0.5718753337860107, + "learning_rate": 4.7299073440436237e-05, + "logits/chosen": 3.3772799968719482, + "logits/rejected": 3.604301929473877, + "logps/chosen": -355.55908203125, + "logps/rejected": -336.38714599609375, + "loss": 0.391, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.826690435409546, + "rewards/margins": 2.907283306121826, + "rewards/rejected": -4.733973503112793, + "step": 4980 + }, + { + "epoch": 0.16293146288013946, + "grad_norm": 4.678829669952393, + "learning_rate": 4.728821107743779e-05, + "logits/chosen": 3.5858116149902344, + "logits/rejected": 3.6318275928497314, + "logps/chosen": -330.5090026855469, + "logps/rejected": -312.6830139160156, + "loss": 0.6578, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.248481273651123, + "rewards/margins": 2.318497896194458, + "rewards/rejected": -4.56697940826416, + "step": 5000 + }, + { + "epoch": 0.16358318873166003, + "grad_norm": 1.4147217273712158, + "learning_rate": 4.727734871443934e-05, + "logits/chosen": 3.737791061401367, + "logits/rejected": 3.6978466510772705, + "logps/chosen": -350.11474609375, + "logps/rejected": -314.62042236328125, + "loss": 0.5406, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7438011169433594, + "rewards/margins": 1.8743202686309814, + "rewards/rejected": -3.61812162399292, + "step": 5020 + }, + { + "epoch": 0.1642349145831806, + "grad_norm": 2.3030271530151367, + "learning_rate": 4.7266486351440896e-05, + "logits/chosen": 3.4465396404266357, + "logits/rejected": 3.516087293624878, + "logps/chosen": -309.60784912109375, + "logps/rejected": -287.16363525390625, + "loss": 0.5494, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.424285650253296, + "rewards/margins": 2.1837069988250732, + "rewards/rejected": -3.607992649078369, + "step": 5040 + }, + { + "epoch": 0.16488664043470114, + "grad_norm": 0.7656645178794861, + "learning_rate": 4.7255623988442447e-05, + "logits/chosen": 3.742762804031372, + "logits/rejected": 3.74025297164917, + "logps/chosen": -350.9559326171875, + "logps/rejected": -307.59051513671875, + "loss": 0.5269, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.224869966506958, + "rewards/margins": 2.18868088722229, + "rewards/rejected": -3.413550853729248, + "step": 5060 + }, + { + "epoch": 0.1655383662862217, + "grad_norm": 1.3085827827453613, + "learning_rate": 4.7244761625444e-05, + "logits/chosen": 3.6092166900634766, + "logits/rejected": 3.660219669342041, + "logps/chosen": -324.1283264160156, + "logps/rejected": -284.4068908691406, + "loss": 0.4667, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2671531438827515, + "rewards/margins": 2.248518466949463, + "rewards/rejected": -3.515671491622925, + "step": 5080 + }, + { + "epoch": 0.16619009213774225, + "grad_norm": 0.8875260949134827, + "learning_rate": 4.7233899262445555e-05, + "logits/chosen": 3.4097206592559814, + "logits/rejected": 3.6128458976745605, + "logps/chosen": -346.1413879394531, + "logps/rejected": -330.1009826660156, + "loss": 0.5011, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4083194732666016, + "rewards/margins": 2.4472718238830566, + "rewards/rejected": -3.855591297149658, + "step": 5100 + }, + { + "epoch": 0.16684181798926281, + "grad_norm": 2.7533199787139893, + "learning_rate": 4.722303689944711e-05, + "logits/chosen": 3.562926769256592, + "logits/rejected": 3.678743362426758, + "logps/chosen": -390.9175720214844, + "logps/rejected": -310.7811279296875, + "loss": 0.4578, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0717194080352783, + "rewards/margins": 2.5268943309783936, + "rewards/rejected": -3.598613739013672, + "step": 5120 + }, + { + "epoch": 0.16749354384078338, + "grad_norm": 1.0500285625457764, + "learning_rate": 4.721217453644866e-05, + "logits/chosen": 3.827747344970703, + "logits/rejected": 3.8059520721435547, + "logps/chosen": -353.490234375, + "logps/rejected": -323.1229553222656, + "loss": 0.487, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.39247792959213257, + "rewards/margins": 2.443358898162842, + "rewards/rejected": -2.8358371257781982, + "step": 5140 + }, + { + "epoch": 0.16814526969230392, + "grad_norm": 1.7318121194839478, + "learning_rate": 4.7201312173450214e-05, + "logits/chosen": 3.487880229949951, + "logits/rejected": 3.6833527088165283, + "logps/chosen": -315.68341064453125, + "logps/rejected": -288.0245361328125, + "loss": 0.5141, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.705915093421936, + "rewards/margins": 1.9357540607452393, + "rewards/rejected": -3.6416690349578857, + "step": 5160 + }, + { + "epoch": 0.1687969955438245, + "grad_norm": 2.155742645263672, + "learning_rate": 4.719044981045177e-05, + "logits/chosen": 3.6366372108459473, + "logits/rejected": 3.8329670429229736, + "logps/chosen": -352.75830078125, + "logps/rejected": -310.93658447265625, + "loss": 0.366, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.854993462562561, + "rewards/margins": 2.1963324546813965, + "rewards/rejected": -4.051326274871826, + "step": 5180 + }, + { + "epoch": 0.16944872139534506, + "grad_norm": 1.116894245147705, + "learning_rate": 4.717958744745332e-05, + "logits/chosen": 3.2915146350860596, + "logits/rejected": 3.6893773078918457, + "logps/chosen": -270.92230224609375, + "logps/rejected": -276.31488037109375, + "loss": 0.6489, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3582803010940552, + "rewards/margins": 1.6101640462875366, + "rewards/rejected": -2.968444347381592, + "step": 5200 + }, + { + "epoch": 0.1701004472468656, + "grad_norm": 6.220220565795898, + "learning_rate": 4.716872508445487e-05, + "logits/chosen": 3.573777437210083, + "logits/rejected": 3.634875535964966, + "logps/chosen": -284.02947998046875, + "logps/rejected": -332.0590515136719, + "loss": 0.5036, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2843925952911377, + "rewards/margins": 2.425690174102783, + "rewards/rejected": -3.710082530975342, + "step": 5220 + }, + { + "epoch": 0.17075217309838617, + "grad_norm": 1.8436062335968018, + "learning_rate": 4.715786272145643e-05, + "logits/chosen": 3.6475188732147217, + "logits/rejected": 3.552889347076416, + "logps/chosen": -338.7034606933594, + "logps/rejected": -333.5899658203125, + "loss": 0.4121, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6694459915161133, + "rewards/margins": 2.0125374794006348, + "rewards/rejected": -3.6819839477539062, + "step": 5240 + }, + { + "epoch": 0.17140389894990674, + "grad_norm": 2.015883445739746, + "learning_rate": 4.714700035845798e-05, + "logits/chosen": 3.634687900543213, + "logits/rejected": 3.7060647010803223, + "logps/chosen": -368.21435546875, + "logps/rejected": -331.8502502441406, + "loss": 0.4334, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.409508466720581, + "rewards/margins": 2.772191047668457, + "rewards/rejected": -4.181699275970459, + "step": 5260 + }, + { + "epoch": 0.17205562480142728, + "grad_norm": 1.8030304908752441, + "learning_rate": 4.713613799545953e-05, + "logits/chosen": 3.451073169708252, + "logits/rejected": 3.9232094287872314, + "logps/chosen": -369.710205078125, + "logps/rejected": -353.3988342285156, + "loss": 0.7002, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.391030192375183, + "rewards/margins": 2.188201427459717, + "rewards/rejected": -3.5792312622070312, + "step": 5280 + }, + { + "epoch": 0.17270735065294784, + "grad_norm": 4.6170806884765625, + "learning_rate": 4.712527563246108e-05, + "logits/chosen": 3.6994247436523438, + "logits/rejected": 3.68678617477417, + "logps/chosen": -348.517578125, + "logps/rejected": -320.3282775878906, + "loss": 0.6154, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2519631385803223, + "rewards/margins": 1.768721580505371, + "rewards/rejected": -3.0206844806671143, + "step": 5300 + }, + { + "epoch": 0.17335907650446838, + "grad_norm": 19.175779342651367, + "learning_rate": 4.711441326946264e-05, + "logits/chosen": 3.806964159011841, + "logits/rejected": 3.7850654125213623, + "logps/chosen": -351.84954833984375, + "logps/rejected": -301.35955810546875, + "loss": 0.6314, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2456471920013428, + "rewards/margins": 2.2858922481536865, + "rewards/rejected": -3.5315394401550293, + "step": 5320 + }, + { + "epoch": 0.17401080235598895, + "grad_norm": 0.5026331543922424, + "learning_rate": 4.710355090646419e-05, + "logits/chosen": 3.657337188720703, + "logits/rejected": 3.820235013961792, + "logps/chosen": -349.28863525390625, + "logps/rejected": -325.1624450683594, + "loss": 0.3354, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.3934723734855652, + "rewards/margins": 3.114938497543335, + "rewards/rejected": -3.508410692214966, + "step": 5340 + }, + { + "epoch": 0.17466252820750952, + "grad_norm": 3.5576894283294678, + "learning_rate": 4.709268854346575e-05, + "logits/chosen": 3.493574619293213, + "logits/rejected": 3.566129684448242, + "logps/chosen": -348.3834533691406, + "logps/rejected": -291.85687255859375, + "loss": 0.4022, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.7688047885894775, + "rewards/margins": 2.5862698554992676, + "rewards/rejected": -3.355074644088745, + "step": 5360 + }, + { + "epoch": 0.17531425405903006, + "grad_norm": 2.575194835662842, + "learning_rate": 4.7081826180467306e-05, + "logits/chosen": 3.165733814239502, + "logits/rejected": 3.400475025177002, + "logps/chosen": -311.8915710449219, + "logps/rejected": -291.60888671875, + "loss": 0.5074, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.789229393005371, + "rewards/margins": 2.0810611248016357, + "rewards/rejected": -3.870290756225586, + "step": 5380 + }, + { + "epoch": 0.17596597991055063, + "grad_norm": 2.1731951236724854, + "learning_rate": 4.707096381746886e-05, + "logits/chosen": 3.544299364089966, + "logits/rejected": 3.5879616737365723, + "logps/chosen": -326.457763671875, + "logps/rejected": -280.8424377441406, + "loss": 0.5416, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3534607887268066, + "rewards/margins": 2.467705249786377, + "rewards/rejected": -3.8211655616760254, + "step": 5400 + }, + { + "epoch": 0.1766177057620712, + "grad_norm": 1.9869508743286133, + "learning_rate": 4.706010145447041e-05, + "logits/chosen": 3.6536712646484375, + "logits/rejected": 3.693833112716675, + "logps/chosen": -314.8085021972656, + "logps/rejected": -278.16400146484375, + "loss": 0.6962, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.771869957447052, + "rewards/margins": 1.9867461919784546, + "rewards/rejected": -2.7586164474487305, + "step": 5420 + }, + { + "epoch": 0.17726943161359174, + "grad_norm": 8.1611967086792, + "learning_rate": 4.704923909147196e-05, + "logits/chosen": 3.313530683517456, + "logits/rejected": 3.4334232807159424, + "logps/chosen": -288.97637939453125, + "logps/rejected": -286.569580078125, + "loss": 0.4623, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3179030418395996, + "rewards/margins": 2.083909511566162, + "rewards/rejected": -2.4018125534057617, + "step": 5440 + }, + { + "epoch": 0.1779211574651123, + "grad_norm": 4.128625869750977, + "learning_rate": 4.7038376728473516e-05, + "logits/chosen": 3.558436632156372, + "logits/rejected": 3.7638697624206543, + "logps/chosen": -359.1125183105469, + "logps/rejected": -298.04901123046875, + "loss": 0.3745, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.5986610651016235, + "rewards/margins": 2.855217456817627, + "rewards/rejected": -3.4538779258728027, + "step": 5460 + }, + { + "epoch": 0.17857288331663285, + "grad_norm": 1.5871541500091553, + "learning_rate": 4.702751436547507e-05, + "logits/chosen": 3.702300548553467, + "logits/rejected": 3.890240430831909, + "logps/chosen": -328.75726318359375, + "logps/rejected": -295.2781677246094, + "loss": 0.5315, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3033716678619385, + "rewards/margins": 2.2845568656921387, + "rewards/rejected": -3.5879287719726562, + "step": 5480 + }, + { + "epoch": 0.17922460916815341, + "grad_norm": 1.5397450923919678, + "learning_rate": 4.701665200247662e-05, + "logits/chosen": 3.833491563796997, + "logits/rejected": 3.8786838054656982, + "logps/chosen": -358.40380859375, + "logps/rejected": -323.46044921875, + "loss": 0.4675, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4439425468444824, + "rewards/margins": 1.92971932888031, + "rewards/rejected": -3.373662233352661, + "step": 5500 + }, + { + "epoch": 0.17987633501967398, + "grad_norm": 5.230326175689697, + "learning_rate": 4.7005789639478176e-05, + "logits/chosen": 3.5030856132507324, + "logits/rejected": 3.509751081466675, + "logps/chosen": -298.0101013183594, + "logps/rejected": -291.63165283203125, + "loss": 0.433, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.316401720046997, + "rewards/margins": 2.913241147994995, + "rewards/rejected": -4.229642868041992, + "step": 5520 + }, + { + "epoch": 0.18052806087119452, + "grad_norm": 2.2781941890716553, + "learning_rate": 4.6994927276479726e-05, + "logits/chosen": 3.4931302070617676, + "logits/rejected": 3.513667583465576, + "logps/chosen": -299.0015563964844, + "logps/rejected": -279.6262512207031, + "loss": 0.4013, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.1074607372283936, + "rewards/margins": 2.7512729167938232, + "rewards/rejected": -3.858733654022217, + "step": 5540 + }, + { + "epoch": 0.1811797867227151, + "grad_norm": 4.5516180992126465, + "learning_rate": 4.698406491348128e-05, + "logits/chosen": 3.466127872467041, + "logits/rejected": 3.516695022583008, + "logps/chosen": -368.66900634765625, + "logps/rejected": -287.02154541015625, + "loss": 0.5365, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4711592197418213, + "rewards/margins": 1.6528011560440063, + "rewards/rejected": -3.123960256576538, + "step": 5560 + }, + { + "epoch": 0.18183151257423566, + "grad_norm": 3.4705872535705566, + "learning_rate": 4.6973202550482835e-05, + "logits/chosen": 3.663477659225464, + "logits/rejected": 3.9322803020477295, + "logps/chosen": -373.6400146484375, + "logps/rejected": -316.28759765625, + "loss": 0.5249, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9967460632324219, + "rewards/margins": 2.093200445175171, + "rewards/rejected": -4.089946746826172, + "step": 5580 + }, + { + "epoch": 0.1824832384257562, + "grad_norm": 0.6377744078636169, + "learning_rate": 4.6962340187484385e-05, + "logits/chosen": 3.165027141571045, + "logits/rejected": 3.3546390533447266, + "logps/chosen": -326.4638366699219, + "logps/rejected": -284.87042236328125, + "loss": 0.5193, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.93448007106781, + "rewards/margins": 2.0271244049072266, + "rewards/rejected": -3.961604356765747, + "step": 5600 + }, + { + "epoch": 0.18313496427727677, + "grad_norm": 1.1638128757476807, + "learning_rate": 4.695147782448594e-05, + "logits/chosen": 3.5821926593780518, + "logits/rejected": 3.931241512298584, + "logps/chosen": -353.90142822265625, + "logps/rejected": -303.30535888671875, + "loss": 0.3198, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0160374641418457, + "rewards/margins": 3.0694942474365234, + "rewards/rejected": -5.085530757904053, + "step": 5620 + }, + { + "epoch": 0.1837866901287973, + "grad_norm": 6.628153324127197, + "learning_rate": 4.6940615461487494e-05, + "logits/chosen": 3.456394910812378, + "logits/rejected": 3.5844879150390625, + "logps/chosen": -348.9818420410156, + "logps/rejected": -336.05084228515625, + "loss": 0.5257, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.4770076274871826, + "rewards/margins": 2.204353094100952, + "rewards/rejected": -4.681361198425293, + "step": 5640 + }, + { + "epoch": 0.18443841598031788, + "grad_norm": 2.6601240634918213, + "learning_rate": 4.692975309848905e-05, + "logits/chosen": 3.230968475341797, + "logits/rejected": 3.5895614624023438, + "logps/chosen": -342.92523193359375, + "logps/rejected": -294.948486328125, + "loss": 0.4761, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6213047504425049, + "rewards/margins": 3.02498197555542, + "rewards/rejected": -4.646286964416504, + "step": 5660 + }, + { + "epoch": 0.18509014183183845, + "grad_norm": 8.756712913513184, + "learning_rate": 4.69188907354906e-05, + "logits/chosen": 3.9027304649353027, + "logits/rejected": 3.907907009124756, + "logps/chosen": -369.1119384765625, + "logps/rejected": -345.2076110839844, + "loss": 0.5806, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6911191940307617, + "rewards/margins": 2.4061801433563232, + "rewards/rejected": -4.097299098968506, + "step": 5680 + }, + { + "epoch": 0.18574186768335899, + "grad_norm": 2.071997880935669, + "learning_rate": 4.690802837249215e-05, + "logits/chosen": 3.4886107444763184, + "logits/rejected": 3.957197904586792, + "logps/chosen": -357.86309814453125, + "logps/rejected": -308.02178955078125, + "loss": 0.3156, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8505979776382446, + "rewards/margins": 3.2321696281433105, + "rewards/rejected": -5.082768440246582, + "step": 5700 + }, + { + "epoch": 0.18639359353487955, + "grad_norm": 1.015213131904602, + "learning_rate": 4.689716600949371e-05, + "logits/chosen": 3.5112786293029785, + "logits/rejected": 3.5079567432403564, + "logps/chosen": -327.0605163574219, + "logps/rejected": -320.18572998046875, + "loss": 0.6903, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.315589189529419, + "rewards/margins": 2.0794425010681152, + "rewards/rejected": -4.395031929016113, + "step": 5720 + }, + { + "epoch": 0.18704531938640012, + "grad_norm": 1.7321304082870483, + "learning_rate": 4.688630364649526e-05, + "logits/chosen": 4.171082496643066, + "logits/rejected": 4.152224540710449, + "logps/chosen": -395.56536865234375, + "logps/rejected": -359.6132507324219, + "loss": 0.5598, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7843596935272217, + "rewards/margins": 2.8140997886657715, + "rewards/rejected": -4.5984601974487305, + "step": 5740 + }, + { + "epoch": 0.18769704523792066, + "grad_norm": 1.7455456256866455, + "learning_rate": 4.687544128349681e-05, + "logits/chosen": 3.526329755783081, + "logits/rejected": 3.930001735687256, + "logps/chosen": -367.3045654296875, + "logps/rejected": -339.8309631347656, + "loss": 0.3358, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8178980350494385, + "rewards/margins": 2.9561169147491455, + "rewards/rejected": -4.774014949798584, + "step": 5760 + }, + { + "epoch": 0.18834877108944123, + "grad_norm": 0.996735692024231, + "learning_rate": 4.686457892049837e-05, + "logits/chosen": 3.842910051345825, + "logits/rejected": 4.018715858459473, + "logps/chosen": -368.3810119628906, + "logps/rejected": -301.64642333984375, + "loss": 0.2943, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.5482032299041748, + "rewards/margins": 2.218906879425049, + "rewards/rejected": -3.7671101093292236, + "step": 5780 + }, + { + "epoch": 0.18900049694096177, + "grad_norm": 0.6791709661483765, + "learning_rate": 4.685371655749992e-05, + "logits/chosen": 3.4569015502929688, + "logits/rejected": 3.711632490158081, + "logps/chosen": -337.74676513671875, + "logps/rejected": -299.8768615722656, + "loss": 0.4819, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3538451194763184, + "rewards/margins": 2.759681463241577, + "rewards/rejected": -5.113526344299316, + "step": 5800 + }, + { + "epoch": 0.18965222279248234, + "grad_norm": 5.887211322784424, + "learning_rate": 4.684285419450147e-05, + "logits/chosen": 3.5027689933776855, + "logits/rejected": 3.7025063037872314, + "logps/chosen": -368.55535888671875, + "logps/rejected": -307.2552185058594, + "loss": 0.4976, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.410339832305908, + "rewards/margins": 2.8048412799835205, + "rewards/rejected": -5.215180397033691, + "step": 5820 + }, + { + "epoch": 0.1903039486440029, + "grad_norm": 13.894855499267578, + "learning_rate": 4.683199183150302e-05, + "logits/chosen": 3.4989190101623535, + "logits/rejected": 3.7466652393341064, + "logps/chosen": -358.686767578125, + "logps/rejected": -335.6480407714844, + "loss": 0.6317, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9733558893203735, + "rewards/margins": 2.481604814529419, + "rewards/rejected": -4.454960346221924, + "step": 5840 + }, + { + "epoch": 0.19095567449552345, + "grad_norm": 3.842109203338623, + "learning_rate": 4.682112946850458e-05, + "logits/chosen": 3.5060620307922363, + "logits/rejected": 3.588191270828247, + "logps/chosen": -344.8022155761719, + "logps/rejected": -302.8489074707031, + "loss": 0.4217, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.126774311065674, + "rewards/margins": 2.43442964553833, + "rewards/rejected": -4.561203956604004, + "step": 5860 + }, + { + "epoch": 0.19160740034704402, + "grad_norm": 3.5816848278045654, + "learning_rate": 4.681026710550614e-05, + "logits/chosen": 3.3838067054748535, + "logits/rejected": 3.5459773540496826, + "logps/chosen": -393.52325439453125, + "logps/rejected": -343.1235046386719, + "loss": 0.5488, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.586900472640991, + "rewards/margins": 2.976794719696045, + "rewards/rejected": -5.563694953918457, + "step": 5880 + }, + { + "epoch": 0.19225912619856458, + "grad_norm": 0.7327704429626465, + "learning_rate": 4.679940474250769e-05, + "logits/chosen": 3.1076719760894775, + "logits/rejected": 3.271662950515747, + "logps/chosen": -339.50396728515625, + "logps/rejected": -321.8166198730469, + "loss": 0.4073, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.182889461517334, + "rewards/margins": 2.7461600303649902, + "rewards/rejected": -4.929049015045166, + "step": 5900 + }, + { + "epoch": 0.19291085205008512, + "grad_norm": 1.142184853553772, + "learning_rate": 4.6788542379509245e-05, + "logits/chosen": 3.327144145965576, + "logits/rejected": 3.5890731811523438, + "logps/chosen": -324.1766357421875, + "logps/rejected": -349.38677978515625, + "loss": 0.6174, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.154491901397705, + "rewards/margins": 2.927022933959961, + "rewards/rejected": -5.081514835357666, + "step": 5920 + }, + { + "epoch": 0.1935625779016057, + "grad_norm": 3.0630624294281006, + "learning_rate": 4.6777680016510796e-05, + "logits/chosen": 3.4882121086120605, + "logits/rejected": 3.7116858959198, + "logps/chosen": -349.42340087890625, + "logps/rejected": -307.98052978515625, + "loss": 0.3565, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9085508584976196, + "rewards/margins": 3.2927870750427246, + "rewards/rejected": -5.201337814331055, + "step": 5940 + }, + { + "epoch": 0.19421430375312626, + "grad_norm": 4.250226974487305, + "learning_rate": 4.676681765351235e-05, + "logits/chosen": 3.4180328845977783, + "logits/rejected": 3.583298921585083, + "logps/chosen": -347.0159606933594, + "logps/rejected": -332.6099548339844, + "loss": 0.4181, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.982488989830017, + "rewards/margins": 2.8290278911590576, + "rewards/rejected": -4.811516761779785, + "step": 5960 + }, + { + "epoch": 0.1948660296046468, + "grad_norm": 2.0794126987457275, + "learning_rate": 4.6755955290513905e-05, + "logits/chosen": 3.453929901123047, + "logits/rejected": 3.6709091663360596, + "logps/chosen": -330.0201416015625, + "logps/rejected": -283.38946533203125, + "loss": 0.7013, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6071704626083374, + "rewards/margins": 2.1129257678985596, + "rewards/rejected": -3.7200961112976074, + "step": 5980 + }, + { + "epoch": 0.19551775545616737, + "grad_norm": 6.254043102264404, + "learning_rate": 4.6745092927515455e-05, + "logits/chosen": 3.43961763381958, + "logits/rejected": 3.3649070262908936, + "logps/chosen": -366.82257080078125, + "logps/rejected": -331.8112487792969, + "loss": 0.7296, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.344441294670105, + "rewards/margins": 2.2095234394073486, + "rewards/rejected": -3.5539650917053223, + "step": 6000 + }, + { + "epoch": 0.1961694813076879, + "grad_norm": 3.250141143798828, + "learning_rate": 4.6734230564517006e-05, + "logits/chosen": 3.5607478618621826, + "logits/rejected": 3.608389377593994, + "logps/chosen": -335.95635986328125, + "logps/rejected": -317.46435546875, + "loss": 0.489, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7449615001678467, + "rewards/margins": 2.506192684173584, + "rewards/rejected": -4.251153945922852, + "step": 6020 + }, + { + "epoch": 0.19682120715920848, + "grad_norm": 0.07679598033428192, + "learning_rate": 4.6723911319668484e-05, + "logits/chosen": 3.1892852783203125, + "logits/rejected": 3.4019618034362793, + "logps/chosen": -376.31292724609375, + "logps/rejected": -288.9933166503906, + "loss": 0.6679, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4369163513183594, + "rewards/margins": 2.275944471359253, + "rewards/rejected": -4.712861061096191, + "step": 6040 + }, + { + "epoch": 0.19747293301072905, + "grad_norm": 5.413981914520264, + "learning_rate": 4.6713048956670035e-05, + "logits/chosen": 3.294839859008789, + "logits/rejected": 3.53503680229187, + "logps/chosen": -303.00689697265625, + "logps/rejected": -318.872314453125, + "loss": 0.6866, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.654367446899414, + "rewards/margins": 2.312758445739746, + "rewards/rejected": -3.967125654220581, + "step": 6060 + }, + { + "epoch": 0.1981246588622496, + "grad_norm": 2.999889612197876, + "learning_rate": 4.6702186593671586e-05, + "logits/chosen": 3.4188461303710938, + "logits/rejected": 3.444279432296753, + "logps/chosen": -289.41436767578125, + "logps/rejected": -293.06365966796875, + "loss": 0.4321, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.511444330215454, + "rewards/margins": 2.4168150424957275, + "rewards/rejected": -3.9282593727111816, + "step": 6080 + }, + { + "epoch": 0.19877638471377015, + "grad_norm": 0.08109070360660553, + "learning_rate": 4.6691324230673144e-05, + "logits/chosen": 3.5833544731140137, + "logits/rejected": 3.845127820968628, + "logps/chosen": -385.04052734375, + "logps/rejected": -305.4220275878906, + "loss": 0.3944, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2226040363311768, + "rewards/margins": 2.533242702484131, + "rewards/rejected": -3.7558467388153076, + "step": 6100 + }, + { + "epoch": 0.19942811056529072, + "grad_norm": 4.231566905975342, + "learning_rate": 4.6680461867674694e-05, + "logits/chosen": 3.6180214881896973, + "logits/rejected": 3.718907117843628, + "logps/chosen": -375.3612060546875, + "logps/rejected": -347.5115051269531, + "loss": 0.5407, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8226817846298218, + "rewards/margins": 2.553952693939209, + "rewards/rejected": -4.37663459777832, + "step": 6120 + }, + { + "epoch": 0.20007983641681126, + "grad_norm": 0.30216073989868164, + "learning_rate": 4.6669599504676245e-05, + "logits/chosen": 3.342012405395508, + "logits/rejected": 3.676413059234619, + "logps/chosen": -359.3817138671875, + "logps/rejected": -356.55029296875, + "loss": 0.4078, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3056720495224, + "rewards/margins": 2.995800733566284, + "rewards/rejected": -4.3014726638793945, + "step": 6140 + }, + { + "epoch": 0.20073156226833183, + "grad_norm": 0.24324840307235718, + "learning_rate": 4.66587371416778e-05, + "logits/chosen": 3.570645809173584, + "logits/rejected": 3.811176300048828, + "logps/chosen": -358.187255859375, + "logps/rejected": -334.0565490722656, + "loss": 0.5952, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0988671779632568, + "rewards/margins": 2.9728410243988037, + "rewards/rejected": -4.071707725524902, + "step": 6160 + }, + { + "epoch": 0.20138328811985237, + "grad_norm": 1.9733905792236328, + "learning_rate": 4.664787477867936e-05, + "logits/chosen": 3.0946781635284424, + "logits/rejected": 3.343344211578369, + "logps/chosen": -350.3486022949219, + "logps/rejected": -341.5438537597656, + "loss": 0.399, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6459274291992188, + "rewards/margins": 3.322221279144287, + "rewards/rejected": -4.968148708343506, + "step": 6180 + }, + { + "epoch": 0.20203501397137294, + "grad_norm": 4.022158145904541, + "learning_rate": 4.663701241568091e-05, + "logits/chosen": 3.2156715393066406, + "logits/rejected": 3.1972765922546387, + "logps/chosen": -331.72406005859375, + "logps/rejected": -324.0400390625, + "loss": 0.7004, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1242191791534424, + "rewards/margins": 1.7647565603256226, + "rewards/rejected": -3.8889758586883545, + "step": 6200 + }, + { + "epoch": 0.2026867398228935, + "grad_norm": 6.844908237457275, + "learning_rate": 4.662615005268246e-05, + "logits/chosen": 3.2649600505828857, + "logits/rejected": 3.379526138305664, + "logps/chosen": -353.7003479003906, + "logps/rejected": -328.0352478027344, + "loss": 0.7541, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6650333404541016, + "rewards/margins": 2.3437373638153076, + "rewards/rejected": -4.008770942687988, + "step": 6220 + }, + { + "epoch": 0.20333846567441405, + "grad_norm": 3.4752519130706787, + "learning_rate": 4.661528768968402e-05, + "logits/chosen": 3.8226447105407715, + "logits/rejected": 3.9657795429229736, + "logps/chosen": -340.33856201171875, + "logps/rejected": -348.92181396484375, + "loss": 0.5565, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.291527271270752, + "rewards/margins": 1.7659733295440674, + "rewards/rejected": -4.05750036239624, + "step": 6240 + }, + { + "epoch": 0.20399019152593462, + "grad_norm": 0.6714630126953125, + "learning_rate": 4.660442532668557e-05, + "logits/chosen": 3.0156455039978027, + "logits/rejected": 3.191556215286255, + "logps/chosen": -343.52392578125, + "logps/rejected": -355.1387023925781, + "loss": 0.2488, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5121517181396484, + "rewards/margins": 3.085843563079834, + "rewards/rejected": -5.597994804382324, + "step": 6260 + }, + { + "epoch": 0.20464191737745518, + "grad_norm": 1.7837220430374146, + "learning_rate": 4.659356296368712e-05, + "logits/chosen": 3.2594058513641357, + "logits/rejected": 3.3115696907043457, + "logps/chosen": -328.467041015625, + "logps/rejected": -337.278076171875, + "loss": 0.5439, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4585013389587402, + "rewards/margins": 2.1386778354644775, + "rewards/rejected": -4.597178936004639, + "step": 6280 + }, + { + "epoch": 0.20529364322897573, + "grad_norm": 7.595023155212402, + "learning_rate": 4.658270060068868e-05, + "logits/chosen": 2.917785167694092, + "logits/rejected": 3.213441848754883, + "logps/chosen": -343.1892395019531, + "logps/rejected": -313.39501953125, + "loss": 0.618, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.797722816467285, + "rewards/margins": 2.3131020069122314, + "rewards/rejected": -5.110825538635254, + "step": 6300 + }, + { + "epoch": 0.2059453690804963, + "grad_norm": 1.2614461183547974, + "learning_rate": 4.657183823769023e-05, + "logits/chosen": 3.4923293590545654, + "logits/rejected": 3.8054141998291016, + "logps/chosen": -399.0619201660156, + "logps/rejected": -320.5560302734375, + "loss": 0.388, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.198599338531494, + "rewards/margins": 3.0568246841430664, + "rewards/rejected": -5.2554240226745605, + "step": 6320 + }, + { + "epoch": 0.20659709493201683, + "grad_norm": 4.8218584060668945, + "learning_rate": 4.656097587469178e-05, + "logits/chosen": 3.4186298847198486, + "logits/rejected": 3.495574951171875, + "logps/chosen": -382.30975341796875, + "logps/rejected": -331.51239013671875, + "loss": 0.6081, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0115936994552612, + "rewards/margins": 2.1737277507781982, + "rewards/rejected": -3.185321569442749, + "step": 6340 + }, + { + "epoch": 0.2072488207835374, + "grad_norm": 1.5155842304229736, + "learning_rate": 4.655011351169334e-05, + "logits/chosen": 3.4559054374694824, + "logits/rejected": 3.4911720752716064, + "logps/chosen": -339.0375061035156, + "logps/rejected": -322.96832275390625, + "loss": 0.3423, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6195392608642578, + "rewards/margins": 3.114957094192505, + "rewards/rejected": -4.734496116638184, + "step": 6360 + }, + { + "epoch": 0.20790054663505797, + "grad_norm": 0.23815421760082245, + "learning_rate": 4.653925114869489e-05, + "logits/chosen": 3.4505867958068848, + "logits/rejected": 3.6310131549835205, + "logps/chosen": -346.6502990722656, + "logps/rejected": -304.3711853027344, + "loss": 0.5661, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.2659332752227783, + "rewards/margins": 2.4369044303894043, + "rewards/rejected": -4.702837944030762, + "step": 6380 + }, + { + "epoch": 0.2085522724865785, + "grad_norm": 1.847636342048645, + "learning_rate": 4.652838878569644e-05, + "logits/chosen": 3.2408416271209717, + "logits/rejected": 3.347982883453369, + "logps/chosen": -308.5107116699219, + "logps/rejected": -295.0482177734375, + "loss": 0.6918, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5825796127319336, + "rewards/margins": 1.7386404275894165, + "rewards/rejected": -3.3212196826934814, + "step": 6400 + }, + { + "epoch": 0.20920399833809908, + "grad_norm": 3.88140869140625, + "learning_rate": 4.6517526422698e-05, + "logits/chosen": 3.276541233062744, + "logits/rejected": 3.5167598724365234, + "logps/chosen": -305.205078125, + "logps/rejected": -313.50390625, + "loss": 0.5075, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.785125494003296, + "rewards/margins": 2.4774956703186035, + "rewards/rejected": -4.2626214027404785, + "step": 6420 + }, + { + "epoch": 0.20985572418961965, + "grad_norm": 1.168385624885559, + "learning_rate": 4.650666405969955e-05, + "logits/chosen": 3.573392391204834, + "logits/rejected": 3.7348666191101074, + "logps/chosen": -341.9404296875, + "logps/rejected": -347.1395263671875, + "loss": 0.4409, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7179005146026611, + "rewards/margins": 2.6451706886291504, + "rewards/rejected": -4.363070487976074, + "step": 6440 + }, + { + "epoch": 0.2105074500411402, + "grad_norm": 5.365046977996826, + "learning_rate": 4.6495801696701105e-05, + "logits/chosen": 3.474072217941284, + "logits/rejected": 3.667064666748047, + "logps/chosen": -311.7642517089844, + "logps/rejected": -344.50738525390625, + "loss": 0.6179, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.0987329483032227, + "rewards/margins": 1.5940545797348022, + "rewards/rejected": -3.6927876472473145, + "step": 6460 + }, + { + "epoch": 0.21115917589266076, + "grad_norm": 5.2790703773498535, + "learning_rate": 4.6484939333702656e-05, + "logits/chosen": 2.931389331817627, + "logits/rejected": 3.338090419769287, + "logps/chosen": -353.5017395019531, + "logps/rejected": -283.74658203125, + "loss": 0.5891, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9342353343963623, + "rewards/margins": 1.7897732257843018, + "rewards/rejected": -3.724008083343506, + "step": 6480 + }, + { + "epoch": 0.21181090174418132, + "grad_norm": 4.6681036949157715, + "learning_rate": 4.6474076970704213e-05, + "logits/chosen": 3.6184184551239014, + "logits/rejected": 3.568645477294922, + "logps/chosen": -338.5634765625, + "logps/rejected": -310.4351806640625, + "loss": 0.5575, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5784974098205566, + "rewards/margins": 1.8896812200546265, + "rewards/rejected": -3.4681785106658936, + "step": 6500 + }, + { + "epoch": 0.21246262759570186, + "grad_norm": 7.066208362579346, + "learning_rate": 4.6463214607705764e-05, + "logits/chosen": 3.5715649127960205, + "logits/rejected": 3.7327492237091064, + "logps/chosen": -338.9093322753906, + "logps/rejected": -321.8078308105469, + "loss": 0.4531, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9502541422843933, + "rewards/margins": 2.2268319129943848, + "rewards/rejected": -3.177086353302002, + "step": 6520 + }, + { + "epoch": 0.21311435344722243, + "grad_norm": 3.3950393199920654, + "learning_rate": 4.6452352244707315e-05, + "logits/chosen": 3.374112606048584, + "logits/rejected": 3.6248257160186768, + "logps/chosen": -294.24139404296875, + "logps/rejected": -287.15618896484375, + "loss": 0.4904, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4371627569198608, + "rewards/margins": 2.4741952419281006, + "rewards/rejected": -3.911357879638672, + "step": 6540 + }, + { + "epoch": 0.21376607929874297, + "grad_norm": 2.101522922515869, + "learning_rate": 4.644148988170887e-05, + "logits/chosen": 3.7048652172088623, + "logits/rejected": 3.6845717430114746, + "logps/chosen": -384.09564208984375, + "logps/rejected": -317.9385986328125, + "loss": 0.5437, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1663105487823486, + "rewards/margins": 2.9265971183776855, + "rewards/rejected": -4.092907905578613, + "step": 6560 + }, + { + "epoch": 0.21441780515026354, + "grad_norm": 0.6265264749526978, + "learning_rate": 4.643062751871042e-05, + "logits/chosen": 3.3694748878479004, + "logits/rejected": 3.7487289905548096, + "logps/chosen": -309.2713928222656, + "logps/rejected": -291.19097900390625, + "loss": 0.2763, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.053981065750122, + "rewards/margins": 2.9550719261169434, + "rewards/rejected": -4.0090532302856445, + "step": 6580 + }, + { + "epoch": 0.2150695310017841, + "grad_norm": 1.0859607458114624, + "learning_rate": 4.6419765155711974e-05, + "logits/chosen": 3.668827772140503, + "logits/rejected": 3.656851291656494, + "logps/chosen": -335.83148193359375, + "logps/rejected": -290.5600280761719, + "loss": 0.4736, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7883634567260742, + "rewards/margins": 2.1081461906433105, + "rewards/rejected": -3.896509885787964, + "step": 6600 + }, + { + "epoch": 0.21572125685330465, + "grad_norm": 0.26269012689590454, + "learning_rate": 4.6408902792713525e-05, + "logits/chosen": 3.110307216644287, + "logits/rejected": 3.415423631668091, + "logps/chosen": -309.0030212402344, + "logps/rejected": -326.8612365722656, + "loss": 0.41, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9108155965805054, + "rewards/margins": 2.7672057151794434, + "rewards/rejected": -4.678021430969238, + "step": 6620 + }, + { + "epoch": 0.21637298270482522, + "grad_norm": 6.003291606903076, + "learning_rate": 4.639804042971508e-05, + "logits/chosen": 3.423872709274292, + "logits/rejected": 3.7599990367889404, + "logps/chosen": -379.4681701660156, + "logps/rejected": -334.5045166015625, + "loss": 0.6381, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.736877918243408, + "rewards/margins": 1.7699811458587646, + "rewards/rejected": -4.506859302520752, + "step": 6640 + }, + { + "epoch": 0.21702470855634579, + "grad_norm": 1.1267787218093872, + "learning_rate": 4.638717806671663e-05, + "logits/chosen": 3.4773926734924316, + "logits/rejected": 3.6991333961486816, + "logps/chosen": -378.8360900878906, + "logps/rejected": -351.48529052734375, + "loss": 0.3199, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6305303573608398, + "rewards/margins": 3.5002987384796143, + "rewards/rejected": -5.130828857421875, + "step": 6660 + }, + { + "epoch": 0.21767643440786633, + "grad_norm": 0.9962124824523926, + "learning_rate": 4.6376315703718184e-05, + "logits/chosen": 3.3770267963409424, + "logits/rejected": 3.6286988258361816, + "logps/chosen": -359.919677734375, + "logps/rejected": -290.53070068359375, + "loss": 0.4357, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5160324573516846, + "rewards/margins": 2.67061710357666, + "rewards/rejected": -4.186649322509766, + "step": 6680 + }, + { + "epoch": 0.2183281602593869, + "grad_norm": 2.1258835792541504, + "learning_rate": 4.636545334071974e-05, + "logits/chosen": 3.305239200592041, + "logits/rejected": 3.5006496906280518, + "logps/chosen": -339.476318359375, + "logps/rejected": -304.9292297363281, + "loss": 0.4342, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9991919994354248, + "rewards/margins": 2.682781457901001, + "rewards/rejected": -4.681973457336426, + "step": 6700 + }, + { + "epoch": 0.21897988611090743, + "grad_norm": 3.474386215209961, + "learning_rate": 4.63545909777213e-05, + "logits/chosen": 3.5344460010528564, + "logits/rejected": 3.7267112731933594, + "logps/chosen": -350.30755615234375, + "logps/rejected": -314.6372985839844, + "loss": 0.5747, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8826255798339844, + "rewards/margins": 2.5044455528259277, + "rewards/rejected": -4.38707160949707, + "step": 6720 + }, + { + "epoch": 0.219631611962428, + "grad_norm": 1.7861144542694092, + "learning_rate": 4.634372861472285e-05, + "logits/chosen": 3.3038907051086426, + "logits/rejected": 3.656831741333008, + "logps/chosen": -364.0299377441406, + "logps/rejected": -328.18798828125, + "loss": 0.5348, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.21191668510437, + "rewards/margins": 2.7404186725616455, + "rewards/rejected": -4.952335834503174, + "step": 6740 + }, + { + "epoch": 0.22028333781394857, + "grad_norm": 4.455032825469971, + "learning_rate": 4.633286625172441e-05, + "logits/chosen": 3.3950672149658203, + "logits/rejected": 3.408137559890747, + "logps/chosen": -320.22955322265625, + "logps/rejected": -266.9364929199219, + "loss": 0.5672, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1402525901794434, + "rewards/margins": 2.1336874961853027, + "rewards/rejected": -4.273940086364746, + "step": 6760 + }, + { + "epoch": 0.2209350636654691, + "grad_norm": 4.626438617706299, + "learning_rate": 4.632200388872596e-05, + "logits/chosen": 3.610586166381836, + "logits/rejected": 3.759542465209961, + "logps/chosen": -339.3816223144531, + "logps/rejected": -328.90283203125, + "loss": 0.7157, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8043941259384155, + "rewards/margins": 2.041891574859619, + "rewards/rejected": -3.846285581588745, + "step": 6780 + }, + { + "epoch": 0.22158678951698968, + "grad_norm": 1.4383779764175415, + "learning_rate": 4.631114152572751e-05, + "logits/chosen": 3.577111005783081, + "logits/rejected": 3.9096922874450684, + "logps/chosen": -359.3970031738281, + "logps/rejected": -327.56610107421875, + "loss": 0.4305, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6364867687225342, + "rewards/margins": 3.136775493621826, + "rewards/rejected": -4.7732625007629395, + "step": 6800 + }, + { + "epoch": 0.22223851536851025, + "grad_norm": 2.4448916912078857, + "learning_rate": 4.630027916272906e-05, + "logits/chosen": 3.8721587657928467, + "logits/rejected": 4.032068729400635, + "logps/chosen": -327.9419860839844, + "logps/rejected": -272.7976989746094, + "loss": 0.5003, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0443711280822754, + "rewards/margins": 1.8680394887924194, + "rewards/rejected": -3.912410259246826, + "step": 6820 + }, + { + "epoch": 0.2228902412200308, + "grad_norm": 1.2206708192825317, + "learning_rate": 4.628941679973062e-05, + "logits/chosen": 3.4509129524230957, + "logits/rejected": 3.6855950355529785, + "logps/chosen": -314.1901550292969, + "logps/rejected": -312.09033203125, + "loss": 0.3854, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9086520671844482, + "rewards/margins": 2.133798837661743, + "rewards/rejected": -4.042450904846191, + "step": 6840 + }, + { + "epoch": 0.22354196707155136, + "grad_norm": 12.6375150680542, + "learning_rate": 4.627855443673217e-05, + "logits/chosen": 3.4048144817352295, + "logits/rejected": 3.474454402923584, + "logps/chosen": -315.5140075683594, + "logps/rejected": -296.5184326171875, + "loss": 0.5032, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.1005702018737793, + "rewards/margins": 2.759335994720459, + "rewards/rejected": -4.859906196594238, + "step": 6860 + }, + { + "epoch": 0.2241936929230719, + "grad_norm": 0.5887628793716431, + "learning_rate": 4.626769207373372e-05, + "logits/chosen": 3.7363979816436768, + "logits/rejected": 3.867562770843506, + "logps/chosen": -350.6956481933594, + "logps/rejected": -316.8213195800781, + "loss": 0.4485, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.081648349761963, + "rewards/margins": 2.921959400177002, + "rewards/rejected": -5.003607749938965, + "step": 6880 + }, + { + "epoch": 0.22484541877459246, + "grad_norm": 0.4405195713043213, + "learning_rate": 4.6256829710735277e-05, + "logits/chosen": 3.802405834197998, + "logits/rejected": 3.8027591705322266, + "logps/chosen": -339.93463134765625, + "logps/rejected": -338.4143371582031, + "loss": 0.6867, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5816476345062256, + "rewards/margins": 3.1293652057647705, + "rewards/rejected": -4.711012840270996, + "step": 6900 + }, + { + "epoch": 0.22549714462611303, + "grad_norm": 2.973099946975708, + "learning_rate": 4.624596734773683e-05, + "logits/chosen": 3.414546251296997, + "logits/rejected": 3.6687839031219482, + "logps/chosen": -372.89300537109375, + "logps/rejected": -332.7206115722656, + "loss": 0.383, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9208002090454102, + "rewards/margins": 3.256321430206299, + "rewards/rejected": -5.177121639251709, + "step": 6920 + }, + { + "epoch": 0.22614887047763357, + "grad_norm": 3.4487149715423584, + "learning_rate": 4.623510498473838e-05, + "logits/chosen": 3.480703353881836, + "logits/rejected": 3.7613863945007324, + "logps/chosen": -343.41680908203125, + "logps/rejected": -310.2889099121094, + "loss": 0.3601, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.3758387565612793, + "rewards/margins": 2.815464973449707, + "rewards/rejected": -5.191303730010986, + "step": 6940 + }, + { + "epoch": 0.22680059632915414, + "grad_norm": 1.9976993799209595, + "learning_rate": 4.6224242621739936e-05, + "logits/chosen": 3.3000316619873047, + "logits/rejected": 3.582746982574463, + "logps/chosen": -341.98248291015625, + "logps/rejected": -289.5679016113281, + "loss": 0.4619, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.1984336376190186, + "rewards/margins": 2.3801207542419434, + "rewards/rejected": -4.578554630279541, + "step": 6960 + }, + { + "epoch": 0.2274523221806747, + "grad_norm": 1.5136576890945435, + "learning_rate": 4.621338025874149e-05, + "logits/chosen": 3.8198256492614746, + "logits/rejected": 3.913135528564453, + "logps/chosen": -334.28521728515625, + "logps/rejected": -312.28369140625, + "loss": 0.4778, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7951034307479858, + "rewards/margins": 2.3488106727600098, + "rewards/rejected": -4.143914699554443, + "step": 6980 + }, + { + "epoch": 0.22810404803219525, + "grad_norm": 2.748046636581421, + "learning_rate": 4.6202517895743044e-05, + "logits/chosen": 3.980778217315674, + "logits/rejected": 4.1859564781188965, + "logps/chosen": -367.1857604980469, + "logps/rejected": -303.0081481933594, + "loss": 0.5269, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7592999935150146, + "rewards/margins": 2.38822865486145, + "rewards/rejected": -4.147528648376465, + "step": 7000 + }, + { + "epoch": 0.22875577388371582, + "grad_norm": 4.375133991241455, + "learning_rate": 4.6191655532744595e-05, + "logits/chosen": 3.7458202838897705, + "logits/rejected": 3.897620439529419, + "logps/chosen": -322.93475341796875, + "logps/rejected": -309.7158203125, + "loss": 0.5111, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6184402704238892, + "rewards/margins": 2.3304121494293213, + "rewards/rejected": -3.9488525390625, + "step": 7020 + }, + { + "epoch": 0.2294074997352364, + "grad_norm": 1.217957615852356, + "learning_rate": 4.618079316974615e-05, + "logits/chosen": 3.898789882659912, + "logits/rejected": 4.118379592895508, + "logps/chosen": -341.1346435546875, + "logps/rejected": -336.44903564453125, + "loss": 0.4665, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1981360912322998, + "rewards/margins": 2.88437819480896, + "rewards/rejected": -4.082514762878418, + "step": 7040 + }, + { + "epoch": 0.23005922558675693, + "grad_norm": 0.41102924942970276, + "learning_rate": 4.61699308067477e-05, + "logits/chosen": 3.5559489727020264, + "logits/rejected": 3.6482772827148438, + "logps/chosen": -364.3196105957031, + "logps/rejected": -320.2249755859375, + "loss": 0.6126, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.5821192264556885, + "rewards/margins": 2.215193510055542, + "rewards/rejected": -4.7973127365112305, + "step": 7060 + }, + { + "epoch": 0.2307109514382775, + "grad_norm": 1.062085747718811, + "learning_rate": 4.6159068443749254e-05, + "logits/chosen": 3.77632212638855, + "logits/rejected": 3.9466605186462402, + "logps/chosen": -368.7576599121094, + "logps/rejected": -362.85333251953125, + "loss": 0.432, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7457668781280518, + "rewards/margins": 2.595243215560913, + "rewards/rejected": -4.341010093688965, + "step": 7080 + }, + { + "epoch": 0.23136267728979804, + "grad_norm": 0.3698817193508148, + "learning_rate": 4.614820608075081e-05, + "logits/chosen": 3.657015323638916, + "logits/rejected": 3.7845795154571533, + "logps/chosen": -295.7823791503906, + "logps/rejected": -272.67767333984375, + "loss": 0.3503, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9788202047348022, + "rewards/margins": 2.4200568199157715, + "rewards/rejected": -4.3988776206970215, + "step": 7100 + }, + { + "epoch": 0.2320144031413186, + "grad_norm": 6.167891025543213, + "learning_rate": 4.613734371775236e-05, + "logits/chosen": 3.7035491466522217, + "logits/rejected": 3.885845899581909, + "logps/chosen": -336.18609619140625, + "logps/rejected": -313.49664306640625, + "loss": 0.5555, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6715846061706543, + "rewards/margins": 2.5500340461730957, + "rewards/rejected": -5.221619129180908, + "step": 7120 + }, + { + "epoch": 0.23266612899283917, + "grad_norm": 0.6740853190422058, + "learning_rate": 4.612648135475391e-05, + "logits/chosen": 3.415647029876709, + "logits/rejected": 3.7964701652526855, + "logps/chosen": -315.0133361816406, + "logps/rejected": -277.61712646484375, + "loss": 0.5768, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0055928230285645, + "rewards/margins": 2.5475564002990723, + "rewards/rejected": -4.553149223327637, + "step": 7140 + }, + { + "epoch": 0.2333178548443597, + "grad_norm": 0.4174670875072479, + "learning_rate": 4.6115618991755464e-05, + "logits/chosen": 3.6379446983337402, + "logits/rejected": 3.8815836906433105, + "logps/chosen": -292.4369812011719, + "logps/rejected": -307.8317565917969, + "loss": 0.6251, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7652181386947632, + "rewards/margins": 2.4273014068603516, + "rewards/rejected": -4.192519187927246, + "step": 7160 + }, + { + "epoch": 0.23396958069588028, + "grad_norm": 4.088688373565674, + "learning_rate": 4.610475662875702e-05, + "logits/chosen": 3.307158946990967, + "logits/rejected": 3.4803848266601562, + "logps/chosen": -301.1815185546875, + "logps/rejected": -277.0004577636719, + "loss": 0.5733, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8937864303588867, + "rewards/margins": 2.4520649909973145, + "rewards/rejected": -4.345850944519043, + "step": 7180 + }, + { + "epoch": 0.23462130654740085, + "grad_norm": 6.110348224639893, + "learning_rate": 4.609389426575857e-05, + "logits/chosen": 3.5764987468719482, + "logits/rejected": 3.722522020339966, + "logps/chosen": -356.3382873535156, + "logps/rejected": -335.2117919921875, + "loss": 0.408, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.942025899887085, + "rewards/margins": 3.1936163902282715, + "rewards/rejected": -5.135642051696777, + "step": 7200 + }, + { + "epoch": 0.2352730323989214, + "grad_norm": 1.4761159420013428, + "learning_rate": 4.608303190276013e-05, + "logits/chosen": 4.111649513244629, + "logits/rejected": 4.2119364738464355, + "logps/chosen": -335.9525451660156, + "logps/rejected": -321.06231689453125, + "loss": 0.712, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6226913928985596, + "rewards/margins": 2.390084981918335, + "rewards/rejected": -4.012776851654053, + "step": 7220 + }, + { + "epoch": 0.23592475825044196, + "grad_norm": 1.6253777742385864, + "learning_rate": 4.607216953976168e-05, + "logits/chosen": 3.4541351795196533, + "logits/rejected": 3.7399086952209473, + "logps/chosen": -378.624267578125, + "logps/rejected": -350.97198486328125, + "loss": 0.3629, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.6946409344673157, + "rewards/margins": 3.8346638679504395, + "rewards/rejected": -4.5293049812316895, + "step": 7240 + }, + { + "epoch": 0.2365764841019625, + "grad_norm": 2.342067241668701, + "learning_rate": 4.606130717676324e-05, + "logits/chosen": 3.632755994796753, + "logits/rejected": 3.815974473953247, + "logps/chosen": -325.284912109375, + "logps/rejected": -289.16766357421875, + "loss": 0.3657, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.4695847034454346, + "rewards/margins": 2.96116304397583, + "rewards/rejected": -4.4307475090026855, + "step": 7260 + }, + { + "epoch": 0.23722820995348307, + "grad_norm": 1.1278965473175049, + "learning_rate": 4.605044481376479e-05, + "logits/chosen": 3.562525987625122, + "logits/rejected": 3.690410614013672, + "logps/chosen": -361.3858642578125, + "logps/rejected": -323.337890625, + "loss": 0.5894, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.944286584854126, + "rewards/margins": 2.4753599166870117, + "rewards/rejected": -4.419646263122559, + "step": 7280 + }, + { + "epoch": 0.23787993580500363, + "grad_norm": 2.324904203414917, + "learning_rate": 4.6039582450766346e-05, + "logits/chosen": 3.9005637168884277, + "logits/rejected": 4.070004463195801, + "logps/chosen": -320.884033203125, + "logps/rejected": -301.79473876953125, + "loss": 0.5736, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5896464586257935, + "rewards/margins": 2.4265336990356445, + "rewards/rejected": -4.016180038452148, + "step": 7300 + }, + { + "epoch": 0.23853166165652417, + "grad_norm": 0.8741568326950073, + "learning_rate": 4.60287200877679e-05, + "logits/chosen": 3.8448078632354736, + "logits/rejected": 4.061445713043213, + "logps/chosen": -365.6895751953125, + "logps/rejected": -312.10552978515625, + "loss": 0.4672, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2762391567230225, + "rewards/margins": 3.290440082550049, + "rewards/rejected": -4.56667947769165, + "step": 7320 + }, + { + "epoch": 0.23918338750804474, + "grad_norm": 6.030093193054199, + "learning_rate": 4.601785772476945e-05, + "logits/chosen": 3.505066394805908, + "logits/rejected": 3.6756725311279297, + "logps/chosen": -335.28607177734375, + "logps/rejected": -320.0855712890625, + "loss": 0.5466, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9514968395233154, + "rewards/margins": 2.3406808376312256, + "rewards/rejected": -4.292177677154541, + "step": 7340 + }, + { + "epoch": 0.2398351133595653, + "grad_norm": 2.7029757499694824, + "learning_rate": 4.6006995361771e-05, + "logits/chosen": 3.409583568572998, + "logits/rejected": 3.6702144145965576, + "logps/chosen": -348.9549255371094, + "logps/rejected": -280.5594177246094, + "loss": 0.4411, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.895016074180603, + "rewards/margins": 2.687828540802002, + "rewards/rejected": -4.5828447341918945, + "step": 7360 + }, + { + "epoch": 0.24048683921108585, + "grad_norm": 4.099144458770752, + "learning_rate": 4.5996132998772556e-05, + "logits/chosen": 3.6711883544921875, + "logits/rejected": 3.770430326461792, + "logps/chosen": -363.2633361816406, + "logps/rejected": -328.7405700683594, + "loss": 0.698, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.342578411102295, + "rewards/margins": 1.815718412399292, + "rewards/rejected": -4.158297061920166, + "step": 7380 + }, + { + "epoch": 0.24113856506260642, + "grad_norm": 1.8583545684814453, + "learning_rate": 4.598527063577411e-05, + "logits/chosen": 3.4390056133270264, + "logits/rejected": 3.6807026863098145, + "logps/chosen": -331.4953308105469, + "logps/rejected": -286.8245849609375, + "loss": 0.5468, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0147757530212402, + "rewards/margins": 2.2971444129943848, + "rewards/rejected": -4.311920642852783, + "step": 7400 + }, + { + "epoch": 0.24179029091412696, + "grad_norm": 0.9934867024421692, + "learning_rate": 4.597440827277566e-05, + "logits/chosen": 3.740018367767334, + "logits/rejected": 3.990025281906128, + "logps/chosen": -360.6219787597656, + "logps/rejected": -332.1273498535156, + "loss": 0.3816, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9686582684516907, + "rewards/margins": 2.569246768951416, + "rewards/rejected": -3.537905216217041, + "step": 7420 + }, + { + "epoch": 0.24244201676564753, + "grad_norm": 3.837608814239502, + "learning_rate": 4.5963545909777215e-05, + "logits/chosen": 3.5790069103240967, + "logits/rejected": 3.657224178314209, + "logps/chosen": -287.72271728515625, + "logps/rejected": -269.9502258300781, + "loss": 0.4817, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3959131240844727, + "rewards/margins": 1.8183956146240234, + "rewards/rejected": -3.214308977127075, + "step": 7440 + }, + { + "epoch": 0.2430937426171681, + "grad_norm": 1.9964206218719482, + "learning_rate": 4.5952683546778766e-05, + "logits/chosen": 3.583888530731201, + "logits/rejected": 3.6451077461242676, + "logps/chosen": -318.23834228515625, + "logps/rejected": -305.1846618652344, + "loss": 0.4983, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7217432260513306, + "rewards/margins": 2.8873298168182373, + "rewards/rejected": -4.609072685241699, + "step": 7460 + }, + { + "epoch": 0.24374546846868864, + "grad_norm": 1.4597169160842896, + "learning_rate": 4.594182118378032e-05, + "logits/chosen": 3.9602909088134766, + "logits/rejected": 4.059917449951172, + "logps/chosen": -392.6819763183594, + "logps/rejected": -344.379150390625, + "loss": 0.5297, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6569194793701172, + "rewards/margins": 2.022489070892334, + "rewards/rejected": -3.679408550262451, + "step": 7480 + }, + { + "epoch": 0.2443971943202092, + "grad_norm": 4.644417762756348, + "learning_rate": 4.5930958820781875e-05, + "logits/chosen": 3.3442344665527344, + "logits/rejected": 3.485394239425659, + "logps/chosen": -307.04022216796875, + "logps/rejected": -290.28302001953125, + "loss": 0.4137, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9621391296386719, + "rewards/margins": 2.4338104724884033, + "rewards/rejected": -4.395949840545654, + "step": 7500 + }, + { + "epoch": 0.24504892017172977, + "grad_norm": 3.122983694076538, + "learning_rate": 4.592009645778343e-05, + "logits/chosen": 3.879906177520752, + "logits/rejected": 4.214956760406494, + "logps/chosen": -363.38275146484375, + "logps/rejected": -292.69073486328125, + "loss": 0.4318, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.25090491771698, + "rewards/margins": 2.0087525844573975, + "rewards/rejected": -3.259657621383667, + "step": 7520 + }, + { + "epoch": 0.2457006460232503, + "grad_norm": 0.7832626700401306, + "learning_rate": 4.590923409478498e-05, + "logits/chosen": 3.7729830741882324, + "logits/rejected": 3.947854518890381, + "logps/chosen": -304.92669677734375, + "logps/rejected": -276.7923889160156, + "loss": 0.4283, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.5090574026107788, + "rewards/margins": 2.3783743381500244, + "rewards/rejected": -3.8874316215515137, + "step": 7540 + }, + { + "epoch": 0.24635237187477088, + "grad_norm": 6.282310962677002, + "learning_rate": 4.5898371731786534e-05, + "logits/chosen": 3.8117995262145996, + "logits/rejected": 3.927264451980591, + "logps/chosen": -358.2835388183594, + "logps/rejected": -339.55950927734375, + "loss": 0.6715, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1297061443328857, + "rewards/margins": 1.4458723068237305, + "rewards/rejected": -3.575578212738037, + "step": 7560 + }, + { + "epoch": 0.24700409772629142, + "grad_norm": 6.002357006072998, + "learning_rate": 4.588750936878809e-05, + "logits/chosen": 3.705458402633667, + "logits/rejected": 4.028184413909912, + "logps/chosen": -364.69781494140625, + "logps/rejected": -306.9528503417969, + "loss": 0.4887, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.432345986366272, + "rewards/margins": 2.7893483638763428, + "rewards/rejected": -4.221694469451904, + "step": 7580 + }, + { + "epoch": 0.247655823577812, + "grad_norm": 1.2960846424102783, + "learning_rate": 4.587664700578964e-05, + "logits/chosen": 3.962355136871338, + "logits/rejected": 4.164594650268555, + "logps/chosen": -350.2738037109375, + "logps/rejected": -330.018310546875, + "loss": 0.5345, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5056688785552979, + "rewards/margins": 2.2930617332458496, + "rewards/rejected": -3.7987303733825684, + "step": 7600 + }, + { + "epoch": 0.24830754942933256, + "grad_norm": 4.241558074951172, + "learning_rate": 4.586578464279119e-05, + "logits/chosen": 3.794102430343628, + "logits/rejected": 3.984903335571289, + "logps/chosen": -367.6976623535156, + "logps/rejected": -304.5484313964844, + "loss": 0.628, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.070064067840576, + "rewards/margins": 2.060732126235962, + "rewards/rejected": -4.130795955657959, + "step": 7620 + }, + { + "epoch": 0.2489592752808531, + "grad_norm": 1.6428180932998657, + "learning_rate": 4.585492227979275e-05, + "logits/chosen": 3.495692014694214, + "logits/rejected": 3.7818751335144043, + "logps/chosen": -290.75701904296875, + "logps/rejected": -275.5481872558594, + "loss": 0.4576, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.309760570526123, + "rewards/margins": 2.446864604949951, + "rewards/rejected": -3.756624937057495, + "step": 7640 + }, + { + "epoch": 0.24961100113237367, + "grad_norm": 0.10162808746099472, + "learning_rate": 4.58440599167943e-05, + "logits/chosen": 3.8225350379943848, + "logits/rejected": 3.915911912918091, + "logps/chosen": -341.7566223144531, + "logps/rejected": -295.78399658203125, + "loss": 0.5244, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7211663722991943, + "rewards/margins": 2.087892770767212, + "rewards/rejected": -3.8090596199035645, + "step": 7660 + }, + { + "epoch": 0.25026272698389423, + "grad_norm": 5.2941436767578125, + "learning_rate": 4.583319755379585e-05, + "logits/chosen": 3.5942955017089844, + "logits/rejected": 3.7607314586639404, + "logps/chosen": -316.3130798339844, + "logps/rejected": -304.8733825683594, + "loss": 0.5521, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.8739286661148071, + "rewards/margins": 2.1416828632354736, + "rewards/rejected": -4.015611171722412, + "step": 7680 + }, + { + "epoch": 0.2509144528354148, + "grad_norm": 2.6548147201538086, + "learning_rate": 4.582233519079741e-05, + "logits/chosen": 3.7470669746398926, + "logits/rejected": 3.7777161598205566, + "logps/chosen": -381.12115478515625, + "logps/rejected": -318.80889892578125, + "loss": 0.6223, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0086731910705566, + "rewards/margins": 2.5685558319091797, + "rewards/rejected": -4.577229022979736, + "step": 7700 + }, + { + "epoch": 0.2515661786869353, + "grad_norm": 0.7568743824958801, + "learning_rate": 4.581147282779896e-05, + "logits/chosen": 3.5537338256835938, + "logits/rejected": 3.6563353538513184, + "logps/chosen": -341.163818359375, + "logps/rejected": -327.12884521484375, + "loss": 0.3198, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.228510618209839, + "rewards/margins": 2.5078487396240234, + "rewards/rejected": -4.736359596252441, + "step": 7720 + }, + { + "epoch": 0.2522179045384559, + "grad_norm": 1.3719455003738403, + "learning_rate": 4.580061046480051e-05, + "logits/chosen": 3.561342239379883, + "logits/rejected": 3.7695865631103516, + "logps/chosen": -351.1151123046875, + "logps/rejected": -360.4558410644531, + "loss": 0.4899, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.394183397293091, + "rewards/margins": 2.6011314392089844, + "rewards/rejected": -4.995314598083496, + "step": 7740 + }, + { + "epoch": 0.25286963038997645, + "grad_norm": 0.232838973402977, + "learning_rate": 4.578974810180207e-05, + "logits/chosen": 3.60133695602417, + "logits/rejected": 3.8834846019744873, + "logps/chosen": -373.47210693359375, + "logps/rejected": -285.590576171875, + "loss": 0.4711, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.435650110244751, + "rewards/margins": 2.213888168334961, + "rewards/rejected": -4.649538040161133, + "step": 7760 + }, + { + "epoch": 0.253521356241497, + "grad_norm": 2.9175667762756348, + "learning_rate": 4.5778885738803626e-05, + "logits/chosen": 3.6456618309020996, + "logits/rejected": 4.060736179351807, + "logps/chosen": -372.8311462402344, + "logps/rejected": -291.7864685058594, + "loss": 0.3734, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7616039514541626, + "rewards/margins": 2.592731475830078, + "rewards/rejected": -4.354334831237793, + "step": 7780 + }, + { + "epoch": 0.2541730820930176, + "grad_norm": 4.2479352951049805, + "learning_rate": 4.576802337580518e-05, + "logits/chosen": 3.8790221214294434, + "logits/rejected": 3.8225929737091064, + "logps/chosen": -396.4140625, + "logps/rejected": -334.92694091796875, + "loss": 0.5772, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5697338581085205, + "rewards/margins": 2.4624202251434326, + "rewards/rejected": -4.032154083251953, + "step": 7800 + }, + { + "epoch": 0.25482480794453816, + "grad_norm": 5.600734233856201, + "learning_rate": 4.575716101280673e-05, + "logits/chosen": 3.985719680786133, + "logits/rejected": 4.225005149841309, + "logps/chosen": -410.18505859375, + "logps/rejected": -328.334228515625, + "loss": 0.3322, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1587328910827637, + "rewards/margins": 3.020434856414795, + "rewards/rejected": -5.179167747497559, + "step": 7820 + }, + { + "epoch": 0.25547653379605867, + "grad_norm": 1.8539007902145386, + "learning_rate": 4.5746298649808285e-05, + "logits/chosen": 3.4649765491485596, + "logits/rejected": 3.860546112060547, + "logps/chosen": -367.4858093261719, + "logps/rejected": -329.30865478515625, + "loss": 0.4088, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.007667064666748, + "rewards/margins": 2.65683650970459, + "rewards/rejected": -4.664503574371338, + "step": 7840 + }, + { + "epoch": 0.25612825964757924, + "grad_norm": 5.927215099334717, + "learning_rate": 4.5735436286809836e-05, + "logits/chosen": 3.600048780441284, + "logits/rejected": 3.8663506507873535, + "logps/chosen": -305.7245178222656, + "logps/rejected": -270.620849609375, + "loss": 0.5715, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.126664638519287, + "rewards/margins": 1.9322086572647095, + "rewards/rejected": -4.058873176574707, + "step": 7860 + }, + { + "epoch": 0.2567799854990998, + "grad_norm": 2.03275728225708, + "learning_rate": 4.572457392381139e-05, + "logits/chosen": 3.4992775917053223, + "logits/rejected": 3.841207504272461, + "logps/chosen": -314.7581481933594, + "logps/rejected": -316.23614501953125, + "loss": 0.4427, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.590017318725586, + "rewards/margins": 2.2539680004119873, + "rewards/rejected": -3.8439857959747314, + "step": 7880 + }, + { + "epoch": 0.2574317113506204, + "grad_norm": 2.3436341285705566, + "learning_rate": 4.5713711560812944e-05, + "logits/chosen": 3.5910868644714355, + "logits/rejected": 3.793337345123291, + "logps/chosen": -360.6662902832031, + "logps/rejected": -332.24017333984375, + "loss": 0.3528, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.1610643863677979, + "rewards/margins": 3.758239269256592, + "rewards/rejected": -4.919303894042969, + "step": 7900 + }, + { + "epoch": 0.25808343720214094, + "grad_norm": 2.914149284362793, + "learning_rate": 4.5702849197814495e-05, + "logits/chosen": 3.5919547080993652, + "logits/rejected": 3.930455446243286, + "logps/chosen": -355.8979187011719, + "logps/rejected": -283.7596130371094, + "loss": 0.5695, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.190129280090332, + "rewards/margins": 2.2786483764648438, + "rewards/rejected": -4.468777656555176, + "step": 7920 + }, + { + "epoch": 0.25873516305366145, + "grad_norm": 11.327168464660645, + "learning_rate": 4.5691986834816046e-05, + "logits/chosen": 3.187425136566162, + "logits/rejected": 3.5983550548553467, + "logps/chosen": -295.9476013183594, + "logps/rejected": -254.76541137695312, + "loss": 0.4834, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0969616174697876, + "rewards/margins": 3.0076732635498047, + "rewards/rejected": -4.104634761810303, + "step": 7940 + }, + { + "epoch": 0.259386888905182, + "grad_norm": 5.085422992706299, + "learning_rate": 4.56811244718176e-05, + "logits/chosen": 3.904247283935547, + "logits/rejected": 4.065102577209473, + "logps/chosen": -335.67572021484375, + "logps/rejected": -322.0950927734375, + "loss": 0.4921, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8900789022445679, + "rewards/margins": 2.3909127712249756, + "rewards/rejected": -4.280991554260254, + "step": 7960 + }, + { + "epoch": 0.2600386147567026, + "grad_norm": 3.609825372695923, + "learning_rate": 4.5670262108819154e-05, + "logits/chosen": 3.608402729034424, + "logits/rejected": 3.943101406097412, + "logps/chosen": -343.68817138671875, + "logps/rejected": -306.9947204589844, + "loss": 0.3058, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7740447521209717, + "rewards/margins": 3.210219144821167, + "rewards/rejected": -4.984263896942139, + "step": 7980 + }, + { + "epoch": 0.26069034060822316, + "grad_norm": 4.763788223266602, + "learning_rate": 4.5659399745820705e-05, + "logits/chosen": 3.691816806793213, + "logits/rejected": 3.748791456222534, + "logps/chosen": -348.39862060546875, + "logps/rejected": -301.13763427734375, + "loss": 0.4577, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7358438968658447, + "rewards/margins": 2.541313648223877, + "rewards/rejected": -4.277157306671143, + "step": 8000 + }, + { + "epoch": 0.2613420664597437, + "grad_norm": 4.010183811187744, + "learning_rate": 4.564853738282226e-05, + "logits/chosen": 3.6070258617401123, + "logits/rejected": 3.749809980392456, + "logps/chosen": -341.3906555175781, + "logps/rejected": -326.67498779296875, + "loss": 0.428, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.071411609649658, + "rewards/margins": 2.5522077083587646, + "rewards/rejected": -4.623619556427002, + "step": 8020 + }, + { + "epoch": 0.26199379231126424, + "grad_norm": 3.2544002532958984, + "learning_rate": 4.5637675019823813e-05, + "logits/chosen": 3.5812556743621826, + "logits/rejected": 3.9464428424835205, + "logps/chosen": -346.33892822265625, + "logps/rejected": -315.54974365234375, + "loss": 0.3517, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2093210220336914, + "rewards/margins": 3.4255077838897705, + "rewards/rejected": -5.634829521179199, + "step": 8040 + }, + { + "epoch": 0.2626455181627848, + "grad_norm": 6.238029956817627, + "learning_rate": 4.562681265682537e-05, + "logits/chosen": 3.314490795135498, + "logits/rejected": 3.2426211833953857, + "logps/chosen": -314.9289245605469, + "logps/rejected": -329.18951416015625, + "loss": 0.5297, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.507831573486328, + "rewards/margins": 2.303008794784546, + "rewards/rejected": -4.810840606689453, + "step": 8060 + }, + { + "epoch": 0.2632972440143054, + "grad_norm": 4.787857532501221, + "learning_rate": 4.561595029382692e-05, + "logits/chosen": 3.2434916496276855, + "logits/rejected": 3.457843780517578, + "logps/chosen": -312.1732177734375, + "logps/rejected": -318.24444580078125, + "loss": 0.3872, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.196187734603882, + "rewards/margins": 3.9792320728302, + "rewards/rejected": -6.175419807434082, + "step": 8080 + }, + { + "epoch": 0.26394896986582594, + "grad_norm": 2.7614197731018066, + "learning_rate": 4.560508793082848e-05, + "logits/chosen": 3.4242916107177734, + "logits/rejected": 3.5563488006591797, + "logps/chosen": -363.93963623046875, + "logps/rejected": -361.54669189453125, + "loss": 0.5102, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.578137159347534, + "rewards/margins": 2.9398350715637207, + "rewards/rejected": -5.517972469329834, + "step": 8100 + }, + { + "epoch": 0.2646006957173465, + "grad_norm": 2.0972392559051514, + "learning_rate": 4.559422556783003e-05, + "logits/chosen": 3.319112777709961, + "logits/rejected": 3.5469422340393066, + "logps/chosen": -370.57110595703125, + "logps/rejected": -331.46795654296875, + "loss": 0.5043, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8189125061035156, + "rewards/margins": 2.693243980407715, + "rewards/rejected": -4.512156009674072, + "step": 8120 + }, + { + "epoch": 0.2652524215688671, + "grad_norm": 1.5650653839111328, + "learning_rate": 4.558336320483158e-05, + "logits/chosen": 3.6790413856506348, + "logits/rejected": 3.896918773651123, + "logps/chosen": -363.5812683105469, + "logps/rejected": -360.3746643066406, + "loss": 0.4833, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.911529541015625, + "rewards/margins": 2.809077262878418, + "rewards/rejected": -4.720607280731201, + "step": 8140 + }, + { + "epoch": 0.2659041474203876, + "grad_norm": 2.34002947807312, + "learning_rate": 4.557250084183313e-05, + "logits/chosen": 3.5628743171691895, + "logits/rejected": 3.680201768875122, + "logps/chosen": -332.8553161621094, + "logps/rejected": -330.65399169921875, + "loss": 0.4324, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.0311901569366455, + "rewards/margins": 2.1808414459228516, + "rewards/rejected": -4.212031364440918, + "step": 8160 + }, + { + "epoch": 0.26655587327190816, + "grad_norm": 2.233198404312134, + "learning_rate": 4.556163847883469e-05, + "logits/chosen": 3.127061367034912, + "logits/rejected": 3.4876410961151123, + "logps/chosen": -341.2405700683594, + "logps/rejected": -322.2530822753906, + "loss": 0.4249, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.780561089515686, + "rewards/margins": 2.6313769817352295, + "rewards/rejected": -4.411937713623047, + "step": 8180 + }, + { + "epoch": 0.26720759912342873, + "grad_norm": 1.0969611406326294, + "learning_rate": 4.555077611583624e-05, + "logits/chosen": 3.150499105453491, + "logits/rejected": 3.4056639671325684, + "logps/chosen": -331.40557861328125, + "logps/rejected": -277.79290771484375, + "loss": 0.3618, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.4930306673049927, + "rewards/margins": 2.5414071083068848, + "rewards/rejected": -4.034438133239746, + "step": 8200 + }, + { + "epoch": 0.2678593249749493, + "grad_norm": 0.8863662481307983, + "learning_rate": 4.553991375283779e-05, + "logits/chosen": 3.3133530616760254, + "logits/rejected": 3.4958672523498535, + "logps/chosen": -339.51422119140625, + "logps/rejected": -321.7318420410156, + "loss": 0.5543, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9923007488250732, + "rewards/margins": 2.7806084156036377, + "rewards/rejected": -4.772909641265869, + "step": 8220 + }, + { + "epoch": 0.26851105082646987, + "grad_norm": 3.923710584640503, + "learning_rate": 4.552905138983935e-05, + "logits/chosen": 3.6772055625915527, + "logits/rejected": 3.813683271408081, + "logps/chosen": -390.89227294921875, + "logps/rejected": -331.9899597167969, + "loss": 0.5809, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.566809058189392, + "rewards/margins": 2.4164669513702393, + "rewards/rejected": -3.983275890350342, + "step": 8240 + }, + { + "epoch": 0.2691627766779904, + "grad_norm": 2.1035852432250977, + "learning_rate": 4.55181890268409e-05, + "logits/chosen": 3.5556697845458984, + "logits/rejected": 3.62249755859375, + "logps/chosen": -343.0182189941406, + "logps/rejected": -334.0406188964844, + "loss": 0.51, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0834193229675293, + "rewards/margins": 1.9725669622421265, + "rewards/rejected": -4.055986404418945, + "step": 8260 + }, + { + "epoch": 0.26981450252951095, + "grad_norm": 3.4449737071990967, + "learning_rate": 4.550732666384245e-05, + "logits/chosen": 3.6578261852264404, + "logits/rejected": 3.6768722534179688, + "logps/chosen": -370.0192565917969, + "logps/rejected": -338.3418884277344, + "loss": 0.5238, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.182738780975342, + "rewards/margins": 2.503995895385742, + "rewards/rejected": -4.686734676361084, + "step": 8280 + }, + { + "epoch": 0.2704662283810315, + "grad_norm": 0.1294803023338318, + "learning_rate": 4.549646430084401e-05, + "logits/chosen": 3.525536060333252, + "logits/rejected": 3.857084274291992, + "logps/chosen": -334.7774963378906, + "logps/rejected": -323.9651794433594, + "loss": 0.5702, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.828142762184143, + "rewards/margins": 2.898742198944092, + "rewards/rejected": -4.726884841918945, + "step": 8300 + }, + { + "epoch": 0.2711179542325521, + "grad_norm": 5.459997177124023, + "learning_rate": 4.5485601937845565e-05, + "logits/chosen": 3.590221405029297, + "logits/rejected": 3.633065700531006, + "logps/chosen": -344.7029724121094, + "logps/rejected": -294.59698486328125, + "loss": 0.3887, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7075653076171875, + "rewards/margins": 2.933676242828369, + "rewards/rejected": -4.641242027282715, + "step": 8320 + }, + { + "epoch": 0.27176968008407265, + "grad_norm": 1.9196809530258179, + "learning_rate": 4.5474739574847116e-05, + "logits/chosen": 3.4550864696502686, + "logits/rejected": 3.5390243530273438, + "logps/chosen": -351.7022399902344, + "logps/rejected": -316.38507080078125, + "loss": 0.3984, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8929857015609741, + "rewards/margins": 2.901988983154297, + "rewards/rejected": -4.7949748039245605, + "step": 8340 + }, + { + "epoch": 0.2724214059355932, + "grad_norm": 1.5713880062103271, + "learning_rate": 4.546387721184867e-05, + "logits/chosen": 3.6451447010040283, + "logits/rejected": 3.704840898513794, + "logps/chosen": -366.76141357421875, + "logps/rejected": -322.8424987792969, + "loss": 0.5457, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9532448053359985, + "rewards/margins": 2.2574639320373535, + "rewards/rejected": -4.210709095001221, + "step": 8360 + }, + { + "epoch": 0.27307313178711373, + "grad_norm": 3.307530641555786, + "learning_rate": 4.5453014848850224e-05, + "logits/chosen": 3.2394630908966064, + "logits/rejected": 3.6227314472198486, + "logps/chosen": -326.16046142578125, + "logps/rejected": -340.8074645996094, + "loss": 0.4696, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5892611742019653, + "rewards/margins": 2.958914279937744, + "rewards/rejected": -4.54817533493042, + "step": 8380 + }, + { + "epoch": 0.2737248576386343, + "grad_norm": 1.2538583278656006, + "learning_rate": 4.5442152485851775e-05, + "logits/chosen": 3.307188034057617, + "logits/rejected": 3.588085174560547, + "logps/chosen": -375.38299560546875, + "logps/rejected": -344.060302734375, + "loss": 0.3602, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.8993251323699951, + "rewards/margins": 3.3459479808807373, + "rewards/rejected": -5.245273590087891, + "step": 8400 + }, + { + "epoch": 0.27437658349015487, + "grad_norm": 2.0936577320098877, + "learning_rate": 4.5431290122853326e-05, + "logits/chosen": 3.822705030441284, + "logits/rejected": 4.0103349685668945, + "logps/chosen": -379.7013854980469, + "logps/rejected": -328.181396484375, + "loss": 0.5444, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.2080719470977783, + "rewards/margins": 2.3151650428771973, + "rewards/rejected": -4.5232367515563965, + "step": 8420 + }, + { + "epoch": 0.27502830934167544, + "grad_norm": 2.033689260482788, + "learning_rate": 4.542042775985488e-05, + "logits/chosen": 3.633789539337158, + "logits/rejected": 3.7710750102996826, + "logps/chosen": -346.6510314941406, + "logps/rejected": -334.9984130859375, + "loss": 0.4096, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4043142795562744, + "rewards/margins": 2.940978765487671, + "rewards/rejected": -4.3452935218811035, + "step": 8440 + }, + { + "epoch": 0.275680035193196, + "grad_norm": 3.3777103424072266, + "learning_rate": 4.5409565396856434e-05, + "logits/chosen": 3.7728309631347656, + "logits/rejected": 4.048682689666748, + "logps/chosen": -352.7034606933594, + "logps/rejected": -301.36700439453125, + "loss": 0.6941, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.606689929962158, + "rewards/margins": 2.171027183532715, + "rewards/rejected": -4.777716636657715, + "step": 8460 + }, + { + "epoch": 0.2763317610447165, + "grad_norm": 3.234844923019409, + "learning_rate": 4.5398703033857985e-05, + "logits/chosen": 3.7538974285125732, + "logits/rejected": 3.7753207683563232, + "logps/chosen": -352.01568603515625, + "logps/rejected": -340.0157165527344, + "loss": 0.6452, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7744640111923218, + "rewards/margins": 1.9403581619262695, + "rewards/rejected": -3.714822292327881, + "step": 8480 + }, + { + "epoch": 0.2769834868962371, + "grad_norm": 3.9215433597564697, + "learning_rate": 4.5387840670859536e-05, + "logits/chosen": 3.583432674407959, + "logits/rejected": 3.814934492111206, + "logps/chosen": -324.2502746582031, + "logps/rejected": -308.94622802734375, + "loss": 0.4201, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.437041997909546, + "rewards/margins": 2.7530722618103027, + "rewards/rejected": -4.1901140213012695, + "step": 8500 + }, + { + "epoch": 0.27763521274775765, + "grad_norm": 3.2108545303344727, + "learning_rate": 4.537697830786109e-05, + "logits/chosen": 3.3293776512145996, + "logits/rejected": 3.5728726387023926, + "logps/chosen": -338.36126708984375, + "logps/rejected": -286.4284973144531, + "loss": 0.4191, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0033851861953735, + "rewards/margins": 2.446779727935791, + "rewards/rejected": -3.450165271759033, + "step": 8520 + }, + { + "epoch": 0.2782869385992782, + "grad_norm": 1.2097665071487427, + "learning_rate": 4.5366115944862644e-05, + "logits/chosen": 3.575974941253662, + "logits/rejected": 3.890552520751953, + "logps/chosen": -341.86163330078125, + "logps/rejected": -325.26287841796875, + "loss": 0.3298, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.993266761302948, + "rewards/margins": 3.111891269683838, + "rewards/rejected": -4.105157852172852, + "step": 8540 + }, + { + "epoch": 0.2789386644507988, + "grad_norm": 3.9247546195983887, + "learning_rate": 4.53552535818642e-05, + "logits/chosen": 3.9315438270568848, + "logits/rejected": 4.053153038024902, + "logps/chosen": -392.16571044921875, + "logps/rejected": -325.4526062011719, + "loss": 0.3734, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.5123958587646484, + "rewards/margins": 2.922231674194336, + "rewards/rejected": -4.434627532958984, + "step": 8560 + }, + { + "epoch": 0.2795903903023193, + "grad_norm": 2.2938759326934814, + "learning_rate": 4.534439121886576e-05, + "logits/chosen": 3.8212802410125732, + "logits/rejected": 3.9654369354248047, + "logps/chosen": -344.2378234863281, + "logps/rejected": -332.36981201171875, + "loss": 0.3756, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.5532965660095215, + "rewards/margins": 3.255262851715088, + "rewards/rejected": -4.808559417724609, + "step": 8580 + }, + { + "epoch": 0.28024211615383987, + "grad_norm": 12.099452018737793, + "learning_rate": 4.533352885586731e-05, + "logits/chosen": 3.271491289138794, + "logits/rejected": 3.4017531871795654, + "logps/chosen": -338.65325927734375, + "logps/rejected": -321.01318359375, + "loss": 0.562, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.806122064590454, + "rewards/margins": 2.8680386543273926, + "rewards/rejected": -4.674160957336426, + "step": 8600 + }, + { + "epoch": 0.28089384200536044, + "grad_norm": 4.793270587921143, + "learning_rate": 4.532266649286886e-05, + "logits/chosen": 3.363713026046753, + "logits/rejected": 3.62322998046875, + "logps/chosen": -311.44049072265625, + "logps/rejected": -350.6631774902344, + "loss": 0.6585, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.6307737827301025, + "rewards/margins": 1.9325225353240967, + "rewards/rejected": -4.563296318054199, + "step": 8620 + }, + { + "epoch": 0.281545567856881, + "grad_norm": 3.664407730102539, + "learning_rate": 4.531180412987042e-05, + "logits/chosen": 2.8773207664489746, + "logits/rejected": 2.959141492843628, + "logps/chosen": -295.7254333496094, + "logps/rejected": -316.8150634765625, + "loss": 0.5122, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.0830161571502686, + "rewards/margins": 2.820885181427002, + "rewards/rejected": -4.903901100158691, + "step": 8640 + }, + { + "epoch": 0.2821972937084016, + "grad_norm": 5.915921688079834, + "learning_rate": 4.530094176687197e-05, + "logits/chosen": 3.4295616149902344, + "logits/rejected": 3.4545702934265137, + "logps/chosen": -379.422119140625, + "logps/rejected": -297.183837890625, + "loss": 0.4763, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.315561294555664, + "rewards/margins": 2.8411242961883545, + "rewards/rejected": -5.156685829162598, + "step": 8660 + }, + { + "epoch": 0.28284901955992214, + "grad_norm": 11.512049674987793, + "learning_rate": 4.529007940387352e-05, + "logits/chosen": 3.5757174491882324, + "logits/rejected": 3.8510475158691406, + "logps/chosen": -400.0611267089844, + "logps/rejected": -336.45550537109375, + "loss": 0.4129, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.340152382850647, + "rewards/margins": 3.346287965774536, + "rewards/rejected": -4.686440467834473, + "step": 8680 + }, + { + "epoch": 0.28350074541144266, + "grad_norm": 9.013387680053711, + "learning_rate": 4.527921704087507e-05, + "logits/chosen": 2.8965394496917725, + "logits/rejected": 3.2547717094421387, + "logps/chosen": -299.65167236328125, + "logps/rejected": -290.537841796875, + "loss": 0.5703, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.37650990486145, + "rewards/margins": 2.394768238067627, + "rewards/rejected": -4.771277904510498, + "step": 8700 + }, + { + "epoch": 0.2841524712629632, + "grad_norm": 1.7633156776428223, + "learning_rate": 4.526835467787663e-05, + "logits/chosen": 3.5084621906280518, + "logits/rejected": 3.769824981689453, + "logps/chosen": -365.37335205078125, + "logps/rejected": -351.25592041015625, + "loss": 0.4323, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7352489233016968, + "rewards/margins": 2.9418232440948486, + "rewards/rejected": -4.677071571350098, + "step": 8720 + }, + { + "epoch": 0.2848041971144838, + "grad_norm": 1.8055684566497803, + "learning_rate": 4.525749231487818e-05, + "logits/chosen": 3.875206708908081, + "logits/rejected": 4.0909743309021, + "logps/chosen": -387.11871337890625, + "logps/rejected": -323.6910095214844, + "loss": 0.5073, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3104965686798096, + "rewards/margins": 3.1324801445007324, + "rewards/rejected": -4.442976951599121, + "step": 8740 + }, + { + "epoch": 0.28545592296600436, + "grad_norm": 2.542753219604492, + "learning_rate": 4.524662995187973e-05, + "logits/chosen": 3.73907470703125, + "logits/rejected": 3.866537570953369, + "logps/chosen": -393.5443420410156, + "logps/rejected": -351.41741943359375, + "loss": 0.4221, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.826544165611267, + "rewards/margins": 3.297407627105713, + "rewards/rejected": -5.1239519119262695, + "step": 8760 + }, + { + "epoch": 0.28610764881752493, + "grad_norm": 3.9976935386657715, + "learning_rate": 4.523576758888129e-05, + "logits/chosen": 3.511364698410034, + "logits/rejected": 3.463611602783203, + "logps/chosen": -343.9238586425781, + "logps/rejected": -329.64178466796875, + "loss": 0.5124, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5029447078704834, + "rewards/margins": 3.124821186065674, + "rewards/rejected": -4.627765655517578, + "step": 8780 + }, + { + "epoch": 0.28675937466904544, + "grad_norm": 3.059037446975708, + "learning_rate": 4.522490522588284e-05, + "logits/chosen": 3.2572970390319824, + "logits/rejected": 3.530452013015747, + "logps/chosen": -321.3861389160156, + "logps/rejected": -272.4162902832031, + "loss": 0.5667, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.055300712585449, + "rewards/margins": 2.3047542572021484, + "rewards/rejected": -4.360054969787598, + "step": 8800 + }, + { + "epoch": 0.287411100520566, + "grad_norm": 3.71211576461792, + "learning_rate": 4.5214042862884396e-05, + "logits/chosen": 3.636218547821045, + "logits/rejected": 3.8842673301696777, + "logps/chosen": -372.4547119140625, + "logps/rejected": -316.7813415527344, + "loss": 0.4334, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2748242616653442, + "rewards/margins": 3.237598419189453, + "rewards/rejected": -4.51242208480835, + "step": 8820 + }, + { + "epoch": 0.2880628263720866, + "grad_norm": 1.399269461631775, + "learning_rate": 4.5203180499885946e-05, + "logits/chosen": 3.545428514480591, + "logits/rejected": 3.6413416862487793, + "logps/chosen": -353.1900329589844, + "logps/rejected": -283.983154296875, + "loss": 0.5381, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.376441240310669, + "rewards/margins": 2.09578013420105, + "rewards/rejected": -3.4722213745117188, + "step": 8840 + }, + { + "epoch": 0.28871455222360715, + "grad_norm": 4.196688175201416, + "learning_rate": 4.5192318136887504e-05, + "logits/chosen": 4.143765449523926, + "logits/rejected": 4.289937496185303, + "logps/chosen": -362.22113037109375, + "logps/rejected": -322.3675231933594, + "loss": 0.4979, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.8496643304824829, + "rewards/margins": 2.199817180633545, + "rewards/rejected": -3.0494818687438965, + "step": 8860 + }, + { + "epoch": 0.2893662780751277, + "grad_norm": 2.1205220222473145, + "learning_rate": 4.5181455773889055e-05, + "logits/chosen": 4.139187812805176, + "logits/rejected": 4.100133895874023, + "logps/chosen": -348.56829833984375, + "logps/rejected": -301.92767333984375, + "loss": 0.5862, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2191951274871826, + "rewards/margins": 1.6747090816497803, + "rewards/rejected": -2.893904685974121, + "step": 8880 + }, + { + "epoch": 0.2900180039266483, + "grad_norm": 0.7834561467170715, + "learning_rate": 4.5170593410890606e-05, + "logits/chosen": 3.6781516075134277, + "logits/rejected": 3.8070156574249268, + "logps/chosen": -324.7522888183594, + "logps/rejected": -283.353515625, + "loss": 0.4743, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.120781660079956, + "rewards/margins": 1.7816203832626343, + "rewards/rejected": -2.902402400970459, + "step": 8900 + }, + { + "epoch": 0.2906697297781688, + "grad_norm": 2.4524714946746826, + "learning_rate": 4.515973104789216e-05, + "logits/chosen": 3.4068756103515625, + "logits/rejected": 3.6506378650665283, + "logps/chosen": -340.5556945800781, + "logps/rejected": -325.9671630859375, + "loss": 0.4066, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1648627519607544, + "rewards/margins": 2.540494203567505, + "rewards/rejected": -3.7053565979003906, + "step": 8920 + }, + { + "epoch": 0.29132145562968936, + "grad_norm": 3.516655683517456, + "learning_rate": 4.5148868684893714e-05, + "logits/chosen": 3.740278720855713, + "logits/rejected": 3.932345151901245, + "logps/chosen": -342.7710266113281, + "logps/rejected": -333.99365234375, + "loss": 0.6584, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0803568363189697, + "rewards/margins": 2.0034472942352295, + "rewards/rejected": -4.083804130554199, + "step": 8940 + }, + { + "epoch": 0.29197318148120993, + "grad_norm": 4.830548286437988, + "learning_rate": 4.5138006321895265e-05, + "logits/chosen": 3.5358729362487793, + "logits/rejected": 3.8152072429656982, + "logps/chosen": -298.53704833984375, + "logps/rejected": -278.36041259765625, + "loss": 0.4741, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.2485207319259644, + "rewards/margins": 2.642284870147705, + "rewards/rejected": -3.890805721282959, + "step": 8960 + }, + { + "epoch": 0.2926249073327305, + "grad_norm": 2.426832675933838, + "learning_rate": 4.512714395889682e-05, + "logits/chosen": 3.9057164192199707, + "logits/rejected": 4.003143310546875, + "logps/chosen": -336.3506164550781, + "logps/rejected": -311.91021728515625, + "loss": 0.5753, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7760217189788818, + "rewards/margins": 2.5678329467773438, + "rewards/rejected": -4.3438544273376465, + "step": 8980 + }, + { + "epoch": 0.29327663318425107, + "grad_norm": 3.667234420776367, + "learning_rate": 4.511628159589837e-05, + "logits/chosen": 3.9587090015411377, + "logits/rejected": 4.201164722442627, + "logps/chosen": -314.3923034667969, + "logps/rejected": -298.7734375, + "loss": 0.5159, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5578105449676514, + "rewards/margins": 1.8618930578231812, + "rewards/rejected": -3.4197044372558594, + "step": 9000 + }, + { + "epoch": 0.29327663318425107, + "eval_logits/chosen": 3.8263840675354004, + "eval_logits/rejected": 4.023495674133301, + "eval_logps/chosen": -368.6352233886719, + "eval_logps/rejected": -336.26220703125, + "eval_loss": 0.4398985505104065, + "eval_rewards/accuracies": 0.8161238431930542, + "eval_rewards/chosen": -1.4054591655731201, + "eval_rewards/margins": 2.5974409580230713, + "eval_rewards/rejected": -4.002900123596191, + "eval_runtime": 3545.34, + "eval_samples_per_second": 3.152, + "eval_steps_per_second": 3.152, + "step": 9000 + }, + { + "epoch": 0.2939283590357716, + "grad_norm": 6.5161871910095215, + "learning_rate": 4.5105419232899924e-05, + "logits/chosen": 3.617083787918091, + "logits/rejected": 3.8662962913513184, + "logps/chosen": -309.8385314941406, + "logps/rejected": -312.8013000488281, + "loss": 0.4433, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6353857517242432, + "rewards/margins": 2.238478899002075, + "rewards/rejected": -3.8738644123077393, + "step": 9020 + }, + { + "epoch": 0.29458008488729215, + "grad_norm": 3.261503219604492, + "learning_rate": 4.509455686990148e-05, + "logits/chosen": 3.5849609375, + "logits/rejected": 3.76127290725708, + "logps/chosen": -311.711669921875, + "logps/rejected": -302.106201171875, + "loss": 0.5793, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1276428699493408, + "rewards/margins": 2.1670217514038086, + "rewards/rejected": -3.2946648597717285, + "step": 9040 + }, + { + "epoch": 0.2952318107388127, + "grad_norm": 2.3924238681793213, + "learning_rate": 4.508369450690303e-05, + "logits/chosen": 3.6022274494171143, + "logits/rejected": 3.951603412628174, + "logps/chosen": -320.82891845703125, + "logps/rejected": -334.6712951660156, + "loss": 0.5494, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5615322589874268, + "rewards/margins": 2.510451078414917, + "rewards/rejected": -4.0719828605651855, + "step": 9060 + }, + { + "epoch": 0.2958835365903333, + "grad_norm": 1.6911005973815918, + "learning_rate": 4.507283214390459e-05, + "logits/chosen": 3.662975311279297, + "logits/rejected": 3.8544838428497314, + "logps/chosen": -346.1162414550781, + "logps/rejected": -310.0665588378906, + "loss": 0.5268, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0037676095962524, + "rewards/margins": 2.2885289192199707, + "rewards/rejected": -3.2922966480255127, + "step": 9080 + }, + { + "epoch": 0.29653526244185385, + "grad_norm": 2.8411056995391846, + "learning_rate": 4.506196978090614e-05, + "logits/chosen": 3.209453582763672, + "logits/rejected": 3.6680896282196045, + "logps/chosen": -298.0554504394531, + "logps/rejected": -282.3381652832031, + "loss": 0.6031, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6677906513214111, + "rewards/margins": 1.875893235206604, + "rewards/rejected": -3.5436840057373047, + "step": 9100 + }, + { + "epoch": 0.29718698829337437, + "grad_norm": 1.1753807067871094, + "learning_rate": 4.50511074179077e-05, + "logits/chosen": 3.7624969482421875, + "logits/rejected": 3.9776394367218018, + "logps/chosen": -334.857177734375, + "logps/rejected": -289.92132568359375, + "loss": 0.5051, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7963011264801025, + "rewards/margins": 1.9761772155761719, + "rewards/rejected": -3.7724781036376953, + "step": 9120 + }, + { + "epoch": 0.29783871414489493, + "grad_norm": 5.15461540222168, + "learning_rate": 4.504024505490925e-05, + "logits/chosen": 3.6650314331054688, + "logits/rejected": 3.977968215942383, + "logps/chosen": -325.18243408203125, + "logps/rejected": -308.3213195800781, + "loss": 0.529, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.137129306793213, + "rewards/margins": 2.044630289077759, + "rewards/rejected": -3.1817593574523926, + "step": 9140 + }, + { + "epoch": 0.2984904399964155, + "grad_norm": 1.4225491285324097, + "learning_rate": 4.50293826919108e-05, + "logits/chosen": 3.960704803466797, + "logits/rejected": 4.219023704528809, + "logps/chosen": -377.869873046875, + "logps/rejected": -310.0888366699219, + "loss": 0.4438, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3575794696807861, + "rewards/margins": 2.490967273712158, + "rewards/rejected": -3.848546266555786, + "step": 9160 + }, + { + "epoch": 0.29914216584793607, + "grad_norm": 0.7915458679199219, + "learning_rate": 4.501852032891236e-05, + "logits/chosen": 3.7368316650390625, + "logits/rejected": 4.031809329986572, + "logps/chosen": -336.09625244140625, + "logps/rejected": -341.6094055175781, + "loss": 0.5711, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.315814733505249, + "rewards/margins": 2.5515666007995605, + "rewards/rejected": -3.8673815727233887, + "step": 9180 + }, + { + "epoch": 0.29979389169945664, + "grad_norm": 1.3117313385009766, + "learning_rate": 4.500765796591391e-05, + "logits/chosen": 3.389200210571289, + "logits/rejected": 3.5361480712890625, + "logps/chosen": -331.0273132324219, + "logps/rejected": -337.8938903808594, + "loss": 0.3809, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.775843858718872, + "rewards/margins": 2.331584930419922, + "rewards/rejected": -4.107428550720215, + "step": 9200 + }, + { + "epoch": 0.3004456175509772, + "grad_norm": 3.442030429840088, + "learning_rate": 4.499679560291546e-05, + "logits/chosen": 3.2440574169158936, + "logits/rejected": 3.551206111907959, + "logps/chosen": -356.6485900878906, + "logps/rejected": -326.7696228027344, + "loss": 0.5553, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2641222476959229, + "rewards/margins": 1.957598328590393, + "rewards/rejected": -3.2217202186584473, + "step": 9220 + }, + { + "epoch": 0.3010973434024977, + "grad_norm": 0.23577813804149628, + "learning_rate": 4.4985933239917016e-05, + "logits/chosen": 3.424154281616211, + "logits/rejected": 3.5320372581481934, + "logps/chosen": -318.52471923828125, + "logps/rejected": -308.4123840332031, + "loss": 0.4342, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9705381393432617, + "rewards/margins": 2.3879897594451904, + "rewards/rejected": -4.358527660369873, + "step": 9240 + }, + { + "epoch": 0.3017490692540183, + "grad_norm": 2.0184123516082764, + "learning_rate": 4.497507087691857e-05, + "logits/chosen": 3.0783936977386475, + "logits/rejected": 3.3940982818603516, + "logps/chosen": -288.1944885253906, + "logps/rejected": -305.1663818359375, + "loss": 0.5266, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.027756929397583, + "rewards/margins": 2.941011905670166, + "rewards/rejected": -4.968768119812012, + "step": 9260 + }, + { + "epoch": 0.30240079510553886, + "grad_norm": 0.781613290309906, + "learning_rate": 4.496420851392012e-05, + "logits/chosen": 3.693608522415161, + "logits/rejected": 3.8445560932159424, + "logps/chosen": -403.54302978515625, + "logps/rejected": -353.9906921386719, + "loss": 0.6459, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.024625778198242, + "rewards/margins": 2.8756420612335205, + "rewards/rejected": -4.900267601013184, + "step": 9280 + }, + { + "epoch": 0.3030525209570594, + "grad_norm": 2.8759913444519043, + "learning_rate": 4.495334615092167e-05, + "logits/chosen": 3.4836342334747314, + "logits/rejected": 3.4976983070373535, + "logps/chosen": -329.4403381347656, + "logps/rejected": -328.7731018066406, + "loss": 0.4392, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.243170738220215, + "rewards/margins": 2.6916470527648926, + "rewards/rejected": -4.934817314147949, + "step": 9300 + }, + { + "epoch": 0.30370424680858, + "grad_norm": 2.7101962566375732, + "learning_rate": 4.4942483787923226e-05, + "logits/chosen": 3.5029797554016113, + "logits/rejected": 3.7926249504089355, + "logps/chosen": -327.8048095703125, + "logps/rejected": -318.91510009765625, + "loss": 0.5505, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0254318714141846, + "rewards/margins": 2.7904491424560547, + "rewards/rejected": -4.81588077545166, + "step": 9320 + }, + { + "epoch": 0.3043559726601005, + "grad_norm": 1.3519618511199951, + "learning_rate": 4.493162142492478e-05, + "logits/chosen": 3.3287148475646973, + "logits/rejected": 3.4941649436950684, + "logps/chosen": -294.9105529785156, + "logps/rejected": -284.5166320800781, + "loss": 0.4025, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9584274291992188, + "rewards/margins": 2.338214159011841, + "rewards/rejected": -4.2966413497924805, + "step": 9340 + }, + { + "epoch": 0.3050076985116211, + "grad_norm": 2.0564424991607666, + "learning_rate": 4.4920759061926335e-05, + "logits/chosen": 3.501086711883545, + "logits/rejected": 3.822606325149536, + "logps/chosen": -327.44110107421875, + "logps/rejected": -311.3541564941406, + "loss": 0.6214, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2727396488189697, + "rewards/margins": 2.184450387954712, + "rewards/rejected": -4.45719051361084, + "step": 9360 + }, + { + "epoch": 0.30565942436314164, + "grad_norm": 2.603893756866455, + "learning_rate": 4.490989669892789e-05, + "logits/chosen": 3.5679996013641357, + "logits/rejected": 3.83121919631958, + "logps/chosen": -343.6649475097656, + "logps/rejected": -319.3695373535156, + "loss": 0.4758, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6868784427642822, + "rewards/margins": 2.383439540863037, + "rewards/rejected": -4.07031774520874, + "step": 9380 + }, + { + "epoch": 0.3063111502146622, + "grad_norm": 1.5246226787567139, + "learning_rate": 4.489903433592944e-05, + "logits/chosen": 3.6602911949157715, + "logits/rejected": 4.0019941329956055, + "logps/chosen": -349.5389099121094, + "logps/rejected": -312.7698059082031, + "loss": 0.5128, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6171514987945557, + "rewards/margins": 2.5252883434295654, + "rewards/rejected": -4.142439842224121, + "step": 9400 + }, + { + "epoch": 0.3069628760661828, + "grad_norm": 1.9521478414535522, + "learning_rate": 4.4888171972930994e-05, + "logits/chosen": 3.6648292541503906, + "logits/rejected": 3.797217607498169, + "logps/chosen": -337.5235595703125, + "logps/rejected": -302.31072998046875, + "loss": 0.5122, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9879196286201477, + "rewards/margins": 2.0048208236694336, + "rewards/rejected": -2.9927401542663574, + "step": 9420 + }, + { + "epoch": 0.30761460191770335, + "grad_norm": 2.8261003494262695, + "learning_rate": 4.487730960993255e-05, + "logits/chosen": 3.452899932861328, + "logits/rejected": 3.8598244190216064, + "logps/chosen": -316.0992126464844, + "logps/rejected": -296.22845458984375, + "loss": 0.4909, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3691569566726685, + "rewards/margins": 2.2936618328094482, + "rewards/rejected": -3.6628189086914062, + "step": 9440 + }, + { + "epoch": 0.30826632776922386, + "grad_norm": 1.813551664352417, + "learning_rate": 4.48664472469341e-05, + "logits/chosen": 3.8521480560302734, + "logits/rejected": 3.795984983444214, + "logps/chosen": -308.86871337890625, + "logps/rejected": -290.31036376953125, + "loss": 0.7334, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.667391061782837, + "rewards/margins": 1.4689973592758179, + "rewards/rejected": -3.1363883018493652, + "step": 9460 + }, + { + "epoch": 0.3089180536207444, + "grad_norm": 5.173272132873535, + "learning_rate": 4.485558488393565e-05, + "logits/chosen": 3.7644824981689453, + "logits/rejected": 3.9342644214630127, + "logps/chosen": -340.69073486328125, + "logps/rejected": -313.99798583984375, + "loss": 0.5139, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.9903851747512817, + "rewards/margins": 2.1542420387268066, + "rewards/rejected": -3.144627332687378, + "step": 9480 + }, + { + "epoch": 0.309569779472265, + "grad_norm": 2.759246349334717, + "learning_rate": 4.4844722520937204e-05, + "logits/chosen": 3.488985538482666, + "logits/rejected": 3.738349199295044, + "logps/chosen": -346.55108642578125, + "logps/rejected": -291.1142578125, + "loss": 0.5008, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.3026331663131714, + "rewards/margins": 2.5691723823547363, + "rewards/rejected": -3.8718056678771973, + "step": 9500 + }, + { + "epoch": 0.31022150532378556, + "grad_norm": 2.4257123470306396, + "learning_rate": 4.483386015793876e-05, + "logits/chosen": 3.6584270000457764, + "logits/rejected": 4.016824245452881, + "logps/chosen": -358.6921081542969, + "logps/rejected": -308.6827392578125, + "loss": 0.4885, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3355748653411865, + "rewards/margins": 2.387908935546875, + "rewards/rejected": -3.723484516143799, + "step": 9520 + }, + { + "epoch": 0.31087323117530613, + "grad_norm": 3.4306771755218506, + "learning_rate": 4.482299779494031e-05, + "logits/chosen": 3.7536416053771973, + "logits/rejected": 4.15143346786499, + "logps/chosen": -338.32183837890625, + "logps/rejected": -290.5627136230469, + "loss": 0.367, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2029896974563599, + "rewards/margins": 2.3646178245544434, + "rewards/rejected": -3.567607879638672, + "step": 9540 + }, + { + "epoch": 0.31152495702682664, + "grad_norm": 0.6663052439689636, + "learning_rate": 4.481213543194186e-05, + "logits/chosen": 3.743408203125, + "logits/rejected": 3.903038501739502, + "logps/chosen": -326.96527099609375, + "logps/rejected": -304.3977355957031, + "loss": 0.4909, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7007386684417725, + "rewards/margins": 2.471942186355591, + "rewards/rejected": -4.172680854797363, + "step": 9560 + }, + { + "epoch": 0.3121766828783472, + "grad_norm": 2.171869993209839, + "learning_rate": 4.480127306894342e-05, + "logits/chosen": 3.638559341430664, + "logits/rejected": 3.9771721363067627, + "logps/chosen": -339.3028259277344, + "logps/rejected": -328.5107421875, + "loss": 0.5505, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8985029458999634, + "rewards/margins": 2.714871883392334, + "rewards/rejected": -4.613374710083008, + "step": 9580 + }, + { + "epoch": 0.3128284087298678, + "grad_norm": 2.8080224990844727, + "learning_rate": 4.479041070594497e-05, + "logits/chosen": 3.641517162322998, + "logits/rejected": 3.9569740295410156, + "logps/chosen": -379.614013671875, + "logps/rejected": -360.7059631347656, + "loss": 0.4251, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1403510570526123, + "rewards/margins": 2.458681106567383, + "rewards/rejected": -4.599032402038574, + "step": 9600 + }, + { + "epoch": 0.31348013458138835, + "grad_norm": 0.3392791152000427, + "learning_rate": 4.477954834294653e-05, + "logits/chosen": 3.7876968383789062, + "logits/rejected": 4.012024879455566, + "logps/chosen": -341.76544189453125, + "logps/rejected": -320.7891540527344, + "loss": 0.4708, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6537872552871704, + "rewards/margins": 2.8646798133850098, + "rewards/rejected": -4.518466949462891, + "step": 9620 + }, + { + "epoch": 0.3141318604329089, + "grad_norm": 1.0330092906951904, + "learning_rate": 4.476868597994808e-05, + "logits/chosen": 3.530217409133911, + "logits/rejected": 3.893559217453003, + "logps/chosen": -372.3333435058594, + "logps/rejected": -288.981689453125, + "loss": 0.4383, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6567367315292358, + "rewards/margins": 2.420663833618164, + "rewards/rejected": -4.0774006843566895, + "step": 9640 + }, + { + "epoch": 0.31478358628442943, + "grad_norm": 1.5930992364883423, + "learning_rate": 4.475782361694964e-05, + "logits/chosen": 3.425192356109619, + "logits/rejected": 3.6571319103240967, + "logps/chosen": -350.6258239746094, + "logps/rejected": -307.7820129394531, + "loss": 0.3503, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6431795358657837, + "rewards/margins": 3.2090821266174316, + "rewards/rejected": -4.852262020111084, + "step": 9660 + }, + { + "epoch": 0.31543531213595, + "grad_norm": 1.449768304824829, + "learning_rate": 4.474696125395119e-05, + "logits/chosen": 3.5968971252441406, + "logits/rejected": 3.9376063346862793, + "logps/chosen": -392.96160888671875, + "logps/rejected": -398.27386474609375, + "loss": 0.3389, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.299379587173462, + "rewards/margins": 4.444596767425537, + "rewards/rejected": -5.743976593017578, + "step": 9680 + }, + { + "epoch": 0.31608703798747056, + "grad_norm": 4.81788969039917, + "learning_rate": 4.473609889095274e-05, + "logits/chosen": 3.2937233448028564, + "logits/rejected": 3.473024845123291, + "logps/chosen": -380.17132568359375, + "logps/rejected": -337.57720947265625, + "loss": 0.8162, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.1964242458343506, + "rewards/margins": 1.6070600748062134, + "rewards/rejected": -4.8034844398498535, + "step": 9700 + }, + { + "epoch": 0.31673876383899113, + "grad_norm": 2.9328510761260986, + "learning_rate": 4.4725236527954296e-05, + "logits/chosen": 3.7086799144744873, + "logits/rejected": 3.9347431659698486, + "logps/chosen": -350.24395751953125, + "logps/rejected": -332.920654296875, + "loss": 0.5223, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.1425094604492188, + "rewards/margins": 2.433835744857788, + "rewards/rejected": -4.576344966888428, + "step": 9720 + }, + { + "epoch": 0.3173904896905117, + "grad_norm": 1.7941195964813232, + "learning_rate": 4.471437416495585e-05, + "logits/chosen": 3.8585312366485596, + "logits/rejected": 3.9519951343536377, + "logps/chosen": -391.709228515625, + "logps/rejected": -320.57098388671875, + "loss": 0.4033, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.721478819847107, + "rewards/margins": 2.8455796241760254, + "rewards/rejected": -4.567058563232422, + "step": 9740 + }, + { + "epoch": 0.31804221554203227, + "grad_norm": 1.0270287990570068, + "learning_rate": 4.47035118019574e-05, + "logits/chosen": 3.688901901245117, + "logits/rejected": 3.82330584526062, + "logps/chosen": -362.156982421875, + "logps/rejected": -287.0440979003906, + "loss": 0.409, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5976951122283936, + "rewards/margins": 2.3640055656433105, + "rewards/rejected": -3.961700439453125, + "step": 9760 + }, + { + "epoch": 0.3186939413935528, + "grad_norm": 6.997811317443848, + "learning_rate": 4.4692649438958955e-05, + "logits/chosen": 3.5211029052734375, + "logits/rejected": 3.6119773387908936, + "logps/chosen": -305.37469482421875, + "logps/rejected": -352.6788330078125, + "loss": 0.5491, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7464078664779663, + "rewards/margins": 2.474407434463501, + "rewards/rejected": -4.220815658569336, + "step": 9780 + }, + { + "epoch": 0.31934566724507335, + "grad_norm": 5.45841646194458, + "learning_rate": 4.4681787075960506e-05, + "logits/chosen": 3.436845302581787, + "logits/rejected": 3.5842223167419434, + "logps/chosen": -302.0278625488281, + "logps/rejected": -298.8910827636719, + "loss": 0.4815, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.700924277305603, + "rewards/margins": 2.291579008102417, + "rewards/rejected": -3.9925034046173096, + "step": 9800 + }, + { + "epoch": 0.3199973930965939, + "grad_norm": 0.48330411314964294, + "learning_rate": 4.467092471296206e-05, + "logits/chosen": 3.610800266265869, + "logits/rejected": 3.830319881439209, + "logps/chosen": -356.0140075683594, + "logps/rejected": -359.61981201171875, + "loss": 0.3988, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5989913940429688, + "rewards/margins": 2.970634937286377, + "rewards/rejected": -4.569626331329346, + "step": 9820 + }, + { + "epoch": 0.3206491189481145, + "grad_norm": 6.348766803741455, + "learning_rate": 4.466006234996361e-05, + "logits/chosen": 3.317770481109619, + "logits/rejected": 3.734898090362549, + "logps/chosen": -341.9024353027344, + "logps/rejected": -277.6325988769531, + "loss": 0.5328, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.4116594791412354, + "rewards/margins": 2.4666390419006348, + "rewards/rejected": -3.878298282623291, + "step": 9840 + }, + { + "epoch": 0.32130084479963505, + "grad_norm": 6.690793037414551, + "learning_rate": 4.4649199986965165e-05, + "logits/chosen": 3.7262356281280518, + "logits/rejected": 3.8666679859161377, + "logps/chosen": -369.40936279296875, + "logps/rejected": -320.06512451171875, + "loss": 0.4547, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.2105460166931152, + "rewards/margins": 3.1060287952423096, + "rewards/rejected": -5.316574573516846, + "step": 9860 + }, + { + "epoch": 0.32195257065115557, + "grad_norm": 0.8569322228431702, + "learning_rate": 4.463833762396672e-05, + "logits/chosen": 3.5357608795166016, + "logits/rejected": 3.625265598297119, + "logps/chosen": -395.91314697265625, + "logps/rejected": -363.5246276855469, + "loss": 0.5253, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.440115213394165, + "rewards/margins": 2.2981741428375244, + "rewards/rejected": -4.738289833068848, + "step": 9880 + }, + { + "epoch": 0.32260429650267614, + "grad_norm": 3.3964381217956543, + "learning_rate": 4.4627475260968274e-05, + "logits/chosen": 3.2099215984344482, + "logits/rejected": 3.524761915206909, + "logps/chosen": -372.435302734375, + "logps/rejected": -324.6745300292969, + "loss": 0.6272, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5859487056732178, + "rewards/margins": 2.7059805393218994, + "rewards/rejected": -5.291929721832275, + "step": 9900 + }, + { + "epoch": 0.3232560223541967, + "grad_norm": 5.906340599060059, + "learning_rate": 4.461661289796983e-05, + "logits/chosen": 3.238111972808838, + "logits/rejected": 3.61181902885437, + "logps/chosen": -334.48419189453125, + "logps/rejected": -324.1343078613281, + "loss": 0.4912, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.1217002868652344, + "rewards/margins": 2.6962196826934814, + "rewards/rejected": -5.817919731140137, + "step": 9920 + }, + { + "epoch": 0.32390774820571727, + "grad_norm": 3.2983384132385254, + "learning_rate": 4.460575053497138e-05, + "logits/chosen": 3.508694887161255, + "logits/rejected": 3.641684055328369, + "logps/chosen": -384.41021728515625, + "logps/rejected": -347.062744140625, + "loss": 0.7579, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.8338556289672852, + "rewards/margins": 1.722917914390564, + "rewards/rejected": -3.5567734241485596, + "step": 9940 + }, + { + "epoch": 0.32455947405723784, + "grad_norm": 2.4137582778930664, + "learning_rate": 4.459488817197293e-05, + "logits/chosen": 3.7443859577178955, + "logits/rejected": 4.035841941833496, + "logps/chosen": -367.27032470703125, + "logps/rejected": -324.42413330078125, + "loss": 0.409, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.4093316793441772, + "rewards/margins": 3.350828170776367, + "rewards/rejected": -4.760159492492676, + "step": 9960 + }, + { + "epoch": 0.3252111999087584, + "grad_norm": 0.21268586814403534, + "learning_rate": 4.458402580897449e-05, + "logits/chosen": 3.804851531982422, + "logits/rejected": 3.941946506500244, + "logps/chosen": -344.4263610839844, + "logps/rejected": -292.1056213378906, + "loss": 0.4181, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4914989471435547, + "rewards/margins": 2.6681323051452637, + "rewards/rejected": -4.15963077545166, + "step": 9980 + }, + { + "epoch": 0.3258629257602789, + "grad_norm": 0.4361385405063629, + "learning_rate": 4.457316344597604e-05, + "logits/chosen": 3.7657508850097656, + "logits/rejected": 3.9364941120147705, + "logps/chosen": -394.3398742675781, + "logps/rejected": -333.41339111328125, + "loss": 0.5185, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2147597074508667, + "rewards/margins": 3.083263874053955, + "rewards/rejected": -4.298023700714111, + "step": 10000 + }, + { + "epoch": 0.3265146516117995, + "grad_norm": 0.92397141456604, + "learning_rate": 4.456230108297759e-05, + "logits/chosen": 3.4134116172790527, + "logits/rejected": 3.677297592163086, + "logps/chosen": -335.93988037109375, + "logps/rejected": -345.66070556640625, + "loss": 0.3918, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9625154733657837, + "rewards/margins": 2.5270514488220215, + "rewards/rejected": -4.489566802978516, + "step": 10020 + }, + { + "epoch": 0.32716637746332006, + "grad_norm": 1.0798659324645996, + "learning_rate": 4.455143871997914e-05, + "logits/chosen": 3.29542875289917, + "logits/rejected": 3.343093156814575, + "logps/chosen": -388.4664001464844, + "logps/rejected": -315.7383728027344, + "loss": 0.4534, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.3606419563293457, + "rewards/margins": 3.2837307453155518, + "rewards/rejected": -5.64437198638916, + "step": 10040 + }, + { + "epoch": 0.3278181033148406, + "grad_norm": 2.505110740661621, + "learning_rate": 4.45405763569807e-05, + "logits/chosen": 3.490764617919922, + "logits/rejected": 3.6548125743865967, + "logps/chosen": -379.974609375, + "logps/rejected": -321.5982971191406, + "loss": 0.452, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4593539237976074, + "rewards/margins": 3.2461154460906982, + "rewards/rejected": -5.705469131469727, + "step": 10060 + }, + { + "epoch": 0.3284698291663612, + "grad_norm": 2.3425583839416504, + "learning_rate": 4.452971399398225e-05, + "logits/chosen": 3.512253522872925, + "logits/rejected": 3.6119461059570312, + "logps/chosen": -332.96685791015625, + "logps/rejected": -314.63214111328125, + "loss": 0.4511, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.239309787750244, + "rewards/margins": 2.9291017055511475, + "rewards/rejected": -5.168412208557129, + "step": 10080 + }, + { + "epoch": 0.3291215550178817, + "grad_norm": 1.8285279273986816, + "learning_rate": 4.45188516309838e-05, + "logits/chosen": 3.313106060028076, + "logits/rejected": 3.5827858448028564, + "logps/chosen": -355.9664306640625, + "logps/rejected": -369.6215515136719, + "loss": 0.4006, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.017279624938965, + "rewards/margins": 3.1108498573303223, + "rewards/rejected": -6.128129482269287, + "step": 10100 + }, + { + "epoch": 0.3297732808694023, + "grad_norm": 0.73393714427948, + "learning_rate": 4.450798926798536e-05, + "logits/chosen": 3.3473002910614014, + "logits/rejected": 3.7232298851013184, + "logps/chosen": -357.11846923828125, + "logps/rejected": -332.6880798339844, + "loss": 0.4171, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.860170841217041, + "rewards/margins": 2.836498737335205, + "rewards/rejected": -5.696669578552246, + "step": 10120 + }, + { + "epoch": 0.33042500672092284, + "grad_norm": 0.20540180802345276, + "learning_rate": 4.449712690498691e-05, + "logits/chosen": 3.392894744873047, + "logits/rejected": 3.4925849437713623, + "logps/chosen": -362.17913818359375, + "logps/rejected": -329.64031982421875, + "loss": 0.5648, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.220764636993408, + "rewards/margins": 2.6471309661865234, + "rewards/rejected": -5.867895603179932, + "step": 10140 + }, + { + "epoch": 0.3310767325724434, + "grad_norm": 3.8266518115997314, + "learning_rate": 4.448626454198847e-05, + "logits/chosen": 3.4722049236297607, + "logits/rejected": 3.6712231636047363, + "logps/chosen": -346.1315002441406, + "logps/rejected": -361.2139587402344, + "loss": 0.6516, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6788573265075684, + "rewards/margins": 2.6981091499328613, + "rewards/rejected": -6.376966953277588, + "step": 10160 + }, + { + "epoch": 0.331728458423964, + "grad_norm": 1.615379810333252, + "learning_rate": 4.4475402178990025e-05, + "logits/chosen": 3.6774723529815674, + "logits/rejected": 3.8945529460906982, + "logps/chosen": -383.6200866699219, + "logps/rejected": -356.04083251953125, + "loss": 0.5829, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.666144847869873, + "rewards/margins": 2.2384872436523438, + "rewards/rejected": -4.904632568359375, + "step": 10180 + }, + { + "epoch": 0.3323801842754845, + "grad_norm": 4.445291996002197, + "learning_rate": 4.4464539815991576e-05, + "logits/chosen": 3.7185134887695312, + "logits/rejected": 3.9799187183380127, + "logps/chosen": -303.6607971191406, + "logps/rejected": -325.8896484375, + "loss": 0.6324, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.9965271949768066, + "rewards/margins": 1.8788385391235352, + "rewards/rejected": -4.875365734100342, + "step": 10200 + }, + { + "epoch": 0.33303191012700506, + "grad_norm": 1.4752097129821777, + "learning_rate": 4.445367745299313e-05, + "logits/chosen": 3.771758556365967, + "logits/rejected": 3.8583579063415527, + "logps/chosen": -380.38665771484375, + "logps/rejected": -366.69403076171875, + "loss": 0.4055, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.5790387392044067, + "rewards/margins": 3.509979248046875, + "rewards/rejected": -5.089017391204834, + "step": 10220 + }, + { + "epoch": 0.33368363597852563, + "grad_norm": 6.854084491729736, + "learning_rate": 4.444281508999468e-05, + "logits/chosen": 3.3323631286621094, + "logits/rejected": 3.6834912300109863, + "logps/chosen": -321.06549072265625, + "logps/rejected": -304.52447509765625, + "loss": 0.4541, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0501692295074463, + "rewards/margins": 2.4234869480133057, + "rewards/rejected": -4.473655700683594, + "step": 10240 + }, + { + "epoch": 0.3343353618300462, + "grad_norm": 1.9740872383117676, + "learning_rate": 4.4431952726996235e-05, + "logits/chosen": 3.465125560760498, + "logits/rejected": 3.531604051589966, + "logps/chosen": -317.67010498046875, + "logps/rejected": -319.6832275390625, + "loss": 0.5586, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7327191829681396, + "rewards/margins": 2.599806308746338, + "rewards/rejected": -4.332525253295898, + "step": 10260 + }, + { + "epoch": 0.33498708768156676, + "grad_norm": 4.643021106719971, + "learning_rate": 4.4421090363997786e-05, + "logits/chosen": 3.8179421424865723, + "logits/rejected": 4.168804168701172, + "logps/chosen": -335.1571044921875, + "logps/rejected": -305.88726806640625, + "loss": 0.3768, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9124181270599365, + "rewards/margins": 2.3386640548706055, + "rewards/rejected": -4.251082420349121, + "step": 10280 + }, + { + "epoch": 0.33563881353308733, + "grad_norm": 2.5647428035736084, + "learning_rate": 4.441022800099934e-05, + "logits/chosen": 3.799095630645752, + "logits/rejected": 4.177992343902588, + "logps/chosen": -355.78961181640625, + "logps/rejected": -324.8876037597656, + "loss": 0.6025, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.285374164581299, + "rewards/margins": 2.2028610706329346, + "rewards/rejected": -4.4882354736328125, + "step": 10300 + }, + { + "epoch": 0.33629053938460785, + "grad_norm": 0.8981235027313232, + "learning_rate": 4.4399908756150815e-05, + "logits/chosen": 3.8348641395568848, + "logits/rejected": 3.997809886932373, + "logps/chosen": -407.8025817871094, + "logps/rejected": -342.26531982421875, + "loss": 0.6235, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.946146011352539, + "rewards/margins": 2.446617841720581, + "rewards/rejected": -4.392763614654541, + "step": 10320 + }, + { + "epoch": 0.3369422652361284, + "grad_norm": 2.206927537918091, + "learning_rate": 4.4389046393152366e-05, + "logits/chosen": 3.497490406036377, + "logits/rejected": 3.9018661975860596, + "logps/chosen": -338.4437561035156, + "logps/rejected": -309.6169738769531, + "loss": 0.4798, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5850552320480347, + "rewards/margins": 2.2147507667541504, + "rewards/rejected": -3.7998058795928955, + "step": 10340 + }, + { + "epoch": 0.337593991087649, + "grad_norm": 1.567094087600708, + "learning_rate": 4.437818403015392e-05, + "logits/chosen": 3.971118927001953, + "logits/rejected": 4.042828559875488, + "logps/chosen": -380.01226806640625, + "logps/rejected": -324.2684020996094, + "loss": 0.5644, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7006807327270508, + "rewards/margins": 2.0685744285583496, + "rewards/rejected": -3.7692551612854004, + "step": 10360 + }, + { + "epoch": 0.33824571693916955, + "grad_norm": 1.292428970336914, + "learning_rate": 4.4367321667155474e-05, + "logits/chosen": 3.5938963890075684, + "logits/rejected": 3.6586296558380127, + "logps/chosen": -323.9371032714844, + "logps/rejected": -344.3617248535156, + "loss": 0.4584, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.437395691871643, + "rewards/margins": 2.9715464115142822, + "rewards/rejected": -4.408942222595215, + "step": 10380 + }, + { + "epoch": 0.3388974427906901, + "grad_norm": 2.9936883449554443, + "learning_rate": 4.4356459304157025e-05, + "logits/chosen": 3.4684898853302, + "logits/rejected": 3.798779249191284, + "logps/chosen": -363.388427734375, + "logps/rejected": -341.1925048828125, + "loss": 0.3822, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9188976287841797, + "rewards/margins": 2.2067058086395264, + "rewards/rejected": -4.125603675842285, + "step": 10400 + }, + { + "epoch": 0.33954916864221063, + "grad_norm": 1.2344108819961548, + "learning_rate": 4.434559694115858e-05, + "logits/chosen": 3.624882936477661, + "logits/rejected": 3.834012508392334, + "logps/chosen": -347.9591369628906, + "logps/rejected": -329.72857666015625, + "loss": 0.4941, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.27724552154541, + "rewards/margins": 2.2211458683013916, + "rewards/rejected": -4.498391151428223, + "step": 10420 + }, + { + "epoch": 0.3402008944937312, + "grad_norm": 8.162324905395508, + "learning_rate": 4.433473457816013e-05, + "logits/chosen": 3.3366806507110596, + "logits/rejected": 3.4601693153381348, + "logps/chosen": -352.07635498046875, + "logps/rejected": -335.5511169433594, + "loss": 0.4979, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.1385695934295654, + "rewards/margins": 2.3075006008148193, + "rewards/rejected": -4.446070194244385, + "step": 10440 + }, + { + "epoch": 0.34085262034525177, + "grad_norm": 1.046183466911316, + "learning_rate": 4.432387221516169e-05, + "logits/chosen": 3.0189576148986816, + "logits/rejected": 3.1540980339050293, + "logps/chosen": -352.5262451171875, + "logps/rejected": -336.0315856933594, + "loss": 0.6427, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.428128480911255, + "rewards/margins": 2.280261516571045, + "rewards/rejected": -4.708390235900879, + "step": 10460 + }, + { + "epoch": 0.34150434619677233, + "grad_norm": 2.4617202281951904, + "learning_rate": 4.431300985216324e-05, + "logits/chosen": 3.2250499725341797, + "logits/rejected": 3.5485939979553223, + "logps/chosen": -313.54669189453125, + "logps/rejected": -286.2921142578125, + "loss": 0.5445, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8870338201522827, + "rewards/margins": 2.1794629096984863, + "rewards/rejected": -4.066496849060059, + "step": 10480 + }, + { + "epoch": 0.3421560720482929, + "grad_norm": 1.1229192018508911, + "learning_rate": 4.43021474891648e-05, + "logits/chosen": 3.5575637817382812, + "logits/rejected": 3.6285393238067627, + "logps/chosen": -324.9263610839844, + "logps/rejected": -308.33685302734375, + "loss": 0.4447, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5326651334762573, + "rewards/margins": 2.074554920196533, + "rewards/rejected": -3.60722017288208, + "step": 10500 + }, + { + "epoch": 0.34280779789981347, + "grad_norm": 4.0524139404296875, + "learning_rate": 4.429128512616635e-05, + "logits/chosen": 3.455394744873047, + "logits/rejected": 3.780630111694336, + "logps/chosen": -331.90972900390625, + "logps/rejected": -304.0276794433594, + "loss": 0.482, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.506135106086731, + "rewards/margins": 2.3346657752990723, + "rewards/rejected": -3.840801239013672, + "step": 10520 + }, + { + "epoch": 0.343459523751334, + "grad_norm": 3.916215658187866, + "learning_rate": 4.42804227631679e-05, + "logits/chosen": 3.2933688163757324, + "logits/rejected": 3.4571259021759033, + "logps/chosen": -333.94952392578125, + "logps/rejected": -322.30169677734375, + "loss": 0.3944, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7473087310791016, + "rewards/margins": 2.6138644218444824, + "rewards/rejected": -4.361173152923584, + "step": 10540 + }, + { + "epoch": 0.34411124960285455, + "grad_norm": 1.5825163125991821, + "learning_rate": 4.426956040016946e-05, + "logits/chosen": 3.465935230255127, + "logits/rejected": 3.636981248855591, + "logps/chosen": -338.5223693847656, + "logps/rejected": -343.2163391113281, + "loss": 0.4853, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.1044939756393433, + "rewards/margins": 3.0052871704101562, + "rewards/rejected": -4.109780788421631, + "step": 10560 + }, + { + "epoch": 0.3447629754543751, + "grad_norm": 1.8610090017318726, + "learning_rate": 4.425869803717101e-05, + "logits/chosen": 3.491664171218872, + "logits/rejected": 3.766364574432373, + "logps/chosen": -355.08526611328125, + "logps/rejected": -313.92633056640625, + "loss": 0.3656, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6987085342407227, + "rewards/margins": 2.4017281532287598, + "rewards/rejected": -4.100436687469482, + "step": 10580 + }, + { + "epoch": 0.3454147013058957, + "grad_norm": 3.975480318069458, + "learning_rate": 4.424783567417256e-05, + "logits/chosen": 3.471416473388672, + "logits/rejected": 3.591870069503784, + "logps/chosen": -319.38739013671875, + "logps/rejected": -275.87396240234375, + "loss": 0.3141, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6684595346450806, + "rewards/margins": 2.467505931854248, + "rewards/rejected": -4.135965824127197, + "step": 10600 + }, + { + "epoch": 0.34606642715741626, + "grad_norm": 0.5570864677429199, + "learning_rate": 4.423697331117411e-05, + "logits/chosen": 3.364400863647461, + "logits/rejected": 3.8207709789276123, + "logps/chosen": -341.9665222167969, + "logps/rejected": -344.85015869140625, + "loss": 0.494, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1351990699768066, + "rewards/margins": 3.0747814178466797, + "rewards/rejected": -5.209980487823486, + "step": 10620 + }, + { + "epoch": 0.34671815300893677, + "grad_norm": 1.050920844078064, + "learning_rate": 4.422611094817567e-05, + "logits/chosen": 3.4349257946014404, + "logits/rejected": 3.5788521766662598, + "logps/chosen": -334.48968505859375, + "logps/rejected": -303.88458251953125, + "loss": 0.5643, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.3434200286865234, + "rewards/margins": 2.1386170387268066, + "rewards/rejected": -4.482036590576172, + "step": 10640 + }, + { + "epoch": 0.34736987886045734, + "grad_norm": 5.736017227172852, + "learning_rate": 4.421524858517722e-05, + "logits/chosen": 3.566901445388794, + "logits/rejected": 3.688910722732544, + "logps/chosen": -383.00201416015625, + "logps/rejected": -339.58758544921875, + "loss": 0.7314, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.870251417160034, + "rewards/margins": 1.8416427373886108, + "rewards/rejected": -4.711893558502197, + "step": 10660 + }, + { + "epoch": 0.3480216047119779, + "grad_norm": 2.358578681945801, + "learning_rate": 4.420438622217877e-05, + "logits/chosen": 3.6238760948181152, + "logits/rejected": 3.8725650310516357, + "logps/chosen": -392.30487060546875, + "logps/rejected": -354.3603210449219, + "loss": 0.6346, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.432647705078125, + "rewards/margins": 2.4274468421936035, + "rewards/rejected": -4.8600945472717285, + "step": 10680 + }, + { + "epoch": 0.3486733305634985, + "grad_norm": 2.0241873264312744, + "learning_rate": 4.419352385918033e-05, + "logits/chosen": 3.2871251106262207, + "logits/rejected": 3.429311752319336, + "logps/chosen": -305.30987548828125, + "logps/rejected": -296.7978515625, + "loss": 0.5088, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9764683246612549, + "rewards/margins": 2.0837225914001465, + "rewards/rejected": -4.060190677642822, + "step": 10700 + }, + { + "epoch": 0.34932505641501904, + "grad_norm": 2.580188512802124, + "learning_rate": 4.4182661496181885e-05, + "logits/chosen": 3.2286345958709717, + "logits/rejected": 3.5244126319885254, + "logps/chosen": -346.0224304199219, + "logps/rejected": -343.16143798828125, + "loss": 0.4197, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.55901038646698, + "rewards/margins": 2.995098352432251, + "rewards/rejected": -4.554108619689941, + "step": 10720 + }, + { + "epoch": 0.34997678226653955, + "grad_norm": 0.20709331333637238, + "learning_rate": 4.4171799133183436e-05, + "logits/chosen": 3.4604790210723877, + "logits/rejected": 3.829512357711792, + "logps/chosen": -321.8540344238281, + "logps/rejected": -300.69073486328125, + "loss": 0.2955, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.2960121631622314, + "rewards/margins": 3.082033395767212, + "rewards/rejected": -4.378045558929443, + "step": 10740 + }, + { + "epoch": 0.3506285081180601, + "grad_norm": 2.7535502910614014, + "learning_rate": 4.416093677018499e-05, + "logits/chosen": 3.657384157180786, + "logits/rejected": 3.877140760421753, + "logps/chosen": -374.9261169433594, + "logps/rejected": -315.89947509765625, + "loss": 0.4299, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.460584282875061, + "rewards/margins": 3.2890923023223877, + "rewards/rejected": -4.749676704406738, + "step": 10760 + }, + { + "epoch": 0.3512802339695807, + "grad_norm": 1.629223346710205, + "learning_rate": 4.4150074407186544e-05, + "logits/chosen": 3.3064091205596924, + "logits/rejected": 3.6009349822998047, + "logps/chosen": -322.26849365234375, + "logps/rejected": -346.7171936035156, + "loss": 0.2675, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9964954853057861, + "rewards/margins": 3.302464008331299, + "rewards/rejected": -5.298959255218506, + "step": 10780 + }, + { + "epoch": 0.35193195982110126, + "grad_norm": 2.8201792240142822, + "learning_rate": 4.4139212044188095e-05, + "logits/chosen": 3.63964581489563, + "logits/rejected": 3.9823310375213623, + "logps/chosen": -361.19183349609375, + "logps/rejected": -317.1882019042969, + "loss": 0.6049, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.662169933319092, + "rewards/margins": 2.2988359928131104, + "rewards/rejected": -4.961006164550781, + "step": 10800 + }, + { + "epoch": 0.3525836856726218, + "grad_norm": 4.248741149902344, + "learning_rate": 4.4128349681189646e-05, + "logits/chosen": 3.4473156929016113, + "logits/rejected": 3.7461256980895996, + "logps/chosen": -381.2140808105469, + "logps/rejected": -310.56134033203125, + "loss": 0.5709, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.477440357208252, + "rewards/margins": 2.423964262008667, + "rewards/rejected": -4.90140438079834, + "step": 10820 + }, + { + "epoch": 0.3532354115241424, + "grad_norm": 3.291938543319702, + "learning_rate": 4.41174873181912e-05, + "logits/chosen": 3.2108635902404785, + "logits/rejected": 3.345414638519287, + "logps/chosen": -314.8421325683594, + "logps/rejected": -284.05364990234375, + "loss": 0.26, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.7868480682373047, + "rewards/margins": 3.4226722717285156, + "rewards/rejected": -5.20952033996582, + "step": 10840 + }, + { + "epoch": 0.3538871373756629, + "grad_norm": 0.9282498955726624, + "learning_rate": 4.4106624955192754e-05, + "logits/chosen": 3.6154861450195312, + "logits/rejected": 3.652707576751709, + "logps/chosen": -321.1143493652344, + "logps/rejected": -284.8221740722656, + "loss": 0.5064, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.1147263050079346, + "rewards/margins": 2.183751344680786, + "rewards/rejected": -4.298477649688721, + "step": 10860 + }, + { + "epoch": 0.3545388632271835, + "grad_norm": 3.162101984024048, + "learning_rate": 4.4095762592194305e-05, + "logits/chosen": 3.553257703781128, + "logits/rejected": 3.503340482711792, + "logps/chosen": -343.1835021972656, + "logps/rejected": -347.54327392578125, + "loss": 0.4685, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.363945484161377, + "rewards/margins": 2.6242809295654297, + "rewards/rejected": -4.988226413726807, + "step": 10880 + }, + { + "epoch": 0.35519058907870404, + "grad_norm": 5.055313587188721, + "learning_rate": 4.408490022919586e-05, + "logits/chosen": 3.401404619216919, + "logits/rejected": 3.3240439891815186, + "logps/chosen": -344.3009948730469, + "logps/rejected": -335.7080993652344, + "loss": 0.4441, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4307130575180054, + "rewards/margins": 2.7410905361175537, + "rewards/rejected": -4.1718034744262695, + "step": 10900 + }, + { + "epoch": 0.3558423149302246, + "grad_norm": 3.801144599914551, + "learning_rate": 4.407403786619741e-05, + "logits/chosen": 3.824571132659912, + "logits/rejected": 3.9547858238220215, + "logps/chosen": -375.7459716796875, + "logps/rejected": -335.0436096191406, + "loss": 0.6514, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1173386573791504, + "rewards/margins": 2.002882242202759, + "rewards/rejected": -4.12022066116333, + "step": 10920 + }, + { + "epoch": 0.3564940407817452, + "grad_norm": 4.326809406280518, + "learning_rate": 4.4063175503198964e-05, + "logits/chosen": 3.65437650680542, + "logits/rejected": 3.5029380321502686, + "logps/chosen": -329.57000732421875, + "logps/rejected": -351.86688232421875, + "loss": 0.5712, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9835939407348633, + "rewards/margins": 2.301485300064087, + "rewards/rejected": -4.285079002380371, + "step": 10940 + }, + { + "epoch": 0.3571457666332657, + "grad_norm": 1.3762056827545166, + "learning_rate": 4.405231314020052e-05, + "logits/chosen": 3.8549797534942627, + "logits/rejected": 3.9246535301208496, + "logps/chosen": -370.4671936035156, + "logps/rejected": -331.8171691894531, + "loss": 0.5252, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6471666097640991, + "rewards/margins": 2.5070574283599854, + "rewards/rejected": -4.154223918914795, + "step": 10960 + }, + { + "epoch": 0.35779749248478626, + "grad_norm": 1.176217794418335, + "learning_rate": 4.404145077720208e-05, + "logits/chosen": 3.307586193084717, + "logits/rejected": 3.6171035766601562, + "logps/chosen": -328.5976257324219, + "logps/rejected": -298.7281799316406, + "loss": 0.6031, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.922369360923767, + "rewards/margins": 2.019834518432617, + "rewards/rejected": -3.9422035217285156, + "step": 10980 + }, + { + "epoch": 0.35844921833630683, + "grad_norm": 1.4974021911621094, + "learning_rate": 4.403058841420363e-05, + "logits/chosen": 3.554483413696289, + "logits/rejected": 3.4990601539611816, + "logps/chosen": -359.55780029296875, + "logps/rejected": -342.03045654296875, + "loss": 0.5578, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9004697799682617, + "rewards/margins": 2.679694414138794, + "rewards/rejected": -4.580163955688477, + "step": 11000 + }, + { + "epoch": 0.3591009441878274, + "grad_norm": 3.6114559173583984, + "learning_rate": 4.401972605120518e-05, + "logits/chosen": 3.0958876609802246, + "logits/rejected": 3.4934089183807373, + "logps/chosen": -362.36456298828125, + "logps/rejected": -353.20037841796875, + "loss": 0.4502, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.8902183771133423, + "rewards/margins": 2.8141109943389893, + "rewards/rejected": -4.704329013824463, + "step": 11020 + }, + { + "epoch": 0.35975267003934797, + "grad_norm": 2.7333812713623047, + "learning_rate": 4.400886368820674e-05, + "logits/chosen": 3.6443405151367188, + "logits/rejected": 3.6178348064422607, + "logps/chosen": -323.36492919921875, + "logps/rejected": -284.8041076660156, + "loss": 0.4047, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6647770404815674, + "rewards/margins": 2.210761070251465, + "rewards/rejected": -3.8755383491516113, + "step": 11040 + }, + { + "epoch": 0.3604043958908685, + "grad_norm": 2.2631821632385254, + "learning_rate": 4.399800132520829e-05, + "logits/chosen": 3.4223594665527344, + "logits/rejected": 3.8633124828338623, + "logps/chosen": -337.3416748046875, + "logps/rejected": -347.54205322265625, + "loss": 0.4542, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6273596286773682, + "rewards/margins": 3.343634843826294, + "rewards/rejected": -4.97099494934082, + "step": 11060 + }, + { + "epoch": 0.36105612174238905, + "grad_norm": 1.7958793640136719, + "learning_rate": 4.398713896220984e-05, + "logits/chosen": 3.4736416339874268, + "logits/rejected": 3.842578172683716, + "logps/chosen": -331.64013671875, + "logps/rejected": -279.75750732421875, + "loss": 0.4858, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6328420639038086, + "rewards/margins": 2.6111977100372314, + "rewards/rejected": -4.244039535522461, + "step": 11080 + }, + { + "epoch": 0.3617078475939096, + "grad_norm": 2.1302788257598877, + "learning_rate": 4.39762765992114e-05, + "logits/chosen": 3.244720935821533, + "logits/rejected": 3.5359904766082764, + "logps/chosen": -293.40045166015625, + "logps/rejected": -276.6231384277344, + "loss": 0.3936, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1040229797363281, + "rewards/margins": 2.633626937866211, + "rewards/rejected": -3.7376503944396973, + "step": 11100 + }, + { + "epoch": 0.3623595734454302, + "grad_norm": 2.5296425819396973, + "learning_rate": 4.396541423621295e-05, + "logits/chosen": 3.7095787525177, + "logits/rejected": 3.8195388317108154, + "logps/chosen": -326.3127136230469, + "logps/rejected": -334.5335388183594, + "loss": 0.3946, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8512945175170898, + "rewards/margins": 3.106482982635498, + "rewards/rejected": -4.95777702331543, + "step": 11120 + }, + { + "epoch": 0.36301129929695075, + "grad_norm": 0.5692671537399292, + "learning_rate": 4.39545518732145e-05, + "logits/chosen": 3.5651721954345703, + "logits/rejected": 3.685485363006592, + "logps/chosen": -313.57196044921875, + "logps/rejected": -243.5947723388672, + "loss": 0.4585, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.4404593706130981, + "rewards/margins": 2.566704511642456, + "rewards/rejected": -4.007164001464844, + "step": 11140 + }, + { + "epoch": 0.3636630251484713, + "grad_norm": 3.9556593894958496, + "learning_rate": 4.394368951021605e-05, + "logits/chosen": 3.624680995941162, + "logits/rejected": 3.6592354774475098, + "logps/chosen": -347.59478759765625, + "logps/rejected": -306.43133544921875, + "loss": 0.416, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9750378131866455, + "rewards/margins": 3.0040977001190186, + "rewards/rejected": -4.979135513305664, + "step": 11160 + }, + { + "epoch": 0.36431475099999183, + "grad_norm": 3.5555195808410645, + "learning_rate": 4.393282714721761e-05, + "logits/chosen": 3.609480619430542, + "logits/rejected": 3.6080756187438965, + "logps/chosen": -334.3541564941406, + "logps/rejected": -350.05426025390625, + "loss": 0.6314, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1083390712738037, + "rewards/margins": 2.657348155975342, + "rewards/rejected": -4.765686988830566, + "step": 11180 + }, + { + "epoch": 0.3649664768515124, + "grad_norm": 1.299296259880066, + "learning_rate": 4.392196478421916e-05, + "logits/chosen": 3.908452272415161, + "logits/rejected": 3.9536213874816895, + "logps/chosen": -331.8665771484375, + "logps/rejected": -310.02130126953125, + "loss": 0.3505, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.2192894220352173, + "rewards/margins": 3.616054058074951, + "rewards/rejected": -4.835343360900879, + "step": 11200 + }, + { + "epoch": 0.36561820270303297, + "grad_norm": 3.7963221073150635, + "learning_rate": 4.3911102421220715e-05, + "logits/chosen": 3.349621534347534, + "logits/rejected": 3.656632900238037, + "logps/chosen": -320.89764404296875, + "logps/rejected": -295.4831848144531, + "loss": 0.3595, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4361002445220947, + "rewards/margins": 3.0685856342315674, + "rewards/rejected": -4.504685878753662, + "step": 11220 + }, + { + "epoch": 0.36626992855455354, + "grad_norm": 4.257942199707031, + "learning_rate": 4.3900240058222266e-05, + "logits/chosen": 3.5627522468566895, + "logits/rejected": 3.9327194690704346, + "logps/chosen": -321.5350036621094, + "logps/rejected": -322.82672119140625, + "loss": 0.4117, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8323482275009155, + "rewards/margins": 3.0803723335266113, + "rewards/rejected": -4.912720680236816, + "step": 11240 + }, + { + "epoch": 0.3669216544060741, + "grad_norm": 0.5033763647079468, + "learning_rate": 4.3889377695223824e-05, + "logits/chosen": 3.205371856689453, + "logits/rejected": 3.3783702850341797, + "logps/chosen": -321.47137451171875, + "logps/rejected": -308.9685363769531, + "loss": 0.4378, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6460357904434204, + "rewards/margins": 2.6270785331726074, + "rewards/rejected": -4.2731146812438965, + "step": 11260 + }, + { + "epoch": 0.3675733802575946, + "grad_norm": 1.3632124662399292, + "learning_rate": 4.3878515332225375e-05, + "logits/chosen": 3.825157880783081, + "logits/rejected": 3.992839813232422, + "logps/chosen": -387.4232177734375, + "logps/rejected": -328.17205810546875, + "loss": 0.4476, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.304898262023926, + "rewards/margins": 3.0159971714019775, + "rewards/rejected": -5.320895195007324, + "step": 11280 + }, + { + "epoch": 0.3682251061091152, + "grad_norm": 1.7311369180679321, + "learning_rate": 4.386765296922693e-05, + "logits/chosen": 3.5802974700927734, + "logits/rejected": 3.8497061729431152, + "logps/chosen": -356.25714111328125, + "logps/rejected": -334.4027404785156, + "loss": 0.5051, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2311758995056152, + "rewards/margins": 2.0191073417663574, + "rewards/rejected": -4.250283241271973, + "step": 11300 + }, + { + "epoch": 0.36887683196063575, + "grad_norm": 4.344323635101318, + "learning_rate": 4.385679060622848e-05, + "logits/chosen": 3.8088767528533936, + "logits/rejected": 3.799041748046875, + "logps/chosen": -335.0738830566406, + "logps/rejected": -350.8710021972656, + "loss": 0.4018, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.582035541534424, + "rewards/margins": 2.9863991737365723, + "rewards/rejected": -5.568434715270996, + "step": 11320 + }, + { + "epoch": 0.3695285578121563, + "grad_norm": 0.23493985831737518, + "learning_rate": 4.3845928243230034e-05, + "logits/chosen": 3.4453048706054688, + "logits/rejected": 3.553401231765747, + "logps/chosen": -343.9291076660156, + "logps/rejected": -320.41583251953125, + "loss": 0.4027, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.1075358390808105, + "rewards/margins": 3.1418559551239014, + "rewards/rejected": -5.249392032623291, + "step": 11340 + }, + { + "epoch": 0.3701802836636769, + "grad_norm": 11.646072387695312, + "learning_rate": 4.3835065880231584e-05, + "logits/chosen": 3.737787961959839, + "logits/rejected": 3.876941204071045, + "logps/chosen": -353.13201904296875, + "logps/rejected": -346.79229736328125, + "loss": 0.3153, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.76946222782135, + "rewards/margins": 3.531555652618408, + "rewards/rejected": -5.3010172843933105, + "step": 11360 + }, + { + "epoch": 0.37083200951519746, + "grad_norm": 3.6672537326812744, + "learning_rate": 4.382420351723314e-05, + "logits/chosen": 3.6219489574432373, + "logits/rejected": 3.772390842437744, + "logps/chosen": -366.1280822753906, + "logps/rejected": -320.8765563964844, + "loss": 0.5605, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9970061779022217, + "rewards/margins": 2.186584949493408, + "rewards/rejected": -4.183590888977051, + "step": 11380 + }, + { + "epoch": 0.37148373536671797, + "grad_norm": 0.5019899606704712, + "learning_rate": 4.381334115423469e-05, + "logits/chosen": 3.775416851043701, + "logits/rejected": 3.9785194396972656, + "logps/chosen": -352.6353454589844, + "logps/rejected": -325.33953857421875, + "loss": 0.3495, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9451528787612915, + "rewards/margins": 2.7914023399353027, + "rewards/rejected": -4.736555099487305, + "step": 11400 + }, + { + "epoch": 0.37213546121823854, + "grad_norm": 1.7872065305709839, + "learning_rate": 4.3802478791236244e-05, + "logits/chosen": 3.6935863494873047, + "logits/rejected": 4.112163066864014, + "logps/chosen": -326.78912353515625, + "logps/rejected": -325.4741516113281, + "loss": 0.39, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5403411388397217, + "rewards/margins": 3.005526065826416, + "rewards/rejected": -4.545867443084717, + "step": 11420 + }, + { + "epoch": 0.3727871870697591, + "grad_norm": 4.543249607086182, + "learning_rate": 4.37916164282378e-05, + "logits/chosen": 3.7800159454345703, + "logits/rejected": 3.989711284637451, + "logps/chosen": -397.1561279296875, + "logps/rejected": -357.5032653808594, + "loss": 0.4428, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.242290735244751, + "rewards/margins": 3.022083282470703, + "rewards/rejected": -5.264374256134033, + "step": 11440 + }, + { + "epoch": 0.3734389129212797, + "grad_norm": 3.0298657417297363, + "learning_rate": 4.378075406523935e-05, + "logits/chosen": 3.813077449798584, + "logits/rejected": 4.137297630310059, + "logps/chosen": -344.75177001953125, + "logps/rejected": -305.33575439453125, + "loss": 0.5004, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7003045082092285, + "rewards/margins": 3.094273090362549, + "rewards/rejected": -5.794577121734619, + "step": 11460 + }, + { + "epoch": 0.37409063877280024, + "grad_norm": 10.251382827758789, + "learning_rate": 4.37698917022409e-05, + "logits/chosen": 3.455350399017334, + "logits/rejected": 3.74426007270813, + "logps/chosen": -364.74786376953125, + "logps/rejected": -312.1545715332031, + "loss": 0.4746, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7249152660369873, + "rewards/margins": 3.0114927291870117, + "rewards/rejected": -5.736408710479736, + "step": 11480 + }, + { + "epoch": 0.37474236462432076, + "grad_norm": 4.280351638793945, + "learning_rate": 4.375902933924246e-05, + "logits/chosen": 3.369277238845825, + "logits/rejected": 3.612431049346924, + "logps/chosen": -322.2083435058594, + "logps/rejected": -313.2590637207031, + "loss": 0.5116, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.205143451690674, + "rewards/margins": 2.8384108543395996, + "rewards/rejected": -5.043554782867432, + "step": 11500 + }, + { + "epoch": 0.3753940904758413, + "grad_norm": 0.6680411696434021, + "learning_rate": 4.374816697624402e-05, + "logits/chosen": 3.7566096782684326, + "logits/rejected": 3.8281478881835938, + "logps/chosen": -363.98663330078125, + "logps/rejected": -332.2403869628906, + "loss": 0.6035, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.1790497303009033, + "rewards/margins": 2.5820538997650146, + "rewards/rejected": -4.761104106903076, + "step": 11520 + }, + { + "epoch": 0.3760458163273619, + "grad_norm": 0.9481142163276672, + "learning_rate": 4.373730461324557e-05, + "logits/chosen": 3.9847359657287598, + "logits/rejected": 4.125492095947266, + "logps/chosen": -381.23284912109375, + "logps/rejected": -367.43463134765625, + "loss": 0.5233, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9268887042999268, + "rewards/margins": 2.6301498413085938, + "rewards/rejected": -4.557038307189941, + "step": 11540 + }, + { + "epoch": 0.37669754217888246, + "grad_norm": 3.221562147140503, + "learning_rate": 4.372644225024712e-05, + "logits/chosen": 3.9760735034942627, + "logits/rejected": 4.249410152435303, + "logps/chosen": -383.7716369628906, + "logps/rejected": -321.61126708984375, + "loss": 0.3367, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.924197793006897, + "rewards/margins": 2.847888231277466, + "rewards/rejected": -4.772085666656494, + "step": 11560 + }, + { + "epoch": 0.37734926803040303, + "grad_norm": 6.362435817718506, + "learning_rate": 4.371557988724868e-05, + "logits/chosen": 3.8422343730926514, + "logits/rejected": 4.054407119750977, + "logps/chosen": -357.95269775390625, + "logps/rejected": -326.50067138671875, + "loss": 0.3823, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9230693578720093, + "rewards/margins": 3.4766299724578857, + "rewards/rejected": -5.3996992111206055, + "step": 11580 + }, + { + "epoch": 0.37800099388192354, + "grad_norm": 0.9828212857246399, + "learning_rate": 4.370471752425023e-05, + "logits/chosen": 3.722757339477539, + "logits/rejected": 3.7616779804229736, + "logps/chosen": -407.75787353515625, + "logps/rejected": -332.3234558105469, + "loss": 0.381, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3517346382141113, + "rewards/margins": 3.2201309204101562, + "rewards/rejected": -5.571866035461426, + "step": 11600 + }, + { + "epoch": 0.3786527197334441, + "grad_norm": 3.371204137802124, + "learning_rate": 4.369385516125178e-05, + "logits/chosen": 3.6887245178222656, + "logits/rejected": 3.8789920806884766, + "logps/chosen": -353.6229553222656, + "logps/rejected": -339.0666809082031, + "loss": 0.705, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.939401626586914, + "rewards/margins": 2.8989806175231934, + "rewards/rejected": -4.838382244110107, + "step": 11620 + }, + { + "epoch": 0.3793044455849647, + "grad_norm": 2.214597702026367, + "learning_rate": 4.3682992798253336e-05, + "logits/chosen": 3.7726123332977295, + "logits/rejected": 3.8387837409973145, + "logps/chosen": -336.7052307128906, + "logps/rejected": -306.5196838378906, + "loss": 0.4058, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.412620782852173, + "rewards/margins": 2.9142582416534424, + "rewards/rejected": -5.326879024505615, + "step": 11640 + }, + { + "epoch": 0.37995617143648525, + "grad_norm": 0.484022855758667, + "learning_rate": 4.367213043525489e-05, + "logits/chosen": 3.127805233001709, + "logits/rejected": 3.3845584392547607, + "logps/chosen": -293.6646423339844, + "logps/rejected": -318.4399108886719, + "loss": 0.2888, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.265531063079834, + "rewards/margins": 3.841726779937744, + "rewards/rejected": -6.107256889343262, + "step": 11660 + }, + { + "epoch": 0.3806078972880058, + "grad_norm": 1.8068422079086304, + "learning_rate": 4.366126807225644e-05, + "logits/chosen": 3.1855099201202393, + "logits/rejected": 3.4403927326202393, + "logps/chosen": -319.81048583984375, + "logps/rejected": -313.4847106933594, + "loss": 0.6684, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9998228549957275, + "rewards/margins": 3.270688533782959, + "rewards/rejected": -5.270511150360107, + "step": 11680 + }, + { + "epoch": 0.3812596231395264, + "grad_norm": 3.388317823410034, + "learning_rate": 4.3650405709257995e-05, + "logits/chosen": 3.487112045288086, + "logits/rejected": 3.7669601440429688, + "logps/chosen": -331.60919189453125, + "logps/rejected": -305.4278259277344, + "loss": 0.3717, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4534156322479248, + "rewards/margins": 2.709855794906616, + "rewards/rejected": -4.163271427154541, + "step": 11700 + }, + { + "epoch": 0.3819113489910469, + "grad_norm": 4.294909954071045, + "learning_rate": 4.364008646440947e-05, + "logits/chosen": 3.6298797130584717, + "logits/rejected": 3.7997002601623535, + "logps/chosen": -325.08685302734375, + "logps/rejected": -302.00042724609375, + "loss": 0.4662, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5524864196777344, + "rewards/margins": 3.5389163494110107, + "rewards/rejected": -5.091403007507324, + "step": 11720 + }, + { + "epoch": 0.38256307484256746, + "grad_norm": 2.7950756549835205, + "learning_rate": 4.362922410141102e-05, + "logits/chosen": 3.650538682937622, + "logits/rejected": 3.8843655586242676, + "logps/chosen": -317.78607177734375, + "logps/rejected": -318.7015380859375, + "loss": 0.5648, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7471939325332642, + "rewards/margins": 2.281681537628174, + "rewards/rejected": -4.028875827789307, + "step": 11740 + }, + { + "epoch": 0.38321480069408803, + "grad_norm": 1.0001245737075806, + "learning_rate": 4.3618361738412575e-05, + "logits/chosen": 3.666996717453003, + "logits/rejected": 3.6402275562286377, + "logps/chosen": -368.7359619140625, + "logps/rejected": -327.90960693359375, + "loss": 0.3676, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.2722398042678833, + "rewards/margins": 2.988297939300537, + "rewards/rejected": -4.260537624359131, + "step": 11760 + }, + { + "epoch": 0.3838665265456086, + "grad_norm": 3.948903799057007, + "learning_rate": 4.3607499375414126e-05, + "logits/chosen": 3.7209277153015137, + "logits/rejected": 3.932636260986328, + "logps/chosen": -361.9315185546875, + "logps/rejected": -346.66864013671875, + "loss": 0.7072, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7604142427444458, + "rewards/margins": 2.109536647796631, + "rewards/rejected": -2.869950771331787, + "step": 11780 + }, + { + "epoch": 0.38451825239712917, + "grad_norm": 0.3782423436641693, + "learning_rate": 4.3596637012415683e-05, + "logits/chosen": 3.8746604919433594, + "logits/rejected": 3.9157156944274902, + "logps/chosen": -326.90106201171875, + "logps/rejected": -283.24676513671875, + "loss": 0.3221, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.8501815795898438, + "rewards/margins": 2.7029895782470703, + "rewards/rejected": -3.5531716346740723, + "step": 11800 + }, + { + "epoch": 0.3851699782486497, + "grad_norm": 6.061784267425537, + "learning_rate": 4.358577464941724e-05, + "logits/chosen": 3.583934783935547, + "logits/rejected": 3.800752639770508, + "logps/chosen": -307.4776306152344, + "logps/rejected": -315.40545654296875, + "loss": 0.4815, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9976637959480286, + "rewards/margins": 2.117410898208618, + "rewards/rejected": -3.115074634552002, + "step": 11820 + }, + { + "epoch": 0.38582170410017025, + "grad_norm": 3.8281285762786865, + "learning_rate": 4.357491228641879e-05, + "logits/chosen": 3.62870454788208, + "logits/rejected": 3.616166591644287, + "logps/chosen": -363.96356201171875, + "logps/rejected": -299.6015625, + "loss": 0.5734, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1695754528045654, + "rewards/margins": 2.3099329471588135, + "rewards/rejected": -3.4795081615448, + "step": 11840 + }, + { + "epoch": 0.3864734299516908, + "grad_norm": 1.3454927206039429, + "learning_rate": 4.356404992342034e-05, + "logits/chosen": 3.2998619079589844, + "logits/rejected": 3.304891586303711, + "logps/chosen": -312.02813720703125, + "logps/rejected": -287.2205505371094, + "loss": 0.5023, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4448611736297607, + "rewards/margins": 2.236936569213867, + "rewards/rejected": -3.681797504425049, + "step": 11860 + }, + { + "epoch": 0.3871251558032114, + "grad_norm": 1.407904863357544, + "learning_rate": 4.35531875604219e-05, + "logits/chosen": 3.1892569065093994, + "logits/rejected": 3.519528865814209, + "logps/chosen": -321.7034912109375, + "logps/rejected": -307.32305908203125, + "loss": 0.3908, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.1418211460113525, + "rewards/margins": 2.8386762142181396, + "rewards/rejected": -3.9804978370666504, + "step": 11880 + }, + { + "epoch": 0.38777688165473195, + "grad_norm": 2.998427391052246, + "learning_rate": 4.354232519742345e-05, + "logits/chosen": 3.701869249343872, + "logits/rejected": 3.8162097930908203, + "logps/chosen": -339.84051513671875, + "logps/rejected": -296.3586120605469, + "loss": 0.5597, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.999900221824646, + "rewards/margins": 2.482713222503662, + "rewards/rejected": -4.482613563537598, + "step": 11900 + }, + { + "epoch": 0.3884286075062525, + "grad_norm": 52.38420104980469, + "learning_rate": 4.3531462834425e-05, + "logits/chosen": 3.7470669746398926, + "logits/rejected": 3.8995914459228516, + "logps/chosen": -404.9337158203125, + "logps/rejected": -306.3583068847656, + "loss": 0.7205, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.141275644302368, + "rewards/margins": 2.8631486892700195, + "rewards/rejected": -5.004425048828125, + "step": 11920 + }, + { + "epoch": 0.38908033335777303, + "grad_norm": 0.9208816885948181, + "learning_rate": 4.352060047142655e-05, + "logits/chosen": 3.607513904571533, + "logits/rejected": 3.816890239715576, + "logps/chosen": -339.70880126953125, + "logps/rejected": -297.3851623535156, + "loss": 0.4272, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.9037824869155884, + "rewards/margins": 2.8777241706848145, + "rewards/rejected": -3.7815067768096924, + "step": 11940 + }, + { + "epoch": 0.3897320592092936, + "grad_norm": 2.300916910171509, + "learning_rate": 4.350973810842811e-05, + "logits/chosen": 3.5310397148132324, + "logits/rejected": 3.44709849357605, + "logps/chosen": -329.5317077636719, + "logps/rejected": -338.1036682128906, + "loss": 0.5184, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8676782846450806, + "rewards/margins": 3.2898285388946533, + "rewards/rejected": -4.157506465911865, + "step": 11960 + }, + { + "epoch": 0.39038378506081417, + "grad_norm": 3.7314047813415527, + "learning_rate": 4.349887574542966e-05, + "logits/chosen": 3.535844326019287, + "logits/rejected": 3.79262113571167, + "logps/chosen": -328.71429443359375, + "logps/rejected": -313.13311767578125, + "loss": 0.3356, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.6331891417503357, + "rewards/margins": 2.9136581420898438, + "rewards/rejected": -3.546847105026245, + "step": 11980 + }, + { + "epoch": 0.39103551091233474, + "grad_norm": 2.395857810974121, + "learning_rate": 4.348801338243121e-05, + "logits/chosen": 3.5986411571502686, + "logits/rejected": 3.783700466156006, + "logps/chosen": -342.5739440917969, + "logps/rejected": -331.14935302734375, + "loss": 0.4474, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6636075973510742, + "rewards/margins": 2.455627679824829, + "rewards/rejected": -4.119235038757324, + "step": 12000 + }, + { + "epoch": 0.3916872367638553, + "grad_norm": 3.562678575515747, + "learning_rate": 4.347715101943277e-05, + "logits/chosen": 3.3304316997528076, + "logits/rejected": 3.4099221229553223, + "logps/chosen": -324.8190612792969, + "logps/rejected": -303.53021240234375, + "loss": 0.527, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7376067638397217, + "rewards/margins": 2.965181827545166, + "rewards/rejected": -4.702788352966309, + "step": 12020 + }, + { + "epoch": 0.3923389626153758, + "grad_norm": 1.2083781957626343, + "learning_rate": 4.346628865643432e-05, + "logits/chosen": 3.69508695602417, + "logits/rejected": 3.688105821609497, + "logps/chosen": -387.6549072265625, + "logps/rejected": -389.7777404785156, + "loss": 0.4023, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6325773000717163, + "rewards/margins": 3.7430038452148438, + "rewards/rejected": -5.375580787658691, + "step": 12040 + }, + { + "epoch": 0.3929906884668964, + "grad_norm": 1.6576818227767944, + "learning_rate": 4.345542629343588e-05, + "logits/chosen": 3.3665542602539062, + "logits/rejected": 3.418402910232544, + "logps/chosen": -301.6120300292969, + "logps/rejected": -343.20086669921875, + "loss": 0.331, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.2049729824066162, + "rewards/margins": 3.173840284347534, + "rewards/rejected": -4.37881326675415, + "step": 12060 + }, + { + "epoch": 0.39364241431841696, + "grad_norm": 1.357805848121643, + "learning_rate": 4.3444563930437435e-05, + "logits/chosen": 3.3910725116729736, + "logits/rejected": 3.2504799365997314, + "logps/chosen": -317.71795654296875, + "logps/rejected": -327.16748046875, + "loss": 0.6584, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7183316946029663, + "rewards/margins": 2.3167693614959717, + "rewards/rejected": -4.035101413726807, + "step": 12080 + }, + { + "epoch": 0.3942941401699375, + "grad_norm": 2.467449188232422, + "learning_rate": 4.3433701567438986e-05, + "logits/chosen": 3.716259002685547, + "logits/rejected": 3.8568034172058105, + "logps/chosen": -366.54449462890625, + "logps/rejected": -354.77691650390625, + "loss": 0.3008, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.9387531280517578, + "rewards/margins": 3.40385103225708, + "rewards/rejected": -5.342604160308838, + "step": 12100 + }, + { + "epoch": 0.3949458660214581, + "grad_norm": 2.3277125358581543, + "learning_rate": 4.342283920444054e-05, + "logits/chosen": 3.5312983989715576, + "logits/rejected": 3.620636463165283, + "logps/chosen": -368.9273986816406, + "logps/rejected": -338.83331298828125, + "loss": 0.6238, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.8713430166244507, + "rewards/margins": 2.684049367904663, + "rewards/rejected": -4.555392265319824, + "step": 12120 + }, + { + "epoch": 0.3955975918729786, + "grad_norm": 2.556208848953247, + "learning_rate": 4.341197684144209e-05, + "logits/chosen": 3.3417441844940186, + "logits/rejected": 3.595874786376953, + "logps/chosen": -334.4649353027344, + "logps/rejected": -311.868896484375, + "loss": 0.3705, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6837915182113647, + "rewards/margins": 3.3181090354919434, + "rewards/rejected": -5.001900672912598, + "step": 12140 + }, + { + "epoch": 0.3962493177244992, + "grad_norm": 1.9798723459243774, + "learning_rate": 4.3401114478443645e-05, + "logits/chosen": 3.481067657470703, + "logits/rejected": 3.5311756134033203, + "logps/chosen": -329.17816162109375, + "logps/rejected": -320.260009765625, + "loss": 0.3099, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4416725635528564, + "rewards/margins": 3.336735963821411, + "rewards/rejected": -4.778409004211426, + "step": 12160 + }, + { + "epoch": 0.39690104357601974, + "grad_norm": 0.5063221454620361, + "learning_rate": 4.3390252115445196e-05, + "logits/chosen": 3.252244472503662, + "logits/rejected": 3.3688480854034424, + "logps/chosen": -318.74578857421875, + "logps/rejected": -317.72650146484375, + "loss": 0.2936, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.965614676475525, + "rewards/margins": 3.103017807006836, + "rewards/rejected": -5.06863260269165, + "step": 12180 + }, + { + "epoch": 0.3975527694275403, + "grad_norm": 8.702651977539062, + "learning_rate": 4.3379389752446747e-05, + "logits/chosen": 3.656607151031494, + "logits/rejected": 3.7690608501434326, + "logps/chosen": -432.197265625, + "logps/rejected": -369.5443420410156, + "loss": 0.5757, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.105114459991455, + "rewards/margins": 3.060868740081787, + "rewards/rejected": -5.165982723236084, + "step": 12200 + }, + { + "epoch": 0.3982044952790609, + "grad_norm": 4.528327941894531, + "learning_rate": 4.3368527389448304e-05, + "logits/chosen": 3.3902289867401123, + "logits/rejected": 3.6993534564971924, + "logps/chosen": -296.9540710449219, + "logps/rejected": -311.8495178222656, + "loss": 0.5669, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4054691791534424, + "rewards/margins": 3.210843563079834, + "rewards/rejected": -4.6163129806518555, + "step": 12220 + }, + { + "epoch": 0.39885622113058145, + "grad_norm": 0.12198824435472488, + "learning_rate": 4.3357665026449855e-05, + "logits/chosen": 3.6999282836914062, + "logits/rejected": 3.8272957801818848, + "logps/chosen": -372.19140625, + "logps/rejected": -304.32012939453125, + "loss": 0.4199, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.1408491134643555, + "rewards/margins": 3.279883623123169, + "rewards/rejected": -4.4207329750061035, + "step": 12240 + }, + { + "epoch": 0.39950794698210196, + "grad_norm": 1.688442349433899, + "learning_rate": 4.3346802663451406e-05, + "logits/chosen": 3.313678026199341, + "logits/rejected": 3.4257469177246094, + "logps/chosen": -288.9721984863281, + "logps/rejected": -310.8792724609375, + "loss": 0.6929, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9820258617401123, + "rewards/margins": 2.2608675956726074, + "rewards/rejected": -4.242894172668457, + "step": 12260 + }, + { + "epoch": 0.4001596728336225, + "grad_norm": 0.7567421197891235, + "learning_rate": 4.333594030045296e-05, + "logits/chosen": 3.885843276977539, + "logits/rejected": 3.9555416107177734, + "logps/chosen": -373.3453369140625, + "logps/rejected": -339.5411071777344, + "loss": 0.4488, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.523321509361267, + "rewards/margins": 2.743464708328247, + "rewards/rejected": -4.266785621643066, + "step": 12280 + }, + { + "epoch": 0.4008113986851431, + "grad_norm": 2.330420732498169, + "learning_rate": 4.3325077937454514e-05, + "logits/chosen": 3.7873566150665283, + "logits/rejected": 3.8886361122131348, + "logps/chosen": -368.4349365234375, + "logps/rejected": -333.76300048828125, + "loss": 0.4945, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4932124614715576, + "rewards/margins": 2.5369503498077393, + "rewards/rejected": -4.030163288116455, + "step": 12300 + }, + { + "epoch": 0.40146312453666366, + "grad_norm": 6.24009895324707, + "learning_rate": 4.331421557445607e-05, + "logits/chosen": 3.584535598754883, + "logits/rejected": 3.628854274749756, + "logps/chosen": -365.5126037597656, + "logps/rejected": -334.57196044921875, + "loss": 0.5288, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.3033699989318848, + "rewards/margins": 3.5408778190612793, + "rewards/rejected": -5.844247341156006, + "step": 12320 + }, + { + "epoch": 0.40211485038818423, + "grad_norm": 1.7187838554382324, + "learning_rate": 4.330335321145762e-05, + "logits/chosen": 3.3652031421661377, + "logits/rejected": 3.484830379486084, + "logps/chosen": -381.9066467285156, + "logps/rejected": -319.69036865234375, + "loss": 0.2946, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.075692653656006, + "rewards/margins": 3.1289806365966797, + "rewards/rejected": -5.204672813415527, + "step": 12340 + }, + { + "epoch": 0.40276657623970474, + "grad_norm": 1.7886536121368408, + "learning_rate": 4.329249084845918e-05, + "logits/chosen": 3.765507221221924, + "logits/rejected": 4.070296287536621, + "logps/chosen": -339.2215881347656, + "logps/rejected": -329.38531494140625, + "loss": 0.6661, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.400078058242798, + "rewards/margins": 2.471862554550171, + "rewards/rejected": -4.871940612792969, + "step": 12360 + }, + { + "epoch": 0.4034183020912253, + "grad_norm": 1.859203815460205, + "learning_rate": 4.328162848546073e-05, + "logits/chosen": 3.692924976348877, + "logits/rejected": 3.803520917892456, + "logps/chosen": -371.83428955078125, + "logps/rejected": -342.41656494140625, + "loss": 0.5622, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9551122188568115, + "rewards/margins": 2.7755026817321777, + "rewards/rejected": -4.73061466217041, + "step": 12380 + }, + { + "epoch": 0.4040700279427459, + "grad_norm": 0.7780736088752747, + "learning_rate": 4.327076612246228e-05, + "logits/chosen": 3.633056640625, + "logits/rejected": 3.700576066970825, + "logps/chosen": -331.47857666015625, + "logps/rejected": -339.78826904296875, + "loss": 0.6789, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.919992208480835, + "rewards/margins": 2.7759547233581543, + "rewards/rejected": -4.695947170257568, + "step": 12400 + }, + { + "epoch": 0.40472175379426645, + "grad_norm": 4.111178874969482, + "learning_rate": 4.325990375946384e-05, + "logits/chosen": 3.6599929332733154, + "logits/rejected": 3.9432406425476074, + "logps/chosen": -389.91375732421875, + "logps/rejected": -409.49407958984375, + "loss": 0.4445, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.4134058952331543, + "rewards/margins": 3.342170238494873, + "rewards/rejected": -5.755576133728027, + "step": 12420 + }, + { + "epoch": 0.405373479645787, + "grad_norm": 1.2061026096343994, + "learning_rate": 4.324904139646539e-05, + "logits/chosen": 3.7934257984161377, + "logits/rejected": 3.782278537750244, + "logps/chosen": -365.1275939941406, + "logps/rejected": -323.4488525390625, + "loss": 0.611, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.3127377033233643, + "rewards/margins": 2.193033218383789, + "rewards/rejected": -4.505771636962891, + "step": 12440 + }, + { + "epoch": 0.4060252054973076, + "grad_norm": 5.347169876098633, + "learning_rate": 4.323817903346694e-05, + "logits/chosen": 3.702843427658081, + "logits/rejected": 4.023158550262451, + "logps/chosen": -407.31939697265625, + "logps/rejected": -345.5549621582031, + "loss": 0.6737, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.553128480911255, + "rewards/margins": 2.649874210357666, + "rewards/rejected": -5.203002452850342, + "step": 12460 + }, + { + "epoch": 0.4066769313488281, + "grad_norm": 17.181686401367188, + "learning_rate": 4.32273166704685e-05, + "logits/chosen": 3.618859052658081, + "logits/rejected": 3.7493560314178467, + "logps/chosen": -273.17474365234375, + "logps/rejected": -308.8026428222656, + "loss": 0.5604, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2258973121643066, + "rewards/margins": 2.0935440063476562, + "rewards/rejected": -4.319440841674805, + "step": 12480 + }, + { + "epoch": 0.40732865720034866, + "grad_norm": 4.199854850769043, + "learning_rate": 4.321645430747005e-05, + "logits/chosen": 4.183228015899658, + "logits/rejected": 4.189120292663574, + "logps/chosen": -400.6385803222656, + "logps/rejected": -352.4070739746094, + "loss": 0.7179, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.478715419769287, + "rewards/margins": 1.5638701915740967, + "rewards/rejected": -4.042585849761963, + "step": 12500 + }, + { + "epoch": 0.40798038305186923, + "grad_norm": 1.345800518989563, + "learning_rate": 4.32055919444716e-05, + "logits/chosen": 4.056609153747559, + "logits/rejected": 4.055568218231201, + "logps/chosen": -307.87164306640625, + "logps/rejected": -271.30804443359375, + "loss": 0.519, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7688701152801514, + "rewards/margins": 2.1513478755950928, + "rewards/rejected": -3.920217990875244, + "step": 12520 + }, + { + "epoch": 0.4086321089033898, + "grad_norm": 2.1254355907440186, + "learning_rate": 4.319472958147315e-05, + "logits/chosen": 4.027743816375732, + "logits/rejected": 4.197320938110352, + "logps/chosen": -395.037109375, + "logps/rejected": -316.27740478515625, + "loss": 0.2931, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5805447101593018, + "rewards/margins": 2.816080093383789, + "rewards/rejected": -4.39662504196167, + "step": 12540 + }, + { + "epoch": 0.40928383475491037, + "grad_norm": 0.39576083421707153, + "learning_rate": 4.318386721847471e-05, + "logits/chosen": 4.022760391235352, + "logits/rejected": 4.007199287414551, + "logps/chosen": -346.9942321777344, + "logps/rejected": -292.81640625, + "loss": 0.2866, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.0609064102172852, + "rewards/margins": 3.1145291328430176, + "rewards/rejected": -4.1754350662231445, + "step": 12560 + }, + { + "epoch": 0.4099355606064309, + "grad_norm": 33.38279342651367, + "learning_rate": 4.3173004855476266e-05, + "logits/chosen": 3.7020771503448486, + "logits/rejected": 3.897590160369873, + "logps/chosen": -383.10968017578125, + "logps/rejected": -384.3697509765625, + "loss": 0.563, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4304969310760498, + "rewards/margins": 3.001713514328003, + "rewards/rejected": -4.432210922241211, + "step": 12580 + }, + { + "epoch": 0.41058728645795145, + "grad_norm": 0.8287296891212463, + "learning_rate": 4.3162142492477816e-05, + "logits/chosen": 3.8498549461364746, + "logits/rejected": 3.9235873222351074, + "logps/chosen": -382.37713623046875, + "logps/rejected": -340.83551025390625, + "loss": 0.4039, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.07934308052063, + "rewards/margins": 2.8078789710998535, + "rewards/rejected": -4.8872222900390625, + "step": 12600 + }, + { + "epoch": 0.411239012309472, + "grad_norm": 3.9638593196868896, + "learning_rate": 4.3151280129479374e-05, + "logits/chosen": 3.627500057220459, + "logits/rejected": 3.727013349533081, + "logps/chosen": -328.94696044921875, + "logps/rejected": -334.8876037597656, + "loss": 0.535, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.4072186946868896, + "rewards/margins": 2.8845784664154053, + "rewards/rejected": -5.291797161102295, + "step": 12620 + }, + { + "epoch": 0.4118907381609926, + "grad_norm": 1.6275748014450073, + "learning_rate": 4.3140417766480925e-05, + "logits/chosen": 3.5295958518981934, + "logits/rejected": 3.768634796142578, + "logps/chosen": -359.6811828613281, + "logps/rejected": -326.81036376953125, + "loss": 0.4548, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7886927127838135, + "rewards/margins": 2.952305316925049, + "rewards/rejected": -4.740997791290283, + "step": 12640 + }, + { + "epoch": 0.41254246401251315, + "grad_norm": 0.8576197028160095, + "learning_rate": 4.3129555403482476e-05, + "logits/chosen": 3.591726779937744, + "logits/rejected": 3.6170907020568848, + "logps/chosen": -329.43609619140625, + "logps/rejected": -322.6855163574219, + "loss": 0.5467, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.162381649017334, + "rewards/margins": 2.5638041496276855, + "rewards/rejected": -4.7261857986450195, + "step": 12660 + }, + { + "epoch": 0.41319418986403367, + "grad_norm": 0.7612417936325073, + "learning_rate": 4.311869304048403e-05, + "logits/chosen": 3.7060794830322266, + "logits/rejected": 3.900871753692627, + "logps/chosen": -350.6959533691406, + "logps/rejected": -343.0794677734375, + "loss": 0.4865, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.122817277908325, + "rewards/margins": 2.4820590019226074, + "rewards/rejected": -4.604876518249512, + "step": 12680 + }, + { + "epoch": 0.41384591571555424, + "grad_norm": 1.5382996797561646, + "learning_rate": 4.3107830677485584e-05, + "logits/chosen": 3.6306564807891846, + "logits/rejected": 3.763108730316162, + "logps/chosen": -356.82489013671875, + "logps/rejected": -329.9605407714844, + "loss": 0.4522, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8433507680892944, + "rewards/margins": 2.834918260574341, + "rewards/rejected": -4.678268909454346, + "step": 12700 + }, + { + "epoch": 0.4144976415670748, + "grad_norm": 1.0853314399719238, + "learning_rate": 4.3096968314487135e-05, + "logits/chosen": 3.6563758850097656, + "logits/rejected": 3.921879529953003, + "logps/chosen": -311.17071533203125, + "logps/rejected": -328.08795166015625, + "loss": 0.4364, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.585893988609314, + "rewards/margins": 3.1826303005218506, + "rewards/rejected": -4.768524169921875, + "step": 12720 + }, + { + "epoch": 0.41514936741859537, + "grad_norm": 0.26267069578170776, + "learning_rate": 4.3086105951488685e-05, + "logits/chosen": 3.466041088104248, + "logits/rejected": 3.7299110889434814, + "logps/chosen": -358.18463134765625, + "logps/rejected": -334.6571350097656, + "loss": 0.6016, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3241546154022217, + "rewards/margins": 2.3338286876678467, + "rewards/rejected": -3.6579830646514893, + "step": 12740 + }, + { + "epoch": 0.41580109327011594, + "grad_norm": 0.1843084841966629, + "learning_rate": 4.307524358849024e-05, + "logits/chosen": 3.314426898956299, + "logits/rejected": 3.551971912384033, + "logps/chosen": -339.34405517578125, + "logps/rejected": -340.37567138671875, + "loss": 0.3123, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.9765422940254211, + "rewards/margins": 3.1794190406799316, + "rewards/rejected": -4.155961036682129, + "step": 12760 + }, + { + "epoch": 0.4164528191216365, + "grad_norm": 2.430861473083496, + "learning_rate": 4.3064381225491794e-05, + "logits/chosen": 3.5076904296875, + "logits/rejected": 3.6386218070983887, + "logps/chosen": -337.60699462890625, + "logps/rejected": -360.49688720703125, + "loss": 0.6063, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.8016231060028076, + "rewards/margins": 2.8846347332000732, + "rewards/rejected": -5.686257362365723, + "step": 12780 + }, + { + "epoch": 0.417104544973157, + "grad_norm": 0.552589476108551, + "learning_rate": 4.3053518862493345e-05, + "logits/chosen": 3.4429099559783936, + "logits/rejected": 3.492760419845581, + "logps/chosen": -310.24371337890625, + "logps/rejected": -327.47003173828125, + "loss": 0.5086, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.98531973361969, + "rewards/margins": 2.4584875106811523, + "rewards/rejected": -4.443807125091553, + "step": 12800 + }, + { + "epoch": 0.4177562708246776, + "grad_norm": 6.234446048736572, + "learning_rate": 4.30426564994949e-05, + "logits/chosen": 3.815825939178467, + "logits/rejected": 3.957737684249878, + "logps/chosen": -321.2431335449219, + "logps/rejected": -339.06329345703125, + "loss": 0.5152, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2018864154815674, + "rewards/margins": 2.1557302474975586, + "rewards/rejected": -4.357616424560547, + "step": 12820 + }, + { + "epoch": 0.41840799667619816, + "grad_norm": 2.6117823123931885, + "learning_rate": 4.303179413649645e-05, + "logits/chosen": 3.4432883262634277, + "logits/rejected": 3.648801803588867, + "logps/chosen": -342.73858642578125, + "logps/rejected": -311.27838134765625, + "loss": 0.5999, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8658899068832397, + "rewards/margins": 2.3472111225128174, + "rewards/rejected": -4.213100910186768, + "step": 12840 + }, + { + "epoch": 0.4190597225277187, + "grad_norm": 4.614185810089111, + "learning_rate": 4.302093177349801e-05, + "logits/chosen": 3.299344301223755, + "logits/rejected": 3.4229958057403564, + "logps/chosen": -361.4295959472656, + "logps/rejected": -333.4239196777344, + "loss": 0.2869, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.3206312656402588, + "rewards/margins": 3.63513445854187, + "rewards/rejected": -4.955766201019287, + "step": 12860 + }, + { + "epoch": 0.4197114483792393, + "grad_norm": 2.2745704650878906, + "learning_rate": 4.301006941049956e-05, + "logits/chosen": 3.525524616241455, + "logits/rejected": 3.702922821044922, + "logps/chosen": -324.09234619140625, + "logps/rejected": -321.5767822265625, + "loss": 0.5156, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.461491107940674, + "rewards/margins": 2.5903408527374268, + "rewards/rejected": -5.051831245422363, + "step": 12880 + }, + { + "epoch": 0.4203631742307598, + "grad_norm": 1.3085801601409912, + "learning_rate": 4.299920704750112e-05, + "logits/chosen": 3.8288445472717285, + "logits/rejected": 3.7270150184631348, + "logps/chosen": -377.07568359375, + "logps/rejected": -309.7663269042969, + "loss": 0.6034, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3788716793060303, + "rewards/margins": 2.964334011077881, + "rewards/rejected": -4.34320592880249, + "step": 12900 + }, + { + "epoch": 0.4210149000822804, + "grad_norm": 0.4537847638130188, + "learning_rate": 4.298834468450267e-05, + "logits/chosen": 3.7291629314422607, + "logits/rejected": 3.817546844482422, + "logps/chosen": -387.5888671875, + "logps/rejected": -363.59027099609375, + "loss": 0.3507, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.5874106884002686, + "rewards/margins": 3.349475383758545, + "rewards/rejected": -4.936886310577393, + "step": 12920 + }, + { + "epoch": 0.42166662593380094, + "grad_norm": 0.9246983528137207, + "learning_rate": 4.297748232150422e-05, + "logits/chosen": 3.8416874408721924, + "logits/rejected": 3.86517596244812, + "logps/chosen": -347.8785400390625, + "logps/rejected": -337.08984375, + "loss": 0.6956, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8227685689926147, + "rewards/margins": 2.4265637397766113, + "rewards/rejected": -4.249332427978516, + "step": 12940 + }, + { + "epoch": 0.4223183517853215, + "grad_norm": 2.7587714195251465, + "learning_rate": 4.296661995850578e-05, + "logits/chosen": 3.559530735015869, + "logits/rejected": 3.690886974334717, + "logps/chosen": -324.71759033203125, + "logps/rejected": -294.94207763671875, + "loss": 0.4504, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4720247983932495, + "rewards/margins": 2.496739149093628, + "rewards/rejected": -3.968763828277588, + "step": 12960 + }, + { + "epoch": 0.4229700776368421, + "grad_norm": 1.8480733633041382, + "learning_rate": 4.295575759550733e-05, + "logits/chosen": 3.2677714824676514, + "logits/rejected": 3.4362998008728027, + "logps/chosen": -342.03424072265625, + "logps/rejected": -317.2989196777344, + "loss": 0.3884, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.2164740562438965, + "rewards/margins": 3.2077877521514893, + "rewards/rejected": -5.424261569976807, + "step": 12980 + }, + { + "epoch": 0.42362180348836265, + "grad_norm": 0.017314434051513672, + "learning_rate": 4.294489523250888e-05, + "logits/chosen": 3.5683302879333496, + "logits/rejected": 3.707947254180908, + "logps/chosen": -317.1650085449219, + "logps/rejected": -293.56060791015625, + "loss": 0.5692, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2734057903289795, + "rewards/margins": 2.9396443367004395, + "rewards/rejected": -5.21304988861084, + "step": 13000 + }, + { + "epoch": 0.42427352933988316, + "grad_norm": 0.0646054819226265, + "learning_rate": 4.293403286951044e-05, + "logits/chosen": 3.567448377609253, + "logits/rejected": 3.8864524364471436, + "logps/chosen": -359.97686767578125, + "logps/rejected": -322.1175231933594, + "loss": 0.439, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6183210611343384, + "rewards/margins": 3.2060017585754395, + "rewards/rejected": -4.824322700500488, + "step": 13020 + }, + { + "epoch": 0.42492525519140373, + "grad_norm": 4.547952651977539, + "learning_rate": 4.292317050651199e-05, + "logits/chosen": 4.320982933044434, + "logits/rejected": 4.475030422210693, + "logps/chosen": -374.4197692871094, + "logps/rejected": -341.2873229980469, + "loss": 0.3237, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.1541839838027954, + "rewards/margins": 3.0060198307037354, + "rewards/rejected": -4.16020393371582, + "step": 13040 + }, + { + "epoch": 0.4255769810429243, + "grad_norm": 2.146322727203369, + "learning_rate": 4.291230814351354e-05, + "logits/chosen": 3.73224139213562, + "logits/rejected": 3.862422466278076, + "logps/chosen": -333.84228515625, + "logps/rejected": -279.20855712890625, + "loss": 0.4656, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3819832801818848, + "rewards/margins": 2.54121732711792, + "rewards/rejected": -3.9232006072998047, + "step": 13060 + }, + { + "epoch": 0.42622870689444486, + "grad_norm": 1.59721040725708, + "learning_rate": 4.290144578051509e-05, + "logits/chosen": 4.079954624176025, + "logits/rejected": 4.063741683959961, + "logps/chosen": -371.37908935546875, + "logps/rejected": -352.6491394042969, + "loss": 0.3634, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.624230146408081, + "rewards/margins": 3.117866039276123, + "rewards/rejected": -4.742096424102783, + "step": 13080 + }, + { + "epoch": 0.42688043274596543, + "grad_norm": 2.9443321228027344, + "learning_rate": 4.289058341751665e-05, + "logits/chosen": 3.465580701828003, + "logits/rejected": 3.686521530151367, + "logps/chosen": -280.6343688964844, + "logps/rejected": -283.7994079589844, + "loss": 0.5293, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.199440836906433, + "rewards/margins": 2.3287596702575684, + "rewards/rejected": -3.528200626373291, + "step": 13100 + }, + { + "epoch": 0.42753215859748595, + "grad_norm": 3.1495652198791504, + "learning_rate": 4.2879721054518205e-05, + "logits/chosen": 3.6306400299072266, + "logits/rejected": 3.784029006958008, + "logps/chosen": -363.2959899902344, + "logps/rejected": -334.77337646484375, + "loss": 0.6185, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5566524267196655, + "rewards/margins": 2.0182344913482666, + "rewards/rejected": -3.5748867988586426, + "step": 13120 + }, + { + "epoch": 0.4281838844490065, + "grad_norm": 2.120983839035034, + "learning_rate": 4.2868858691519755e-05, + "logits/chosen": 4.009109020233154, + "logits/rejected": 4.07193660736084, + "logps/chosen": -345.17718505859375, + "logps/rejected": -338.0405578613281, + "loss": 0.2775, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1824567317962646, + "rewards/margins": 3.3672873973846436, + "rewards/rejected": -4.549744606018066, + "step": 13140 + }, + { + "epoch": 0.4288356103005271, + "grad_norm": 8.427762031555176, + "learning_rate": 4.285799632852131e-05, + "logits/chosen": 3.654776096343994, + "logits/rejected": 3.800985336303711, + "logps/chosen": -342.7120666503906, + "logps/rejected": -311.67498779296875, + "loss": 0.4968, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9280378222465515, + "rewards/margins": 2.471269130706787, + "rewards/rejected": -3.3993067741394043, + "step": 13160 + }, + { + "epoch": 0.42948733615204765, + "grad_norm": 1.7667961120605469, + "learning_rate": 4.2847133965522864e-05, + "logits/chosen": 3.478361129760742, + "logits/rejected": 3.6397647857666016, + "logps/chosen": -358.45587158203125, + "logps/rejected": -303.935302734375, + "loss": 0.3975, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.8232017755508423, + "rewards/margins": 3.349979877471924, + "rewards/rejected": -4.173181533813477, + "step": 13180 + }, + { + "epoch": 0.4301390620035682, + "grad_norm": 2.4327805042266846, + "learning_rate": 4.2836271602524414e-05, + "logits/chosen": 3.9604411125183105, + "logits/rejected": 4.144984245300293, + "logps/chosen": -377.28948974609375, + "logps/rejected": -356.50494384765625, + "loss": 0.3668, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5070197582244873, + "rewards/margins": 2.8016555309295654, + "rewards/rejected": -4.308675289154053, + "step": 13200 + }, + { + "epoch": 0.43079078785508873, + "grad_norm": 2.8595383167266846, + "learning_rate": 4.282540923952597e-05, + "logits/chosen": 3.8194942474365234, + "logits/rejected": 3.932110548019409, + "logps/chosen": -380.67523193359375, + "logps/rejected": -355.2630615234375, + "loss": 0.3855, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.602834701538086, + "rewards/margins": 2.9803130626678467, + "rewards/rejected": -4.583148002624512, + "step": 13220 + }, + { + "epoch": 0.4314425137066093, + "grad_norm": 1.6328562498092651, + "learning_rate": 4.281454687652752e-05, + "logits/chosen": 3.329616069793701, + "logits/rejected": 3.4686694145202637, + "logps/chosen": -281.0997009277344, + "logps/rejected": -291.8130798339844, + "loss": 0.6669, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8491913080215454, + "rewards/margins": 2.2377877235412598, + "rewards/rejected": -4.086978912353516, + "step": 13240 + }, + { + "epoch": 0.43209423955812987, + "grad_norm": 3.9666213989257812, + "learning_rate": 4.2803684513529074e-05, + "logits/chosen": 3.382355213165283, + "logits/rejected": 3.558976650238037, + "logps/chosen": -346.7284851074219, + "logps/rejected": -318.10089111328125, + "loss": 0.4534, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8450393676757812, + "rewards/margins": 2.834404468536377, + "rewards/rejected": -4.679444313049316, + "step": 13260 + }, + { + "epoch": 0.43274596540965043, + "grad_norm": 1.6502373218536377, + "learning_rate": 4.2792822150530624e-05, + "logits/chosen": 3.697809934616089, + "logits/rejected": 3.5738511085510254, + "logps/chosen": -313.17425537109375, + "logps/rejected": -289.12860107421875, + "loss": 0.5473, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5150504112243652, + "rewards/margins": 2.578733205795288, + "rewards/rejected": -4.093783378601074, + "step": 13280 + }, + { + "epoch": 0.433397691261171, + "grad_norm": 1.951598048210144, + "learning_rate": 4.278195978753218e-05, + "logits/chosen": 4.12760066986084, + "logits/rejected": 4.317046165466309, + "logps/chosen": -330.24920654296875, + "logps/rejected": -294.4374694824219, + "loss": 0.4315, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5071382522583008, + "rewards/margins": 2.6063530445098877, + "rewards/rejected": -4.113492012023926, + "step": 13300 + }, + { + "epoch": 0.43404941711269157, + "grad_norm": 1.6097701787948608, + "learning_rate": 4.277109742453373e-05, + "logits/chosen": 3.5789523124694824, + "logits/rejected": 3.716722011566162, + "logps/chosen": -319.05426025390625, + "logps/rejected": -315.6777648925781, + "loss": 0.4089, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.012183427810669, + "rewards/margins": 2.296419143676758, + "rewards/rejected": -3.3086025714874268, + "step": 13320 + }, + { + "epoch": 0.4347011429642121, + "grad_norm": 2.8055055141448975, + "learning_rate": 4.2760235061535283e-05, + "logits/chosen": 3.5423145294189453, + "logits/rejected": 3.7099761962890625, + "logps/chosen": -335.2253723144531, + "logps/rejected": -327.59765625, + "loss": 0.5358, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0680524110794067, + "rewards/margins": 2.438927412033081, + "rewards/rejected": -3.5069797039031982, + "step": 13340 + }, + { + "epoch": 0.43535286881573265, + "grad_norm": 6.577881336212158, + "learning_rate": 4.274937269853684e-05, + "logits/chosen": 3.618968963623047, + "logits/rejected": 3.7641406059265137, + "logps/chosen": -302.3026123046875, + "logps/rejected": -324.03167724609375, + "loss": 0.4558, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.0553334951400757, + "rewards/margins": 2.613264799118042, + "rewards/rejected": -3.6685986518859863, + "step": 13360 + }, + { + "epoch": 0.4360045946672532, + "grad_norm": 1.516269564628601, + "learning_rate": 4.27385103355384e-05, + "logits/chosen": 3.4310500621795654, + "logits/rejected": 3.5701968669891357, + "logps/chosen": -287.26141357421875, + "logps/rejected": -254.87332153320312, + "loss": 0.33, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.4912010133266449, + "rewards/margins": 2.7387828826904297, + "rewards/rejected": -3.2299842834472656, + "step": 13380 + }, + { + "epoch": 0.4366563205187738, + "grad_norm": 1.2417048215866089, + "learning_rate": 4.272764797253995e-05, + "logits/chosen": 3.9574742317199707, + "logits/rejected": 3.8710453510284424, + "logps/chosen": -320.7976989746094, + "logps/rejected": -293.2945861816406, + "loss": 0.4183, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.1133002042770386, + "rewards/margins": 2.850548267364502, + "rewards/rejected": -3.96384859085083, + "step": 13400 + }, + { + "epoch": 0.43730804637029436, + "grad_norm": 6.367365837097168, + "learning_rate": 4.271678560954151e-05, + "logits/chosen": 3.8225455284118652, + "logits/rejected": 3.9219393730163574, + "logps/chosen": -385.0989685058594, + "logps/rejected": -348.3660888671875, + "loss": 0.5571, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8888866305351257, + "rewards/margins": 2.6461784839630127, + "rewards/rejected": -3.535065174102783, + "step": 13420 + }, + { + "epoch": 0.43795977222181487, + "grad_norm": 1.6615631580352783, + "learning_rate": 4.270592324654306e-05, + "logits/chosen": 3.8158111572265625, + "logits/rejected": 4.0432000160217285, + "logps/chosen": -330.37908935546875, + "logps/rejected": -291.40997314453125, + "loss": 0.4262, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.4630588293075562, + "rewards/margins": 2.801931858062744, + "rewards/rejected": -4.26499080657959, + "step": 13440 + }, + { + "epoch": 0.43861149807333544, + "grad_norm": 1.8790671825408936, + "learning_rate": 4.269506088354461e-05, + "logits/chosen": 3.7931296825408936, + "logits/rejected": 3.9075493812561035, + "logps/chosen": -342.3168029785156, + "logps/rejected": -269.90802001953125, + "loss": 0.5088, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3370963335037231, + "rewards/margins": 2.1982831954956055, + "rewards/rejected": -3.535379409790039, + "step": 13460 + }, + { + "epoch": 0.439263223924856, + "grad_norm": 2.7771661281585693, + "learning_rate": 4.268419852054616e-05, + "logits/chosen": 3.7596306800842285, + "logits/rejected": 3.8817343711853027, + "logps/chosen": -382.4857177734375, + "logps/rejected": -374.0744934082031, + "loss": 0.3666, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.28460267186164856, + "rewards/margins": 4.213589668273926, + "rewards/rejected": -4.498192310333252, + "step": 13480 + }, + { + "epoch": 0.4399149497763766, + "grad_norm": 2.6781108379364014, + "learning_rate": 4.267333615754772e-05, + "logits/chosen": 3.490283966064453, + "logits/rejected": 3.7781243324279785, + "logps/chosen": -334.12200927734375, + "logps/rejected": -374.33306884765625, + "loss": 0.9284, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6984269618988037, + "rewards/margins": 2.1665148735046387, + "rewards/rejected": -3.8649418354034424, + "step": 13500 + }, + { + "epoch": 0.44056667562789714, + "grad_norm": 3.229506492614746, + "learning_rate": 4.266247379454927e-05, + "logits/chosen": 3.746748447418213, + "logits/rejected": 3.783280611038208, + "logps/chosen": -354.1898498535156, + "logps/rejected": -338.16143798828125, + "loss": 0.2918, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6958338022232056, + "rewards/margins": 3.265488386154175, + "rewards/rejected": -4.961321830749512, + "step": 13520 + }, + { + "epoch": 0.4412184014794177, + "grad_norm": 2.7408266067504883, + "learning_rate": 4.265161143155082e-05, + "logits/chosen": 3.27325701713562, + "logits/rejected": 3.3932347297668457, + "logps/chosen": -327.0648498535156, + "logps/rejected": -318.19232177734375, + "loss": 0.4668, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9349539279937744, + "rewards/margins": 2.745840549468994, + "rewards/rejected": -4.6807942390441895, + "step": 13540 + }, + { + "epoch": 0.4418701273309382, + "grad_norm": 1.0012565851211548, + "learning_rate": 4.2640749068552376e-05, + "logits/chosen": 3.6906485557556152, + "logits/rejected": 3.8346340656280518, + "logps/chosen": -368.0428771972656, + "logps/rejected": -345.9737854003906, + "loss": 0.4594, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.1102027893066406, + "rewards/margins": 2.7960288524627686, + "rewards/rejected": -4.906231880187988, + "step": 13560 + }, + { + "epoch": 0.4425218531824588, + "grad_norm": 2.9696500301361084, + "learning_rate": 4.262988670555393e-05, + "logits/chosen": 3.0208160877227783, + "logits/rejected": 3.154541492462158, + "logps/chosen": -338.682861328125, + "logps/rejected": -321.25604248046875, + "loss": 0.3403, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1172432899475098, + "rewards/margins": 3.261385679244995, + "rewards/rejected": -5.378629207611084, + "step": 13580 + }, + { + "epoch": 0.44317357903397936, + "grad_norm": 1.9657313823699951, + "learning_rate": 4.261902434255548e-05, + "logits/chosen": 3.5435569286346436, + "logits/rejected": 3.692983627319336, + "logps/chosen": -334.5237731933594, + "logps/rejected": -279.8662109375, + "loss": 0.604, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.5380539894104004, + "rewards/margins": 2.43196177482605, + "rewards/rejected": -4.970015525817871, + "step": 13600 + }, + { + "epoch": 0.4438253048854999, + "grad_norm": 0.9083979725837708, + "learning_rate": 4.2608161979557035e-05, + "logits/chosen": 3.523850917816162, + "logits/rejected": 3.626598834991455, + "logps/chosen": -341.474609375, + "logps/rejected": -326.36309814453125, + "loss": 0.4526, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9103485345840454, + "rewards/margins": 2.7037296295166016, + "rewards/rejected": -4.614078044891357, + "step": 13620 + }, + { + "epoch": 0.4444770307370205, + "grad_norm": 1.9875011444091797, + "learning_rate": 4.2597299616558586e-05, + "logits/chosen": 3.518125534057617, + "logits/rejected": 3.417619228363037, + "logps/chosen": -348.5337829589844, + "logps/rejected": -313.94354248046875, + "loss": 0.4633, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.01532244682312, + "rewards/margins": 3.2495639324188232, + "rewards/rejected": -5.264885902404785, + "step": 13640 + }, + { + "epoch": 0.445128756588541, + "grad_norm": 2.673677921295166, + "learning_rate": 4.2586437253560143e-05, + "logits/chosen": 3.41801381111145, + "logits/rejected": 3.739126682281494, + "logps/chosen": -305.69097900390625, + "logps/rejected": -281.8565979003906, + "loss": 0.5562, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.8910354375839233, + "rewards/margins": 2.2824432849884033, + "rewards/rejected": -4.173478603363037, + "step": 13660 + }, + { + "epoch": 0.4457804824400616, + "grad_norm": 0.21851778030395508, + "learning_rate": 4.2575574890561694e-05, + "logits/chosen": 3.8817222118377686, + "logits/rejected": 3.9144127368927, + "logps/chosen": -382.26263427734375, + "logps/rejected": -353.6990051269531, + "loss": 0.508, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.989606499671936, + "rewards/margins": 3.2681381702423096, + "rewards/rejected": -5.257744789123535, + "step": 13680 + }, + { + "epoch": 0.44643220829158214, + "grad_norm": 1.5602343082427979, + "learning_rate": 4.256471252756325e-05, + "logits/chosen": 3.628500461578369, + "logits/rejected": 3.7266337871551514, + "logps/chosen": -375.7085876464844, + "logps/rejected": -337.40679931640625, + "loss": 0.4766, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.197489023208618, + "rewards/margins": 2.8087666034698486, + "rewards/rejected": -5.006255626678467, + "step": 13700 + }, + { + "epoch": 0.4470839341431027, + "grad_norm": 0.9838492274284363, + "learning_rate": 4.25538501645648e-05, + "logits/chosen": 3.702422618865967, + "logits/rejected": 3.8650174140930176, + "logps/chosen": -386.1387939453125, + "logps/rejected": -343.29962158203125, + "loss": 0.3859, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0374780893325806, + "rewards/margins": 3.343306064605713, + "rewards/rejected": -4.380784034729004, + "step": 13720 + }, + { + "epoch": 0.4477356599946233, + "grad_norm": 2.3733644485473633, + "learning_rate": 4.254298780156635e-05, + "logits/chosen": 3.532939910888672, + "logits/rejected": 3.7939491271972656, + "logps/chosen": -373.08880615234375, + "logps/rejected": -348.62347412109375, + "loss": 0.5305, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9499136209487915, + "rewards/margins": 2.6990549564361572, + "rewards/rejected": -4.648968696594238, + "step": 13740 + }, + { + "epoch": 0.4483873858461438, + "grad_norm": 4.409870624542236, + "learning_rate": 4.253212543856791e-05, + "logits/chosen": 3.689526319503784, + "logits/rejected": 3.9323036670684814, + "logps/chosen": -332.17437744140625, + "logps/rejected": -321.14630126953125, + "loss": 0.589, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.162722587585449, + "rewards/margins": 2.6632513999938965, + "rewards/rejected": -4.825973987579346, + "step": 13760 + }, + { + "epoch": 0.44903911169766436, + "grad_norm": 17.228805541992188, + "learning_rate": 4.252126307556946e-05, + "logits/chosen": 3.687373638153076, + "logits/rejected": 3.830448865890503, + "logps/chosen": -350.4100036621094, + "logps/rejected": -326.98577880859375, + "loss": 0.3881, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.025280237197876, + "rewards/margins": 2.9482266902923584, + "rewards/rejected": -3.9735069274902344, + "step": 13780 + }, + { + "epoch": 0.44969083754918493, + "grad_norm": 2.0345497131347656, + "learning_rate": 4.251040071257101e-05, + "logits/chosen": 3.3278796672821045, + "logits/rejected": 3.6605124473571777, + "logps/chosen": -384.2477111816406, + "logps/rejected": -327.7543029785156, + "loss": 0.425, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.2701427936553955, + "rewards/margins": 3.2856249809265137, + "rewards/rejected": -4.555768013000488, + "step": 13800 + }, + { + "epoch": 0.4503425634007055, + "grad_norm": 4.841041564941406, + "learning_rate": 4.249953834957257e-05, + "logits/chosen": 3.73773455619812, + "logits/rejected": 3.6829421520233154, + "logps/chosen": -314.136962890625, + "logps/rejected": -294.3863830566406, + "loss": 0.4305, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5288673639297485, + "rewards/margins": 2.763309955596924, + "rewards/rejected": -4.292177200317383, + "step": 13820 + }, + { + "epoch": 0.45099428925222607, + "grad_norm": 1.3817723989486694, + "learning_rate": 4.248867598657412e-05, + "logits/chosen": 3.7377541065216064, + "logits/rejected": 3.7652924060821533, + "logps/chosen": -303.20440673828125, + "logps/rejected": -307.51214599609375, + "loss": 0.595, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0334932804107666, + "rewards/margins": 2.4844677448272705, + "rewards/rejected": -4.517961025238037, + "step": 13840 + }, + { + "epoch": 0.45164601510374663, + "grad_norm": 2.1433024406433105, + "learning_rate": 4.247781362357567e-05, + "logits/chosen": 3.1406283378601074, + "logits/rejected": 3.5088531970977783, + "logps/chosen": -322.926513671875, + "logps/rejected": -319.796875, + "loss": 0.3825, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1860984563827515, + "rewards/margins": 3.195648193359375, + "rewards/rejected": -4.381746292114258, + "step": 13860 + }, + { + "epoch": 0.45229774095526715, + "grad_norm": 1.3062396049499512, + "learning_rate": 4.246695126057722e-05, + "logits/chosen": 3.476621150970459, + "logits/rejected": 3.605358600616455, + "logps/chosen": -329.3778991699219, + "logps/rejected": -299.24713134765625, + "loss": 0.6672, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.549538493156433, + "rewards/margins": 1.9345362186431885, + "rewards/rejected": -3.484074831008911, + "step": 13880 + }, + { + "epoch": 0.4529494668067877, + "grad_norm": 2.9810519218444824, + "learning_rate": 4.245608889757878e-05, + "logits/chosen": 3.4027340412139893, + "logits/rejected": 3.5238890647888184, + "logps/chosen": -311.00970458984375, + "logps/rejected": -316.36724853515625, + "loss": 0.4829, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4165592193603516, + "rewards/margins": 2.7505698204040527, + "rewards/rejected": -4.167128562927246, + "step": 13900 + }, + { + "epoch": 0.4536011926583083, + "grad_norm": 1.243027925491333, + "learning_rate": 4.244522653458034e-05, + "logits/chosen": 3.453845977783203, + "logits/rejected": 3.4563403129577637, + "logps/chosen": -328.3252258300781, + "logps/rejected": -344.5174255371094, + "loss": 0.7053, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5184670686721802, + "rewards/margins": 2.6497020721435547, + "rewards/rejected": -4.1681694984436035, + "step": 13920 + }, + { + "epoch": 0.45425291850982885, + "grad_norm": 2.7285189628601074, + "learning_rate": 4.243436417158189e-05, + "logits/chosen": 3.4942848682403564, + "logits/rejected": 3.5999655723571777, + "logps/chosen": -352.5911865234375, + "logps/rejected": -349.7704162597656, + "loss": 0.3854, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.316042184829712, + "rewards/margins": 3.0839335918426514, + "rewards/rejected": -4.399975776672363, + "step": 13940 + }, + { + "epoch": 0.4549046443613494, + "grad_norm": 23.826108932495117, + "learning_rate": 4.2423501808583446e-05, + "logits/chosen": 3.3788046836853027, + "logits/rejected": 3.765568256378174, + "logps/chosen": -353.873046875, + "logps/rejected": -331.52667236328125, + "loss": 0.448, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1698546409606934, + "rewards/margins": 2.7444894313812256, + "rewards/rejected": -4.91434383392334, + "step": 13960 + }, + { + "epoch": 0.45555637021286993, + "grad_norm": 5.622879981994629, + "learning_rate": 4.2412639445585e-05, + "logits/chosen": 3.769834041595459, + "logits/rejected": 3.8941073417663574, + "logps/chosen": -354.27130126953125, + "logps/rejected": -328.22857666015625, + "loss": 0.3568, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.1946786642074585, + "rewards/margins": 3.3642783164978027, + "rewards/rejected": -4.558957099914551, + "step": 13980 + }, + { + "epoch": 0.4562080960643905, + "grad_norm": 1.5565975904464722, + "learning_rate": 4.240177708258655e-05, + "logits/chosen": 3.3806469440460205, + "logits/rejected": 3.6404953002929688, + "logps/chosen": -360.2562561035156, + "logps/rejected": -324.103515625, + "loss": 0.4784, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.055300235748291, + "rewards/margins": 3.445620059967041, + "rewards/rejected": -5.500920295715332, + "step": 14000 + }, + { + "epoch": 0.45685982191591107, + "grad_norm": 1.0363861322402954, + "learning_rate": 4.23909147195881e-05, + "logits/chosen": 3.883195161819458, + "logits/rejected": 3.981616497039795, + "logps/chosen": -370.08343505859375, + "logps/rejected": -316.04986572265625, + "loss": 0.5902, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7780961990356445, + "rewards/margins": 2.547281503677368, + "rewards/rejected": -4.325377464294434, + "step": 14020 + }, + { + "epoch": 0.45751154776743164, + "grad_norm": 2.5167133808135986, + "learning_rate": 4.2380052356589656e-05, + "logits/chosen": 3.34661602973938, + "logits/rejected": 3.6861743927001953, + "logps/chosen": -323.7190246582031, + "logps/rejected": -328.17218017578125, + "loss": 0.4192, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.771026372909546, + "rewards/margins": 3.4171173572540283, + "rewards/rejected": -5.188143730163574, + "step": 14040 + }, + { + "epoch": 0.4581632736189522, + "grad_norm": 1.2961714267730713, + "learning_rate": 4.2369189993591207e-05, + "logits/chosen": 3.5283172130584717, + "logits/rejected": 3.7753500938415527, + "logps/chosen": -331.59637451171875, + "logps/rejected": -312.82012939453125, + "loss": 0.3829, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.502934217453003, + "rewards/margins": 3.283949375152588, + "rewards/rejected": -4.7868828773498535, + "step": 14060 + }, + { + "epoch": 0.4588149994704728, + "grad_norm": 1.2037428617477417, + "learning_rate": 4.235832763059276e-05, + "logits/chosen": 3.3986942768096924, + "logits/rejected": 3.5911800861358643, + "logps/chosen": -322.37750244140625, + "logps/rejected": -320.38885498046875, + "loss": 0.5525, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0832037925720215, + "rewards/margins": 2.2749390602111816, + "rewards/rejected": -4.358142852783203, + "step": 14080 + }, + { + "epoch": 0.4594667253219933, + "grad_norm": 2.2703630924224854, + "learning_rate": 4.2347465267594315e-05, + "logits/chosen": 3.897289752960205, + "logits/rejected": 3.8357551097869873, + "logps/chosen": -320.74884033203125, + "logps/rejected": -311.0357360839844, + "loss": 0.5576, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8242671489715576, + "rewards/margins": 2.211287021636963, + "rewards/rejected": -4.035553932189941, + "step": 14100 + }, + { + "epoch": 0.46011845117351385, + "grad_norm": 4.017509460449219, + "learning_rate": 4.2336602904595866e-05, + "logits/chosen": 4.075305938720703, + "logits/rejected": 4.0919928550720215, + "logps/chosen": -378.1620178222656, + "logps/rejected": -334.64508056640625, + "loss": 0.5029, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4949285984039307, + "rewards/margins": 2.643589735031128, + "rewards/rejected": -4.138518333435059, + "step": 14120 + }, + { + "epoch": 0.4607701770250344, + "grad_norm": 3.9083352088928223, + "learning_rate": 4.2325740541597416e-05, + "logits/chosen": 3.6824965476989746, + "logits/rejected": 3.6968817710876465, + "logps/chosen": -361.38385009765625, + "logps/rejected": -337.76251220703125, + "loss": 0.6693, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.3040218353271484, + "rewards/margins": 2.238734483718872, + "rewards/rejected": -4.542756080627441, + "step": 14140 + }, + { + "epoch": 0.461421902876555, + "grad_norm": 1.5165528059005737, + "learning_rate": 4.2314878178598974e-05, + "logits/chosen": 3.4444305896759033, + "logits/rejected": 3.6298537254333496, + "logps/chosen": -307.2798156738281, + "logps/rejected": -308.5860900878906, + "loss": 0.4845, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1299355030059814, + "rewards/margins": 2.6021063327789307, + "rewards/rejected": -4.732041358947754, + "step": 14160 + }, + { + "epoch": 0.46207362872807556, + "grad_norm": 2.1408419609069824, + "learning_rate": 4.230401581560053e-05, + "logits/chosen": 3.7091751098632812, + "logits/rejected": 3.8478050231933594, + "logps/chosen": -335.634033203125, + "logps/rejected": -297.1236572265625, + "loss": 0.4207, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.1933029890060425, + "rewards/margins": 2.7244300842285156, + "rewards/rejected": -3.9177329540252686, + "step": 14180 + }, + { + "epoch": 0.46272535457959607, + "grad_norm": 1.482291579246521, + "learning_rate": 4.229315345260208e-05, + "logits/chosen": 3.9105193614959717, + "logits/rejected": 4.012207508087158, + "logps/chosen": -345.81439208984375, + "logps/rejected": -344.2801208496094, + "loss": 0.4469, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.549200415611267, + "rewards/margins": 2.951098918914795, + "rewards/rejected": -4.500298976898193, + "step": 14200 + }, + { + "epoch": 0.46337708043111664, + "grad_norm": 17.868303298950195, + "learning_rate": 4.228229108960363e-05, + "logits/chosen": 3.725156307220459, + "logits/rejected": 3.986436367034912, + "logps/chosen": -348.00518798828125, + "logps/rejected": -327.17840576171875, + "loss": 0.5121, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.0719094276428223, + "rewards/margins": 2.791081190109253, + "rewards/rejected": -3.862990140914917, + "step": 14220 + }, + { + "epoch": 0.4640288062826372, + "grad_norm": 2.890226364135742, + "learning_rate": 4.227142872660519e-05, + "logits/chosen": 3.5744400024414062, + "logits/rejected": 3.681147336959839, + "logps/chosen": -348.7259521484375, + "logps/rejected": -351.08087158203125, + "loss": 0.4836, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.2062058448791504, + "rewards/margins": 2.9201064109802246, + "rewards/rejected": -5.126312732696533, + "step": 14240 + }, + { + "epoch": 0.4646805321341578, + "grad_norm": 4.169720649719238, + "learning_rate": 4.226056636360674e-05, + "logits/chosen": 3.650028705596924, + "logits/rejected": 3.777833938598633, + "logps/chosen": -355.2131652832031, + "logps/rejected": -332.0047912597656, + "loss": 0.5492, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4431816339492798, + "rewards/margins": 2.9158644676208496, + "rewards/rejected": -4.35904598236084, + "step": 14260 + }, + { + "epoch": 0.46533225798567834, + "grad_norm": 1.3889434337615967, + "learning_rate": 4.224970400060829e-05, + "logits/chosen": 3.6925792694091797, + "logits/rejected": 3.7470805644989014, + "logps/chosen": -318.71307373046875, + "logps/rejected": -295.34979248046875, + "loss": 0.5118, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4729039669036865, + "rewards/margins": 2.3735604286193848, + "rewards/rejected": -3.846464157104492, + "step": 14280 + }, + { + "epoch": 0.46598398383719886, + "grad_norm": 2.4545352458953857, + "learning_rate": 4.223884163760985e-05, + "logits/chosen": 3.1825079917907715, + "logits/rejected": 3.5964901447296143, + "logps/chosen": -349.251220703125, + "logps/rejected": -303.44085693359375, + "loss": 0.5137, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4721081256866455, + "rewards/margins": 2.7985949516296387, + "rewards/rejected": -4.270703315734863, + "step": 14300 + }, + { + "epoch": 0.4666357096887194, + "grad_norm": 1.2339797019958496, + "learning_rate": 4.22279792746114e-05, + "logits/chosen": 3.454552173614502, + "logits/rejected": 3.718561887741089, + "logps/chosen": -362.66790771484375, + "logps/rejected": -306.7494201660156, + "loss": 0.5328, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6760671138763428, + "rewards/margins": 2.974015712738037, + "rewards/rejected": -4.650082588195801, + "step": 14320 + }, + { + "epoch": 0.46728743554024, + "grad_norm": 4.49125337600708, + "learning_rate": 4.221711691161295e-05, + "logits/chosen": 3.622110366821289, + "logits/rejected": 3.7419848442077637, + "logps/chosen": -349.00982666015625, + "logps/rejected": -328.3883361816406, + "loss": 0.5758, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9976364374160767, + "rewards/margins": 2.359661102294922, + "rewards/rejected": -4.357296943664551, + "step": 14340 + }, + { + "epoch": 0.46793916139176056, + "grad_norm": 0.10062891244888306, + "learning_rate": 4.220625454861451e-05, + "logits/chosen": 3.1958634853363037, + "logits/rejected": 3.3053550720214844, + "logps/chosen": -346.233642578125, + "logps/rejected": -335.8998107910156, + "loss": 0.3997, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7456951141357422, + "rewards/margins": 3.0046238899230957, + "rewards/rejected": -4.750319004058838, + "step": 14360 + }, + { + "epoch": 0.46859088724328113, + "grad_norm": 0.22962142527103424, + "learning_rate": 4.219539218561606e-05, + "logits/chosen": 3.1932547092437744, + "logits/rejected": 3.4016761779785156, + "logps/chosen": -360.7402038574219, + "logps/rejected": -317.7936096191406, + "loss": 0.6165, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.649720549583435, + "rewards/margins": 3.0507028102874756, + "rewards/rejected": -4.700423717498779, + "step": 14380 + }, + { + "epoch": 0.4692426130948017, + "grad_norm": 2.968545913696289, + "learning_rate": 4.218452982261761e-05, + "logits/chosen": 3.7110981941223145, + "logits/rejected": 4.001956462860107, + "logps/chosen": -357.68988037109375, + "logps/rejected": -300.104736328125, + "loss": 0.5316, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1591246128082275, + "rewards/margins": 2.305152416229248, + "rewards/rejected": -3.4642772674560547, + "step": 14400 + }, + { + "epoch": 0.4698943389463222, + "grad_norm": 3.789163112640381, + "learning_rate": 4.217366745961917e-05, + "logits/chosen": 3.2508044242858887, + "logits/rejected": 3.300715923309326, + "logps/chosen": -312.59478759765625, + "logps/rejected": -314.8308410644531, + "loss": 0.3869, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.581135630607605, + "rewards/margins": 2.420319080352783, + "rewards/rejected": -4.0014543533325195, + "step": 14420 + }, + { + "epoch": 0.4705460647978428, + "grad_norm": 5.469239711761475, + "learning_rate": 4.216280509662072e-05, + "logits/chosen": 3.8415980339050293, + "logits/rejected": 3.9172158241271973, + "logps/chosen": -347.052490234375, + "logps/rejected": -310.72137451171875, + "loss": 0.4284, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2094247341156006, + "rewards/margins": 2.3371105194091797, + "rewards/rejected": -3.546534776687622, + "step": 14440 + }, + { + "epoch": 0.47119779064936335, + "grad_norm": 0.6489500403404236, + "learning_rate": 4.2151942733622276e-05, + "logits/chosen": 3.3983654975891113, + "logits/rejected": 3.3676161766052246, + "logps/chosen": -291.0464782714844, + "logps/rejected": -260.6913146972656, + "loss": 0.489, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7042076587677002, + "rewards/margins": 2.5420098304748535, + "rewards/rejected": -4.246217727661133, + "step": 14460 + }, + { + "epoch": 0.4718495165008839, + "grad_norm": 3.0550410747528076, + "learning_rate": 4.214108037062383e-05, + "logits/chosen": 3.498088836669922, + "logits/rejected": 3.608278751373291, + "logps/chosen": -330.2873840332031, + "logps/rejected": -325.07159423828125, + "loss": 0.3987, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2204345017671585, + "rewards/margins": 2.5904440879821777, + "rewards/rejected": -2.8108787536621094, + "step": 14480 + }, + { + "epoch": 0.4725012423524045, + "grad_norm": 2.0359036922454834, + "learning_rate": 4.2130218007625385e-05, + "logits/chosen": 3.708559036254883, + "logits/rejected": 3.8153843879699707, + "logps/chosen": -373.28802490234375, + "logps/rejected": -320.50946044921875, + "loss": 0.2759, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.771734893321991, + "rewards/margins": 3.1922309398651123, + "rewards/rejected": -3.963965892791748, + "step": 14500 + }, + { + "epoch": 0.473152968203925, + "grad_norm": 1.3363789319992065, + "learning_rate": 4.2119355644626936e-05, + "logits/chosen": 3.5944836139678955, + "logits/rejected": 3.7118561267852783, + "logps/chosen": -361.70135498046875, + "logps/rejected": -293.45697021484375, + "loss": 0.5229, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.4630948305130005, + "rewards/margins": 2.223694324493408, + "rewards/rejected": -3.686789035797119, + "step": 14520 + }, + { + "epoch": 0.47380469405544556, + "grad_norm": 5.325522422790527, + "learning_rate": 4.2108493281628486e-05, + "logits/chosen": 3.340696334838867, + "logits/rejected": 3.309445858001709, + "logps/chosen": -341.36041259765625, + "logps/rejected": -292.9287109375, + "loss": 0.6173, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.769060492515564, + "rewards/margins": 2.086071729660034, + "rewards/rejected": -3.8551323413848877, + "step": 14540 + }, + { + "epoch": 0.47445641990696613, + "grad_norm": 4.299686431884766, + "learning_rate": 4.2097630918630044e-05, + "logits/chosen": 3.3760502338409424, + "logits/rejected": 3.711068630218506, + "logps/chosen": -359.52880859375, + "logps/rejected": -317.80364990234375, + "loss": 0.5273, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2503302097320557, + "rewards/margins": 2.579129457473755, + "rewards/rejected": -3.8294596672058105, + "step": 14560 + }, + { + "epoch": 0.4751081457584867, + "grad_norm": 2.0724246501922607, + "learning_rate": 4.2086768555631595e-05, + "logits/chosen": 3.3258957862854004, + "logits/rejected": 3.4639296531677246, + "logps/chosen": -291.50311279296875, + "logps/rejected": -293.77325439453125, + "loss": 0.5096, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2003743648529053, + "rewards/margins": 2.1704447269439697, + "rewards/rejected": -3.370819091796875, + "step": 14580 + }, + { + "epoch": 0.47575987161000727, + "grad_norm": 15.262664794921875, + "learning_rate": 4.2075906192633145e-05, + "logits/chosen": 3.3868155479431152, + "logits/rejected": 3.7057411670684814, + "logps/chosen": -344.84039306640625, + "logps/rejected": -285.6700439453125, + "loss": 0.4285, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.12129545211792, + "rewards/margins": 2.3421754837036133, + "rewards/rejected": -3.463470458984375, + "step": 14600 + }, + { + "epoch": 0.47641159746152784, + "grad_norm": 0.014786154963076115, + "learning_rate": 4.2065043829634696e-05, + "logits/chosen": 3.28703236579895, + "logits/rejected": 3.6624228954315186, + "logps/chosen": -329.32330322265625, + "logps/rejected": -303.60406494140625, + "loss": 0.5587, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5579572916030884, + "rewards/margins": 2.302518844604492, + "rewards/rejected": -3.860476016998291, + "step": 14620 + }, + { + "epoch": 0.47706332331304835, + "grad_norm": 5.586556911468506, + "learning_rate": 4.2054181466636254e-05, + "logits/chosen": 3.3039627075195312, + "logits/rejected": 3.622755527496338, + "logps/chosen": -313.5089111328125, + "logps/rejected": -264.9256286621094, + "loss": 0.5069, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4989489316940308, + "rewards/margins": 2.290510654449463, + "rewards/rejected": -3.7894599437713623, + "step": 14640 + }, + { + "epoch": 0.4777150491645689, + "grad_norm": 1.450537919998169, + "learning_rate": 4.2043319103637805e-05, + "logits/chosen": 3.7781665325164795, + "logits/rejected": 3.6984825134277344, + "logps/chosen": -347.5832214355469, + "logps/rejected": -337.61004638671875, + "loss": 0.6315, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2414346933364868, + "rewards/margins": 2.0186214447021484, + "rewards/rejected": -3.260056257247925, + "step": 14660 + }, + { + "epoch": 0.4783667750160895, + "grad_norm": 0.45089226961135864, + "learning_rate": 4.203245674063936e-05, + "logits/chosen": 3.6775271892547607, + "logits/rejected": 3.8912506103515625, + "logps/chosen": -375.1754455566406, + "logps/rejected": -333.30096435546875, + "loss": 0.3822, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2458837032318115, + "rewards/margins": 2.969456195831299, + "rewards/rejected": -4.2153401374816895, + "step": 14680 + }, + { + "epoch": 0.47901850086761005, + "grad_norm": 2.4119651317596436, + "learning_rate": 4.202159437764091e-05, + "logits/chosen": 3.9417121410369873, + "logits/rejected": 3.9372451305389404, + "logps/chosen": -337.1834411621094, + "logps/rejected": -295.8119812011719, + "loss": 0.4853, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2537612915039062, + "rewards/margins": 2.7133469581604004, + "rewards/rejected": -3.9671084880828857, + "step": 14700 + }, + { + "epoch": 0.4796702267191306, + "grad_norm": 0.20308837294578552, + "learning_rate": 4.201073201464247e-05, + "logits/chosen": 3.661510467529297, + "logits/rejected": 3.896693706512451, + "logps/chosen": -338.09130859375, + "logps/rejected": -306.55291748046875, + "loss": 0.4578, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.137634515762329, + "rewards/margins": 2.7861812114715576, + "rewards/rejected": -3.923816204071045, + "step": 14720 + }, + { + "epoch": 0.48032195257065113, + "grad_norm": 4.114689826965332, + "learning_rate": 4.199986965164402e-05, + "logits/chosen": 3.526172161102295, + "logits/rejected": 3.924050807952881, + "logps/chosen": -366.44232177734375, + "logps/rejected": -317.1351013183594, + "loss": 0.6332, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4977644681930542, + "rewards/margins": 2.0066308975219727, + "rewards/rejected": -3.5043952465057373, + "step": 14740 + }, + { + "epoch": 0.4809736784221717, + "grad_norm": 2.288939952850342, + "learning_rate": 4.198900728864558e-05, + "logits/chosen": 3.2640318870544434, + "logits/rejected": 3.3251395225524902, + "logps/chosen": -303.35955810546875, + "logps/rejected": -291.01263427734375, + "loss": 0.4653, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.9906131029129028, + "rewards/margins": 2.4119553565979004, + "rewards/rejected": -3.4025681018829346, + "step": 14760 + }, + { + "epoch": 0.48162540427369227, + "grad_norm": 1.656518578529358, + "learning_rate": 4.197814492564713e-05, + "logits/chosen": 3.5972061157226562, + "logits/rejected": 3.498135805130005, + "logps/chosen": -354.5940856933594, + "logps/rejected": -298.7793884277344, + "loss": 0.4224, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8631625175476074, + "rewards/margins": 2.3348231315612793, + "rewards/rejected": -3.1979854106903076, + "step": 14780 + }, + { + "epoch": 0.48227713012521284, + "grad_norm": 6.6461710929870605, + "learning_rate": 4.196728256264868e-05, + "logits/chosen": 3.514954090118408, + "logits/rejected": 3.5941779613494873, + "logps/chosen": -353.52301025390625, + "logps/rejected": -356.6258239746094, + "loss": 0.64, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7083404064178467, + "rewards/margins": 2.3633363246917725, + "rewards/rejected": -4.071677207946777, + "step": 14800 + }, + { + "epoch": 0.4829288559767334, + "grad_norm": 1.7242003679275513, + "learning_rate": 4.195642019965023e-05, + "logits/chosen": 3.571730375289917, + "logits/rejected": 3.4855964183807373, + "logps/chosen": -313.3379211425781, + "logps/rejected": -285.1102294921875, + "loss": 0.5764, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8763672113418579, + "rewards/margins": 2.024977207183838, + "rewards/rejected": -2.9013442993164062, + "step": 14820 + }, + { + "epoch": 0.4835805818282539, + "grad_norm": 1.1042002439498901, + "learning_rate": 4.194555783665179e-05, + "logits/chosen": 3.7569515705108643, + "logits/rejected": 3.8087973594665527, + "logps/chosen": -349.9139709472656, + "logps/rejected": -316.28179931640625, + "loss": 0.5011, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.17836830019950867, + "rewards/margins": 2.703259229660034, + "rewards/rejected": -2.8816275596618652, + "step": 14840 + }, + { + "epoch": 0.4842323076797745, + "grad_norm": 1.753777265548706, + "learning_rate": 4.193469547365334e-05, + "logits/chosen": 3.699982166290283, + "logits/rejected": 3.8265297412872314, + "logps/chosen": -341.1871337890625, + "logps/rejected": -303.2939453125, + "loss": 0.5094, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.35738351941108704, + "rewards/margins": 1.6248226165771484, + "rewards/rejected": -1.982206106185913, + "step": 14860 + }, + { + "epoch": 0.48488403353129506, + "grad_norm": 0.3964464068412781, + "learning_rate": 4.192383311065489e-05, + "logits/chosen": 3.6040377616882324, + "logits/rejected": 3.8693931102752686, + "logps/chosen": -354.07794189453125, + "logps/rejected": -311.1929931640625, + "loss": 0.3283, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.6521664261817932, + "rewards/margins": 2.793647050857544, + "rewards/rejected": -3.4458136558532715, + "step": 14880 + }, + { + "epoch": 0.4855357593828156, + "grad_norm": 3.4738681316375732, + "learning_rate": 4.191297074765645e-05, + "logits/chosen": 3.310420274734497, + "logits/rejected": 3.548128604888916, + "logps/chosen": -358.5556335449219, + "logps/rejected": -316.9095458984375, + "loss": 0.4059, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.3027551770210266, + "rewards/margins": 3.45452618598938, + "rewards/rejected": -3.757281541824341, + "step": 14900 + }, + { + "epoch": 0.4861874852343362, + "grad_norm": 4.026890754699707, + "learning_rate": 4.1902108384658e-05, + "logits/chosen": 3.32916259765625, + "logits/rejected": 3.576585054397583, + "logps/chosen": -361.57989501953125, + "logps/rejected": -309.031494140625, + "loss": 0.3553, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.7871868014335632, + "rewards/margins": 2.8756775856018066, + "rewards/rejected": -3.6628646850585938, + "step": 14920 + }, + { + "epoch": 0.48683921108585676, + "grad_norm": 0.9917084574699402, + "learning_rate": 4.189124602165955e-05, + "logits/chosen": 3.302093505859375, + "logits/rejected": 3.533346176147461, + "logps/chosen": -316.7646484375, + "logps/rejected": -293.11163330078125, + "loss": 0.3789, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.1624990701675415, + "rewards/margins": 2.8084776401519775, + "rewards/rejected": -3.9709770679473877, + "step": 14940 + }, + { + "epoch": 0.4874909369373773, + "grad_norm": 2.9306387901306152, + "learning_rate": 4.188038365866111e-05, + "logits/chosen": 3.6444449424743652, + "logits/rejected": 3.6875386238098145, + "logps/chosen": -344.4445495605469, + "logps/rejected": -316.8236999511719, + "loss": 0.6681, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4746867418289185, + "rewards/margins": 2.5684773921966553, + "rewards/rejected": -4.043164253234863, + "step": 14960 + }, + { + "epoch": 0.48814266278889784, + "grad_norm": 1.4096421003341675, + "learning_rate": 4.1869521295662665e-05, + "logits/chosen": 3.76751971244812, + "logits/rejected": 3.8006420135498047, + "logps/chosen": -364.1354064941406, + "logps/rejected": -376.2245178222656, + "loss": 0.4805, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6413671970367432, + "rewards/margins": 2.569162607192993, + "rewards/rejected": -4.210529804229736, + "step": 14980 + }, + { + "epoch": 0.4887943886404184, + "grad_norm": 1.7352946996688843, + "learning_rate": 4.1858658932664215e-05, + "logits/chosen": 2.8972396850585938, + "logits/rejected": 3.149217128753662, + "logps/chosen": -272.6822509765625, + "logps/rejected": -270.36065673828125, + "loss": 0.4516, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.512080430984497, + "rewards/margins": 2.51296329498291, + "rewards/rejected": -4.025043964385986, + "step": 15000 + }, + { + "epoch": 0.489446114491939, + "grad_norm": 0.5061320662498474, + "learning_rate": 4.1847796569665766e-05, + "logits/chosen": 3.3472931385040283, + "logits/rejected": 3.321147918701172, + "logps/chosen": -312.0174560546875, + "logps/rejected": -309.4783630371094, + "loss": 0.5107, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9987809658050537, + "rewards/margins": 2.6663341522216797, + "rewards/rejected": -4.6651153564453125, + "step": 15020 + }, + { + "epoch": 0.49009784034345955, + "grad_norm": 2.685601234436035, + "learning_rate": 4.1836934206667324e-05, + "logits/chosen": 3.773949146270752, + "logits/rejected": 3.8477859497070312, + "logps/chosen": -354.06072998046875, + "logps/rejected": -353.6343688964844, + "loss": 0.5077, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.800705909729004, + "rewards/margins": 2.670842170715332, + "rewards/rejected": -4.471548080444336, + "step": 15040 + }, + { + "epoch": 0.49074956619498006, + "grad_norm": 1.641852855682373, + "learning_rate": 4.1826071843668874e-05, + "logits/chosen": 3.364067792892456, + "logits/rejected": 3.2589924335479736, + "logps/chosen": -331.82159423828125, + "logps/rejected": -329.8224792480469, + "loss": 0.5438, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7925217151641846, + "rewards/margins": 3.0637881755828857, + "rewards/rejected": -4.85630989074707, + "step": 15060 + }, + { + "epoch": 0.4914012920465006, + "grad_norm": 4.680403709411621, + "learning_rate": 4.1815209480670425e-05, + "logits/chosen": 3.3109195232391357, + "logits/rejected": 3.3700695037841797, + "logps/chosen": -394.79290771484375, + "logps/rejected": -315.59771728515625, + "loss": 0.4043, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6154779195785522, + "rewards/margins": 3.187312602996826, + "rewards/rejected": -4.802790641784668, + "step": 15080 + }, + { + "epoch": 0.4920530178980212, + "grad_norm": 0.8146421909332275, + "learning_rate": 4.180434711767198e-05, + "logits/chosen": 3.5446114540100098, + "logits/rejected": 3.5342979431152344, + "logps/chosen": -364.4675598144531, + "logps/rejected": -347.1686096191406, + "loss": 0.4182, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4964491128921509, + "rewards/margins": 3.1576952934265137, + "rewards/rejected": -4.654144287109375, + "step": 15100 + }, + { + "epoch": 0.49270474374954176, + "grad_norm": 3.4037933349609375, + "learning_rate": 4.1793484754673534e-05, + "logits/chosen": 3.4947476387023926, + "logits/rejected": 3.7031962871551514, + "logps/chosen": -354.00762939453125, + "logps/rejected": -299.16815185546875, + "loss": 0.3798, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8764108419418335, + "rewards/margins": 3.1328303813934326, + "rewards/rejected": -5.009241104125977, + "step": 15120 + }, + { + "epoch": 0.49335646960106233, + "grad_norm": 0.42618855834007263, + "learning_rate": 4.1782622391675084e-05, + "logits/chosen": 3.7083237171173096, + "logits/rejected": 3.815145969390869, + "logps/chosen": -351.77734375, + "logps/rejected": -333.7751770019531, + "loss": 0.3616, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.066049575805664, + "rewards/margins": 3.222905397415161, + "rewards/rejected": -5.288954734802246, + "step": 15140 + }, + { + "epoch": 0.49400819545258284, + "grad_norm": 0.5775102972984314, + "learning_rate": 4.177176002867664e-05, + "logits/chosen": 3.4220480918884277, + "logits/rejected": 3.76078724861145, + "logps/chosen": -345.40777587890625, + "logps/rejected": -287.42193603515625, + "loss": 0.5582, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5692507028579712, + "rewards/margins": 3.095266819000244, + "rewards/rejected": -4.664517402648926, + "step": 15160 + }, + { + "epoch": 0.4946599213041034, + "grad_norm": 1.7615548372268677, + "learning_rate": 4.176089766567819e-05, + "logits/chosen": 3.8851375579833984, + "logits/rejected": 3.8865272998809814, + "logps/chosen": -370.0118713378906, + "logps/rejected": -337.21405029296875, + "loss": 0.6079, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.793701171875, + "rewards/margins": 2.820493459701538, + "rewards/rejected": -4.614194869995117, + "step": 15180 + }, + { + "epoch": 0.495311647155624, + "grad_norm": 5.053243160247803, + "learning_rate": 4.1750035302679744e-05, + "logits/chosen": 3.476699113845825, + "logits/rejected": 3.6651339530944824, + "logps/chosen": -356.92706298828125, + "logps/rejected": -395.24151611328125, + "loss": 0.5476, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.670433521270752, + "rewards/margins": 2.9850192070007324, + "rewards/rejected": -5.655452728271484, + "step": 15200 + }, + { + "epoch": 0.49596337300714455, + "grad_norm": 6.702383041381836, + "learning_rate": 4.17391729396813e-05, + "logits/chosen": 3.392925977706909, + "logits/rejected": 3.592151641845703, + "logps/chosen": -340.83062744140625, + "logps/rejected": -332.7991638183594, + "loss": 0.4793, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.427426815032959, + "rewards/margins": 3.6456291675567627, + "rewards/rejected": -5.073055744171143, + "step": 15220 + }, + { + "epoch": 0.4966150988586651, + "grad_norm": 1.1173582077026367, + "learning_rate": 4.172831057668285e-05, + "logits/chosen": 3.6007370948791504, + "logits/rejected": 3.65800404548645, + "logps/chosen": -331.67254638671875, + "logps/rejected": -347.58990478515625, + "loss": 0.5416, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2420620918273926, + "rewards/margins": 2.1340091228485107, + "rewards/rejected": -4.376070976257324, + "step": 15240 + }, + { + "epoch": 0.4972668247101857, + "grad_norm": 2.4206721782684326, + "learning_rate": 4.171744821368441e-05, + "logits/chosen": 3.469554901123047, + "logits/rejected": 3.6499435901641846, + "logps/chosen": -319.5582580566406, + "logps/rejected": -279.7436828613281, + "loss": 0.4119, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0358235836029053, + "rewards/margins": 2.746189832687378, + "rewards/rejected": -3.782013416290283, + "step": 15260 + }, + { + "epoch": 0.4979185505617062, + "grad_norm": 0.08608205616474152, + "learning_rate": 4.170658585068596e-05, + "logits/chosen": 3.852837324142456, + "logits/rejected": 3.8879427909851074, + "logps/chosen": -366.098876953125, + "logps/rejected": -320.6130065917969, + "loss": 0.521, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.330884337425232, + "rewards/margins": 2.6001954078674316, + "rewards/rejected": -3.931079387664795, + "step": 15280 + }, + { + "epoch": 0.49857027641322677, + "grad_norm": 2.8739254474639893, + "learning_rate": 4.169572348768752e-05, + "logits/chosen": 3.5962042808532715, + "logits/rejected": 3.8080296516418457, + "logps/chosen": -301.7193298339844, + "logps/rejected": -344.8818359375, + "loss": 0.8577, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.144989490509033, + "rewards/margins": 1.39706552028656, + "rewards/rejected": -3.542055606842041, + "step": 15300 + }, + { + "epoch": 0.49922200226474733, + "grad_norm": 2.180102825164795, + "learning_rate": 4.168486112468907e-05, + "logits/chosen": 3.366368055343628, + "logits/rejected": 3.522388458251953, + "logps/chosen": -326.22119140625, + "logps/rejected": -287.7239074707031, + "loss": 0.3478, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.2263925075531006, + "rewards/margins": 3.843733310699463, + "rewards/rejected": -5.070125102996826, + "step": 15320 + }, + { + "epoch": 0.4998737281162679, + "grad_norm": 0.6016874313354492, + "learning_rate": 4.167399876169062e-05, + "logits/chosen": 3.573061466217041, + "logits/rejected": 3.7789433002471924, + "logps/chosen": -368.683349609375, + "logps/rejected": -326.470458984375, + "loss": 0.4777, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9281911849975586, + "rewards/margins": 2.415346622467041, + "rewards/rejected": -4.3435378074646, + "step": 15340 + }, + { + "epoch": 0.5005254539677885, + "grad_norm": 0.9157819151878357, + "learning_rate": 4.166313639869217e-05, + "logits/chosen": 3.425065517425537, + "logits/rejected": 3.6529288291931152, + "logps/chosen": -363.6746826171875, + "logps/rejected": -327.06671142578125, + "loss": 0.3365, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.162172555923462, + "rewards/margins": 3.69360089302063, + "rewards/rejected": -4.855772972106934, + "step": 15360 + }, + { + "epoch": 0.501177179819309, + "grad_norm": 4.570070266723633, + "learning_rate": 4.165227403569373e-05, + "logits/chosen": 3.4639930725097656, + "logits/rejected": 3.4795963764190674, + "logps/chosen": -329.36724853515625, + "logps/rejected": -300.8335266113281, + "loss": 0.5115, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7524936199188232, + "rewards/margins": 2.3281006813049316, + "rewards/rejected": -4.080595016479492, + "step": 15380 + }, + { + "epoch": 0.5018289056708296, + "grad_norm": 0.20215186476707458, + "learning_rate": 4.164141167269528e-05, + "logits/chosen": 3.801093339920044, + "logits/rejected": 4.017409801483154, + "logps/chosen": -399.6871032714844, + "logps/rejected": -329.1607360839844, + "loss": 0.4726, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.520836591720581, + "rewards/margins": 2.9749317169189453, + "rewards/rejected": -4.4957685470581055, + "step": 15400 + }, + { + "epoch": 0.5024806315223501, + "grad_norm": 1.3046938180923462, + "learning_rate": 4.163054930969683e-05, + "logits/chosen": 3.7070858478546143, + "logits/rejected": 3.8484935760498047, + "logps/chosen": -338.8731384277344, + "logps/rejected": -304.99822998046875, + "loss": 0.5978, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.0504438877105713, + "rewards/margins": 2.769437313079834, + "rewards/rejected": -4.819880485534668, + "step": 15420 + }, + { + "epoch": 0.5031323573738706, + "grad_norm": 0.9945696592330933, + "learning_rate": 4.161968694669839e-05, + "logits/chosen": 3.869770050048828, + "logits/rejected": 3.994523286819458, + "logps/chosen": -347.20159912109375, + "logps/rejected": -317.2839050292969, + "loss": 0.4546, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.4922449588775635, + "rewards/margins": 3.1032469272613525, + "rewards/rejected": -5.595491886138916, + "step": 15440 + }, + { + "epoch": 0.5037840832253913, + "grad_norm": 10.780041694641113, + "learning_rate": 4.160882458369994e-05, + "logits/chosen": 3.9142308235168457, + "logits/rejected": 3.9705700874328613, + "logps/chosen": -360.0743103027344, + "logps/rejected": -316.78118896484375, + "loss": 0.5266, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.607308864593506, + "rewards/margins": 2.6259891986846924, + "rewards/rejected": -5.233298301696777, + "step": 15460 + }, + { + "epoch": 0.5044358090769118, + "grad_norm": 1.2159085273742676, + "learning_rate": 4.1597962220701495e-05, + "logits/chosen": 3.717681407928467, + "logits/rejected": 3.720759868621826, + "logps/chosen": -348.95867919921875, + "logps/rejected": -316.8052062988281, + "loss": 0.4914, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.767568826675415, + "rewards/margins": 2.4861559867858887, + "rewards/rejected": -4.253724575042725, + "step": 15480 + }, + { + "epoch": 0.5050875349284324, + "grad_norm": 4.078325271606445, + "learning_rate": 4.1587099857703046e-05, + "logits/chosen": 3.6590182781219482, + "logits/rejected": 3.927950620651245, + "logps/chosen": -298.08563232421875, + "logps/rejected": -319.44549560546875, + "loss": 0.6093, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.754119634628296, + "rewards/margins": 2.47947359085083, + "rewards/rejected": -4.233593463897705, + "step": 15500 + }, + { + "epoch": 0.5057392607799529, + "grad_norm": 0.9032250642776489, + "learning_rate": 4.1576237494704603e-05, + "logits/chosen": 3.4745216369628906, + "logits/rejected": 3.5621323585510254, + "logps/chosen": -330.21258544921875, + "logps/rejected": -330.0232238769531, + "loss": 0.5116, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9235280752182007, + "rewards/margins": 3.0629122257232666, + "rewards/rejected": -4.986440658569336, + "step": 15520 + }, + { + "epoch": 0.5063909866314734, + "grad_norm": 0.42977702617645264, + "learning_rate": 4.1565375131706154e-05, + "logits/chosen": 3.7340328693389893, + "logits/rejected": 3.992603302001953, + "logps/chosen": -345.383056640625, + "logps/rejected": -297.3928527832031, + "loss": 0.3434, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.4574832916259766, + "rewards/margins": 3.2521157264709473, + "rewards/rejected": -4.709599494934082, + "step": 15540 + }, + { + "epoch": 0.507042712482994, + "grad_norm": 1.9538853168487549, + "learning_rate": 4.1554512768707705e-05, + "logits/chosen": 3.646634340286255, + "logits/rejected": 3.8006489276885986, + "logps/chosen": -314.2568054199219, + "logps/rejected": -281.9151611328125, + "loss": 0.3662, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4686143398284912, + "rewards/margins": 3.4289677143096924, + "rewards/rejected": -4.897582054138184, + "step": 15560 + }, + { + "epoch": 0.5076944383345146, + "grad_norm": 0.6579074263572693, + "learning_rate": 4.154365040570926e-05, + "logits/chosen": 3.7959511280059814, + "logits/rejected": 3.732295274734497, + "logps/chosen": -339.03448486328125, + "logps/rejected": -338.84185791015625, + "loss": 0.3963, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3363237380981445, + "rewards/margins": 3.1650023460388184, + "rewards/rejected": -4.501326084136963, + "step": 15580 + }, + { + "epoch": 0.5083461641860352, + "grad_norm": 4.002108573913574, + "learning_rate": 4.1532788042710813e-05, + "logits/chosen": 3.500581741333008, + "logits/rejected": 3.6410281658172607, + "logps/chosen": -322.2293701171875, + "logps/rejected": -347.401611328125, + "loss": 0.4557, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3929578065872192, + "rewards/margins": 2.8405635356903076, + "rewards/rejected": -4.233521461486816, + "step": 15600 + }, + { + "epoch": 0.5089978900375557, + "grad_norm": 0.5939142107963562, + "learning_rate": 4.1521925679712364e-05, + "logits/chosen": 3.972066879272461, + "logits/rejected": 4.000028610229492, + "logps/chosen": -409.2189025878906, + "logps/rejected": -280.6338806152344, + "loss": 0.3176, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.1303157806396484, + "rewards/margins": 3.086455821990967, + "rewards/rejected": -4.216771602630615, + "step": 15620 + }, + { + "epoch": 0.5096496158890763, + "grad_norm": 5.008413314819336, + "learning_rate": 4.151106331671392e-05, + "logits/chosen": 3.7483925819396973, + "logits/rejected": 3.892000198364258, + "logps/chosen": -341.32257080078125, + "logps/rejected": -287.5126953125, + "loss": 0.4695, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.939958930015564, + "rewards/margins": 2.8213517665863037, + "rewards/rejected": -4.7613115310668945, + "step": 15640 + }, + { + "epoch": 0.5103013417405968, + "grad_norm": 0.723612904548645, + "learning_rate": 4.150020095371547e-05, + "logits/chosen": 3.6659369468688965, + "logits/rejected": 3.7726798057556152, + "logps/chosen": -359.6225891113281, + "logps/rejected": -313.57720947265625, + "loss": 0.613, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0731561183929443, + "rewards/margins": 1.9486243724822998, + "rewards/rejected": -4.021780014038086, + "step": 15660 + }, + { + "epoch": 0.5109530675921173, + "grad_norm": 3.2487032413482666, + "learning_rate": 4.148933859071702e-05, + "logits/chosen": 3.8676562309265137, + "logits/rejected": 3.982008695602417, + "logps/chosen": -394.29339599609375, + "logps/rejected": -335.9927673339844, + "loss": 0.3996, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.3182036876678467, + "rewards/margins": 3.245335817337036, + "rewards/rejected": -4.563539981842041, + "step": 15680 + }, + { + "epoch": 0.511604793443638, + "grad_norm": 1.1098017692565918, + "learning_rate": 4.147847622771858e-05, + "logits/chosen": 3.5939583778381348, + "logits/rejected": 3.6337273120880127, + "logps/chosen": -325.5815734863281, + "logps/rejected": -327.13580322265625, + "loss": 0.4115, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8496888875961304, + "rewards/margins": 3.167327404022217, + "rewards/rejected": -5.0170159339904785, + "step": 15700 + }, + { + "epoch": 0.5122565192951585, + "grad_norm": 1.4389970302581787, + "learning_rate": 4.146761386472013e-05, + "logits/chosen": 3.5329856872558594, + "logits/rejected": 3.6322779655456543, + "logps/chosen": -347.55828857421875, + "logps/rejected": -311.47564697265625, + "loss": 0.3898, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.719853162765503, + "rewards/margins": 3.315898895263672, + "rewards/rejected": -5.035752296447754, + "step": 15720 + }, + { + "epoch": 0.5129082451466791, + "grad_norm": 4.7562055587768555, + "learning_rate": 4.145675150172168e-05, + "logits/chosen": 3.9222118854522705, + "logits/rejected": 3.870427370071411, + "logps/chosen": -358.184814453125, + "logps/rejected": -310.2081604003906, + "loss": 0.4735, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7451518774032593, + "rewards/margins": 2.972731590270996, + "rewards/rejected": -4.717883110046387, + "step": 15740 + }, + { + "epoch": 0.5135599709981996, + "grad_norm": 3.169837474822998, + "learning_rate": 4.144588913872324e-05, + "logits/chosen": 3.733020305633545, + "logits/rejected": 3.7261340618133545, + "logps/chosen": -328.55853271484375, + "logps/rejected": -324.67584228515625, + "loss": 0.3369, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.510311484336853, + "rewards/margins": 3.4878172874450684, + "rewards/rejected": -4.998128890991211, + "step": 15760 + }, + { + "epoch": 0.5142116968497201, + "grad_norm": 0.80384361743927, + "learning_rate": 4.14350267757248e-05, + "logits/chosen": 3.4576804637908936, + "logits/rejected": 3.630077838897705, + "logps/chosen": -337.4125061035156, + "logps/rejected": -307.8397521972656, + "loss": 0.5644, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.894124984741211, + "rewards/margins": 2.7032644748687744, + "rewards/rejected": -4.597389221191406, + "step": 15780 + }, + { + "epoch": 0.5148634227012407, + "grad_norm": 26.183696746826172, + "learning_rate": 4.142416441272635e-05, + "logits/chosen": 3.7028679847717285, + "logits/rejected": 4.079373836517334, + "logps/chosen": -345.3189697265625, + "logps/rejected": -341.0089416503906, + "loss": 0.8864, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3382635116577148, + "rewards/margins": 2.5244483947753906, + "rewards/rejected": -3.8627116680145264, + "step": 15800 + }, + { + "epoch": 0.5155151485527613, + "grad_norm": 0.2101546674966812, + "learning_rate": 4.14133020497279e-05, + "logits/chosen": 3.9090259075164795, + "logits/rejected": 4.099499702453613, + "logps/chosen": -397.5978698730469, + "logps/rejected": -325.7984313964844, + "loss": 0.4454, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2930047512054443, + "rewards/margins": 2.813814640045166, + "rewards/rejected": -4.1068196296691895, + "step": 15820 + }, + { + "epoch": 0.5161668744042819, + "grad_norm": 0.9144209027290344, + "learning_rate": 4.140243968672946e-05, + "logits/chosen": 3.6742610931396484, + "logits/rejected": 3.857326030731201, + "logps/chosen": -346.7101745605469, + "logps/rejected": -321.71588134765625, + "loss": 0.5541, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.362060308456421, + "rewards/margins": 2.3005576133728027, + "rewards/rejected": -3.6626181602478027, + "step": 15840 + }, + { + "epoch": 0.5168186002558024, + "grad_norm": 8.004899024963379, + "learning_rate": 4.139157732373101e-05, + "logits/chosen": 2.9503254890441895, + "logits/rejected": 3.447110652923584, + "logps/chosen": -317.6686096191406, + "logps/rejected": -310.48773193359375, + "loss": 0.558, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0278658866882324, + "rewards/margins": 2.5854299068450928, + "rewards/rejected": -4.613295555114746, + "step": 15860 + }, + { + "epoch": 0.5174703261073229, + "grad_norm": 3.4327127933502197, + "learning_rate": 4.138071496073256e-05, + "logits/chosen": 3.2345898151397705, + "logits/rejected": 3.436661958694458, + "logps/chosen": -345.5423583984375, + "logps/rejected": -289.4623718261719, + "loss": 0.287, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.1116974353790283, + "rewards/margins": 3.3239407539367676, + "rewards/rejected": -5.435637950897217, + "step": 15880 + }, + { + "epoch": 0.5181220519588435, + "grad_norm": 0.05722203478217125, + "learning_rate": 4.1369852597734116e-05, + "logits/chosen": 3.5891666412353516, + "logits/rejected": 3.745936632156372, + "logps/chosen": -336.4872131347656, + "logps/rejected": -303.2713317871094, + "loss": 0.5023, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8372518420219421, + "rewards/margins": 3.6052017211914062, + "rewards/rejected": -4.442452907562256, + "step": 15900 + }, + { + "epoch": 0.518773777810364, + "grad_norm": 1.3361940383911133, + "learning_rate": 4.1358990234735667e-05, + "logits/chosen": 3.7741916179656982, + "logits/rejected": 3.9126033782958984, + "logps/chosen": -375.30963134765625, + "logps/rejected": -360.95269775390625, + "loss": 0.5588, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6261955499649048, + "rewards/margins": 2.698951482772827, + "rewards/rejected": -4.325146675109863, + "step": 15920 + }, + { + "epoch": 0.5194255036618847, + "grad_norm": 3.319549322128296, + "learning_rate": 4.134812787173722e-05, + "logits/chosen": 3.559418201446533, + "logits/rejected": 3.831747055053711, + "logps/chosen": -329.0173034667969, + "logps/rejected": -316.142333984375, + "loss": 0.6256, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1652367115020752, + "rewards/margins": 2.426687717437744, + "rewards/rejected": -3.5919251441955566, + "step": 15940 + }, + { + "epoch": 0.5200772295134052, + "grad_norm": 3.423184871673584, + "learning_rate": 4.133726550873877e-05, + "logits/chosen": 3.844170093536377, + "logits/rejected": 3.902294635772705, + "logps/chosen": -361.55908203125, + "logps/rejected": -318.8043212890625, + "loss": 0.4884, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9674542546272278, + "rewards/margins": 2.491687774658203, + "rewards/rejected": -3.4591422080993652, + "step": 15960 + }, + { + "epoch": 0.5207289553649257, + "grad_norm": 2.1837732791900635, + "learning_rate": 4.1326403145740326e-05, + "logits/chosen": 3.4349913597106934, + "logits/rejected": 3.539803981781006, + "logps/chosen": -306.22222900390625, + "logps/rejected": -297.33209228515625, + "loss": 0.5053, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.549878716468811, + "rewards/margins": 2.0852017402648926, + "rewards/rejected": -3.635080337524414, + "step": 15980 + }, + { + "epoch": 0.5213806812164463, + "grad_norm": 4.013166427612305, + "learning_rate": 4.1315540782741877e-05, + "logits/chosen": 3.7467620372772217, + "logits/rejected": 3.765491485595703, + "logps/chosen": -395.00225830078125, + "logps/rejected": -325.0822448730469, + "loss": 0.3604, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6641139388084412, + "rewards/margins": 3.4464759826660156, + "rewards/rejected": -4.110589504241943, + "step": 16000 + }, + { + "epoch": 0.5220324070679668, + "grad_norm": 2.477729320526123, + "learning_rate": 4.1305221537893355e-05, + "logits/chosen": 3.649651050567627, + "logits/rejected": 3.7299532890319824, + "logps/chosen": -347.60589599609375, + "logps/rejected": -307.5805358886719, + "loss": 0.3633, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.130279302597046, + "rewards/margins": 2.930504560470581, + "rewards/rejected": -4.060783863067627, + "step": 16020 + }, + { + "epoch": 0.5226841329194875, + "grad_norm": 1.155307650566101, + "learning_rate": 4.1294359174894906e-05, + "logits/chosen": 3.7098402976989746, + "logits/rejected": 3.7663092613220215, + "logps/chosen": -326.49261474609375, + "logps/rejected": -297.4537658691406, + "loss": 0.3503, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5457810163497925, + "rewards/margins": 2.9814963340759277, + "rewards/rejected": -4.527277946472168, + "step": 16040 + }, + { + "epoch": 0.523335858771008, + "grad_norm": 1.2863256931304932, + "learning_rate": 4.128349681189646e-05, + "logits/chosen": 3.577763080596924, + "logits/rejected": 3.603285312652588, + "logps/chosen": -361.15179443359375, + "logps/rejected": -346.88409423828125, + "loss": 0.4232, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.468557596206665, + "rewards/margins": 3.2160587310791016, + "rewards/rejected": -5.684616565704346, + "step": 16060 + }, + { + "epoch": 0.5239875846225285, + "grad_norm": 1.5742031335830688, + "learning_rate": 4.127263444889802e-05, + "logits/chosen": 3.8273441791534424, + "logits/rejected": 3.946514129638672, + "logps/chosen": -369.456787109375, + "logps/rejected": -320.40838623046875, + "loss": 0.5839, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6102749109268188, + "rewards/margins": 2.121644973754883, + "rewards/rejected": -3.731919765472412, + "step": 16080 + }, + { + "epoch": 0.5246393104740491, + "grad_norm": 1.4857606887817383, + "learning_rate": 4.126177208589957e-05, + "logits/chosen": 3.748373508453369, + "logits/rejected": 3.8536133766174316, + "logps/chosen": -347.0051574707031, + "logps/rejected": -285.1726989746094, + "loss": 0.5019, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7852360010147095, + "rewards/margins": 2.492359161376953, + "rewards/rejected": -4.277595520019531, + "step": 16100 + }, + { + "epoch": 0.5252910363255696, + "grad_norm": 1.869759440422058, + "learning_rate": 4.125090972290112e-05, + "logits/chosen": 3.517066240310669, + "logits/rejected": 3.647587537765503, + "logps/chosen": -369.77947998046875, + "logps/rejected": -304.6853942871094, + "loss": 0.4145, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.2381502389907837, + "rewards/margins": 3.254039764404297, + "rewards/rejected": -4.492190361022949, + "step": 16120 + }, + { + "epoch": 0.5259427621770902, + "grad_norm": 5.7883195877075195, + "learning_rate": 4.124004735990267e-05, + "logits/chosen": 3.431819438934326, + "logits/rejected": 3.802522659301758, + "logps/chosen": -331.91607666015625, + "logps/rejected": -342.3179626464844, + "loss": 0.3326, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6007773876190186, + "rewards/margins": 3.8641514778137207, + "rewards/rejected": -5.46492862701416, + "step": 16140 + }, + { + "epoch": 0.5265944880286108, + "grad_norm": 1.5990535020828247, + "learning_rate": 4.122918499690423e-05, + "logits/chosen": 3.2471249103546143, + "logits/rejected": 3.332254409790039, + "logps/chosen": -337.47869873046875, + "logps/rejected": -319.64532470703125, + "loss": 0.3834, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5138659477233887, + "rewards/margins": 2.944131374359131, + "rewards/rejected": -4.4579973220825195, + "step": 16160 + }, + { + "epoch": 0.5272462138801314, + "grad_norm": 5.368670463562012, + "learning_rate": 4.121832263390578e-05, + "logits/chosen": 3.498886823654175, + "logits/rejected": 3.732400894165039, + "logps/chosen": -358.1736755371094, + "logps/rejected": -348.6121826171875, + "loss": 0.6187, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7429498434066772, + "rewards/margins": 2.4392049312591553, + "rewards/rejected": -4.182154655456543, + "step": 16180 + }, + { + "epoch": 0.5278979397316519, + "grad_norm": 3.191626787185669, + "learning_rate": 4.120746027090733e-05, + "logits/chosen": 3.4054245948791504, + "logits/rejected": 3.7150180339813232, + "logps/chosen": -321.72998046875, + "logps/rejected": -318.6777648925781, + "loss": 0.3249, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.748748779296875, + "rewards/margins": 3.4150421619415283, + "rewards/rejected": -5.163790702819824, + "step": 16200 + }, + { + "epoch": 0.5285496655831724, + "grad_norm": 2.9237923622131348, + "learning_rate": 4.119659790790889e-05, + "logits/chosen": 3.6551296710968018, + "logits/rejected": 3.739912509918213, + "logps/chosen": -353.5201721191406, + "logps/rejected": -336.2696838378906, + "loss": 0.382, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.286454677581787, + "rewards/margins": 3.3768439292907715, + "rewards/rejected": -5.663298606872559, + "step": 16220 + }, + { + "epoch": 0.529201391434693, + "grad_norm": 2.158386468887329, + "learning_rate": 4.118573554491044e-05, + "logits/chosen": 3.338775157928467, + "logits/rejected": 3.7133991718292236, + "logps/chosen": -299.24639892578125, + "logps/rejected": -289.835693359375, + "loss": 0.5203, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.333423614501953, + "rewards/margins": 2.8682479858398438, + "rewards/rejected": -5.201671600341797, + "step": 16240 + }, + { + "epoch": 0.5298531172862135, + "grad_norm": 2.826942205429077, + "learning_rate": 4.117487318191199e-05, + "logits/chosen": 3.8481624126434326, + "logits/rejected": 3.9103782176971436, + "logps/chosen": -393.7291259765625, + "logps/rejected": -339.6145935058594, + "loss": 0.4775, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5059187412261963, + "rewards/margins": 3.2910754680633545, + "rewards/rejected": -4.796994209289551, + "step": 16260 + }, + { + "epoch": 0.5305048431377342, + "grad_norm": 0.9285250902175903, + "learning_rate": 4.116401081891355e-05, + "logits/chosen": 3.542858839035034, + "logits/rejected": 3.683584213256836, + "logps/chosen": -362.3615417480469, + "logps/rejected": -344.5205993652344, + "loss": 0.4832, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.625976800918579, + "rewards/margins": 2.9759521484375, + "rewards/rejected": -4.6019287109375, + "step": 16280 + }, + { + "epoch": 0.5311565689892547, + "grad_norm": 6.2704081535339355, + "learning_rate": 4.11531484559151e-05, + "logits/chosen": 3.392164945602417, + "logits/rejected": 3.4304299354553223, + "logps/chosen": -344.7274475097656, + "logps/rejected": -329.4820861816406, + "loss": 0.6485, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8537143468856812, + "rewards/margins": 2.373487710952759, + "rewards/rejected": -4.227202415466309, + "step": 16300 + }, + { + "epoch": 0.5318082948407752, + "grad_norm": 2.2185757160186768, + "learning_rate": 4.114228609291666e-05, + "logits/chosen": 3.4158451557159424, + "logits/rejected": 3.5277886390686035, + "logps/chosen": -389.83990478515625, + "logps/rejected": -357.0296936035156, + "loss": 0.3648, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6296427249908447, + "rewards/margins": 3.8004043102264404, + "rewards/rejected": -5.430047512054443, + "step": 16320 + }, + { + "epoch": 0.5324600206922958, + "grad_norm": 0.818878710269928, + "learning_rate": 4.113142372991821e-05, + "logits/chosen": 3.725808620452881, + "logits/rejected": 3.7789597511291504, + "logps/chosen": -335.60809326171875, + "logps/rejected": -277.6468200683594, + "loss": 0.5577, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.979274034500122, + "rewards/margins": 2.969031810760498, + "rewards/rejected": -4.948306083679199, + "step": 16340 + }, + { + "epoch": 0.5331117465438163, + "grad_norm": 0.34118205308914185, + "learning_rate": 4.1120561366919766e-05, + "logits/chosen": 3.353182554244995, + "logits/rejected": 3.533583164215088, + "logps/chosen": -333.0008850097656, + "logps/rejected": -305.0059814453125, + "loss": 0.4247, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.175806999206543, + "rewards/margins": 2.6405856609344482, + "rewards/rejected": -4.81639289855957, + "step": 16360 + }, + { + "epoch": 0.533763472395337, + "grad_norm": 5.517192363739014, + "learning_rate": 4.1109699003921316e-05, + "logits/chosen": 3.83628511428833, + "logits/rejected": 3.8513317108154297, + "logps/chosen": -398.2156066894531, + "logps/rejected": -305.7725524902344, + "loss": 0.496, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.472460389137268, + "rewards/margins": 3.2583861351013184, + "rewards/rejected": -4.730846405029297, + "step": 16380 + }, + { + "epoch": 0.5344151982468575, + "grad_norm": 2.026738166809082, + "learning_rate": 4.109883664092287e-05, + "logits/chosen": 3.659313201904297, + "logits/rejected": 3.726097822189331, + "logps/chosen": -347.2936706542969, + "logps/rejected": -314.71405029296875, + "loss": 0.4944, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6843992471694946, + "rewards/margins": 2.739748477935791, + "rewards/rejected": -4.424147605895996, + "step": 16400 + }, + { + "epoch": 0.535066924098378, + "grad_norm": 2.181086778640747, + "learning_rate": 4.1087974277924425e-05, + "logits/chosen": 3.6710288524627686, + "logits/rejected": 3.7950050830841064, + "logps/chosen": -365.2923583984375, + "logps/rejected": -322.1693420410156, + "loss": 0.5305, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4690061807632446, + "rewards/margins": 2.793692111968994, + "rewards/rejected": -4.262698173522949, + "step": 16420 + }, + { + "epoch": 0.5357186499498986, + "grad_norm": 2.3000593185424805, + "learning_rate": 4.1077111914925976e-05, + "logits/chosen": 3.682410717010498, + "logits/rejected": 3.9131717681884766, + "logps/chosen": -315.7232971191406, + "logps/rejected": -289.01177978515625, + "loss": 0.6651, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9497973918914795, + "rewards/margins": 2.81892728805542, + "rewards/rejected": -4.76872444152832, + "step": 16440 + }, + { + "epoch": 0.5363703758014191, + "grad_norm": 2.522037982940674, + "learning_rate": 4.1066249551927526e-05, + "logits/chosen": 3.533628463745117, + "logits/rejected": 3.6392955780029297, + "logps/chosen": -319.1247253417969, + "logps/rejected": -315.39874267578125, + "loss": 0.5313, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3738726377487183, + "rewards/margins": 2.4739222526550293, + "rewards/rejected": -3.847794771194458, + "step": 16460 + }, + { + "epoch": 0.5370221016529397, + "grad_norm": 7.092350482940674, + "learning_rate": 4.1055387188929084e-05, + "logits/chosen": 3.6232452392578125, + "logits/rejected": 3.789480686187744, + "logps/chosen": -356.3876953125, + "logps/rejected": -328.9256286621094, + "loss": 0.3774, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.128910779953003, + "rewards/margins": 3.431725025177002, + "rewards/rejected": -4.560635566711426, + "step": 16480 + }, + { + "epoch": 0.5376738275044602, + "grad_norm": 4.21872091293335, + "learning_rate": 4.1044524825930635e-05, + "logits/chosen": 3.8322136402130127, + "logits/rejected": 3.7086734771728516, + "logps/chosen": -395.21728515625, + "logps/rejected": -331.115234375, + "loss": 0.4768, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5650203227996826, + "rewards/margins": 2.39536714553833, + "rewards/rejected": -3.9603874683380127, + "step": 16500 + }, + { + "epoch": 0.5383255533559808, + "grad_norm": 2.5179896354675293, + "learning_rate": 4.1033662462932185e-05, + "logits/chosen": 3.2857718467712402, + "logits/rejected": 3.381528854370117, + "logps/chosen": -316.5406188964844, + "logps/rejected": -296.9846496582031, + "loss": 0.3821, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.3202725648880005, + "rewards/margins": 2.867684841156006, + "rewards/rejected": -4.187956809997559, + "step": 16520 + }, + { + "epoch": 0.5389772792075014, + "grad_norm": 4.915157318115234, + "learning_rate": 4.1022800099933736e-05, + "logits/chosen": 3.8155319690704346, + "logits/rejected": 3.875534772872925, + "logps/chosen": -341.617431640625, + "logps/rejected": -295.5185852050781, + "loss": 0.3071, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7325659990310669, + "rewards/margins": 2.9060864448547363, + "rewards/rejected": -3.6386520862579346, + "step": 16540 + }, + { + "epoch": 0.5396290050590219, + "grad_norm": 9.574320793151855, + "learning_rate": 4.1011937736935294e-05, + "logits/chosen": 3.291896104812622, + "logits/rejected": 3.3649649620056152, + "logps/chosen": -302.174072265625, + "logps/rejected": -323.4270935058594, + "loss": 0.756, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7479221820831299, + "rewards/margins": 2.310025215148926, + "rewards/rejected": -4.057946681976318, + "step": 16560 + }, + { + "epoch": 0.5402807309105425, + "grad_norm": 0.06672211736440659, + "learning_rate": 4.100107537393685e-05, + "logits/chosen": 3.3812053203582764, + "logits/rejected": 3.5315048694610596, + "logps/chosen": -326.1324768066406, + "logps/rejected": -309.76910400390625, + "loss": 0.336, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.642348289489746, + "rewards/margins": 3.2371697425842285, + "rewards/rejected": -4.879518032073975, + "step": 16580 + }, + { + "epoch": 0.540932456762063, + "grad_norm": 2.910672187805176, + "learning_rate": 4.09902130109384e-05, + "logits/chosen": 3.561270236968994, + "logits/rejected": 3.5272536277770996, + "logps/chosen": -368.1986999511719, + "logps/rejected": -355.74029541015625, + "loss": 0.4268, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0716097354888916, + "rewards/margins": 2.6593213081359863, + "rewards/rejected": -3.730931043624878, + "step": 16600 + }, + { + "epoch": 0.5415841826135835, + "grad_norm": 3.409862518310547, + "learning_rate": 4.097935064793996e-05, + "logits/chosen": 3.4966647624969482, + "logits/rejected": 3.716545581817627, + "logps/chosen": -327.6935729980469, + "logps/rejected": -308.0021057128906, + "loss": 0.4825, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9047467708587646, + "rewards/margins": 2.195730686187744, + "rewards/rejected": -4.100477695465088, + "step": 16620 + }, + { + "epoch": 0.5422359084651042, + "grad_norm": 0.7334257364273071, + "learning_rate": 4.096848828494151e-05, + "logits/chosen": 3.658825635910034, + "logits/rejected": 3.7573211193084717, + "logps/chosen": -359.05780029296875, + "logps/rejected": -354.2613525390625, + "loss": 0.3774, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.091545343399048, + "rewards/margins": 3.177839517593384, + "rewards/rejected": -5.269384384155273, + "step": 16640 + }, + { + "epoch": 0.5428876343166247, + "grad_norm": 2.2928388118743896, + "learning_rate": 4.095762592194306e-05, + "logits/chosen": 3.469057559967041, + "logits/rejected": 3.4873008728027344, + "logps/chosen": -310.2811279296875, + "logps/rejected": -326.82183837890625, + "loss": 0.3781, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7444870471954346, + "rewards/margins": 3.2072784900665283, + "rewards/rejected": -4.951765537261963, + "step": 16660 + }, + { + "epoch": 0.5435393601681453, + "grad_norm": 1.9679019451141357, + "learning_rate": 4.094676355894462e-05, + "logits/chosen": 3.0370547771453857, + "logits/rejected": 3.1595029830932617, + "logps/chosen": -323.564697265625, + "logps/rejected": -265.3701477050781, + "loss": 0.2853, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.849034309387207, + "rewards/margins": 2.9134864807128906, + "rewards/rejected": -4.762520790100098, + "step": 16680 + }, + { + "epoch": 0.5441910860196658, + "grad_norm": 2.631176710128784, + "learning_rate": 4.093590119594617e-05, + "logits/chosen": 3.3245224952697754, + "logits/rejected": 3.3731250762939453, + "logps/chosen": -328.8118896484375, + "logps/rejected": -340.6634826660156, + "loss": 0.4646, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5418332815170288, + "rewards/margins": 2.792855739593506, + "rewards/rejected": -4.334689140319824, + "step": 16700 + }, + { + "epoch": 0.5448428118711864, + "grad_norm": 0.8185393214225769, + "learning_rate": 4.092503883294772e-05, + "logits/chosen": 3.4782772064208984, + "logits/rejected": 3.578273296356201, + "logps/chosen": -332.85888671875, + "logps/rejected": -319.64996337890625, + "loss": 0.3906, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.723750352859497, + "rewards/margins": 3.1395840644836426, + "rewards/rejected": -4.863334655761719, + "step": 16720 + }, + { + "epoch": 0.545494537722707, + "grad_norm": 2.3405966758728027, + "learning_rate": 4.091417646994927e-05, + "logits/chosen": 3.684209108352661, + "logits/rejected": 3.87311053276062, + "logps/chosen": -377.6273498535156, + "logps/rejected": -331.4395446777344, + "loss": 0.5743, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1375551223754883, + "rewards/margins": 2.8587472438812256, + "rewards/rejected": -4.996302604675293, + "step": 16740 + }, + { + "epoch": 0.5461462635742275, + "grad_norm": 0.6701863408088684, + "learning_rate": 4.090331410695083e-05, + "logits/chosen": 3.316593885421753, + "logits/rejected": 3.480614185333252, + "logps/chosen": -353.3307189941406, + "logps/rejected": -280.45703125, + "loss": 0.7582, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5938360691070557, + "rewards/margins": 2.0317416191101074, + "rewards/rejected": -3.625577926635742, + "step": 16760 + }, + { + "epoch": 0.5467979894257481, + "grad_norm": 5.881904125213623, + "learning_rate": 4.089245174395238e-05, + "logits/chosen": 3.6448585987091064, + "logits/rejected": 3.8429951667785645, + "logps/chosen": -339.60321044921875, + "logps/rejected": -359.9998474121094, + "loss": 0.5717, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.07350798696279526, + "rewards/margins": 3.3085951805114746, + "rewards/rejected": -3.3821029663085938, + "step": 16780 + }, + { + "epoch": 0.5474497152772686, + "grad_norm": 1.1428108215332031, + "learning_rate": 4.088158938095393e-05, + "logits/chosen": 3.6344501972198486, + "logits/rejected": 3.7934017181396484, + "logps/chosen": -374.9032287597656, + "logps/rejected": -359.20452880859375, + "loss": 0.332, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.1779035329818726, + "rewards/margins": 3.775850772857666, + "rewards/rejected": -4.95375394821167, + "step": 16800 + }, + { + "epoch": 0.5481014411287892, + "grad_norm": 2.356426239013672, + "learning_rate": 4.087072701795549e-05, + "logits/chosen": 3.4311001300811768, + "logits/rejected": 3.5605225563049316, + "logps/chosen": -359.0180969238281, + "logps/rejected": -318.30999755859375, + "loss": 0.3621, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8283889293670654, + "rewards/margins": 2.9773643016815186, + "rewards/rejected": -4.805753231048584, + "step": 16820 + }, + { + "epoch": 0.5487531669803097, + "grad_norm": 2.710925340652466, + "learning_rate": 4.085986465495704e-05, + "logits/chosen": 3.7343127727508545, + "logits/rejected": 3.9404385089874268, + "logps/chosen": -377.3125915527344, + "logps/rejected": -312.35015869140625, + "loss": 0.6075, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7766954898834229, + "rewards/margins": 2.1392338275909424, + "rewards/rejected": -3.9159293174743652, + "step": 16840 + }, + { + "epoch": 0.5494048928318302, + "grad_norm": 2.772966146469116, + "learning_rate": 4.0849002291958596e-05, + "logits/chosen": 3.6029064655303955, + "logits/rejected": 3.7143890857696533, + "logps/chosen": -306.6961975097656, + "logps/rejected": -297.99432373046875, + "loss": 0.587, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7526648044586182, + "rewards/margins": 2.3926026821136475, + "rewards/rejected": -4.145267009735107, + "step": 16860 + }, + { + "epoch": 0.5500566186833509, + "grad_norm": 3.623116970062256, + "learning_rate": 4.083813992896015e-05, + "logits/chosen": 3.7068862915039062, + "logits/rejected": 3.7985644340515137, + "logps/chosen": -345.62042236328125, + "logps/rejected": -264.6666259765625, + "loss": 0.4186, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6022199392318726, + "rewards/margins": 2.474282741546631, + "rewards/rejected": -4.076502799987793, + "step": 16880 + }, + { + "epoch": 0.5507083445348714, + "grad_norm": 3.271564483642578, + "learning_rate": 4.0827277565961705e-05, + "logits/chosen": 3.340813159942627, + "logits/rejected": 3.9020438194274902, + "logps/chosen": -314.8501281738281, + "logps/rejected": -288.78936767578125, + "loss": 0.4077, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8934276103973389, + "rewards/margins": 2.882622241973877, + "rewards/rejected": -4.776049613952637, + "step": 16900 + }, + { + "epoch": 0.551360070386392, + "grad_norm": 5.812217712402344, + "learning_rate": 4.0816415202963255e-05, + "logits/chosen": 3.8133232593536377, + "logits/rejected": 3.7995574474334717, + "logps/chosen": -375.29351806640625, + "logps/rejected": -301.70697021484375, + "loss": 0.4793, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.349822998046875, + "rewards/margins": 2.773627519607544, + "rewards/rejected": -5.12345027923584, + "step": 16920 + }, + { + "epoch": 0.5520117962379125, + "grad_norm": 0.7375978231430054, + "learning_rate": 4.0805552839964806e-05, + "logits/chosen": 3.353435516357422, + "logits/rejected": 3.6642098426818848, + "logps/chosen": -375.56781005859375, + "logps/rejected": -322.7897644042969, + "loss": 0.28, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1606662273406982, + "rewards/margins": 3.669294834136963, + "rewards/rejected": -5.829960823059082, + "step": 16940 + }, + { + "epoch": 0.552663522089433, + "grad_norm": 0.2454719841480255, + "learning_rate": 4.0794690476966364e-05, + "logits/chosen": 3.5146191120147705, + "logits/rejected": 3.941361665725708, + "logps/chosen": -321.4756164550781, + "logps/rejected": -310.04296875, + "loss": 0.5943, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.340620517730713, + "rewards/margins": 3.1345906257629395, + "rewards/rejected": -5.475211143493652, + "step": 16960 + }, + { + "epoch": 0.5533152479409537, + "grad_norm": 4.969719409942627, + "learning_rate": 4.0783828113967914e-05, + "logits/chosen": 3.540800094604492, + "logits/rejected": 3.7951087951660156, + "logps/chosen": -344.8605041503906, + "logps/rejected": -293.1742248535156, + "loss": 0.4029, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8109657764434814, + "rewards/margins": 3.3112895488739014, + "rewards/rejected": -5.122254848480225, + "step": 16980 + }, + { + "epoch": 0.5539669737924742, + "grad_norm": 1.8236699104309082, + "learning_rate": 4.0772965750969465e-05, + "logits/chosen": 3.5320637226104736, + "logits/rejected": 3.861774444580078, + "logps/chosen": -342.4463806152344, + "logps/rejected": -304.013916015625, + "loss": 0.3386, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9659630060195923, + "rewards/margins": 3.0011353492736816, + "rewards/rejected": -4.967098236083984, + "step": 17000 + }, + { + "epoch": 0.5546186996439948, + "grad_norm": 1.152929663658142, + "learning_rate": 4.076210338797102e-05, + "logits/chosen": 3.5347347259521484, + "logits/rejected": 3.7421059608459473, + "logps/chosen": -326.6462707519531, + "logps/rejected": -328.9617614746094, + "loss": 0.5782, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.152693748474121, + "rewards/margins": 2.8025355339050293, + "rewards/rejected": -4.95522928237915, + "step": 17020 + }, + { + "epoch": 0.5552704254955153, + "grad_norm": 2.274355173110962, + "learning_rate": 4.0751241024972574e-05, + "logits/chosen": 3.8357536792755127, + "logits/rejected": 3.980527877807617, + "logps/chosen": -354.13714599609375, + "logps/rejected": -295.6404724121094, + "loss": 0.5078, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.443321704864502, + "rewards/margins": 2.6046719551086426, + "rewards/rejected": -5.0479936599731445, + "step": 17040 + }, + { + "epoch": 0.5559221513470358, + "grad_norm": 3.769198179244995, + "learning_rate": 4.0740378661974124e-05, + "logits/chosen": 3.968676805496216, + "logits/rejected": 3.983034133911133, + "logps/chosen": -336.93890380859375, + "logps/rejected": -334.6183776855469, + "loss": 0.5316, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.129159450531006, + "rewards/margins": 2.718789577484131, + "rewards/rejected": -4.847949028015137, + "step": 17060 + }, + { + "epoch": 0.5565738771985564, + "grad_norm": 0.3462807238101959, + "learning_rate": 4.0729516298975675e-05, + "logits/chosen": 3.474073886871338, + "logits/rejected": 3.91508150100708, + "logps/chosen": -346.1643981933594, + "logps/rejected": -334.39959716796875, + "loss": 0.3838, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6531264781951904, + "rewards/margins": 3.431230068206787, + "rewards/rejected": -5.084356784820557, + "step": 17080 + }, + { + "epoch": 0.557225603050077, + "grad_norm": 0.09273256361484528, + "learning_rate": 4.071865393597723e-05, + "logits/chosen": 3.7312121391296387, + "logits/rejected": 3.9171993732452393, + "logps/chosen": -358.0005798339844, + "logps/rejected": -304.7071533203125, + "loss": 0.5557, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9384613037109375, + "rewards/margins": 2.985762119293213, + "rewards/rejected": -4.924223899841309, + "step": 17100 + }, + { + "epoch": 0.5578773289015976, + "grad_norm": 1.8675979375839233, + "learning_rate": 4.070779157297879e-05, + "logits/chosen": 3.656787395477295, + "logits/rejected": 3.8672378063201904, + "logps/chosen": -348.7405090332031, + "logps/rejected": -303.9360046386719, + "loss": 0.2734, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.691070318222046, + "rewards/margins": 3.901777744293213, + "rewards/rejected": -5.592848300933838, + "step": 17120 + }, + { + "epoch": 0.5585290547531181, + "grad_norm": 0.9890944361686707, + "learning_rate": 4.069692920998034e-05, + "logits/chosen": 3.7272510528564453, + "logits/rejected": 3.9403293132781982, + "logps/chosen": -332.17962646484375, + "logps/rejected": -308.3672790527344, + "loss": 0.5157, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8035573959350586, + "rewards/margins": 2.8520374298095703, + "rewards/rejected": -4.655594825744629, + "step": 17140 + }, + { + "epoch": 0.5591807806046386, + "grad_norm": 2.8779852390289307, + "learning_rate": 4.06860668469819e-05, + "logits/chosen": 3.8100593090057373, + "logits/rejected": 3.9946494102478027, + "logps/chosen": -366.68756103515625, + "logps/rejected": -294.27044677734375, + "loss": 0.3637, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.184539556503296, + "rewards/margins": 3.2165825366973877, + "rewards/rejected": -4.401122093200684, + "step": 17160 + }, + { + "epoch": 0.5598325064561592, + "grad_norm": 0.3714311122894287, + "learning_rate": 4.067520448398345e-05, + "logits/chosen": 3.865145206451416, + "logits/rejected": 3.8527369499206543, + "logps/chosen": -366.4514465332031, + "logps/rejected": -356.845947265625, + "loss": 0.6531, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6979496479034424, + "rewards/margins": 2.748065233230591, + "rewards/rejected": -4.446014404296875, + "step": 17180 + }, + { + "epoch": 0.5604842323076797, + "grad_norm": 1.5265766382217407, + "learning_rate": 4.0664342120985e-05, + "logits/chosen": 3.660073757171631, + "logits/rejected": 4.0379438400268555, + "logps/chosen": -359.2120361328125, + "logps/rejected": -345.4322814941406, + "loss": 0.457, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8405351638793945, + "rewards/margins": 2.496556520462036, + "rewards/rejected": -4.33709192276001, + "step": 17200 + }, + { + "epoch": 0.5611359581592004, + "grad_norm": 1.6934906244277954, + "learning_rate": 4.065347975798656e-05, + "logits/chosen": 4.09769344329834, + "logits/rejected": 4.173195838928223, + "logps/chosen": -341.73187255859375, + "logps/rejected": -343.60235595703125, + "loss": 0.5845, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9594299793243408, + "rewards/margins": 2.2126736640930176, + "rewards/rejected": -4.172102928161621, + "step": 17220 + }, + { + "epoch": 0.5617876840107209, + "grad_norm": 2.67818546295166, + "learning_rate": 4.064261739498811e-05, + "logits/chosen": 3.577247142791748, + "logits/rejected": 3.792029619216919, + "logps/chosen": -345.70648193359375, + "logps/rejected": -293.4059143066406, + "loss": 0.4722, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.2796885967254639, + "rewards/margins": 3.5415942668914795, + "rewards/rejected": -4.821282863616943, + "step": 17240 + }, + { + "epoch": 0.5624394098622415, + "grad_norm": 6.768702030181885, + "learning_rate": 4.063175503198966e-05, + "logits/chosen": 3.9642555713653564, + "logits/rejected": 4.333308696746826, + "logps/chosen": -348.0759582519531, + "logps/rejected": -338.28912353515625, + "loss": 0.4802, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7338310480117798, + "rewards/margins": 2.632685899734497, + "rewards/rejected": -4.366517066955566, + "step": 17260 + }, + { + "epoch": 0.563091135713762, + "grad_norm": 0.4019780457019806, + "learning_rate": 4.062089266899121e-05, + "logits/chosen": 3.9689247608184814, + "logits/rejected": 3.9149041175842285, + "logps/chosen": -371.96832275390625, + "logps/rejected": -384.68975830078125, + "loss": 0.5485, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7645759582519531, + "rewards/margins": 2.517930269241333, + "rewards/rejected": -4.282505989074707, + "step": 17280 + }, + { + "epoch": 0.5637428615652825, + "grad_norm": 1.9647817611694336, + "learning_rate": 4.061003030599277e-05, + "logits/chosen": 3.540039539337158, + "logits/rejected": 3.8697190284729004, + "logps/chosen": -300.17437744140625, + "logps/rejected": -323.9010925292969, + "loss": 0.4784, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.933014154434204, + "rewards/margins": 3.0064451694488525, + "rewards/rejected": -4.939459800720215, + "step": 17300 + }, + { + "epoch": 0.5643945874168032, + "grad_norm": 3.6428775787353516, + "learning_rate": 4.059916794299432e-05, + "logits/chosen": 3.6496143341064453, + "logits/rejected": 3.9401772022247314, + "logps/chosen": -358.901123046875, + "logps/rejected": -331.24420166015625, + "loss": 0.4823, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6951717138290405, + "rewards/margins": 2.708016872406006, + "rewards/rejected": -4.403188705444336, + "step": 17320 + }, + { + "epoch": 0.5650463132683237, + "grad_norm": 4.074242115020752, + "learning_rate": 4.058830557999587e-05, + "logits/chosen": 3.4539589881896973, + "logits/rejected": 3.7641990184783936, + "logps/chosen": -305.31463623046875, + "logps/rejected": -282.3783874511719, + "loss": 0.558, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8405523300170898, + "rewards/margins": 1.7144291400909424, + "rewards/rejected": -3.554981231689453, + "step": 17340 + }, + { + "epoch": 0.5656980391198443, + "grad_norm": 1.5191537141799927, + "learning_rate": 4.057744321699743e-05, + "logits/chosen": 3.391869306564331, + "logits/rejected": 3.679645538330078, + "logps/chosen": -299.1665954589844, + "logps/rejected": -291.9519348144531, + "loss": 0.3409, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4574081897735596, + "rewards/margins": 2.862790584564209, + "rewards/rejected": -5.320198059082031, + "step": 17360 + }, + { + "epoch": 0.5663497649713648, + "grad_norm": 5.320065975189209, + "learning_rate": 4.0566580853998984e-05, + "logits/chosen": 3.441061019897461, + "logits/rejected": 3.641382932662964, + "logps/chosen": -337.8479919433594, + "logps/rejected": -331.2164001464844, + "loss": 0.6094, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2253448963165283, + "rewards/margins": 2.298893451690674, + "rewards/rejected": -4.524237632751465, + "step": 17380 + }, + { + "epoch": 0.5670014908228853, + "grad_norm": 1.3383868932724, + "learning_rate": 4.0555718491000535e-05, + "logits/chosen": 3.4501731395721436, + "logits/rejected": 3.4527008533477783, + "logps/chosen": -328.55694580078125, + "logps/rejected": -329.4349670410156, + "loss": 0.5045, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.176740884780884, + "rewards/margins": 3.4652953147888184, + "rewards/rejected": -5.642036437988281, + "step": 17400 + }, + { + "epoch": 0.5676532166744059, + "grad_norm": 2.023404836654663, + "learning_rate": 4.054485612800209e-05, + "logits/chosen": 3.6103062629699707, + "logits/rejected": 3.8097338676452637, + "logps/chosen": -306.9372863769531, + "logps/rejected": -330.6369323730469, + "loss": 0.6259, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.2131025791168213, + "rewards/margins": 2.1943199634552, + "rewards/rejected": -4.4074225425720215, + "step": 17420 + }, + { + "epoch": 0.5683049425259264, + "grad_norm": 2.5765914916992188, + "learning_rate": 4.0533993765003643e-05, + "logits/chosen": 3.260270357131958, + "logits/rejected": 3.305863857269287, + "logps/chosen": -282.70654296875, + "logps/rejected": -324.36492919921875, + "loss": 0.4266, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6937662363052368, + "rewards/margins": 3.0278351306915283, + "rewards/rejected": -4.7216010093688965, + "step": 17440 + }, + { + "epoch": 0.5689566683774471, + "grad_norm": 3.7042946815490723, + "learning_rate": 4.0523131402005194e-05, + "logits/chosen": 3.1352782249450684, + "logits/rejected": 3.3643975257873535, + "logps/chosen": -339.87774658203125, + "logps/rejected": -340.91790771484375, + "loss": 0.5062, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0296101570129395, + "rewards/margins": 3.1282060146331787, + "rewards/rejected": -5.157816410064697, + "step": 17460 + }, + { + "epoch": 0.5696083942289676, + "grad_norm": 3.2569921016693115, + "learning_rate": 4.0512269039006745e-05, + "logits/chosen": 3.2334465980529785, + "logits/rejected": 3.3571979999542236, + "logps/chosen": -351.53936767578125, + "logps/rejected": -315.5303039550781, + "loss": 0.4443, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.362166404724121, + "rewards/margins": 2.603855609893799, + "rewards/rejected": -4.966022491455078, + "step": 17480 + }, + { + "epoch": 0.5702601200804881, + "grad_norm": 3.8433151245117188, + "learning_rate": 4.05014066760083e-05, + "logits/chosen": 3.4125125408172607, + "logits/rejected": 3.5275940895080566, + "logps/chosen": -300.59588623046875, + "logps/rejected": -305.61981201171875, + "loss": 0.5771, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.2861106395721436, + "rewards/margins": 2.303915023803711, + "rewards/rejected": -4.590025901794434, + "step": 17500 + }, + { + "epoch": 0.5709118459320087, + "grad_norm": 1.978237271308899, + "learning_rate": 4.049054431300985e-05, + "logits/chosen": 3.641714572906494, + "logits/rejected": 3.8399243354797363, + "logps/chosen": -381.3780822753906, + "logps/rejected": -290.4139099121094, + "loss": 0.553, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5450420379638672, + "rewards/margins": 2.7729978561401367, + "rewards/rejected": -4.318039894104004, + "step": 17520 + }, + { + "epoch": 0.5715635717835292, + "grad_norm": 22.04918670654297, + "learning_rate": 4.0479681950011404e-05, + "logits/chosen": 3.574471950531006, + "logits/rejected": 3.809835910797119, + "logps/chosen": -361.33099365234375, + "logps/rejected": -330.52716064453125, + "loss": 0.7459, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.600752830505371, + "rewards/margins": 2.591087818145752, + "rewards/rejected": -4.191840648651123, + "step": 17540 + }, + { + "epoch": 0.5722152976350499, + "grad_norm": 4.3850274085998535, + "learning_rate": 4.046881958701296e-05, + "logits/chosen": 3.642549514770508, + "logits/rejected": 3.685814619064331, + "logps/chosen": -371.5742492675781, + "logps/rejected": -336.50140380859375, + "loss": 0.3817, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6984097957611084, + "rewards/margins": 3.10876202583313, + "rewards/rejected": -4.807171821594238, + "step": 17560 + }, + { + "epoch": 0.5728670234865704, + "grad_norm": 1.562686562538147, + "learning_rate": 4.045795722401451e-05, + "logits/chosen": 3.186112403869629, + "logits/rejected": 3.5212273597717285, + "logps/chosen": -324.05267333984375, + "logps/rejected": -280.87445068359375, + "loss": 0.4419, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0529239177703857, + "rewards/margins": 2.6884264945983887, + "rewards/rejected": -4.7413506507873535, + "step": 17580 + }, + { + "epoch": 0.5735187493380909, + "grad_norm": 1.038318395614624, + "learning_rate": 4.044709486101606e-05, + "logits/chosen": 3.437678813934326, + "logits/rejected": 3.7396507263183594, + "logps/chosen": -319.87554931640625, + "logps/rejected": -316.0446472167969, + "loss": 0.347, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.1178579330444336, + "rewards/margins": 3.5060646533966064, + "rewards/rejected": -5.623922824859619, + "step": 17600 + }, + { + "epoch": 0.5741704751896115, + "grad_norm": 3.2043070793151855, + "learning_rate": 4.043623249801762e-05, + "logits/chosen": 3.503593921661377, + "logits/rejected": 3.4404499530792236, + "logps/chosen": -301.3606872558594, + "logps/rejected": -282.5544128417969, + "loss": 0.5674, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9794962406158447, + "rewards/margins": 1.8226343393325806, + "rewards/rejected": -3.8021304607391357, + "step": 17620 + }, + { + "epoch": 0.574822201041132, + "grad_norm": 3.1187689304351807, + "learning_rate": 4.042537013501917e-05, + "logits/chosen": 3.4095230102539062, + "logits/rejected": 3.519113540649414, + "logps/chosen": -336.3672790527344, + "logps/rejected": -322.92633056640625, + "loss": 0.5398, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3187344074249268, + "rewards/margins": 2.139423131942749, + "rewards/rejected": -4.458158016204834, + "step": 17640 + }, + { + "epoch": 0.5754739268926526, + "grad_norm": 4.452084541320801, + "learning_rate": 4.041450777202073e-05, + "logits/chosen": 3.11897611618042, + "logits/rejected": 3.544090747833252, + "logps/chosen": -307.207763671875, + "logps/rejected": -301.516845703125, + "loss": 0.481, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.120513916015625, + "rewards/margins": 2.756035327911377, + "rewards/rejected": -4.87654972076416, + "step": 17660 + }, + { + "epoch": 0.5761256527441732, + "grad_norm": 9.495017051696777, + "learning_rate": 4.040364540902228e-05, + "logits/chosen": 3.2818655967712402, + "logits/rejected": 3.6082897186279297, + "logps/chosen": -335.900634765625, + "logps/rejected": -311.44488525390625, + "loss": 0.4248, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.3702540397644043, + "rewards/margins": 2.7787344455718994, + "rewards/rejected": -5.148988246917725, + "step": 17680 + }, + { + "epoch": 0.5767773785956937, + "grad_norm": 0.2564842402935028, + "learning_rate": 4.039278304602384e-05, + "logits/chosen": 3.3069815635681152, + "logits/rejected": 3.5689117908477783, + "logps/chosen": -340.54449462890625, + "logps/rejected": -336.52899169921875, + "loss": 0.4804, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.358238697052002, + "rewards/margins": 2.4655864238739014, + "rewards/rejected": -4.823824882507324, + "step": 17700 + }, + { + "epoch": 0.5774291044472143, + "grad_norm": 6.34815788269043, + "learning_rate": 4.038192068302539e-05, + "logits/chosen": 3.075221300125122, + "logits/rejected": 3.35197377204895, + "logps/chosen": -313.1931457519531, + "logps/rejected": -291.43927001953125, + "loss": 0.5126, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.993939757347107, + "rewards/margins": 2.6112704277038574, + "rewards/rejected": -4.605210304260254, + "step": 17720 + }, + { + "epoch": 0.5780808302987348, + "grad_norm": 1.5126179456710815, + "learning_rate": 4.037105832002694e-05, + "logits/chosen": 3.4160754680633545, + "logits/rejected": 3.7027747631073, + "logps/chosen": -341.41058349609375, + "logps/rejected": -323.6886901855469, + "loss": 0.5121, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4512180089950562, + "rewards/margins": 2.9465274810791016, + "rewards/rejected": -4.397745132446289, + "step": 17740 + }, + { + "epoch": 0.5787325561502554, + "grad_norm": 3.955166816711426, + "learning_rate": 4.03601959570285e-05, + "logits/chosen": 3.7477900981903076, + "logits/rejected": 3.910456895828247, + "logps/chosen": -340.849853515625, + "logps/rejected": -315.1708068847656, + "loss": 0.4449, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3211076259613037, + "rewards/margins": 3.3275184631347656, + "rewards/rejected": -4.648626327514648, + "step": 17760 + }, + { + "epoch": 0.5793842820017759, + "grad_norm": 1.9447165727615356, + "learning_rate": 4.034933359403005e-05, + "logits/chosen": 3.4632506370544434, + "logits/rejected": 3.6600565910339355, + "logps/chosen": -340.71002197265625, + "logps/rejected": -287.7596740722656, + "loss": 0.3343, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.9385568499565125, + "rewards/margins": 2.724879026412964, + "rewards/rejected": -3.6634361743927, + "step": 17780 + }, + { + "epoch": 0.5800360078532966, + "grad_norm": 1.6213334798812866, + "learning_rate": 4.03384712310316e-05, + "logits/chosen": 3.9396347999572754, + "logits/rejected": 3.9338021278381348, + "logps/chosen": -332.6965026855469, + "logps/rejected": -292.20867919921875, + "loss": 0.6393, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4896384477615356, + "rewards/margins": 2.6620571613311768, + "rewards/rejected": -4.151695728302002, + "step": 17800 + }, + { + "epoch": 0.5806877337048171, + "grad_norm": 0.8459265232086182, + "learning_rate": 4.0327608868033156e-05, + "logits/chosen": 3.714707136154175, + "logits/rejected": 3.869851589202881, + "logps/chosen": -360.87835693359375, + "logps/rejected": -364.0660705566406, + "loss": 0.4009, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.2614670991897583, + "rewards/margins": 2.9819159507751465, + "rewards/rejected": -4.243382930755615, + "step": 17820 + }, + { + "epoch": 0.5813394595563376, + "grad_norm": 4.475644111633301, + "learning_rate": 4.0316746505034707e-05, + "logits/chosen": 3.8674893379211426, + "logits/rejected": 3.8894851207733154, + "logps/chosen": -368.8748474121094, + "logps/rejected": -342.51806640625, + "loss": 0.4224, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9671932458877563, + "rewards/margins": 3.215308427810669, + "rewards/rejected": -4.182501792907715, + "step": 17840 + }, + { + "epoch": 0.5819911854078582, + "grad_norm": 2.7617430686950684, + "learning_rate": 4.030588414203626e-05, + "logits/chosen": 3.2954788208007812, + "logits/rejected": 3.656978130340576, + "logps/chosen": -368.43133544921875, + "logps/rejected": -334.18914794921875, + "loss": 0.3738, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2788447141647339, + "rewards/margins": 3.168837070465088, + "rewards/rejected": -4.447681903839111, + "step": 17860 + }, + { + "epoch": 0.5826429112593787, + "grad_norm": 0.9533687233924866, + "learning_rate": 4.0295021779037815e-05, + "logits/chosen": 3.5746941566467285, + "logits/rejected": 3.7224490642547607, + "logps/chosen": -319.62762451171875, + "logps/rejected": -303.396728515625, + "loss": 0.3685, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.1220908164978027, + "rewards/margins": 2.8761866092681885, + "rewards/rejected": -3.9982776641845703, + "step": 17880 + }, + { + "epoch": 0.5832946371108993, + "grad_norm": 5.040199279785156, + "learning_rate": 4.0284159416039366e-05, + "logits/chosen": 3.4782111644744873, + "logits/rejected": 3.6831181049346924, + "logps/chosen": -332.53826904296875, + "logps/rejected": -301.15753173828125, + "loss": 0.3674, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9700590968132019, + "rewards/margins": 3.4207592010498047, + "rewards/rejected": -4.390818119049072, + "step": 17900 + }, + { + "epoch": 0.5839463629624199, + "grad_norm": 2.3014185428619385, + "learning_rate": 4.027329705304092e-05, + "logits/chosen": 2.990863084793091, + "logits/rejected": 3.1450207233428955, + "logps/chosen": -310.9618835449219, + "logps/rejected": -297.8345642089844, + "loss": 0.5574, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4787907600402832, + "rewards/margins": 2.175901412963867, + "rewards/rejected": -3.6546921730041504, + "step": 17920 + }, + { + "epoch": 0.5845980888139404, + "grad_norm": 1.0643277168273926, + "learning_rate": 4.0262434690042474e-05, + "logits/chosen": 2.9953103065490723, + "logits/rejected": 3.1037497520446777, + "logps/chosen": -300.0231628417969, + "logps/rejected": -284.1839294433594, + "loss": 0.4283, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8790050745010376, + "rewards/margins": 3.004833221435547, + "rewards/rejected": -4.883838653564453, + "step": 17940 + }, + { + "epoch": 0.585249814665461, + "grad_norm": 2.3158254623413086, + "learning_rate": 4.025157232704403e-05, + "logits/chosen": 3.2110774517059326, + "logits/rejected": 3.2449326515197754, + "logps/chosen": -317.14166259765625, + "logps/rejected": -352.529052734375, + "loss": 0.394, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4705578088760376, + "rewards/margins": 3.2474358081817627, + "rewards/rejected": -4.717993259429932, + "step": 17960 + }, + { + "epoch": 0.5859015405169815, + "grad_norm": 3.332915782928467, + "learning_rate": 4.024070996404558e-05, + "logits/chosen": 3.547272205352783, + "logits/rejected": 3.560359239578247, + "logps/chosen": -335.4058532714844, + "logps/rejected": -315.2486267089844, + "loss": 0.5035, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.056124210357666, + "rewards/margins": 2.877519130706787, + "rewards/rejected": -4.933643341064453, + "step": 17980 + }, + { + "epoch": 0.5865532663685021, + "grad_norm": 1.863903522491455, + "learning_rate": 4.022984760104713e-05, + "logits/chosen": 2.771902561187744, + "logits/rejected": 2.8957951068878174, + "logps/chosen": -324.15118408203125, + "logps/rejected": -321.2676696777344, + "loss": 0.5395, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.8934465646743774, + "rewards/margins": 3.1752521991729736, + "rewards/rejected": -5.068698406219482, + "step": 18000 + }, + { + "epoch": 0.5865532663685021, + "eval_logits/chosen": 3.310800552368164, + "eval_logits/rejected": 3.4121129512786865, + "eval_logps/chosen": -375.6473693847656, + "eval_logps/rejected": -351.6201171875, + "eval_loss": 0.436347097158432, + "eval_rewards/accuracies": 0.8298138976097107, + "eval_rewards/chosen": -2.1066739559173584, + "eval_rewards/margins": 3.432015895843506, + "eval_rewards/rejected": -5.538689613342285, + "eval_runtime": 3545.6886, + "eval_samples_per_second": 3.152, + "eval_steps_per_second": 3.152, + "step": 18000 + }, + { + "epoch": 0.5872049922200226, + "grad_norm": 2.5957157611846924, + "learning_rate": 4.021898523804869e-05, + "logits/chosen": 3.2940821647644043, + "logits/rejected": 3.4354755878448486, + "logps/chosen": -322.11614990234375, + "logps/rejected": -331.75445556640625, + "loss": 0.7567, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6956398487091064, + "rewards/margins": 2.5553596019744873, + "rewards/rejected": -4.250999450683594, + "step": 18020 + }, + { + "epoch": 0.5878567180715432, + "grad_norm": 4.038511753082275, + "learning_rate": 4.020812287505024e-05, + "logits/chosen": 2.9881608486175537, + "logits/rejected": 3.316256046295166, + "logps/chosen": -302.816162109375, + "logps/rejected": -255.8801727294922, + "loss": 0.3763, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.9304767847061157, + "rewards/margins": 2.912667989730835, + "rewards/rejected": -3.8431448936462402, + "step": 18040 + }, + { + "epoch": 0.5885084439230638, + "grad_norm": 2.012795925140381, + "learning_rate": 4.019726051205179e-05, + "logits/chosen": 3.545429229736328, + "logits/rejected": 3.609666109085083, + "logps/chosen": -333.89813232421875, + "logps/rejected": -309.8428039550781, + "loss": 0.5302, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6525447368621826, + "rewards/margins": 2.670718193054199, + "rewards/rejected": -4.323262691497803, + "step": 18060 + }, + { + "epoch": 0.5891601697745843, + "grad_norm": 9.193358421325684, + "learning_rate": 4.018639814905334e-05, + "logits/chosen": 3.1883339881896973, + "logits/rejected": 3.3963074684143066, + "logps/chosen": -311.74017333984375, + "logps/rejected": -300.39886474609375, + "loss": 0.3879, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.1716463565826416, + "rewards/margins": 3.2274131774902344, + "rewards/rejected": -4.399058818817139, + "step": 18080 + }, + { + "epoch": 0.5898118956261049, + "grad_norm": 0.7603635787963867, + "learning_rate": 4.01755357860549e-05, + "logits/chosen": 2.957515001296997, + "logits/rejected": 3.187289237976074, + "logps/chosen": -313.54248046875, + "logps/rejected": -295.8938293457031, + "loss": 0.4503, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6259231567382812, + "rewards/margins": 2.9575607776641846, + "rewards/rejected": -4.583484172821045, + "step": 18100 + }, + { + "epoch": 0.5904636214776254, + "grad_norm": 0.755395770072937, + "learning_rate": 4.016467342305645e-05, + "logits/chosen": 3.4891796112060547, + "logits/rejected": 3.7362003326416016, + "logps/chosen": -352.5565490722656, + "logps/rejected": -303.4577331542969, + "loss": 0.602, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8553367853164673, + "rewards/margins": 2.303165912628174, + "rewards/rejected": -4.15850305557251, + "step": 18120 + }, + { + "epoch": 0.591115347329146, + "grad_norm": 5.777568340301514, + "learning_rate": 4.0153811060058e-05, + "logits/chosen": 3.648038387298584, + "logits/rejected": 3.8113937377929688, + "logps/chosen": -368.3232421875, + "logps/rejected": -334.57611083984375, + "loss": 0.5449, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2446398735046387, + "rewards/margins": 2.414698600769043, + "rewards/rejected": -3.6593384742736816, + "step": 18140 + }, + { + "epoch": 0.5917670731806666, + "grad_norm": 7.8638105392456055, + "learning_rate": 4.014294869705956e-05, + "logits/chosen": 3.5632553100585938, + "logits/rejected": 3.701063871383667, + "logps/chosen": -337.70709228515625, + "logps/rejected": -310.5272216796875, + "loss": 0.5628, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7964757680892944, + "rewards/margins": 1.9012861251831055, + "rewards/rejected": -3.6977622509002686, + "step": 18160 + }, + { + "epoch": 0.5924187990321871, + "grad_norm": 6.575500965118408, + "learning_rate": 4.013208633406112e-05, + "logits/chosen": 2.944819211959839, + "logits/rejected": 3.104720115661621, + "logps/chosen": -307.7134704589844, + "logps/rejected": -301.8155822753906, + "loss": 0.4444, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4495397806167603, + "rewards/margins": 3.2617359161376953, + "rewards/rejected": -4.711276054382324, + "step": 18180 + }, + { + "epoch": 0.5930705248837077, + "grad_norm": 0.6965501308441162, + "learning_rate": 4.012176708921259e-05, + "logits/chosen": 3.315643310546875, + "logits/rejected": 3.610255479812622, + "logps/chosen": -321.8348083496094, + "logps/rejected": -312.671875, + "loss": 0.4848, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7503213286399841, + "rewards/margins": 2.511058807373047, + "rewards/rejected": -3.261380434036255, + "step": 18200 + }, + { + "epoch": 0.5937222507352282, + "grad_norm": 2.1830203533172607, + "learning_rate": 4.0110904726214146e-05, + "logits/chosen": 3.7529404163360596, + "logits/rejected": 3.7481796741485596, + "logps/chosen": -310.85748291015625, + "logps/rejected": -314.3260498046875, + "loss": 0.5176, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7268819808959961, + "rewards/margins": 2.3061602115631104, + "rewards/rejected": -3.0330419540405273, + "step": 18220 + }, + { + "epoch": 0.5943739765867487, + "grad_norm": 0.9977461099624634, + "learning_rate": 4.01000423632157e-05, + "logits/chosen": 3.4569122791290283, + "logits/rejected": 3.5643444061279297, + "logps/chosen": -340.548095703125, + "logps/rejected": -309.8306579589844, + "loss": 0.3634, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.831283450126648, + "rewards/margins": 2.3681979179382324, + "rewards/rejected": -3.199481248855591, + "step": 18240 + }, + { + "epoch": 0.5950257024382694, + "grad_norm": 1.8719713687896729, + "learning_rate": 4.008918000021725e-05, + "logits/chosen": 3.669773578643799, + "logits/rejected": 3.754978656768799, + "logps/chosen": -334.97528076171875, + "logps/rejected": -284.08074951171875, + "loss": 0.5565, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7252652645111084, + "rewards/margins": 2.146529197692871, + "rewards/rejected": -3.8717944622039795, + "step": 18260 + }, + { + "epoch": 0.5956774282897899, + "grad_norm": 4.388335227966309, + "learning_rate": 4.0078317637218806e-05, + "logits/chosen": 3.366436719894409, + "logits/rejected": 3.486630916595459, + "logps/chosen": -324.04632568359375, + "logps/rejected": -268.3299865722656, + "loss": 0.6378, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4497116804122925, + "rewards/margins": 1.9772497415542603, + "rewards/rejected": -3.4269611835479736, + "step": 18280 + }, + { + "epoch": 0.5963291541413105, + "grad_norm": 4.011209964752197, + "learning_rate": 4.006799839237028e-05, + "logits/chosen": 3.8007278442382812, + "logits/rejected": 3.775372266769409, + "logps/chosen": -388.3985900878906, + "logps/rejected": -296.55194091796875, + "loss": 0.451, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6161425113677979, + "rewards/margins": 2.4371018409729004, + "rewards/rejected": -4.053244590759277, + "step": 18300 + }, + { + "epoch": 0.596980879992831, + "grad_norm": 5.670206069946289, + "learning_rate": 4.0057136029371835e-05, + "logits/chosen": 3.258296489715576, + "logits/rejected": 3.4027342796325684, + "logps/chosen": -365.7374267578125, + "logps/rejected": -279.41131591796875, + "loss": 0.5014, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.6571169495582581, + "rewards/margins": 2.795506238937378, + "rewards/rejected": -3.452622890472412, + "step": 18320 + }, + { + "epoch": 0.5976326058443516, + "grad_norm": 1.3576884269714355, + "learning_rate": 4.0046273666373385e-05, + "logits/chosen": 3.478680372238159, + "logits/rejected": 3.8474628925323486, + "logps/chosen": -303.7761535644531, + "logps/rejected": -290.67047119140625, + "loss": 0.4473, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.3157848119735718, + "rewards/margins": 3.008692979812622, + "rewards/rejected": -4.324477672576904, + "step": 18340 + }, + { + "epoch": 0.5982843316958721, + "grad_norm": 1.8705177307128906, + "learning_rate": 4.0035411303374936e-05, + "logits/chosen": 3.021446943283081, + "logits/rejected": 3.326259136199951, + "logps/chosen": -312.05242919921875, + "logps/rejected": -295.47589111328125, + "loss": 0.3909, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.3760240077972412, + "rewards/margins": 2.967862129211426, + "rewards/rejected": -4.343886375427246, + "step": 18360 + }, + { + "epoch": 0.5989360575473927, + "grad_norm": 5.721783638000488, + "learning_rate": 4.0024548940376494e-05, + "logits/chosen": 3.392443895339966, + "logits/rejected": 3.6891911029815674, + "logps/chosen": -337.7597351074219, + "logps/rejected": -291.61981201171875, + "loss": 0.4762, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7535444498062134, + "rewards/margins": 2.738443374633789, + "rewards/rejected": -4.491988182067871, + "step": 18380 + }, + { + "epoch": 0.5995877833989133, + "grad_norm": 2.8708760738372803, + "learning_rate": 4.0013686577378045e-05, + "logits/chosen": 3.2736740112304688, + "logits/rejected": 3.4569365978240967, + "logps/chosen": -308.95550537109375, + "logps/rejected": -295.20465087890625, + "loss": 0.5979, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0745208263397217, + "rewards/margins": 2.600034713745117, + "rewards/rejected": -4.674555778503418, + "step": 18400 + }, + { + "epoch": 0.6002395092504338, + "grad_norm": 6.474599838256836, + "learning_rate": 4.0002824214379595e-05, + "logits/chosen": 3.2207818031311035, + "logits/rejected": 3.522545576095581, + "logps/chosen": -325.37677001953125, + "logps/rejected": -311.7696228027344, + "loss": 0.592, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.052625894546509, + "rewards/margins": 2.2769885063171387, + "rewards/rejected": -4.329614639282227, + "step": 18420 + }, + { + "epoch": 0.6008912351019544, + "grad_norm": 2.3741376399993896, + "learning_rate": 3.9991961851381146e-05, + "logits/chosen": 3.5620810985565186, + "logits/rejected": 3.9146037101745605, + "logps/chosen": -365.71563720703125, + "logps/rejected": -335.5256652832031, + "loss": 0.4213, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0146617889404297, + "rewards/margins": 2.915069341659546, + "rewards/rejected": -4.929731845855713, + "step": 18440 + }, + { + "epoch": 0.6015429609534749, + "grad_norm": 2.14677357673645, + "learning_rate": 3.9981099488382704e-05, + "logits/chosen": 3.790473222732544, + "logits/rejected": 3.8479301929473877, + "logps/chosen": -366.6490783691406, + "logps/rejected": -301.74517822265625, + "loss": 0.6001, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.3176045417785645, + "rewards/margins": 2.254146099090576, + "rewards/rejected": -4.571751117706299, + "step": 18460 + }, + { + "epoch": 0.6021946868049954, + "grad_norm": 1.3271434307098389, + "learning_rate": 3.9970237125384254e-05, + "logits/chosen": 3.514371395111084, + "logits/rejected": 3.6699156761169434, + "logps/chosen": -377.32586669921875, + "logps/rejected": -310.52191162109375, + "loss": 0.416, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0017426013946533, + "rewards/margins": 2.3986763954162598, + "rewards/rejected": -4.400418758392334, + "step": 18480 + }, + { + "epoch": 0.6028464126565161, + "grad_norm": 3.448354721069336, + "learning_rate": 3.995937476238581e-05, + "logits/chosen": 3.2743630409240723, + "logits/rejected": 3.7201621532440186, + "logps/chosen": -298.7265625, + "logps/rejected": -296.42889404296875, + "loss": 0.6664, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.328596591949463, + "rewards/margins": 2.1581058502197266, + "rewards/rejected": -4.4867024421691895, + "step": 18500 + }, + { + "epoch": 0.6034981385080366, + "grad_norm": 3.4806525707244873, + "learning_rate": 3.994851239938737e-05, + "logits/chosen": 3.3150744438171387, + "logits/rejected": 3.493018627166748, + "logps/chosen": -360.4976806640625, + "logps/rejected": -308.64556884765625, + "loss": 0.5479, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9855728149414062, + "rewards/margins": 2.9146780967712402, + "rewards/rejected": -4.9002509117126465, + "step": 18520 + }, + { + "epoch": 0.6041498643595572, + "grad_norm": 2.7560172080993652, + "learning_rate": 3.993765003638892e-05, + "logits/chosen": 3.6066536903381348, + "logits/rejected": 3.7437469959259033, + "logps/chosen": -370.82965087890625, + "logps/rejected": -311.9354248046875, + "loss": 0.6157, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.37862229347229, + "rewards/margins": 2.419196367263794, + "rewards/rejected": -4.797818183898926, + "step": 18540 + }, + { + "epoch": 0.6048015902110777, + "grad_norm": 3.069085121154785, + "learning_rate": 3.992678767339047e-05, + "logits/chosen": 3.8198204040527344, + "logits/rejected": 4.116447925567627, + "logps/chosen": -380.87030029296875, + "logps/rejected": -318.9344177246094, + "loss": 0.2861, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.3275418281555176, + "rewards/margins": 3.2319445610046387, + "rewards/rejected": -5.559487342834473, + "step": 18560 + }, + { + "epoch": 0.6054533160625982, + "grad_norm": 2.522555351257324, + "learning_rate": 3.991592531039203e-05, + "logits/chosen": 3.7673239707946777, + "logits/rejected": 3.9950473308563232, + "logps/chosen": -393.992919921875, + "logps/rejected": -322.1138916015625, + "loss": 0.3132, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6423273086547852, + "rewards/margins": 3.1877386569976807, + "rewards/rejected": -4.830065727233887, + "step": 18580 + }, + { + "epoch": 0.6061050419141188, + "grad_norm": 9.082642555236816, + "learning_rate": 3.990506294739358e-05, + "logits/chosen": 3.4170188903808594, + "logits/rejected": 3.8239874839782715, + "logps/chosen": -328.7779846191406, + "logps/rejected": -309.2220458984375, + "loss": 0.5512, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4360764026641846, + "rewards/margins": 2.3185582160949707, + "rewards/rejected": -3.754634141921997, + "step": 18600 + }, + { + "epoch": 0.6067567677656394, + "grad_norm": 21.918582916259766, + "learning_rate": 3.989420058439513e-05, + "logits/chosen": 3.526024580001831, + "logits/rejected": 3.5010387897491455, + "logps/chosen": -361.7472839355469, + "logps/rejected": -309.97052001953125, + "loss": 0.4402, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.351060152053833, + "rewards/margins": 3.093116044998169, + "rewards/rejected": -4.44417667388916, + "step": 18620 + }, + { + "epoch": 0.60740849361716, + "grad_norm": 4.462892532348633, + "learning_rate": 3.988333822139668e-05, + "logits/chosen": 3.7151827812194824, + "logits/rejected": 3.8236727714538574, + "logps/chosen": -341.02130126953125, + "logps/rejected": -299.0995788574219, + "loss": 0.4262, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.472713589668274, + "rewards/margins": 3.4269027709960938, + "rewards/rejected": -4.899616241455078, + "step": 18640 + }, + { + "epoch": 0.6080602194686805, + "grad_norm": 1.0803979635238647, + "learning_rate": 3.987247585839824e-05, + "logits/chosen": 3.5556983947753906, + "logits/rejected": 3.964763641357422, + "logps/chosen": -333.8385314941406, + "logps/rejected": -309.97052001953125, + "loss": 0.6024, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5955044031143188, + "rewards/margins": 2.87147855758667, + "rewards/rejected": -4.466982841491699, + "step": 18660 + }, + { + "epoch": 0.608711945320201, + "grad_norm": 4.873453617095947, + "learning_rate": 3.986161349539979e-05, + "logits/chosen": 3.546138048171997, + "logits/rejected": 3.691342830657959, + "logps/chosen": -355.6633605957031, + "logps/rejected": -285.75909423828125, + "loss": 0.4906, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0761959552764893, + "rewards/margins": 2.199890613555908, + "rewards/rejected": -3.2760868072509766, + "step": 18680 + }, + { + "epoch": 0.6093636711717216, + "grad_norm": 2.587024211883545, + "learning_rate": 3.985075113240134e-05, + "logits/chosen": 3.734363555908203, + "logits/rejected": 3.9552788734436035, + "logps/chosen": -330.51983642578125, + "logps/rejected": -315.78668212890625, + "loss": 0.3431, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1176984310150146, + "rewards/margins": 3.410437822341919, + "rewards/rejected": -4.528136253356934, + "step": 18700 + }, + { + "epoch": 0.6100153970232421, + "grad_norm": 3.4571878910064697, + "learning_rate": 3.98398887694029e-05, + "logits/chosen": 3.8625476360321045, + "logits/rejected": 3.954423427581787, + "logps/chosen": -383.26007080078125, + "logps/rejected": -316.6346435546875, + "loss": 0.3793, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.266793966293335, + "rewards/margins": 3.4062600135803223, + "rewards/rejected": -4.6730546951293945, + "step": 18720 + }, + { + "epoch": 0.6106671228747628, + "grad_norm": 3.123440980911255, + "learning_rate": 3.982902640640445e-05, + "logits/chosen": 3.5470166206359863, + "logits/rejected": 3.7446277141571045, + "logps/chosen": -333.4368896484375, + "logps/rejected": -319.863525390625, + "loss": 0.474, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6079514026641846, + "rewards/margins": 2.4742302894592285, + "rewards/rejected": -4.082181453704834, + "step": 18740 + }, + { + "epoch": 0.6113188487262833, + "grad_norm": 1.7755727767944336, + "learning_rate": 3.9818164043406006e-05, + "logits/chosen": 3.471806287765503, + "logits/rejected": 3.619201183319092, + "logps/chosen": -347.91717529296875, + "logps/rejected": -280.59796142578125, + "loss": 0.5059, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5664212703704834, + "rewards/margins": 2.3185062408447266, + "rewards/rejected": -3.884927272796631, + "step": 18760 + }, + { + "epoch": 0.6119705745778038, + "grad_norm": 2.0905370712280273, + "learning_rate": 3.9807301680407564e-05, + "logits/chosen": 3.3287651538848877, + "logits/rejected": 3.5188400745391846, + "logps/chosen": -315.3193664550781, + "logps/rejected": -300.9823913574219, + "loss": 0.4016, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.403077483177185, + "rewards/margins": 2.825239419937134, + "rewards/rejected": -4.228316783905029, + "step": 18780 + }, + { + "epoch": 0.6126223004293244, + "grad_norm": 2.5644285678863525, + "learning_rate": 3.9796439317409114e-05, + "logits/chosen": 3.5415775775909424, + "logits/rejected": 3.627997636795044, + "logps/chosen": -259.7340393066406, + "logps/rejected": -295.64971923828125, + "loss": 0.4509, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.4007269144058228, + "rewards/margins": 2.5188117027282715, + "rewards/rejected": -3.9195380210876465, + "step": 18800 + }, + { + "epoch": 0.6132740262808449, + "grad_norm": 0.2214728593826294, + "learning_rate": 3.9785576954410665e-05, + "logits/chosen": 3.621983289718628, + "logits/rejected": 3.5586345195770264, + "logps/chosen": -369.643310546875, + "logps/rejected": -321.0818786621094, + "loss": 0.4107, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0346928834915161, + "rewards/margins": 3.191709518432617, + "rewards/rejected": -4.226402759552002, + "step": 18820 + }, + { + "epoch": 0.6139257521323656, + "grad_norm": 0.5524157285690308, + "learning_rate": 3.9774714591412216e-05, + "logits/chosen": 3.330725908279419, + "logits/rejected": 3.4062609672546387, + "logps/chosen": -331.4629821777344, + "logps/rejected": -324.7911071777344, + "loss": 0.3808, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3790148496627808, + "rewards/margins": 2.491116762161255, + "rewards/rejected": -3.870131731033325, + "step": 18840 + }, + { + "epoch": 0.6145774779838861, + "grad_norm": 2.045783519744873, + "learning_rate": 3.9763852228413774e-05, + "logits/chosen": 3.2717413902282715, + "logits/rejected": 3.392688035964966, + "logps/chosen": -319.34967041015625, + "logps/rejected": -311.4097595214844, + "loss": 0.5541, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.774640679359436, + "rewards/margins": 3.0357038974761963, + "rewards/rejected": -4.810344696044922, + "step": 18860 + }, + { + "epoch": 0.6152292038354067, + "grad_norm": 0.5215424299240112, + "learning_rate": 3.9752989865415324e-05, + "logits/chosen": 3.4908089637756348, + "logits/rejected": 3.5030980110168457, + "logps/chosen": -349.65374755859375, + "logps/rejected": -311.2232666015625, + "loss": 0.3137, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.0555943250656128, + "rewards/margins": 3.9508328437805176, + "rewards/rejected": -5.006426811218262, + "step": 18880 + }, + { + "epoch": 0.6158809296869272, + "grad_norm": 1.3025132417678833, + "learning_rate": 3.9742127502416875e-05, + "logits/chosen": 3.9726898670196533, + "logits/rejected": 3.9323887825012207, + "logps/chosen": -373.7646484375, + "logps/rejected": -340.16632080078125, + "loss": 0.5204, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5578163862228394, + "rewards/margins": 2.983687162399292, + "rewards/rejected": -4.541503429412842, + "step": 18900 + }, + { + "epoch": 0.6165326555384477, + "grad_norm": 0.7326942682266235, + "learning_rate": 3.973126513941843e-05, + "logits/chosen": 3.6144027709960938, + "logits/rejected": 3.605034351348877, + "logps/chosen": -298.5893859863281, + "logps/rejected": -287.22662353515625, + "loss": 0.5083, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.95272696018219, + "rewards/margins": 1.876752495765686, + "rewards/rejected": -3.829479217529297, + "step": 18920 + }, + { + "epoch": 0.6171843813899683, + "grad_norm": 0.9239898324012756, + "learning_rate": 3.9720402776419983e-05, + "logits/chosen": 3.467298984527588, + "logits/rejected": 3.6636757850646973, + "logps/chosen": -352.3357849121094, + "logps/rejected": -339.9361877441406, + "loss": 0.3662, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9484732151031494, + "rewards/margins": 3.5968894958496094, + "rewards/rejected": -5.5453619956970215, + "step": 18940 + }, + { + "epoch": 0.6178361072414889, + "grad_norm": 5.150781631469727, + "learning_rate": 3.9709540413421534e-05, + "logits/chosen": 3.2553305625915527, + "logits/rejected": 3.478367567062378, + "logps/chosen": -326.439453125, + "logps/rejected": -319.0692138671875, + "loss": 0.467, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8872982263565063, + "rewards/margins": 3.622343063354492, + "rewards/rejected": -5.509641170501709, + "step": 18960 + }, + { + "epoch": 0.6184878330930095, + "grad_norm": 0.7945571541786194, + "learning_rate": 3.9698678050423085e-05, + "logits/chosen": 3.6768486499786377, + "logits/rejected": 3.664959669113159, + "logps/chosen": -392.16912841796875, + "logps/rejected": -357.7842102050781, + "loss": 0.4666, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4172310829162598, + "rewards/margins": 2.8682339191436768, + "rewards/rejected": -4.285465240478516, + "step": 18980 + }, + { + "epoch": 0.61913955894453, + "grad_norm": 1.8456300497055054, + "learning_rate": 3.968781568742464e-05, + "logits/chosen": 3.6367347240448, + "logits/rejected": 3.6566994190216064, + "logps/chosen": -358.912353515625, + "logps/rejected": -290.32147216796875, + "loss": 0.4864, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6211140155792236, + "rewards/margins": 2.5069525241851807, + "rewards/rejected": -4.128066062927246, + "step": 19000 + }, + { + "epoch": 0.6197912847960505, + "grad_norm": 4.9705891609191895, + "learning_rate": 3.96769533244262e-05, + "logits/chosen": 3.1787631511688232, + "logits/rejected": 3.2455430030822754, + "logps/chosen": -361.09393310546875, + "logps/rejected": -353.8075256347656, + "loss": 0.5598, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7758700847625732, + "rewards/margins": 2.4687352180480957, + "rewards/rejected": -4.24460506439209, + "step": 19020 + }, + { + "epoch": 0.6204430106475711, + "grad_norm": 5.083608627319336, + "learning_rate": 3.966609096142775e-05, + "logits/chosen": 2.992757558822632, + "logits/rejected": 3.256584644317627, + "logps/chosen": -295.07318115234375, + "logps/rejected": -258.32440185546875, + "loss": 0.3639, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4866377115249634, + "rewards/margins": 3.7236030101776123, + "rewards/rejected": -5.210240840911865, + "step": 19040 + }, + { + "epoch": 0.6210947364990916, + "grad_norm": 0.9281842112541199, + "learning_rate": 3.965522859842931e-05, + "logits/chosen": 3.2902495861053467, + "logits/rejected": 3.5459389686584473, + "logps/chosen": -351.30084228515625, + "logps/rejected": -301.80078125, + "loss": 0.4424, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.516605257987976, + "rewards/margins": 2.992044448852539, + "rewards/rejected": -4.5086493492126465, + "step": 19060 + }, + { + "epoch": 0.6217464623506123, + "grad_norm": 3.9713988304138184, + "learning_rate": 3.964436623543086e-05, + "logits/chosen": 3.6793036460876465, + "logits/rejected": 4.002519130706787, + "logps/chosen": -352.5762939453125, + "logps/rejected": -329.3010559082031, + "loss": 0.3656, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.4209917783737183, + "rewards/margins": 3.615274429321289, + "rewards/rejected": -5.036265850067139, + "step": 19080 + }, + { + "epoch": 0.6223981882021328, + "grad_norm": 1.983890175819397, + "learning_rate": 3.963350387243241e-05, + "logits/chosen": 3.528904438018799, + "logits/rejected": 3.5189387798309326, + "logps/chosen": -336.3255310058594, + "logps/rejected": -344.82830810546875, + "loss": 0.5276, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9705445766448975, + "rewards/margins": 2.535355806350708, + "rewards/rejected": -4.5059003829956055, + "step": 19100 + }, + { + "epoch": 0.6230499140536533, + "grad_norm": 0.3357907235622406, + "learning_rate": 3.962264150943397e-05, + "logits/chosen": 3.167767286300659, + "logits/rejected": 3.253293514251709, + "logps/chosen": -332.3590393066406, + "logps/rejected": -303.12213134765625, + "loss": 0.4072, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.672928810119629, + "rewards/margins": 3.0425422191619873, + "rewards/rejected": -4.715470790863037, + "step": 19120 + }, + { + "epoch": 0.6237016399051739, + "grad_norm": 0.9049257040023804, + "learning_rate": 3.961177914643552e-05, + "logits/chosen": 3.5652854442596436, + "logits/rejected": 3.7714874744415283, + "logps/chosen": -309.5381774902344, + "logps/rejected": -308.47491455078125, + "loss": 0.4701, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.131868600845337, + "rewards/margins": 3.0233206748962402, + "rewards/rejected": -4.15518856048584, + "step": 19140 + }, + { + "epoch": 0.6243533657566944, + "grad_norm": 4.944677829742432, + "learning_rate": 3.960091678343707e-05, + "logits/chosen": 3.2379233837127686, + "logits/rejected": 3.4963583946228027, + "logps/chosen": -298.97967529296875, + "logps/rejected": -275.2109375, + "loss": 0.4866, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.9990742802619934, + "rewards/margins": 2.6006522178649902, + "rewards/rejected": -3.599726438522339, + "step": 19160 + }, + { + "epoch": 0.625005091608215, + "grad_norm": 0.7646414041519165, + "learning_rate": 3.959005442043862e-05, + "logits/chosen": 3.1143558025360107, + "logits/rejected": 3.237168550491333, + "logps/chosen": -343.02947998046875, + "logps/rejected": -320.7533874511719, + "loss": 0.3605, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9229118824005127, + "rewards/margins": 2.8919198513031006, + "rewards/rejected": -4.814831733703613, + "step": 19180 + }, + { + "epoch": 0.6256568174597356, + "grad_norm": 0.9479644298553467, + "learning_rate": 3.957919205744018e-05, + "logits/chosen": 3.500481128692627, + "logits/rejected": 3.5129458904266357, + "logps/chosen": -295.0979919433594, + "logps/rejected": -308.25103759765625, + "loss": 0.5046, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5264489650726318, + "rewards/margins": 2.5298476219177246, + "rewards/rejected": -4.0562968254089355, + "step": 19200 + }, + { + "epoch": 0.6263085433112561, + "grad_norm": 1.4565051794052124, + "learning_rate": 3.956832969444173e-05, + "logits/chosen": 3.336275815963745, + "logits/rejected": 3.561845064163208, + "logps/chosen": -302.78509521484375, + "logps/rejected": -305.86004638671875, + "loss": 0.5969, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1548341512680054, + "rewards/margins": 2.3679542541503906, + "rewards/rejected": -3.5227882862091064, + "step": 19220 + }, + { + "epoch": 0.6269602691627767, + "grad_norm": 3.177062749862671, + "learning_rate": 3.955746733144328e-05, + "logits/chosen": 3.315197706222534, + "logits/rejected": 3.43986177444458, + "logps/chosen": -268.1351013183594, + "logps/rejected": -309.3914794921875, + "loss": 0.5118, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3845019340515137, + "rewards/margins": 2.4910507202148438, + "rewards/rejected": -3.8755524158477783, + "step": 19240 + }, + { + "epoch": 0.6276119950142972, + "grad_norm": 2.22853684425354, + "learning_rate": 3.954660496844484e-05, + "logits/chosen": 3.153921604156494, + "logits/rejected": 3.2185378074645996, + "logps/chosen": -355.2002868652344, + "logps/rejected": -309.925537109375, + "loss": 0.523, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.378328561782837, + "rewards/margins": 2.486236095428467, + "rewards/rejected": -3.864564895629883, + "step": 19260 + }, + { + "epoch": 0.6282637208658178, + "grad_norm": 0.977475106716156, + "learning_rate": 3.9535742605446394e-05, + "logits/chosen": 3.551189422607422, + "logits/rejected": 3.5839035511016846, + "logps/chosen": -350.2050476074219, + "logps/rejected": -330.66510009765625, + "loss": 0.6044, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2381550073623657, + "rewards/margins": 2.360369920730591, + "rewards/rejected": -3.598524570465088, + "step": 19280 + }, + { + "epoch": 0.6289154467173383, + "grad_norm": 4.459108352661133, + "learning_rate": 3.9524880242447945e-05, + "logits/chosen": 3.2462775707244873, + "logits/rejected": 3.541706085205078, + "logps/chosen": -303.80218505859375, + "logps/rejected": -317.00799560546875, + "loss": 0.4199, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3923001289367676, + "rewards/margins": 2.86799955368042, + "rewards/rejected": -4.260300159454346, + "step": 19300 + }, + { + "epoch": 0.6295671725688589, + "grad_norm": 4.174319744110107, + "learning_rate": 3.95140178794495e-05, + "logits/chosen": 3.0774033069610596, + "logits/rejected": 3.3162803649902344, + "logps/chosen": -354.8174743652344, + "logps/rejected": -338.46533203125, + "loss": 0.6008, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6020421981811523, + "rewards/margins": 2.5936684608459473, + "rewards/rejected": -4.1957106590271, + "step": 19320 + }, + { + "epoch": 0.6302188984203795, + "grad_norm": 1.9600739479064941, + "learning_rate": 3.950315551645105e-05, + "logits/chosen": 3.5853524208068848, + "logits/rejected": 3.7000343799591064, + "logps/chosen": -334.78399658203125, + "logps/rejected": -316.5585021972656, + "loss": 0.395, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.5037925243377686, + "rewards/margins": 2.685370683670044, + "rewards/rejected": -4.189163684844971, + "step": 19340 + }, + { + "epoch": 0.6308706242719, + "grad_norm": 2.4314639568328857, + "learning_rate": 3.9492293153452604e-05, + "logits/chosen": 3.383265972137451, + "logits/rejected": 3.422830581665039, + "logps/chosen": -352.1787109375, + "logps/rejected": -305.85003662109375, + "loss": 0.5386, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3398616313934326, + "rewards/margins": 2.726954936981201, + "rewards/rejected": -4.066817283630371, + "step": 19360 + }, + { + "epoch": 0.6315223501234206, + "grad_norm": 5.614832878112793, + "learning_rate": 3.9481430790454155e-05, + "logits/chosen": 3.246943950653076, + "logits/rejected": 3.603067398071289, + "logps/chosen": -316.44598388671875, + "logps/rejected": -326.0803527832031, + "loss": 0.4071, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.2901053428649902, + "rewards/margins": 3.112995147705078, + "rewards/rejected": -4.40310001373291, + "step": 19380 + }, + { + "epoch": 0.6321740759749411, + "grad_norm": 5.04454231262207, + "learning_rate": 3.947056842745571e-05, + "logits/chosen": 3.132392406463623, + "logits/rejected": 3.4166812896728516, + "logps/chosen": -313.3570251464844, + "logps/rejected": -317.5492248535156, + "loss": 0.5204, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0677616596221924, + "rewards/margins": 3.070457935333252, + "rewards/rejected": -5.138220310211182, + "step": 19400 + }, + { + "epoch": 0.6328258018264618, + "grad_norm": 0.10098158568143845, + "learning_rate": 3.945970606445726e-05, + "logits/chosen": 3.122389316558838, + "logits/rejected": 3.412381649017334, + "logps/chosen": -340.5827331542969, + "logps/rejected": -351.9416809082031, + "loss": 0.3255, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.7759456038475037, + "rewards/margins": 3.6682097911834717, + "rewards/rejected": -4.444155693054199, + "step": 19420 + }, + { + "epoch": 0.6334775276779823, + "grad_norm": 7.099296569824219, + "learning_rate": 3.9448843701458814e-05, + "logits/chosen": 3.4515442848205566, + "logits/rejected": 3.395374298095703, + "logps/chosen": -309.71038818359375, + "logps/rejected": -323.5845947265625, + "loss": 0.6885, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8201767206192017, + "rewards/margins": 2.0592801570892334, + "rewards/rejected": -3.8794567584991455, + "step": 19440 + }, + { + "epoch": 0.6341292535295028, + "grad_norm": 0.8941229581832886, + "learning_rate": 3.943798133846037e-05, + "logits/chosen": 3.441427230834961, + "logits/rejected": 3.607173204421997, + "logps/chosen": -375.0367736816406, + "logps/rejected": -394.4779968261719, + "loss": 0.4503, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7424789667129517, + "rewards/margins": 3.0076072216033936, + "rewards/rejected": -4.750086307525635, + "step": 19460 + }, + { + "epoch": 0.6347809793810234, + "grad_norm": 1.353521704673767, + "learning_rate": 3.942711897546192e-05, + "logits/chosen": 3.1310665607452393, + "logits/rejected": 3.2805843353271484, + "logps/chosen": -351.73809814453125, + "logps/rejected": -310.72235107421875, + "loss": 0.5441, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.6937813758850098, + "rewards/margins": 2.8710694313049316, + "rewards/rejected": -5.564850807189941, + "step": 19480 + }, + { + "epoch": 0.6354327052325439, + "grad_norm": 2.8023386001586914, + "learning_rate": 3.941625661246347e-05, + "logits/chosen": 3.496990919113159, + "logits/rejected": 3.524289608001709, + "logps/chosen": -336.36578369140625, + "logps/rejected": -302.1412658691406, + "loss": 0.4007, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.838313341140747, + "rewards/margins": 2.9107184410095215, + "rewards/rejected": -4.749032020568848, + "step": 19500 + }, + { + "epoch": 0.6360844310840645, + "grad_norm": 1.3422963619232178, + "learning_rate": 3.940539424946503e-05, + "logits/chosen": 3.597015380859375, + "logits/rejected": 3.637216091156006, + "logps/chosen": -350.7108154296875, + "logps/rejected": -347.2540588378906, + "loss": 0.3792, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7881743907928467, + "rewards/margins": 3.1651365756988525, + "rewards/rejected": -4.953311443328857, + "step": 19520 + }, + { + "epoch": 0.636736156935585, + "grad_norm": 2.1525344848632812, + "learning_rate": 3.93950750046165e-05, + "logits/chosen": 3.580639362335205, + "logits/rejected": 3.56132173538208, + "logps/chosen": -365.85888671875, + "logps/rejected": -290.9951477050781, + "loss": 0.4068, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5271011590957642, + "rewards/margins": 2.8636555671691895, + "rewards/rejected": -4.390756607055664, + "step": 19540 + }, + { + "epoch": 0.6373878827871056, + "grad_norm": 1.6133606433868408, + "learning_rate": 3.938421264161806e-05, + "logits/chosen": 3.3090033531188965, + "logits/rejected": 3.272639036178589, + "logps/chosen": -318.5869445800781, + "logps/rejected": -304.8014831542969, + "loss": 0.7743, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2672886848449707, + "rewards/margins": 2.329317569732666, + "rewards/rejected": -4.596606254577637, + "step": 19560 + }, + { + "epoch": 0.6380396086386262, + "grad_norm": 6.414629936218262, + "learning_rate": 3.937335027861962e-05, + "logits/chosen": 3.3973021507263184, + "logits/rejected": 3.4422245025634766, + "logps/chosen": -377.5535583496094, + "logps/rejected": -356.0341796875, + "loss": 0.5709, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8786745071411133, + "rewards/margins": 3.0378577709198, + "rewards/rejected": -4.916532039642334, + "step": 19580 + }, + { + "epoch": 0.6386913344901467, + "grad_norm": 2.2359111309051514, + "learning_rate": 3.936248791562117e-05, + "logits/chosen": 3.155230760574341, + "logits/rejected": 3.1192522048950195, + "logps/chosen": -337.15985107421875, + "logps/rejected": -325.78875732421875, + "loss": 0.5708, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6572837829589844, + "rewards/margins": 3.1855576038360596, + "rewards/rejected": -4.842841148376465, + "step": 19600 + }, + { + "epoch": 0.6393430603416673, + "grad_norm": 0.6637324094772339, + "learning_rate": 3.935162555262272e-05, + "logits/chosen": 3.5265953540802, + "logits/rejected": 3.6749179363250732, + "logps/chosen": -367.9244384765625, + "logps/rejected": -337.52703857421875, + "loss": 0.5241, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.457371473312378, + "rewards/margins": 3.2460265159606934, + "rewards/rejected": -4.70339822769165, + "step": 19620 + }, + { + "epoch": 0.6399947861931878, + "grad_norm": 2.1849613189697266, + "learning_rate": 3.9340763189624277e-05, + "logits/chosen": 3.4153265953063965, + "logits/rejected": 3.4553260803222656, + "logps/chosen": -366.6961669921875, + "logps/rejected": -344.6565246582031, + "loss": 0.4875, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.9304059743881226, + "rewards/margins": 3.1651830673217773, + "rewards/rejected": -4.0955891609191895, + "step": 19640 + }, + { + "epoch": 0.6406465120447083, + "grad_norm": 5.936824798583984, + "learning_rate": 3.932990082662583e-05, + "logits/chosen": 3.1338274478912354, + "logits/rejected": 3.4223217964172363, + "logps/chosen": -347.55377197265625, + "logps/rejected": -324.0847473144531, + "loss": 0.4435, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.39532470703125, + "rewards/margins": 3.50474214553833, + "rewards/rejected": -4.90006685256958, + "step": 19660 + }, + { + "epoch": 0.641298237896229, + "grad_norm": 2.48595929145813, + "learning_rate": 3.931903846362738e-05, + "logits/chosen": 3.3386833667755127, + "logits/rejected": 3.390775203704834, + "logps/chosen": -322.21417236328125, + "logps/rejected": -306.3620300292969, + "loss": 0.3349, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.3596961498260498, + "rewards/margins": 2.684802770614624, + "rewards/rejected": -4.044499397277832, + "step": 19680 + }, + { + "epoch": 0.6419499637477495, + "grad_norm": 0.07292710989713669, + "learning_rate": 3.9308176100628936e-05, + "logits/chosen": 3.2787437438964844, + "logits/rejected": 3.38873291015625, + "logps/chosen": -312.8807678222656, + "logps/rejected": -297.09588623046875, + "loss": 0.4191, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.881347894668579, + "rewards/margins": 2.3950817584991455, + "rewards/rejected": -4.276429653167725, + "step": 19700 + }, + { + "epoch": 0.6426016895992701, + "grad_norm": 0.7391329407691956, + "learning_rate": 3.9297313737630486e-05, + "logits/chosen": 3.5370864868164062, + "logits/rejected": 3.7352840900421143, + "logps/chosen": -381.291015625, + "logps/rejected": -308.7089538574219, + "loss": 0.5381, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9785598516464233, + "rewards/margins": 2.6801745891571045, + "rewards/rejected": -4.6587347984313965, + "step": 19720 + }, + { + "epoch": 0.6432534154507906, + "grad_norm": 4.602273941040039, + "learning_rate": 3.928645137463204e-05, + "logits/chosen": 3.4546284675598145, + "logits/rejected": 3.467883348464966, + "logps/chosen": -350.2622375488281, + "logps/rejected": -334.5744323730469, + "loss": 0.5596, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4904863834381104, + "rewards/margins": 2.7154693603515625, + "rewards/rejected": -4.205955505371094, + "step": 19740 + }, + { + "epoch": 0.6439051413023111, + "grad_norm": 4.088013648986816, + "learning_rate": 3.927558901163359e-05, + "logits/chosen": 3.4461638927459717, + "logits/rejected": 3.4304442405700684, + "logps/chosen": -297.0211181640625, + "logps/rejected": -319.40313720703125, + "loss": 0.4804, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.2771811485290527, + "rewards/margins": 2.545445203781128, + "rewards/rejected": -4.822625637054443, + "step": 19760 + }, + { + "epoch": 0.6445568671538318, + "grad_norm": 0.8878589272499084, + "learning_rate": 3.9264726648635146e-05, + "logits/chosen": 3.4904232025146484, + "logits/rejected": 3.6577389240264893, + "logps/chosen": -318.65142822265625, + "logps/rejected": -330.30743408203125, + "loss": 0.5542, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6078665256500244, + "rewards/margins": 2.950582981109619, + "rewards/rejected": -4.558449745178223, + "step": 19780 + }, + { + "epoch": 0.6452085930053523, + "grad_norm": 4.5998735427856445, + "learning_rate": 3.9253864285636696e-05, + "logits/chosen": 3.3129234313964844, + "logits/rejected": 3.474853038787842, + "logps/chosen": -341.9805603027344, + "logps/rejected": -329.73876953125, + "loss": 0.471, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9095308780670166, + "rewards/margins": 3.0393710136413574, + "rewards/rejected": -4.948901653289795, + "step": 19800 + }, + { + "epoch": 0.6458603188568729, + "grad_norm": 2.4448301792144775, + "learning_rate": 3.9243001922638254e-05, + "logits/chosen": 3.152587413787842, + "logits/rejected": 3.1711220741271973, + "logps/chosen": -326.91424560546875, + "logps/rejected": -310.0340270996094, + "loss": 0.4215, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.3940107822418213, + "rewards/margins": 2.8387444019317627, + "rewards/rejected": -4.232754707336426, + "step": 19820 + }, + { + "epoch": 0.6465120447083934, + "grad_norm": 3.0045993328094482, + "learning_rate": 3.9232139559639805e-05, + "logits/chosen": 3.3790011405944824, + "logits/rejected": 3.471426486968994, + "logps/chosen": -368.1590270996094, + "logps/rejected": -323.92547607421875, + "loss": 0.5694, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5254552364349365, + "rewards/margins": 2.3537850379943848, + "rewards/rejected": -3.8792405128479004, + "step": 19840 + }, + { + "epoch": 0.6471637705599139, + "grad_norm": 10.098546981811523, + "learning_rate": 3.922127719664136e-05, + "logits/chosen": 3.4230704307556152, + "logits/rejected": 3.596004009246826, + "logps/chosen": -358.4360046386719, + "logps/rejected": -308.83099365234375, + "loss": 0.5426, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.321645975112915, + "rewards/margins": 2.813256025314331, + "rewards/rejected": -4.134902477264404, + "step": 19860 + }, + { + "epoch": 0.6478154964114345, + "grad_norm": 1.8737163543701172, + "learning_rate": 3.921041483364291e-05, + "logits/chosen": 3.537632703781128, + "logits/rejected": 3.5876307487487793, + "logps/chosen": -383.8858337402344, + "logps/rejected": -350.607177734375, + "loss": 0.2606, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.3626372814178467, + "rewards/margins": 3.608996629714966, + "rewards/rejected": -4.971633434295654, + "step": 19880 + }, + { + "epoch": 0.6484672222629551, + "grad_norm": 0.2646341323852539, + "learning_rate": 3.919955247064447e-05, + "logits/chosen": 3.5380330085754395, + "logits/rejected": 3.549985885620117, + "logps/chosen": -315.25091552734375, + "logps/rejected": -369.5877990722656, + "loss": 0.4204, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7169513702392578, + "rewards/margins": 3.1539130210876465, + "rewards/rejected": -4.870863914489746, + "step": 19900 + }, + { + "epoch": 0.6491189481144757, + "grad_norm": 22.031612396240234, + "learning_rate": 3.918869010764602e-05, + "logits/chosen": 3.070392608642578, + "logits/rejected": 3.429988145828247, + "logps/chosen": -368.4327087402344, + "logps/rejected": -310.09442138671875, + "loss": 0.4138, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0307726860046387, + "rewards/margins": 3.133884906768799, + "rewards/rejected": -5.1646575927734375, + "step": 19920 + }, + { + "epoch": 0.6497706739659962, + "grad_norm": 5.004989147186279, + "learning_rate": 3.917782774464757e-05, + "logits/chosen": 3.6923670768737793, + "logits/rejected": 3.8766403198242188, + "logps/chosen": -358.22552490234375, + "logps/rejected": -317.41815185546875, + "loss": 0.5564, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3306249380111694, + "rewards/margins": 3.0694117546081543, + "rewards/rejected": -4.400036811828613, + "step": 19940 + }, + { + "epoch": 0.6504223998175168, + "grad_norm": 0.599509060382843, + "learning_rate": 3.916696538164912e-05, + "logits/chosen": 3.3241569995880127, + "logits/rejected": 3.2766921520233154, + "logps/chosen": -332.3446350097656, + "logps/rejected": -321.1459655761719, + "loss": 0.6773, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.396334409713745, + "rewards/margins": 2.4122917652130127, + "rewards/rejected": -4.808626651763916, + "step": 19960 + }, + { + "epoch": 0.6510741256690373, + "grad_norm": 3.6987802982330322, + "learning_rate": 3.915610301865068e-05, + "logits/chosen": 3.2719008922576904, + "logits/rejected": 3.3451647758483887, + "logps/chosen": -321.00494384765625, + "logps/rejected": -292.44378662109375, + "loss": 0.3578, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.1837027072906494, + "rewards/margins": 3.064793825149536, + "rewards/rejected": -4.2484965324401855, + "step": 19980 + }, + { + "epoch": 0.6517258515205578, + "grad_norm": 3.659048318862915, + "learning_rate": 3.914524065565223e-05, + "logits/chosen": 3.2672665119171143, + "logits/rejected": 3.3744874000549316, + "logps/chosen": -316.31170654296875, + "logps/rejected": -274.20843505859375, + "loss": 0.3789, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.717909812927246, + "rewards/margins": 2.520117998123169, + "rewards/rejected": -4.238027572631836, + "step": 20000 + }, + { + "epoch": 0.6523775773720785, + "grad_norm": 3.000448226928711, + "learning_rate": 3.913437829265378e-05, + "logits/chosen": 3.2857112884521484, + "logits/rejected": 3.372576951980591, + "logps/chosen": -346.6449890136719, + "logps/rejected": -345.1919860839844, + "loss": 0.3397, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9413458108901978, + "rewards/margins": 3.545016050338745, + "rewards/rejected": -5.486361503601074, + "step": 20020 + }, + { + "epoch": 0.653029303223599, + "grad_norm": 1.3172948360443115, + "learning_rate": 3.912351592965534e-05, + "logits/chosen": 3.3743629455566406, + "logits/rejected": 3.394214153289795, + "logps/chosen": -347.2186279296875, + "logps/rejected": -314.5482177734375, + "loss": 0.4195, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9626489877700806, + "rewards/margins": 3.3591041564941406, + "rewards/rejected": -5.32175350189209, + "step": 20040 + }, + { + "epoch": 0.6536810290751196, + "grad_norm": 1.6202671527862549, + "learning_rate": 3.911265356665689e-05, + "logits/chosen": 3.20696759223938, + "logits/rejected": 3.5630507469177246, + "logps/chosen": -378.2550354003906, + "logps/rejected": -310.6284484863281, + "loss": 0.4862, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.60207200050354, + "rewards/margins": 2.9576613903045654, + "rewards/rejected": -4.5597333908081055, + "step": 20060 + }, + { + "epoch": 0.6543327549266401, + "grad_norm": 0.07133996486663818, + "learning_rate": 3.910179120365844e-05, + "logits/chosen": 3.47986102104187, + "logits/rejected": 3.623274564743042, + "logps/chosen": -359.50726318359375, + "logps/rejected": -334.93988037109375, + "loss": 0.4219, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6603485345840454, + "rewards/margins": 3.4773566722869873, + "rewards/rejected": -5.137705326080322, + "step": 20080 + }, + { + "epoch": 0.6549844807781606, + "grad_norm": 2.9411802291870117, + "learning_rate": 3.909092884066e-05, + "logits/chosen": 3.7678966522216797, + "logits/rejected": 3.7268924713134766, + "logps/chosen": -398.3627624511719, + "logps/rejected": -354.728271484375, + "loss": 0.4765, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2242152690887451, + "rewards/margins": 3.6993374824523926, + "rewards/rejected": -4.923552513122559, + "step": 20100 + }, + { + "epoch": 0.6556362066296813, + "grad_norm": 1.1059304475784302, + "learning_rate": 3.9080066477661556e-05, + "logits/chosen": 3.4366676807403564, + "logits/rejected": 3.703725814819336, + "logps/chosen": -374.0931701660156, + "logps/rejected": -334.1124267578125, + "loss": 0.4989, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.506476879119873, + "rewards/margins": 2.858935594558716, + "rewards/rejected": -5.36541223526001, + "step": 20120 + }, + { + "epoch": 0.6562879324812018, + "grad_norm": 0.9430385231971741, + "learning_rate": 3.906920411466311e-05, + "logits/chosen": 3.162987232208252, + "logits/rejected": 3.0564723014831543, + "logps/chosen": -361.38153076171875, + "logps/rejected": -303.9256896972656, + "loss": 0.3572, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8581148386001587, + "rewards/margins": 3.5234813690185547, + "rewards/rejected": -5.381596565246582, + "step": 20140 + }, + { + "epoch": 0.6569396583327224, + "grad_norm": 3.626615524291992, + "learning_rate": 3.905834175166466e-05, + "logits/chosen": 3.0255532264709473, + "logits/rejected": 3.0041675567626953, + "logps/chosen": -321.5042419433594, + "logps/rejected": -289.5610656738281, + "loss": 0.464, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.5334889888763428, + "rewards/margins": 2.544335126876831, + "rewards/rejected": -5.077824115753174, + "step": 20160 + }, + { + "epoch": 0.6575913841842429, + "grad_norm": 6.370234966278076, + "learning_rate": 3.9047479388666215e-05, + "logits/chosen": 3.3732597827911377, + "logits/rejected": 3.387669086456299, + "logps/chosen": -354.15057373046875, + "logps/rejected": -295.7207336425781, + "loss": 0.5894, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2939231395721436, + "rewards/margins": 3.0241539478302, + "rewards/rejected": -5.318077087402344, + "step": 20180 + }, + { + "epoch": 0.6582431100357634, + "grad_norm": 0.06987281143665314, + "learning_rate": 3.9036617025667766e-05, + "logits/chosen": 3.315424680709839, + "logits/rejected": 3.285170793533325, + "logps/chosen": -337.8129577636719, + "logps/rejected": -311.6724853515625, + "loss": 0.3606, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.811836838722229, + "rewards/margins": 3.7643496990203857, + "rewards/rejected": -5.576186656951904, + "step": 20200 + }, + { + "epoch": 0.658894835887284, + "grad_norm": 1.8764126300811768, + "learning_rate": 3.902575466266932e-05, + "logits/chosen": 3.4227683544158936, + "logits/rejected": 3.5711536407470703, + "logps/chosen": -336.27850341796875, + "logps/rejected": -318.95196533203125, + "loss": 0.4156, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.012530565261841, + "rewards/margins": 3.165566921234131, + "rewards/rejected": -5.178097724914551, + "step": 20220 + }, + { + "epoch": 0.6595465617388045, + "grad_norm": 0.10744510591030121, + "learning_rate": 3.9014892299670875e-05, + "logits/chosen": 3.293997287750244, + "logits/rejected": 3.6408839225769043, + "logps/chosen": -334.2550354003906, + "logps/rejected": -340.8355407714844, + "loss": 0.423, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1219284534454346, + "rewards/margins": 4.496148586273193, + "rewards/rejected": -6.618077278137207, + "step": 20240 + }, + { + "epoch": 0.6601982875903252, + "grad_norm": 2.127849817276001, + "learning_rate": 3.9004029936672425e-05, + "logits/chosen": 3.674894332885742, + "logits/rejected": 3.773191452026367, + "logps/chosen": -361.13433837890625, + "logps/rejected": -310.352783203125, + "loss": 0.3942, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.4843621253967285, + "rewards/margins": 3.454446315765381, + "rewards/rejected": -5.938807964324951, + "step": 20260 + }, + { + "epoch": 0.6608500134418457, + "grad_norm": 0.09437773376703262, + "learning_rate": 3.8993167573673976e-05, + "logits/chosen": 3.455535888671875, + "logits/rejected": 3.727184295654297, + "logps/chosen": -367.006103515625, + "logps/rejected": -353.98382568359375, + "loss": 0.5363, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.5522940158843994, + "rewards/margins": 3.543210983276367, + "rewards/rejected": -6.095504283905029, + "step": 20280 + }, + { + "epoch": 0.6615017392933662, + "grad_norm": 2.412036418914795, + "learning_rate": 3.8982305210675534e-05, + "logits/chosen": 3.094606876373291, + "logits/rejected": 3.3465476036071777, + "logps/chosen": -332.26324462890625, + "logps/rejected": -356.65087890625, + "loss": 0.5983, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.652311325073242, + "rewards/margins": 3.2994067668914795, + "rewards/rejected": -5.951718330383301, + "step": 20300 + }, + { + "epoch": 0.6621534651448868, + "grad_norm": 0.6312927007675171, + "learning_rate": 3.8971442847677084e-05, + "logits/chosen": 3.2404682636260986, + "logits/rejected": 3.3796894550323486, + "logps/chosen": -337.1199035644531, + "logps/rejected": -329.43450927734375, + "loss": 0.5486, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.5400660037994385, + "rewards/margins": 2.9520339965820312, + "rewards/rejected": -5.492099761962891, + "step": 20320 + }, + { + "epoch": 0.6628051909964073, + "grad_norm": 0.3084004819393158, + "learning_rate": 3.8960580484678635e-05, + "logits/chosen": 3.3390514850616455, + "logits/rejected": 3.500886917114258, + "logps/chosen": -346.08831787109375, + "logps/rejected": -320.7940368652344, + "loss": 0.3421, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.1955547332763672, + "rewards/margins": 3.6035256385803223, + "rewards/rejected": -4.799079895019531, + "step": 20340 + }, + { + "epoch": 0.663456916847928, + "grad_norm": 5.507288455963135, + "learning_rate": 3.894971812168019e-05, + "logits/chosen": 3.460303544998169, + "logits/rejected": 3.435040235519409, + "logps/chosen": -367.2547302246094, + "logps/rejected": -327.5873718261719, + "loss": 0.5599, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9385855197906494, + "rewards/margins": 3.0062994956970215, + "rewards/rejected": -4.944884777069092, + "step": 20360 + }, + { + "epoch": 0.6641086426994485, + "grad_norm": 3.4413187503814697, + "learning_rate": 3.893885575868175e-05, + "logits/chosen": 3.6177241802215576, + "logits/rejected": 3.6549789905548096, + "logps/chosen": -354.5645446777344, + "logps/rejected": -311.04095458984375, + "loss": 0.5226, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.1917619705200195, + "rewards/margins": 2.7232775688171387, + "rewards/rejected": -4.915040016174316, + "step": 20380 + }, + { + "epoch": 0.664760368550969, + "grad_norm": 8.292364120483398, + "learning_rate": 3.89279933956833e-05, + "logits/chosen": 3.161081552505493, + "logits/rejected": 3.203016996383667, + "logps/chosen": -352.87847900390625, + "logps/rejected": -307.96929931640625, + "loss": 0.6006, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.5593409538269043, + "rewards/margins": 2.5222296714782715, + "rewards/rejected": -5.081570625305176, + "step": 20400 + }, + { + "epoch": 0.6654120944024896, + "grad_norm": 2.167703151702881, + "learning_rate": 3.891713103268485e-05, + "logits/chosen": 3.7856059074401855, + "logits/rejected": 3.9373557567596436, + "logps/chosen": -362.3009033203125, + "logps/rejected": -326.89324951171875, + "loss": 0.5793, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.162818431854248, + "rewards/margins": 2.8668787479400635, + "rewards/rejected": -5.029696941375732, + "step": 20420 + }, + { + "epoch": 0.6660638202540101, + "grad_norm": 1.9279916286468506, + "learning_rate": 3.890626866968641e-05, + "logits/chosen": 3.128927707672119, + "logits/rejected": 3.249457836151123, + "logps/chosen": -350.1006164550781, + "logps/rejected": -314.74652099609375, + "loss": 0.4546, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.118530511856079, + "rewards/margins": 3.031510591506958, + "rewards/rejected": -5.150040626525879, + "step": 20440 + }, + { + "epoch": 0.6667155461055307, + "grad_norm": 0.17516936361789703, + "learning_rate": 3.889540630668796e-05, + "logits/chosen": 3.123413562774658, + "logits/rejected": 3.4085211753845215, + "logps/chosen": -367.6123352050781, + "logps/rejected": -358.85003662109375, + "loss": 0.5732, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.207348585128784, + "rewards/margins": 2.9482436180114746, + "rewards/rejected": -5.155592441558838, + "step": 20460 + }, + { + "epoch": 0.6673672719570513, + "grad_norm": 1.2108180522918701, + "learning_rate": 3.888454394368951e-05, + "logits/chosen": 3.4915242195129395, + "logits/rejected": 3.5776124000549316, + "logps/chosen": -342.6974182128906, + "logps/rejected": -330.14404296875, + "loss": 0.3629, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.1622064113616943, + "rewards/margins": 3.3193061351776123, + "rewards/rejected": -5.481513023376465, + "step": 20480 + }, + { + "epoch": 0.6680189978085719, + "grad_norm": 1.7955764532089233, + "learning_rate": 3.887368158069107e-05, + "logits/chosen": 3.2714343070983887, + "logits/rejected": 3.653643846511841, + "logps/chosen": -390.3486328125, + "logps/rejected": -382.2767333984375, + "loss": 0.8384, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7495187520980835, + "rewards/margins": 2.4551780223846436, + "rewards/rejected": -4.2046966552734375, + "step": 20500 + }, + { + "epoch": 0.6686707236600924, + "grad_norm": 0.8009448051452637, + "learning_rate": 3.886281921769262e-05, + "logits/chosen": 3.3725104331970215, + "logits/rejected": 3.4375247955322266, + "logps/chosen": -328.0099182128906, + "logps/rejected": -325.5618591308594, + "loss": 0.5731, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0238335132598877, + "rewards/margins": 2.653426170349121, + "rewards/rejected": -4.677260398864746, + "step": 20520 + }, + { + "epoch": 0.6693224495116129, + "grad_norm": 0.9690284729003906, + "learning_rate": 3.885195685469417e-05, + "logits/chosen": 3.542149782180786, + "logits/rejected": 3.469170093536377, + "logps/chosen": -379.41265869140625, + "logps/rejected": -345.1938171386719, + "loss": 0.3806, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2079551219940186, + "rewards/margins": 3.279738187789917, + "rewards/rejected": -4.4876933097839355, + "step": 20540 + }, + { + "epoch": 0.6699741753631335, + "grad_norm": 4.455617427825928, + "learning_rate": 3.884109449169572e-05, + "logits/chosen": 3.4841721057891846, + "logits/rejected": 3.4616570472717285, + "logps/chosen": -383.4967346191406, + "logps/rejected": -357.54583740234375, + "loss": 0.4883, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9380474090576172, + "rewards/margins": 2.8150296211242676, + "rewards/rejected": -4.753077030181885, + "step": 20560 + }, + { + "epoch": 0.670625901214654, + "grad_norm": 6.2683210372924805, + "learning_rate": 3.883023212869728e-05, + "logits/chosen": 3.2346503734588623, + "logits/rejected": 3.4070732593536377, + "logps/chosen": -373.44879150390625, + "logps/rejected": -352.11627197265625, + "loss": 0.5938, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.1467537879943848, + "rewards/margins": 2.3998754024505615, + "rewards/rejected": -4.546629428863525, + "step": 20580 + }, + { + "epoch": 0.6712776270661747, + "grad_norm": 3.8302125930786133, + "learning_rate": 3.881936976569883e-05, + "logits/chosen": 3.226444959640503, + "logits/rejected": 3.4109749794006348, + "logps/chosen": -348.16644287109375, + "logps/rejected": -335.25726318359375, + "loss": 0.4184, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6342408657073975, + "rewards/margins": 2.732154369354248, + "rewards/rejected": -5.366394996643066, + "step": 20600 + }, + { + "epoch": 0.6719293529176952, + "grad_norm": 2.0471110343933105, + "learning_rate": 3.880850740270039e-05, + "logits/chosen": 3.4205424785614014, + "logits/rejected": 3.5574791431427, + "logps/chosen": -306.6393127441406, + "logps/rejected": -271.32122802734375, + "loss": 0.4637, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.171657085418701, + "rewards/margins": 2.270089626312256, + "rewards/rejected": -4.441746711730957, + "step": 20620 + }, + { + "epoch": 0.6725810787692157, + "grad_norm": 0.26078343391418457, + "learning_rate": 3.879764503970194e-05, + "logits/chosen": 3.5520145893096924, + "logits/rejected": 3.636333465576172, + "logps/chosen": -371.79632568359375, + "logps/rejected": -310.3235168457031, + "loss": 0.4024, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.012709140777588, + "rewards/margins": 3.0971767902374268, + "rewards/rejected": -5.109886169433594, + "step": 20640 + }, + { + "epoch": 0.6732328046207363, + "grad_norm": 0.3218671679496765, + "learning_rate": 3.8786782676703495e-05, + "logits/chosen": 3.376868486404419, + "logits/rejected": 3.5773448944091797, + "logps/chosen": -350.05267333984375, + "logps/rejected": -324.7908020019531, + "loss": 0.5778, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7403361797332764, + "rewards/margins": 2.854771137237549, + "rewards/rejected": -4.5951080322265625, + "step": 20660 + }, + { + "epoch": 0.6738845304722568, + "grad_norm": 0.6019384860992432, + "learning_rate": 3.8775920313705046e-05, + "logits/chosen": 3.5327587127685547, + "logits/rejected": 3.790583372116089, + "logps/chosen": -346.799072265625, + "logps/rejected": -320.125, + "loss": 0.4836, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.683978796005249, + "rewards/margins": 2.723376750946045, + "rewards/rejected": -4.407355308532715, + "step": 20680 + }, + { + "epoch": 0.6745362563237775, + "grad_norm": 4.369143486022949, + "learning_rate": 3.87650579507066e-05, + "logits/chosen": 3.6994261741638184, + "logits/rejected": 3.863690137863159, + "logps/chosen": -346.74249267578125, + "logps/rejected": -342.3338317871094, + "loss": 0.4045, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9918636083602905, + "rewards/margins": 3.298447370529175, + "rewards/rejected": -5.290310859680176, + "step": 20700 + }, + { + "epoch": 0.675187982175298, + "grad_norm": 2.586404800415039, + "learning_rate": 3.8754195587708154e-05, + "logits/chosen": 3.693908214569092, + "logits/rejected": 3.747920513153076, + "logps/chosen": -363.1207580566406, + "logps/rejected": -340.3548889160156, + "loss": 0.4962, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.061084270477295, + "rewards/margins": 2.8440141677856445, + "rewards/rejected": -4.905097961425781, + "step": 20720 + }, + { + "epoch": 0.6758397080268185, + "grad_norm": 0.7376147508621216, + "learning_rate": 3.8743333224709705e-05, + "logits/chosen": 3.0531322956085205, + "logits/rejected": 3.3908133506774902, + "logps/chosen": -330.383056640625, + "logps/rejected": -296.0280456542969, + "loss": 0.546, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6319917440414429, + "rewards/margins": 3.001018762588501, + "rewards/rejected": -4.633010387420654, + "step": 20740 + }, + { + "epoch": 0.6764914338783391, + "grad_norm": 7.074551105499268, + "learning_rate": 3.8732470861711256e-05, + "logits/chosen": 3.0721547603607178, + "logits/rejected": 3.424983263015747, + "logps/chosen": -349.9570617675781, + "logps/rejected": -337.2801513671875, + "loss": 0.5223, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.349423885345459, + "rewards/margins": 2.647810459136963, + "rewards/rejected": -4.997234344482422, + "step": 20760 + }, + { + "epoch": 0.6771431597298596, + "grad_norm": 1.371158242225647, + "learning_rate": 3.8721608498712813e-05, + "logits/chosen": 3.630999803543091, + "logits/rejected": 3.6684677600860596, + "logps/chosen": -377.3924865722656, + "logps/rejected": -354.29364013671875, + "loss": 0.4679, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8689937591552734, + "rewards/margins": 3.151010751724243, + "rewards/rejected": -5.0200042724609375, + "step": 20780 + }, + { + "epoch": 0.6777948855813802, + "grad_norm": 1.1973590850830078, + "learning_rate": 3.8710746135714364e-05, + "logits/chosen": 3.5426411628723145, + "logits/rejected": 3.639854907989502, + "logps/chosen": -331.2402648925781, + "logps/rejected": -323.26495361328125, + "loss": 0.4405, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8243896961212158, + "rewards/margins": 2.8419926166534424, + "rewards/rejected": -4.666382312774658, + "step": 20800 + }, + { + "epoch": 0.6784466114329007, + "grad_norm": 3.959381580352783, + "learning_rate": 3.8699883772715915e-05, + "logits/chosen": 3.3097610473632812, + "logits/rejected": 3.6652438640594482, + "logps/chosen": -317.6466369628906, + "logps/rejected": -320.42724609375, + "loss": 0.6132, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.931734323501587, + "rewards/margins": 2.6714718341827393, + "rewards/rejected": -4.603205680847168, + "step": 20820 + }, + { + "epoch": 0.6790983372844213, + "grad_norm": 0.46483314037323, + "learning_rate": 3.868902140971747e-05, + "logits/chosen": 3.551482677459717, + "logits/rejected": 3.752831220626831, + "logps/chosen": -383.33282470703125, + "logps/rejected": -359.1151428222656, + "loss": 0.4411, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.793811559677124, + "rewards/margins": 3.3159854412078857, + "rewards/rejected": -5.109796524047852, + "step": 20840 + }, + { + "epoch": 0.6797500631359419, + "grad_norm": 10.252052307128906, + "learning_rate": 3.867815904671902e-05, + "logits/chosen": 3.328542709350586, + "logits/rejected": 3.397671937942505, + "logps/chosen": -308.1998596191406, + "logps/rejected": -324.8957214355469, + "loss": 0.5844, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.2355268001556396, + "rewards/margins": 2.5004329681396484, + "rewards/rejected": -4.735960483551025, + "step": 20860 + }, + { + "epoch": 0.6804017889874624, + "grad_norm": 0.14137886464595795, + "learning_rate": 3.8667296683720574e-05, + "logits/chosen": 3.1781132221221924, + "logits/rejected": 3.5945117473602295, + "logps/chosen": -328.84356689453125, + "logps/rejected": -331.4468078613281, + "loss": 0.4791, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9396613836288452, + "rewards/margins": 3.136963367462158, + "rewards/rejected": -5.076624870300293, + "step": 20880 + }, + { + "epoch": 0.681053514838983, + "grad_norm": 2.904726982116699, + "learning_rate": 3.865643432072213e-05, + "logits/chosen": 3.5887763500213623, + "logits/rejected": 3.6618475914001465, + "logps/chosen": -324.84234619140625, + "logps/rejected": -297.5805969238281, + "loss": 0.4878, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3079590797424316, + "rewards/margins": 1.9098981618881226, + "rewards/rejected": -3.2178573608398438, + "step": 20900 + }, + { + "epoch": 0.6817052406905035, + "grad_norm": 4.19849967956543, + "learning_rate": 3.864557195772369e-05, + "logits/chosen": 3.616370677947998, + "logits/rejected": 3.617431640625, + "logps/chosen": -332.74468994140625, + "logps/rejected": -342.197509765625, + "loss": 0.5191, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.448578119277954, + "rewards/margins": 2.177644729614258, + "rewards/rejected": -3.626223087310791, + "step": 20920 + }, + { + "epoch": 0.682356966542024, + "grad_norm": 7.133583068847656, + "learning_rate": 3.863470959472524e-05, + "logits/chosen": 3.744885206222534, + "logits/rejected": 3.9495723247528076, + "logps/chosen": -402.69219970703125, + "logps/rejected": -298.7810974121094, + "loss": 0.5564, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8966888189315796, + "rewards/margins": 2.03254771232605, + "rewards/rejected": -3.929236650466919, + "step": 20940 + }, + { + "epoch": 0.6830086923935447, + "grad_norm": 6.972297191619873, + "learning_rate": 3.862384723172679e-05, + "logits/chosen": 3.481825590133667, + "logits/rejected": 3.573028564453125, + "logps/chosen": -351.2462463378906, + "logps/rejected": -295.93780517578125, + "loss": 0.4964, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.029327154159546, + "rewards/margins": 2.5525717735290527, + "rewards/rejected": -3.5818989276885986, + "step": 20960 + }, + { + "epoch": 0.6836604182450652, + "grad_norm": 1.6138761043548584, + "learning_rate": 3.861298486872835e-05, + "logits/chosen": 3.915090560913086, + "logits/rejected": 3.947878360748291, + "logps/chosen": -345.8470153808594, + "logps/rejected": -304.48828125, + "loss": 0.5433, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4468594789505005, + "rewards/margins": 2.2465834617614746, + "rewards/rejected": -3.6934428215026855, + "step": 20980 + }, + { + "epoch": 0.6843121440965858, + "grad_norm": 2.5024173259735107, + "learning_rate": 3.86021225057299e-05, + "logits/chosen": 3.738349199295044, + "logits/rejected": 3.6453144550323486, + "logps/chosen": -328.16162109375, + "logps/rejected": -332.3053283691406, + "loss": 0.4237, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.9340952038764954, + "rewards/margins": 2.3410282135009766, + "rewards/rejected": -3.275123119354248, + "step": 21000 + }, + { + "epoch": 0.6849638699481063, + "grad_norm": 1.09121572971344, + "learning_rate": 3.859126014273145e-05, + "logits/chosen": 3.090567111968994, + "logits/rejected": 3.456371307373047, + "logps/chosen": -388.05096435546875, + "logps/rejected": -307.8350524902344, + "loss": 0.8938, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.254058361053467, + "rewards/margins": 2.0189731121063232, + "rewards/rejected": -4.273031711578369, + "step": 21020 + }, + { + "epoch": 0.6856155957996269, + "grad_norm": 0.3532474637031555, + "learning_rate": 3.858039777973301e-05, + "logits/chosen": 3.6975674629211426, + "logits/rejected": 3.8353283405303955, + "logps/chosen": -391.2301025390625, + "logps/rejected": -326.76336669921875, + "loss": 0.4483, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4476985931396484, + "rewards/margins": 2.711021900177002, + "rewards/rejected": -4.158720016479492, + "step": 21040 + }, + { + "epoch": 0.6862673216511475, + "grad_norm": 5.218931674957275, + "learning_rate": 3.856953541673456e-05, + "logits/chosen": 3.663755416870117, + "logits/rejected": 3.8141860961914062, + "logps/chosen": -343.288330078125, + "logps/rejected": -304.7303771972656, + "loss": 0.5071, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4561741352081299, + "rewards/margins": 2.0335376262664795, + "rewards/rejected": -3.4897117614746094, + "step": 21060 + }, + { + "epoch": 0.686919047502668, + "grad_norm": 4.003526210784912, + "learning_rate": 3.855867305373611e-05, + "logits/chosen": 3.783435344696045, + "logits/rejected": 3.5947723388671875, + "logps/chosen": -311.4997863769531, + "logps/rejected": -301.0281066894531, + "loss": 0.4689, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.089508056640625, + "rewards/margins": 2.620144844055176, + "rewards/rejected": -3.7096526622772217, + "step": 21080 + }, + { + "epoch": 0.6875707733541886, + "grad_norm": 1.8330601453781128, + "learning_rate": 3.854781069073766e-05, + "logits/chosen": 3.2453479766845703, + "logits/rejected": 3.553417921066284, + "logps/chosen": -338.66143798828125, + "logps/rejected": -281.6866455078125, + "loss": 0.6096, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6057127714157104, + "rewards/margins": 2.0694005489349365, + "rewards/rejected": -2.6751129627227783, + "step": 21100 + }, + { + "epoch": 0.6882224992057091, + "grad_norm": 2.0527238845825195, + "learning_rate": 3.853694832773922e-05, + "logits/chosen": 3.264657497406006, + "logits/rejected": 3.4418110847473145, + "logps/chosen": -289.82525634765625, + "logps/rejected": -308.4505615234375, + "loss": 0.3907, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1009771823883057, + "rewards/margins": 2.3897604942321777, + "rewards/rejected": -3.4907379150390625, + "step": 21120 + }, + { + "epoch": 0.6888742250572297, + "grad_norm": 9.871136665344238, + "learning_rate": 3.852608596474077e-05, + "logits/chosen": 3.3097052574157715, + "logits/rejected": 3.4684231281280518, + "logps/chosen": -325.83050537109375, + "logps/rejected": -293.7611389160156, + "loss": 0.5052, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2503324747085571, + "rewards/margins": 2.52783465385437, + "rewards/rejected": -3.7781670093536377, + "step": 21140 + }, + { + "epoch": 0.6895259509087502, + "grad_norm": 1.0994741916656494, + "learning_rate": 3.8515223601742326e-05, + "logits/chosen": 3.1488096714019775, + "logits/rejected": 3.2249367237091064, + "logps/chosen": -314.9317932128906, + "logps/rejected": -334.80096435546875, + "loss": 0.4672, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.018358826637268, + "rewards/margins": 2.343449831008911, + "rewards/rejected": -3.3618087768554688, + "step": 21160 + }, + { + "epoch": 0.6901776767602708, + "grad_norm": 1.609290361404419, + "learning_rate": 3.850436123874388e-05, + "logits/chosen": 3.2346158027648926, + "logits/rejected": 3.363706111907959, + "logps/chosen": -332.9775085449219, + "logps/rejected": -289.58477783203125, + "loss": 0.411, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8785052299499512, + "rewards/margins": 2.562814235687256, + "rewards/rejected": -3.441319227218628, + "step": 21180 + }, + { + "epoch": 0.6908294026117914, + "grad_norm": 3.2903337478637695, + "learning_rate": 3.8493498875745434e-05, + "logits/chosen": 3.1524245738983154, + "logits/rejected": 3.2483978271484375, + "logps/chosen": -334.7339782714844, + "logps/rejected": -313.1120300292969, + "loss": 0.4935, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0854992866516113, + "rewards/margins": 2.571559190750122, + "rewards/rejected": -4.6570587158203125, + "step": 21200 + }, + { + "epoch": 0.6914811284633119, + "grad_norm": 2.062347650527954, + "learning_rate": 3.8482636512746985e-05, + "logits/chosen": 3.2597174644470215, + "logits/rejected": 3.271530866622925, + "logps/chosen": -337.67376708984375, + "logps/rejected": -307.40216064453125, + "loss": 0.5129, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7528455257415771, + "rewards/margins": 2.8344454765319824, + "rewards/rejected": -4.5872907638549805, + "step": 21220 + }, + { + "epoch": 0.6921328543148325, + "grad_norm": 2.2723498344421387, + "learning_rate": 3.847177414974854e-05, + "logits/chosen": 3.1067395210266113, + "logits/rejected": 3.342149019241333, + "logps/chosen": -343.897705078125, + "logps/rejected": -309.700927734375, + "loss": 0.5354, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.723557710647583, + "rewards/margins": 2.4679229259490967, + "rewards/rejected": -4.19148063659668, + "step": 21240 + }, + { + "epoch": 0.692784580166353, + "grad_norm": 0.6910951733589172, + "learning_rate": 3.846091178675009e-05, + "logits/chosen": 3.2611355781555176, + "logits/rejected": 3.5351524353027344, + "logps/chosen": -292.2054138183594, + "logps/rejected": -304.84710693359375, + "loss": 0.4638, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5467113256454468, + "rewards/margins": 2.620183229446411, + "rewards/rejected": -4.166894912719727, + "step": 21260 + }, + { + "epoch": 0.6934363060178735, + "grad_norm": 3.7038400173187256, + "learning_rate": 3.8450049423751644e-05, + "logits/chosen": 3.4116244316101074, + "logits/rejected": 3.5083413124084473, + "logps/chosen": -329.3558349609375, + "logps/rejected": -299.97137451171875, + "loss": 0.6308, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.281141996383667, + "rewards/margins": 2.0680909156799316, + "rewards/rejected": -4.3492326736450195, + "step": 21280 + }, + { + "epoch": 0.6940880318693942, + "grad_norm": 2.0531575679779053, + "learning_rate": 3.8439187060753195e-05, + "logits/chosen": 3.4955649375915527, + "logits/rejected": 3.695335865020752, + "logps/chosen": -353.09197998046875, + "logps/rejected": -311.6591796875, + "loss": 0.2991, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7012516260147095, + "rewards/margins": 2.7134881019592285, + "rewards/rejected": -4.414740085601807, + "step": 21300 + }, + { + "epoch": 0.6947397577209147, + "grad_norm": 5.189874172210693, + "learning_rate": 3.842832469775475e-05, + "logits/chosen": 3.5852699279785156, + "logits/rejected": 3.698227643966675, + "logps/chosen": -344.55316162109375, + "logps/rejected": -352.6553649902344, + "loss": 0.4126, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9253299236297607, + "rewards/margins": 2.5654971599578857, + "rewards/rejected": -4.4908270835876465, + "step": 21320 + }, + { + "epoch": 0.6953914835724353, + "grad_norm": 2.2334694862365723, + "learning_rate": 3.84174623347563e-05, + "logits/chosen": 3.68957781791687, + "logits/rejected": 3.8472225666046143, + "logps/chosen": -335.3594665527344, + "logps/rejected": -302.3918762207031, + "loss": 0.51, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.761803388595581, + "rewards/margins": 2.365959882736206, + "rewards/rejected": -4.127763748168945, + "step": 21340 + }, + { + "epoch": 0.6960432094239558, + "grad_norm": 1.5711177587509155, + "learning_rate": 3.8406599971757854e-05, + "logits/chosen": 4.200415134429932, + "logits/rejected": 4.115564823150635, + "logps/chosen": -385.6088562011719, + "logps/rejected": -328.2865295410156, + "loss": 0.4284, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6532251834869385, + "rewards/margins": 2.225555896759033, + "rewards/rejected": -3.8787810802459717, + "step": 21360 + }, + { + "epoch": 0.6966949352754763, + "grad_norm": 0.12029710412025452, + "learning_rate": 3.839573760875941e-05, + "logits/chosen": 3.5016071796417236, + "logits/rejected": 3.6186680793762207, + "logps/chosen": -354.42926025390625, + "logps/rejected": -320.08172607421875, + "loss": 0.368, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.9417064189910889, + "rewards/margins": 2.613837242126465, + "rewards/rejected": -4.555543422698975, + "step": 21380 + }, + { + "epoch": 0.697346661126997, + "grad_norm": 2.607489585876465, + "learning_rate": 3.838487524576096e-05, + "logits/chosen": 3.194122314453125, + "logits/rejected": 3.484304428100586, + "logps/chosen": -365.45428466796875, + "logps/rejected": -296.10687255859375, + "loss": 0.5108, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4208920001983643, + "rewards/margins": 2.5000548362731934, + "rewards/rejected": -4.920947074890137, + "step": 21400 + }, + { + "epoch": 0.6979983869785175, + "grad_norm": 3.4998624324798584, + "learning_rate": 3.837401288276252e-05, + "logits/chosen": 3.5911927223205566, + "logits/rejected": 3.7236380577087402, + "logps/chosen": -392.81146240234375, + "logps/rejected": -374.3353576660156, + "loss": 0.4267, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.172281503677368, + "rewards/margins": 3.150073289871216, + "rewards/rejected": -5.322354316711426, + "step": 21420 + }, + { + "epoch": 0.6986501128300381, + "grad_norm": 0.3274494707584381, + "learning_rate": 3.836315051976407e-05, + "logits/chosen": 3.5479302406311035, + "logits/rejected": 3.6297125816345215, + "logps/chosen": -397.0652770996094, + "logps/rejected": -343.6028137207031, + "loss": 0.3921, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8733265399932861, + "rewards/margins": 3.173555612564087, + "rewards/rejected": -5.046881675720215, + "step": 21440 + }, + { + "epoch": 0.6993018386815586, + "grad_norm": 0.911907434463501, + "learning_rate": 3.835228815676563e-05, + "logits/chosen": 3.2202224731445312, + "logits/rejected": 3.3314175605773926, + "logps/chosen": -333.04949951171875, + "logps/rejected": -289.6731262207031, + "loss": 0.587, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.381779909133911, + "rewards/margins": 2.1537559032440186, + "rewards/rejected": -4.53553581237793, + "step": 21460 + }, + { + "epoch": 0.6999535645330791, + "grad_norm": 3.4866416454315186, + "learning_rate": 3.834142579376718e-05, + "logits/chosen": 3.3655402660369873, + "logits/rejected": 3.5364327430725098, + "logps/chosen": -299.2911682128906, + "logps/rejected": -283.9482421875, + "loss": 0.5351, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.4823403358459473, + "rewards/margins": 2.424748420715332, + "rewards/rejected": -4.9070892333984375, + "step": 21480 + }, + { + "epoch": 0.7006052903845997, + "grad_norm": 3.9578027725219727, + "learning_rate": 3.833056343076873e-05, + "logits/chosen": 3.3741888999938965, + "logits/rejected": 3.6941161155700684, + "logps/chosen": -326.83233642578125, + "logps/rejected": -345.3719482421875, + "loss": 0.5598, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9963356256484985, + "rewards/margins": 2.4554638862609863, + "rewards/rejected": -4.451799392700195, + "step": 21500 + }, + { + "epoch": 0.7012570162361202, + "grad_norm": 1.5167444944381714, + "learning_rate": 3.831970106777029e-05, + "logits/chosen": 3.6478512287139893, + "logits/rejected": 3.8114266395568848, + "logps/chosen": -353.50836181640625, + "logps/rejected": -329.47747802734375, + "loss": 0.542, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.5274691581726074, + "rewards/margins": 2.583324909210205, + "rewards/rejected": -5.110794544219971, + "step": 21520 + }, + { + "epoch": 0.7019087420876409, + "grad_norm": 0.3681613802909851, + "learning_rate": 3.830883870477184e-05, + "logits/chosen": 3.1865501403808594, + "logits/rejected": 3.316135883331299, + "logps/chosen": -322.4212951660156, + "logps/rejected": -302.1346435546875, + "loss": 0.4129, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.175194263458252, + "rewards/margins": 3.266791582107544, + "rewards/rejected": -5.441986083984375, + "step": 21540 + }, + { + "epoch": 0.7025604679391614, + "grad_norm": 1.326474666595459, + "learning_rate": 3.829797634177339e-05, + "logits/chosen": 3.7256221771240234, + "logits/rejected": 3.690718412399292, + "logps/chosen": -370.6858215332031, + "logps/rejected": -358.3293151855469, + "loss": 0.3172, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8864425420761108, + "rewards/margins": 2.9802968502044678, + "rewards/rejected": -4.866738796234131, + "step": 21560 + }, + { + "epoch": 0.7032121937906819, + "grad_norm": 2.823676347732544, + "learning_rate": 3.8287113978774946e-05, + "logits/chosen": 3.3635990619659424, + "logits/rejected": 3.5000853538513184, + "logps/chosen": -332.84478759765625, + "logps/rejected": -317.3389892578125, + "loss": 0.4442, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8795665502548218, + "rewards/margins": 2.707719087600708, + "rewards/rejected": -4.587285041809082, + "step": 21580 + }, + { + "epoch": 0.7038639196422025, + "grad_norm": 2.0342719554901123, + "learning_rate": 3.82762516157765e-05, + "logits/chosen": 3.579624652862549, + "logits/rejected": 3.561023235321045, + "logps/chosen": -344.1999206542969, + "logps/rejected": -306.3910217285156, + "loss": 0.5855, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8945653438568115, + "rewards/margins": 2.280744791030884, + "rewards/rejected": -4.1753106117248535, + "step": 21600 + }, + { + "epoch": 0.704515645493723, + "grad_norm": 0.6349130272865295, + "learning_rate": 3.826538925277805e-05, + "logits/chosen": 3.580258846282959, + "logits/rejected": 3.6688761711120605, + "logps/chosen": -337.2831726074219, + "logps/rejected": -337.13616943359375, + "loss": 0.2459, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.162952184677124, + "rewards/margins": 3.958064317703247, + "rewards/rejected": -5.121016502380371, + "step": 21620 + }, + { + "epoch": 0.7051673713452437, + "grad_norm": 1.86649489402771, + "learning_rate": 3.8254526889779606e-05, + "logits/chosen": 3.517533779144287, + "logits/rejected": 3.6518311500549316, + "logps/chosen": -331.71820068359375, + "logps/rejected": -286.5494079589844, + "loss": 0.5869, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.062399387359619, + "rewards/margins": 2.546614408493042, + "rewards/rejected": -4.60901403427124, + "step": 21640 + }, + { + "epoch": 0.7058190971967642, + "grad_norm": 2.343430757522583, + "learning_rate": 3.8243664526781156e-05, + "logits/chosen": 3.54669189453125, + "logits/rejected": 3.629176378250122, + "logps/chosen": -339.4483642578125, + "logps/rejected": -331.95086669921875, + "loss": 0.3094, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.089369773864746, + "rewards/margins": 3.5131354331970215, + "rewards/rejected": -5.602505207061768, + "step": 21660 + }, + { + "epoch": 0.7064708230482848, + "grad_norm": 2.944786310195923, + "learning_rate": 3.823280216378271e-05, + "logits/chosen": 3.6560044288635254, + "logits/rejected": 3.6902873516082764, + "logps/chosen": -426.8848571777344, + "logps/rejected": -357.6672058105469, + "loss": 0.5121, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.596374273300171, + "rewards/margins": 3.3238468170166016, + "rewards/rejected": -4.920220851898193, + "step": 21680 + }, + { + "epoch": 0.7071225488998053, + "grad_norm": 1.1900473833084106, + "learning_rate": 3.8221939800784265e-05, + "logits/chosen": 3.4991073608398438, + "logits/rejected": 3.7222702503204346, + "logps/chosen": -350.18408203125, + "logps/rejected": -319.85467529296875, + "loss": 0.4965, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9418354034423828, + "rewards/margins": 2.942699432373047, + "rewards/rejected": -4.884535312652588, + "step": 21700 + }, + { + "epoch": 0.7077742747513258, + "grad_norm": 1.7245692014694214, + "learning_rate": 3.821107743778582e-05, + "logits/chosen": 3.606349229812622, + "logits/rejected": 3.763760805130005, + "logps/chosen": -342.22821044921875, + "logps/rejected": -305.4476318359375, + "loss": 0.3215, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7169866561889648, + "rewards/margins": 3.2057273387908936, + "rewards/rejected": -4.922713756561279, + "step": 21720 + }, + { + "epoch": 0.7084260006028464, + "grad_norm": 0.6960054636001587, + "learning_rate": 3.820021507478737e-05, + "logits/chosen": 3.519076108932495, + "logits/rejected": 3.751657009124756, + "logps/chosen": -319.5457763671875, + "logps/rejected": -309.4517517089844, + "loss": 0.4305, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6492621898651123, + "rewards/margins": 3.015961170196533, + "rewards/rejected": -4.665223598480225, + "step": 21740 + }, + { + "epoch": 0.709077726454367, + "grad_norm": 4.153099536895752, + "learning_rate": 3.8189352711788924e-05, + "logits/chosen": 3.3919997215270996, + "logits/rejected": 3.582909345626831, + "logps/chosen": -359.8260498046875, + "logps/rejected": -355.55084228515625, + "loss": 0.4248, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.1269712448120117, + "rewards/margins": 3.2757461071014404, + "rewards/rejected": -5.402717113494873, + "step": 21760 + }, + { + "epoch": 0.7097294523058876, + "grad_norm": 4.563737869262695, + "learning_rate": 3.817849034879048e-05, + "logits/chosen": 3.260505199432373, + "logits/rejected": 3.272390365600586, + "logps/chosen": -322.7164001464844, + "logps/rejected": -318.0915832519531, + "loss": 0.4478, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.597365140914917, + "rewards/margins": 3.1509242057800293, + "rewards/rejected": -5.748289585113525, + "step": 21780 + }, + { + "epoch": 0.7103811781574081, + "grad_norm": 0.5963456034660339, + "learning_rate": 3.816762798579203e-05, + "logits/chosen": 3.3585236072540283, + "logits/rejected": 3.5585758686065674, + "logps/chosen": -335.61907958984375, + "logps/rejected": -345.45037841796875, + "loss": 0.3649, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.3397207260131836, + "rewards/margins": 3.3752429485321045, + "rewards/rejected": -5.714963436126709, + "step": 21800 + }, + { + "epoch": 0.7110329040089286, + "grad_norm": 2.4971888065338135, + "learning_rate": 3.815676562279358e-05, + "logits/chosen": 3.532276153564453, + "logits/rejected": 3.6437995433807373, + "logps/chosen": -327.0250549316406, + "logps/rejected": -311.074951171875, + "loss": 0.7314, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4155356884002686, + "rewards/margins": 2.438997745513916, + "rewards/rejected": -4.8545331954956055, + "step": 21820 + }, + { + "epoch": 0.7116846298604492, + "grad_norm": 1.3393093347549438, + "learning_rate": 3.8145903259795134e-05, + "logits/chosen": 3.347628116607666, + "logits/rejected": 3.3550376892089844, + "logps/chosen": -322.0609130859375, + "logps/rejected": -305.5943298339844, + "loss": 0.546, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.436509132385254, + "rewards/margins": 2.649077892303467, + "rewards/rejected": -5.085587501525879, + "step": 21840 + }, + { + "epoch": 0.7123363557119697, + "grad_norm": 4.578582286834717, + "learning_rate": 3.813504089679669e-05, + "logits/chosen": 3.3475894927978516, + "logits/rejected": 3.4183897972106934, + "logps/chosen": -341.9515075683594, + "logps/rejected": -338.07562255859375, + "loss": 0.5569, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.212087869644165, + "rewards/margins": 3.035635471343994, + "rewards/rejected": -5.247723579406738, + "step": 21860 + }, + { + "epoch": 0.7129880815634904, + "grad_norm": 1.0481832027435303, + "learning_rate": 3.812417853379824e-05, + "logits/chosen": 3.658911943435669, + "logits/rejected": 4.014595985412598, + "logps/chosen": -343.6416931152344, + "logps/rejected": -345.1085205078125, + "loss": 0.4849, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.0480990409851074, + "rewards/margins": 2.165860652923584, + "rewards/rejected": -4.213959693908691, + "step": 21880 + }, + { + "epoch": 0.7136398074150109, + "grad_norm": 0.411677747964859, + "learning_rate": 3.811331617079979e-05, + "logits/chosen": 3.976579189300537, + "logits/rejected": 4.004709243774414, + "logps/chosen": -395.12066650390625, + "logps/rejected": -364.07501220703125, + "loss": 0.5948, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.6994807720184326, + "rewards/margins": 2.8869094848632812, + "rewards/rejected": -5.586390495300293, + "step": 21900 + }, + { + "epoch": 0.7142915332665314, + "grad_norm": 24.414209365844727, + "learning_rate": 3.810245380780135e-05, + "logits/chosen": 3.5133583545684814, + "logits/rejected": 3.6953914165496826, + "logps/chosen": -325.5815734863281, + "logps/rejected": -330.7919616699219, + "loss": 0.5462, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.602473020553589, + "rewards/margins": 2.3515877723693848, + "rewards/rejected": -4.9540605545043945, + "step": 21920 + }, + { + "epoch": 0.714943259118052, + "grad_norm": 4.043552398681641, + "learning_rate": 3.80915914448029e-05, + "logits/chosen": 3.551884412765503, + "logits/rejected": 3.8000190258026123, + "logps/chosen": -365.1644592285156, + "logps/rejected": -344.98138427734375, + "loss": 0.3674, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.0908522605895996, + "rewards/margins": 3.5392098426818848, + "rewards/rejected": -5.630062103271484, + "step": 21940 + }, + { + "epoch": 0.7155949849695725, + "grad_norm": 0.2710435688495636, + "learning_rate": 3.808072908180446e-05, + "logits/chosen": 3.3614368438720703, + "logits/rejected": 3.434101104736328, + "logps/chosen": -333.4077453613281, + "logps/rejected": -328.53973388671875, + "loss": 0.6238, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.0510764122009277, + "rewards/margins": 2.634768009185791, + "rewards/rejected": -4.685843467712402, + "step": 21960 + }, + { + "epoch": 0.7162467108210931, + "grad_norm": 1.0387470722198486, + "learning_rate": 3.8069866718806016e-05, + "logits/chosen": 3.3074259757995605, + "logits/rejected": 3.4959492683410645, + "logps/chosen": -328.2149963378906, + "logps/rejected": -310.68011474609375, + "loss": 0.6606, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.3653600215911865, + "rewards/margins": 2.5629019737243652, + "rewards/rejected": -4.928261756896973, + "step": 21980 + }, + { + "epoch": 0.7168984366726137, + "grad_norm": 6.231029987335205, + "learning_rate": 3.805900435580757e-05, + "logits/chosen": 3.7578530311584473, + "logits/rejected": 3.758235216140747, + "logps/chosen": -324.8170471191406, + "logps/rejected": -315.3425598144531, + "loss": 0.4738, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.2567389011383057, + "rewards/margins": 2.7116501331329346, + "rewards/rejected": -4.96838903427124, + "step": 22000 + }, + { + "epoch": 0.7175501625241342, + "grad_norm": 0.57881098985672, + "learning_rate": 3.804814199280912e-05, + "logits/chosen": 3.744925022125244, + "logits/rejected": 3.6658968925476074, + "logps/chosen": -337.2420349121094, + "logps/rejected": -320.6252136230469, + "loss": 0.5178, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.188575267791748, + "rewards/margins": 3.2630627155303955, + "rewards/rejected": -5.451638221740723, + "step": 22020 + }, + { + "epoch": 0.7182018883756548, + "grad_norm": 1.632339358329773, + "learning_rate": 3.803727962981067e-05, + "logits/chosen": 3.3785502910614014, + "logits/rejected": 3.548257350921631, + "logps/chosen": -346.1852111816406, + "logps/rejected": -321.1504821777344, + "loss": 0.7266, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9730584621429443, + "rewards/margins": 1.907340407371521, + "rewards/rejected": -3.880398988723755, + "step": 22040 + }, + { + "epoch": 0.7188536142271753, + "grad_norm": 3.420295000076294, + "learning_rate": 3.8026417266812226e-05, + "logits/chosen": 3.278266191482544, + "logits/rejected": 3.443297863006592, + "logps/chosen": -312.17486572265625, + "logps/rejected": -307.2089538574219, + "loss": 0.3794, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7551473379135132, + "rewards/margins": 2.998765468597412, + "rewards/rejected": -4.753912925720215, + "step": 22060 + }, + { + "epoch": 0.7195053400786959, + "grad_norm": 1.4495775699615479, + "learning_rate": 3.801555490381378e-05, + "logits/chosen": 3.589463472366333, + "logits/rejected": 3.748676300048828, + "logps/chosen": -363.72552490234375, + "logps/rejected": -353.0551452636719, + "loss": 0.7325, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.5085411071777344, + "rewards/margins": 2.4467272758483887, + "rewards/rejected": -4.955268383026123, + "step": 22080 + }, + { + "epoch": 0.7201570659302164, + "grad_norm": 4.982332706451416, + "learning_rate": 3.800469254081533e-05, + "logits/chosen": 3.0167860984802246, + "logits/rejected": 3.1121208667755127, + "logps/chosen": -332.80963134765625, + "logps/rejected": -318.25970458984375, + "loss": 0.3625, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.428891658782959, + "rewards/margins": 2.9847025871276855, + "rewards/rejected": -5.4135942459106445, + "step": 22100 + }, + { + "epoch": 0.720808791781737, + "grad_norm": 3.3324708938598633, + "learning_rate": 3.7993830177816885e-05, + "logits/chosen": 3.648646831512451, + "logits/rejected": 3.576538562774658, + "logps/chosen": -373.75177001953125, + "logps/rejected": -329.08935546875, + "loss": 0.4908, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.370399236679077, + "rewards/margins": 3.2107207775115967, + "rewards/rejected": -5.581120491027832, + "step": 22120 + }, + { + "epoch": 0.7214605176332576, + "grad_norm": 2.3131213188171387, + "learning_rate": 3.7982967814818436e-05, + "logits/chosen": 3.225161075592041, + "logits/rejected": 3.4243202209472656, + "logps/chosen": -319.07049560546875, + "logps/rejected": -339.1788330078125, + "loss": 0.6397, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.5132627487182617, + "rewards/margins": 2.133107900619507, + "rewards/rejected": -4.646371364593506, + "step": 22140 + }, + { + "epoch": 0.7221122434847781, + "grad_norm": 5.135356903076172, + "learning_rate": 3.797210545181999e-05, + "logits/chosen": 3.138841152191162, + "logits/rejected": 3.62556529045105, + "logps/chosen": -328.2182922363281, + "logps/rejected": -293.053955078125, + "loss": 0.4183, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7395671606063843, + "rewards/margins": 2.6438136100769043, + "rewards/rejected": -4.383381366729736, + "step": 22160 + }, + { + "epoch": 0.7227639693362987, + "grad_norm": 1.0580010414123535, + "learning_rate": 3.7961243088821545e-05, + "logits/chosen": 3.6846976280212402, + "logits/rejected": 3.816509246826172, + "logps/chosen": -325.61138916015625, + "logps/rejected": -256.0003967285156, + "loss": 0.3957, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5076693296432495, + "rewards/margins": 2.7752652168273926, + "rewards/rejected": -4.282934665679932, + "step": 22180 + }, + { + "epoch": 0.7234156951878192, + "grad_norm": 1.598496437072754, + "learning_rate": 3.7950380725823095e-05, + "logits/chosen": 3.648146390914917, + "logits/rejected": 3.809351682662964, + "logps/chosen": -331.0506286621094, + "logps/rejected": -306.63848876953125, + "loss": 0.5555, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.328855276107788, + "rewards/margins": 2.3058362007141113, + "rewards/rejected": -3.6346912384033203, + "step": 22200 + }, + { + "epoch": 0.7240674210393399, + "grad_norm": 1.2863894701004028, + "learning_rate": 3.793951836282465e-05, + "logits/chosen": 3.7161126136779785, + "logits/rejected": 4.020954608917236, + "logps/chosen": -364.5186767578125, + "logps/rejected": -288.8224182128906, + "loss": 0.4104, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.8631480932235718, + "rewards/margins": 3.445284366607666, + "rewards/rejected": -4.308432102203369, + "step": 22220 + }, + { + "epoch": 0.7247191468908604, + "grad_norm": 0.6411238312721252, + "learning_rate": 3.7928655999826204e-05, + "logits/chosen": 3.4387097358703613, + "logits/rejected": 3.6466357707977295, + "logps/chosen": -289.4892578125, + "logps/rejected": -279.48822021484375, + "loss": 0.3302, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4176324605941772, + "rewards/margins": 3.0572288036346436, + "rewards/rejected": -4.4748616218566895, + "step": 22240 + }, + { + "epoch": 0.7253708727423809, + "grad_norm": 2.282459259033203, + "learning_rate": 3.791779363682776e-05, + "logits/chosen": 3.8045029640197754, + "logits/rejected": 3.6156177520751953, + "logps/chosen": -332.7469787597656, + "logps/rejected": -326.3101501464844, + "loss": 0.5453, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7437849044799805, + "rewards/margins": 2.1020328998565674, + "rewards/rejected": -3.845818281173706, + "step": 22260 + }, + { + "epoch": 0.7260225985939015, + "grad_norm": 2.6611361503601074, + "learning_rate": 3.790693127382931e-05, + "logits/chosen": 3.7861275672912598, + "logits/rejected": 3.6842968463897705, + "logps/chosen": -336.2262878417969, + "logps/rejected": -327.66900634765625, + "loss": 0.4943, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6280415058135986, + "rewards/margins": 2.6393320560455322, + "rewards/rejected": -4.267373561859131, + "step": 22280 + }, + { + "epoch": 0.726674324445422, + "grad_norm": 0.15359249711036682, + "learning_rate": 3.789606891083086e-05, + "logits/chosen": 3.5872998237609863, + "logits/rejected": 3.6224045753479004, + "logps/chosen": -348.71063232421875, + "logps/rejected": -342.13726806640625, + "loss": 0.4061, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.2739371061325073, + "rewards/margins": 3.2458877563476562, + "rewards/rejected": -4.5198259353637695, + "step": 22300 + }, + { + "epoch": 0.7273260502969426, + "grad_norm": 1.9535549879074097, + "learning_rate": 3.788520654783242e-05, + "logits/chosen": 3.6596450805664062, + "logits/rejected": 3.751981019973755, + "logps/chosen": -354.0664978027344, + "logps/rejected": -365.078857421875, + "loss": 0.4717, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.222489595413208, + "rewards/margins": 3.370190143585205, + "rewards/rejected": -4.592679500579834, + "step": 22320 + }, + { + "epoch": 0.7279777761484632, + "grad_norm": 2.6201858520507812, + "learning_rate": 3.787434418483397e-05, + "logits/chosen": 3.83868408203125, + "logits/rejected": 3.9362151622772217, + "logps/chosen": -419.518310546875, + "logps/rejected": -341.9678649902344, + "loss": 0.4534, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2661902904510498, + "rewards/margins": 2.8387646675109863, + "rewards/rejected": -4.104955196380615, + "step": 22340 + }, + { + "epoch": 0.7286295019999837, + "grad_norm": 1.765866994857788, + "learning_rate": 3.786348182183552e-05, + "logits/chosen": 3.3413338661193848, + "logits/rejected": 3.6121914386749268, + "logps/chosen": -343.91015625, + "logps/rejected": -325.1333923339844, + "loss": 0.4644, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.626220703125, + "rewards/margins": 2.514645576477051, + "rewards/rejected": -4.140866279602051, + "step": 22360 + }, + { + "epoch": 0.7292812278515043, + "grad_norm": 4.636353492736816, + "learning_rate": 3.785261945883708e-05, + "logits/chosen": 3.283365249633789, + "logits/rejected": 3.422976016998291, + "logps/chosen": -336.91595458984375, + "logps/rejected": -316.90625, + "loss": 0.4372, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9661502838134766, + "rewards/margins": 2.466262102127075, + "rewards/rejected": -4.432412624359131, + "step": 22380 + }, + { + "epoch": 0.7299329537030248, + "grad_norm": 4.376764297485352, + "learning_rate": 3.784175709583863e-05, + "logits/chosen": 3.433152437210083, + "logits/rejected": 3.3965237140655518, + "logps/chosen": -343.51715087890625, + "logps/rejected": -341.05926513671875, + "loss": 0.5549, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.090252161026001, + "rewards/margins": 3.0624186992645264, + "rewards/rejected": -5.152670860290527, + "step": 22400 + }, + { + "epoch": 0.7305846795545454, + "grad_norm": 5.503464221954346, + "learning_rate": 3.783089473284018e-05, + "logits/chosen": 3.211778163909912, + "logits/rejected": 3.2299530506134033, + "logps/chosen": -334.17205810546875, + "logps/rejected": -327.8750915527344, + "loss": 0.4995, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.5732386112213135, + "rewards/margins": 2.3506007194519043, + "rewards/rejected": -4.923839092254639, + "step": 22420 + }, + { + "epoch": 0.7312364054060659, + "grad_norm": 3.920180082321167, + "learning_rate": 3.782003236984173e-05, + "logits/chosen": 3.3586838245391846, + "logits/rejected": 3.6010756492614746, + "logps/chosen": -367.0420837402344, + "logps/rejected": -338.1452331542969, + "loss": 0.3842, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7891552448272705, + "rewards/margins": 2.546212673187256, + "rewards/rejected": -4.3353681564331055, + "step": 22440 + }, + { + "epoch": 0.7318881312575864, + "grad_norm": 2.003779411315918, + "learning_rate": 3.780917000684329e-05, + "logits/chosen": 3.434288740158081, + "logits/rejected": 3.383876323699951, + "logps/chosen": -354.12445068359375, + "logps/rejected": -334.51214599609375, + "loss": 0.5606, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.141420841217041, + "rewards/margins": 2.5544211864471436, + "rewards/rejected": -4.6958417892456055, + "step": 22460 + }, + { + "epoch": 0.7325398571091071, + "grad_norm": 2.9156253337860107, + "learning_rate": 3.779830764384485e-05, + "logits/chosen": 3.2650794982910156, + "logits/rejected": 3.453015089035034, + "logps/chosen": -349.8980712890625, + "logps/rejected": -307.7830505371094, + "loss": 0.5267, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.087709903717041, + "rewards/margins": 2.454648971557617, + "rewards/rejected": -4.542359352111816, + "step": 22480 + }, + { + "epoch": 0.7331915829606276, + "grad_norm": 0.987515926361084, + "learning_rate": 3.77874452808464e-05, + "logits/chosen": 3.3803696632385254, + "logits/rejected": 3.5196945667266846, + "logps/chosen": -311.90203857421875, + "logps/rejected": -295.09637451171875, + "loss": 0.372, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.1394273042678833, + "rewards/margins": 2.674057722091675, + "rewards/rejected": -3.8134849071502686, + "step": 22500 + }, + { + "epoch": 0.7338433088121482, + "grad_norm": 1.9073387384414673, + "learning_rate": 3.7776582917847955e-05, + "logits/chosen": 3.382948637008667, + "logits/rejected": 3.4293198585510254, + "logps/chosen": -357.82952880859375, + "logps/rejected": -306.18194580078125, + "loss": 0.3205, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.4169354438781738, + "rewards/margins": 2.871950149536133, + "rewards/rejected": -4.288886070251465, + "step": 22520 + }, + { + "epoch": 0.7344950346636687, + "grad_norm": 2.4137961864471436, + "learning_rate": 3.7765720554849506e-05, + "logits/chosen": 3.2588794231414795, + "logits/rejected": 3.5412087440490723, + "logps/chosen": -342.8609924316406, + "logps/rejected": -310.1871643066406, + "loss": 0.3918, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8668019771575928, + "rewards/margins": 2.9935927391052246, + "rewards/rejected": -4.860394477844238, + "step": 22540 + }, + { + "epoch": 0.7351467605151892, + "grad_norm": 5.102714538574219, + "learning_rate": 3.775485819185106e-05, + "logits/chosen": 3.1851723194122314, + "logits/rejected": 3.225435972213745, + "logps/chosen": -350.69915771484375, + "logps/rejected": -296.4798889160156, + "loss": 0.4344, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.2191741466522217, + "rewards/margins": 2.520902395248413, + "rewards/rejected": -4.740077018737793, + "step": 22560 + }, + { + "epoch": 0.7357984863667099, + "grad_norm": 1.4880962371826172, + "learning_rate": 3.7743995828852614e-05, + "logits/chosen": 3.293652057647705, + "logits/rejected": 3.50384521484375, + "logps/chosen": -360.0761413574219, + "logps/rejected": -290.99871826171875, + "loss": 0.3695, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.335993528366089, + "rewards/margins": 2.800096273422241, + "rewards/rejected": -5.13608980178833, + "step": 22580 + }, + { + "epoch": 0.7364502122182304, + "grad_norm": 7.140813827514648, + "learning_rate": 3.7733133465854165e-05, + "logits/chosen": 3.654139995574951, + "logits/rejected": 3.5957818031311035, + "logps/chosen": -339.66070556640625, + "logps/rejected": -334.0869140625, + "loss": 0.607, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.5187408924102783, + "rewards/margins": 2.4235007762908936, + "rewards/rejected": -4.94224214553833, + "step": 22600 + }, + { + "epoch": 0.737101938069751, + "grad_norm": 8.979181289672852, + "learning_rate": 3.7722271102855716e-05, + "logits/chosen": 3.5020878314971924, + "logits/rejected": 3.5499813556671143, + "logps/chosen": -357.6037292480469, + "logps/rejected": -300.6266784667969, + "loss": 0.4418, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.1994314193725586, + "rewards/margins": 3.136584758758545, + "rewards/rejected": -5.336016654968262, + "step": 22620 + }, + { + "epoch": 0.7377536639212715, + "grad_norm": 2.3303728103637695, + "learning_rate": 3.771140873985727e-05, + "logits/chosen": 3.151902675628662, + "logits/rejected": 3.261934757232666, + "logps/chosen": -350.948974609375, + "logps/rejected": -319.5929260253906, + "loss": 0.3181, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.9249871969223022, + "rewards/margins": 3.871777057647705, + "rewards/rejected": -5.796764373779297, + "step": 22640 + }, + { + "epoch": 0.738405389772792, + "grad_norm": 3.652282238006592, + "learning_rate": 3.7700546376858824e-05, + "logits/chosen": 2.926344394683838, + "logits/rejected": 3.08003568649292, + "logps/chosen": -355.38189697265625, + "logps/rejected": -356.7275085449219, + "loss": 0.5301, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.343350887298584, + "rewards/margins": 2.99977707862854, + "rewards/rejected": -5.343128204345703, + "step": 22660 + }, + { + "epoch": 0.7390571156243126, + "grad_norm": 0.21230655908584595, + "learning_rate": 3.7689684013860375e-05, + "logits/chosen": 3.398728609085083, + "logits/rejected": 3.5067577362060547, + "logps/chosen": -364.617919921875, + "logps/rejected": -342.10064697265625, + "loss": 0.293, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4119327068328857, + "rewards/margins": 4.032964706420898, + "rewards/rejected": -6.444897651672363, + "step": 22680 + }, + { + "epoch": 0.7397088414758332, + "grad_norm": 3.2051498889923096, + "learning_rate": 3.7678821650861926e-05, + "logits/chosen": 3.628197431564331, + "logits/rejected": 3.656071186065674, + "logps/chosen": -367.2054138183594, + "logps/rejected": -374.28509521484375, + "loss": 0.5521, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.904029369354248, + "rewards/margins": 2.6546201705932617, + "rewards/rejected": -5.55864953994751, + "step": 22700 + }, + { + "epoch": 0.7403605673273538, + "grad_norm": 1.9656331539154053, + "learning_rate": 3.7667959287863483e-05, + "logits/chosen": 3.2390053272247314, + "logits/rejected": 3.3283615112304688, + "logps/chosen": -384.3541564941406, + "logps/rejected": -352.5851135253906, + "loss": 0.3631, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8604636192321777, + "rewards/margins": 3.717639446258545, + "rewards/rejected": -6.578103065490723, + "step": 22720 + }, + { + "epoch": 0.7410122931788743, + "grad_norm": 3.243748188018799, + "learning_rate": 3.7657096924865034e-05, + "logits/chosen": 3.115088701248169, + "logits/rejected": 3.3221688270568848, + "logps/chosen": -351.61810302734375, + "logps/rejected": -347.03778076171875, + "loss": 0.411, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7123193740844727, + "rewards/margins": 3.5658211708068848, + "rewards/rejected": -6.278140544891357, + "step": 22740 + }, + { + "epoch": 0.7416640190303949, + "grad_norm": 7.467217922210693, + "learning_rate": 3.764623456186659e-05, + "logits/chosen": 3.3287742137908936, + "logits/rejected": 3.5149154663085938, + "logps/chosen": -371.6351623535156, + "logps/rejected": -324.3486328125, + "loss": 0.5844, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.3947463035583496, + "rewards/margins": 3.2377426624298096, + "rewards/rejected": -5.632489204406738, + "step": 22760 + }, + { + "epoch": 0.7423157448819154, + "grad_norm": 2.321016788482666, + "learning_rate": 3.763537219886815e-05, + "logits/chosen": 3.409680128097534, + "logits/rejected": 3.5034477710723877, + "logps/chosen": -346.5894470214844, + "logps/rejected": -364.4905090332031, + "loss": 0.6494, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.6856160163879395, + "rewards/margins": 3.2906157970428467, + "rewards/rejected": -5.976232051849365, + "step": 22780 + }, + { + "epoch": 0.7429674707334359, + "grad_norm": 7.895336627960205, + "learning_rate": 3.76245098358697e-05, + "logits/chosen": 3.495850086212158, + "logits/rejected": 3.699805736541748, + "logps/chosen": -349.8553161621094, + "logps/rejected": -317.9339599609375, + "loss": 0.5566, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.0403404235839844, + "rewards/margins": 2.423142433166504, + "rewards/rejected": -5.463482856750488, + "step": 22800 + }, + { + "epoch": 0.7436191965849566, + "grad_norm": 8.776105880737305, + "learning_rate": 3.761364747287125e-05, + "logits/chosen": 3.139760732650757, + "logits/rejected": 3.1704795360565186, + "logps/chosen": -318.7049255371094, + "logps/rejected": -315.24176025390625, + "loss": 0.6271, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6212997436523438, + "rewards/margins": 3.07081937789917, + "rewards/rejected": -5.692119121551514, + "step": 22820 + }, + { + "epoch": 0.7442709224364771, + "grad_norm": 3.432093620300293, + "learning_rate": 3.76027851098728e-05, + "logits/chosen": 3.4397079944610596, + "logits/rejected": 3.711737871170044, + "logps/chosen": -333.7914123535156, + "logps/rejected": -331.48388671875, + "loss": 0.5124, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.5885751247406006, + "rewards/margins": 2.6885247230529785, + "rewards/rejected": -5.277099609375, + "step": 22840 + }, + { + "epoch": 0.7449226482879977, + "grad_norm": 6.115138053894043, + "learning_rate": 3.759192274687436e-05, + "logits/chosen": 3.49834942817688, + "logits/rejected": 3.6201329231262207, + "logps/chosen": -366.0537109375, + "logps/rejected": -320.6615905761719, + "loss": 0.6214, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.146857738494873, + "rewards/margins": 2.5469043254852295, + "rewards/rejected": -4.693762302398682, + "step": 22860 + }, + { + "epoch": 0.7455743741395182, + "grad_norm": 3.75648832321167, + "learning_rate": 3.758106038387591e-05, + "logits/chosen": 3.4078376293182373, + "logits/rejected": 3.360525608062744, + "logps/chosen": -321.88201904296875, + "logps/rejected": -312.9386291503906, + "loss": 0.4874, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.252690315246582, + "rewards/margins": 2.6296887397766113, + "rewards/rejected": -4.882379055023193, + "step": 22880 + }, + { + "epoch": 0.7462260999910387, + "grad_norm": 1.4789985418319702, + "learning_rate": 3.757019802087746e-05, + "logits/chosen": 3.4044671058654785, + "logits/rejected": 3.494729518890381, + "logps/chosen": -324.6876525878906, + "logps/rejected": -286.6428527832031, + "loss": 0.717, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2618296146392822, + "rewards/margins": 2.0796751976013184, + "rewards/rejected": -4.34150505065918, + "step": 22900 + }, + { + "epoch": 0.7468778258425594, + "grad_norm": 2.6215426921844482, + "learning_rate": 3.755933565787902e-05, + "logits/chosen": 3.4501500129699707, + "logits/rejected": 3.4627537727355957, + "logps/chosen": -339.89093017578125, + "logps/rejected": -288.9854431152344, + "loss": 0.5231, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7433974742889404, + "rewards/margins": 2.2571022510528564, + "rewards/rejected": -4.000500202178955, + "step": 22920 + }, + { + "epoch": 0.7475295516940799, + "grad_norm": 2.217971086502075, + "learning_rate": 3.754847329488057e-05, + "logits/chosen": 3.4780426025390625, + "logits/rejected": 3.5621254444122314, + "logps/chosen": -365.9189758300781, + "logps/rejected": -343.75787353515625, + "loss": 0.3585, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7297147512435913, + "rewards/margins": 3.0341196060180664, + "rewards/rejected": -4.763834476470947, + "step": 22940 + }, + { + "epoch": 0.7481812775456005, + "grad_norm": 5.532500267028809, + "learning_rate": 3.753761093188212e-05, + "logits/chosen": 3.2689411640167236, + "logits/rejected": 3.4185142517089844, + "logps/chosen": -286.86993408203125, + "logps/rejected": -282.69451904296875, + "loss": 0.4163, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.273036241531372, + "rewards/margins": 2.2401673793792725, + "rewards/rejected": -4.513204097747803, + "step": 22960 + }, + { + "epoch": 0.748833003397121, + "grad_norm": 5.138960361480713, + "learning_rate": 3.752674856888368e-05, + "logits/chosen": 3.3054566383361816, + "logits/rejected": 3.1661922931671143, + "logps/chosen": -349.17083740234375, + "logps/rejected": -321.9953308105469, + "loss": 0.4632, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.408017873764038, + "rewards/margins": 3.1078057289123535, + "rewards/rejected": -5.515823841094971, + "step": 22980 + }, + { + "epoch": 0.7494847292486415, + "grad_norm": 0.6333909034729004, + "learning_rate": 3.751588620588523e-05, + "logits/chosen": 3.2454535961151123, + "logits/rejected": 3.43822979927063, + "logps/chosen": -329.69085693359375, + "logps/rejected": -305.5981140136719, + "loss": 0.4236, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.193498134613037, + "rewards/margins": 2.8050143718719482, + "rewards/rejected": -4.998513221740723, + "step": 23000 + }, + { + "epoch": 0.7501364551001621, + "grad_norm": 2.9221904277801514, + "learning_rate": 3.7505023842886786e-05, + "logits/chosen": 3.1601836681365967, + "logits/rejected": 3.337139844894409, + "logps/chosen": -348.6711730957031, + "logps/rejected": -316.4305419921875, + "loss": 0.4372, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.683940887451172, + "rewards/margins": 2.6876413822174072, + "rewards/rejected": -5.37158203125, + "step": 23020 + }, + { + "epoch": 0.7507881809516826, + "grad_norm": 1.2444899082183838, + "learning_rate": 3.749416147988834e-05, + "logits/chosen": 2.9036989212036133, + "logits/rejected": 2.9356048107147217, + "logps/chosen": -346.78826904296875, + "logps/rejected": -333.7370300292969, + "loss": 0.4071, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.0683560371398926, + "rewards/margins": 4.014636039733887, + "rewards/rejected": -6.082992076873779, + "step": 23040 + }, + { + "epoch": 0.7514399068032033, + "grad_norm": 11.017748832702637, + "learning_rate": 3.7483299116889894e-05, + "logits/chosen": 3.2852187156677246, + "logits/rejected": 3.4975078105926514, + "logps/chosen": -368.548828125, + "logps/rejected": -294.9757995605469, + "loss": 0.6234, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.046086311340332, + "rewards/margins": 1.889421820640564, + "rewards/rejected": -4.9355082511901855, + "step": 23060 + }, + { + "epoch": 0.7520916326547238, + "grad_norm": 0.33963149785995483, + "learning_rate": 3.7472436753891445e-05, + "logits/chosen": 3.1685783863067627, + "logits/rejected": 3.5324928760528564, + "logps/chosen": -357.5009765625, + "logps/rejected": -344.1461486816406, + "loss": 0.4213, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.206937074661255, + "rewards/margins": 3.160104990005493, + "rewards/rejected": -5.36704158782959, + "step": 23080 + }, + { + "epoch": 0.7527433585062443, + "grad_norm": 5.027831554412842, + "learning_rate": 3.7461574390892996e-05, + "logits/chosen": 3.106109142303467, + "logits/rejected": 3.3308205604553223, + "logps/chosen": -341.0044250488281, + "logps/rejected": -344.33856201171875, + "loss": 0.4212, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4601023197174072, + "rewards/margins": 3.313267230987549, + "rewards/rejected": -5.773369789123535, + "step": 23100 + }, + { + "epoch": 0.7533950843577649, + "grad_norm": 3.6428520679473877, + "learning_rate": 3.745071202789455e-05, + "logits/chosen": 3.1732232570648193, + "logits/rejected": 3.3613364696502686, + "logps/chosen": -362.01165771484375, + "logps/rejected": -322.58685302734375, + "loss": 0.514, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.473142385482788, + "rewards/margins": 3.5327651500701904, + "rewards/rejected": -5.00590705871582, + "step": 23120 + }, + { + "epoch": 0.7540468102092854, + "grad_norm": 1.1273646354675293, + "learning_rate": 3.7439849664896104e-05, + "logits/chosen": 3.5447134971618652, + "logits/rejected": 3.7498810291290283, + "logps/chosen": -369.24432373046875, + "logps/rejected": -302.7729797363281, + "loss": 0.3108, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4358289241790771, + "rewards/margins": 3.3272509574890137, + "rewards/rejected": -4.763079643249512, + "step": 23140 + }, + { + "epoch": 0.7546985360608061, + "grad_norm": 0.42768657207489014, + "learning_rate": 3.7428987301897655e-05, + "logits/chosen": 3.778062105178833, + "logits/rejected": 3.8972668647766113, + "logps/chosen": -366.53997802734375, + "logps/rejected": -336.0467529296875, + "loss": 0.4525, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9232234954833984, + "rewards/margins": 3.4215540885925293, + "rewards/rejected": -5.3447771072387695, + "step": 23160 + }, + { + "epoch": 0.7553502619123266, + "grad_norm": 0.19566819071769714, + "learning_rate": 3.7418124938899206e-05, + "logits/chosen": 3.076159954071045, + "logits/rejected": 3.445331573486328, + "logps/chosen": -320.40045166015625, + "logps/rejected": -291.7132568359375, + "loss": 0.3756, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8389685153961182, + "rewards/margins": 2.6811816692352295, + "rewards/rejected": -4.520150184631348, + "step": 23180 + }, + { + "epoch": 0.7560019877638471, + "grad_norm": 0.6297158598899841, + "learning_rate": 3.740726257590076e-05, + "logits/chosen": 3.4832985401153564, + "logits/rejected": 3.655977964401245, + "logps/chosen": -344.30364990234375, + "logps/rejected": -287.92596435546875, + "loss": 0.4273, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6207935214042664, + "rewards/margins": 3.2037761211395264, + "rewards/rejected": -3.8245692253112793, + "step": 23200 + }, + { + "epoch": 0.7566537136153677, + "grad_norm": 1.2873421907424927, + "learning_rate": 3.7396400212902314e-05, + "logits/chosen": 3.3671786785125732, + "logits/rejected": 3.515021800994873, + "logps/chosen": -302.7511291503906, + "logps/rejected": -302.33160400390625, + "loss": 0.4911, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9616146087646484, + "rewards/margins": 2.29976224899292, + "rewards/rejected": -4.26137638092041, + "step": 23220 + }, + { + "epoch": 0.7573054394668882, + "grad_norm": 4.256545066833496, + "learning_rate": 3.7385537849903865e-05, + "logits/chosen": 3.713819980621338, + "logits/rejected": 3.8465869426727295, + "logps/chosen": -346.6053161621094, + "logps/rejected": -344.8159484863281, + "loss": 0.7191, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9516340494155884, + "rewards/margins": 2.716041088104248, + "rewards/rejected": -4.6676740646362305, + "step": 23240 + }, + { + "epoch": 0.7579571653184088, + "grad_norm": 1.4395934343338013, + "learning_rate": 3.737467548690542e-05, + "logits/chosen": 3.4795429706573486, + "logits/rejected": 3.5344913005828857, + "logps/chosen": -324.94805908203125, + "logps/rejected": -306.74462890625, + "loss": 0.4302, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.5753086805343628, + "rewards/margins": 2.8498446941375732, + "rewards/rejected": -4.4251532554626465, + "step": 23260 + }, + { + "epoch": 0.7586088911699294, + "grad_norm": 1.1134041547775269, + "learning_rate": 3.736381312390698e-05, + "logits/chosen": 3.5267837047576904, + "logits/rejected": 3.608834743499756, + "logps/chosen": -380.9120178222656, + "logps/rejected": -364.76214599609375, + "loss": 0.4824, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4293283224105835, + "rewards/margins": 2.855729818344116, + "rewards/rejected": -4.28505802154541, + "step": 23280 + }, + { + "epoch": 0.75926061702145, + "grad_norm": 1.3385136127471924, + "learning_rate": 3.735295076090853e-05, + "logits/chosen": 3.613523483276367, + "logits/rejected": 3.6867496967315674, + "logps/chosen": -343.95648193359375, + "logps/rejected": -298.56707763671875, + "loss": 0.3198, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.218388319015503, + "rewards/margins": 3.4038264751434326, + "rewards/rejected": -4.622215270996094, + "step": 23300 + }, + { + "epoch": 0.7599123428729705, + "grad_norm": 5.048826694488525, + "learning_rate": 3.734208839791009e-05, + "logits/chosen": 3.1981756687164307, + "logits/rejected": 3.281113386154175, + "logps/chosen": -331.6693420410156, + "logps/rejected": -359.75146484375, + "loss": 0.2694, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.6559431552886963, + "rewards/margins": 3.7310631275177, + "rewards/rejected": -5.3870062828063965, + "step": 23320 + }, + { + "epoch": 0.760564068724491, + "grad_norm": 1.758998990058899, + "learning_rate": 3.733122603491164e-05, + "logits/chosen": 3.1428542137145996, + "logits/rejected": 3.2647793292999268, + "logps/chosen": -309.8661804199219, + "logps/rejected": -310.29888916015625, + "loss": 0.6619, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.3721706867218018, + "rewards/margins": 2.8883001804351807, + "rewards/rejected": -5.260470390319824, + "step": 23340 + }, + { + "epoch": 0.7612157945760116, + "grad_norm": 6.460838317871094, + "learning_rate": 3.732036367191319e-05, + "logits/chosen": 3.3734512329101562, + "logits/rejected": 3.443211078643799, + "logps/chosen": -341.62835693359375, + "logps/rejected": -318.14581298828125, + "loss": 0.4398, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.0272984504699707, + "rewards/margins": 2.7642617225646973, + "rewards/rejected": -4.79155969619751, + "step": 23360 + }, + { + "epoch": 0.7618675204275321, + "grad_norm": 3.3611223697662354, + "learning_rate": 3.730950130891474e-05, + "logits/chosen": 3.4443676471710205, + "logits/rejected": 3.58886981010437, + "logps/chosen": -342.00689697265625, + "logps/rejected": -318.5663146972656, + "loss": 0.4009, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5541741847991943, + "rewards/margins": 2.8938887119293213, + "rewards/rejected": -4.448062896728516, + "step": 23380 + }, + { + "epoch": 0.7625192462790528, + "grad_norm": 3.850600481033325, + "learning_rate": 3.72986389459163e-05, + "logits/chosen": 3.362529754638672, + "logits/rejected": 3.55169939994812, + "logps/chosen": -299.63653564453125, + "logps/rejected": -257.32757568359375, + "loss": 0.3537, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5265064239501953, + "rewards/margins": 3.077235698699951, + "rewards/rejected": -4.603742599487305, + "step": 23400 + }, + { + "epoch": 0.7631709721305733, + "grad_norm": 0.06721451878547668, + "learning_rate": 3.728777658291785e-05, + "logits/chosen": 3.8226921558380127, + "logits/rejected": 3.815512180328369, + "logps/chosen": -344.7593078613281, + "logps/rejected": -342.515380859375, + "loss": 0.4308, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7375402450561523, + "rewards/margins": 3.2098565101623535, + "rewards/rejected": -4.947396278381348, + "step": 23420 + }, + { + "epoch": 0.7638226979820938, + "grad_norm": 2.809234857559204, + "learning_rate": 3.72769142199194e-05, + "logits/chosen": 3.521742343902588, + "logits/rejected": 3.60115385055542, + "logps/chosen": -367.16619873046875, + "logps/rejected": -356.9707946777344, + "loss": 0.5196, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.232528805732727, + "rewards/margins": 2.7599551677703857, + "rewards/rejected": -3.9924843311309814, + "step": 23440 + }, + { + "epoch": 0.7644744238336144, + "grad_norm": 5.572142124176025, + "learning_rate": 3.726605185692096e-05, + "logits/chosen": 3.513861894607544, + "logits/rejected": 3.5407261848449707, + "logps/chosen": -333.21038818359375, + "logps/rejected": -315.0457458496094, + "loss": 0.4661, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8378175497055054, + "rewards/margins": 3.0057005882263184, + "rewards/rejected": -4.843518257141113, + "step": 23460 + }, + { + "epoch": 0.7651261496851349, + "grad_norm": 3.254809617996216, + "learning_rate": 3.725518949392251e-05, + "logits/chosen": 3.0985782146453857, + "logits/rejected": 3.209951400756836, + "logps/chosen": -315.0631103515625, + "logps/rejected": -335.2816467285156, + "loss": 0.448, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9486052989959717, + "rewards/margins": 2.770564079284668, + "rewards/rejected": -5.719169616699219, + "step": 23480 + }, + { + "epoch": 0.7657778755366556, + "grad_norm": 7.809262275695801, + "learning_rate": 3.724432713092406e-05, + "logits/chosen": 3.018662929534912, + "logits/rejected": 3.2213146686553955, + "logps/chosen": -355.6850280761719, + "logps/rejected": -337.57757568359375, + "loss": 0.4904, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.9719349145889282, + "rewards/margins": 3.55253529548645, + "rewards/rejected": -5.524470329284668, + "step": 23500 + }, + { + "epoch": 0.7664296013881761, + "grad_norm": 4.312654972076416, + "learning_rate": 3.7233464767925616e-05, + "logits/chosen": 3.7703425884246826, + "logits/rejected": 3.828975200653076, + "logps/chosen": -349.86297607421875, + "logps/rejected": -296.434326171875, + "loss": 0.5077, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1937882900238037, + "rewards/margins": 2.519746780395508, + "rewards/rejected": -4.713535308837891, + "step": 23520 + }, + { + "epoch": 0.7670813272396966, + "grad_norm": 1.7208468914031982, + "learning_rate": 3.722260240492717e-05, + "logits/chosen": 3.757579803466797, + "logits/rejected": 3.8546135425567627, + "logps/chosen": -360.02490234375, + "logps/rejected": -317.5433044433594, + "loss": 0.5256, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.217406749725342, + "rewards/margins": 2.598574161529541, + "rewards/rejected": -4.815981864929199, + "step": 23540 + }, + { + "epoch": 0.7677330530912172, + "grad_norm": 2.676509380340576, + "learning_rate": 3.7211740041928725e-05, + "logits/chosen": 3.5126781463623047, + "logits/rejected": 3.495448589324951, + "logps/chosen": -345.73968505859375, + "logps/rejected": -328.80078125, + "loss": 0.2801, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.063267230987549, + "rewards/margins": 3.171787977218628, + "rewards/rejected": -5.235054969787598, + "step": 23560 + }, + { + "epoch": 0.7683847789427377, + "grad_norm": 7.270565986633301, + "learning_rate": 3.7200877678930276e-05, + "logits/chosen": 4.011735439300537, + "logits/rejected": 4.014187335968018, + "logps/chosen": -407.185791015625, + "logps/rejected": -307.9431457519531, + "loss": 0.3922, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.3236687183380127, + "rewards/margins": 2.66232967376709, + "rewards/rejected": -4.985998630523682, + "step": 23580 + }, + { + "epoch": 0.7690365047942583, + "grad_norm": 3.7753961086273193, + "learning_rate": 3.719001531593183e-05, + "logits/chosen": 3.773883819580078, + "logits/rejected": 3.7766525745391846, + "logps/chosen": -377.7391662597656, + "logps/rejected": -337.51702880859375, + "loss": 0.4381, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3600353002548218, + "rewards/margins": 2.9937973022460938, + "rewards/rejected": -4.353832721710205, + "step": 23600 + }, + { + "epoch": 0.7696882306457788, + "grad_norm": 5.624262809753418, + "learning_rate": 3.7179152952933384e-05, + "logits/chosen": 3.407733917236328, + "logits/rejected": 3.5443997383117676, + "logps/chosen": -387.77398681640625, + "logps/rejected": -300.4818115234375, + "loss": 0.2996, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.2382636070251465, + "rewards/margins": 2.9740614891052246, + "rewards/rejected": -5.212325096130371, + "step": 23620 + }, + { + "epoch": 0.7703399564972994, + "grad_norm": 25.44963836669922, + "learning_rate": 3.7168290589934935e-05, + "logits/chosen": 3.6728274822235107, + "logits/rejected": 3.7907447814941406, + "logps/chosen": -324.99407958984375, + "logps/rejected": -292.67718505859375, + "loss": 0.3928, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9286158084869385, + "rewards/margins": 2.642987012863159, + "rewards/rejected": -4.571602821350098, + "step": 23640 + }, + { + "epoch": 0.77099168234882, + "grad_norm": 0.9626504778862, + "learning_rate": 3.715742822693649e-05, + "logits/chosen": 3.6259562969207764, + "logits/rejected": 3.8531577587127686, + "logps/chosen": -314.70343017578125, + "logps/rejected": -288.85601806640625, + "loss": 0.6456, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6278213262557983, + "rewards/margins": 2.1365506649017334, + "rewards/rejected": -3.764371871948242, + "step": 23660 + }, + { + "epoch": 0.7716434082003405, + "grad_norm": 1.0168826580047607, + "learning_rate": 3.714656586393804e-05, + "logits/chosen": 3.5530548095703125, + "logits/rejected": 3.535391330718994, + "logps/chosen": -337.21905517578125, + "logps/rejected": -325.302978515625, + "loss": 0.6289, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4292047023773193, + "rewards/margins": 2.4324944019317627, + "rewards/rejected": -3.861698865890503, + "step": 23680 + }, + { + "epoch": 0.7722951340518611, + "grad_norm": 4.156672477722168, + "learning_rate": 3.7135703500939594e-05, + "logits/chosen": 3.4780449867248535, + "logits/rejected": 3.7479515075683594, + "logps/chosen": -342.77825927734375, + "logps/rejected": -313.67584228515625, + "loss": 0.417, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.815835952758789, + "rewards/margins": 3.439213275909424, + "rewards/rejected": -5.255048751831055, + "step": 23700 + }, + { + "epoch": 0.7729468599033816, + "grad_norm": 2.5179874897003174, + "learning_rate": 3.712484113794115e-05, + "logits/chosen": 3.803856372833252, + "logits/rejected": 3.8717198371887207, + "logps/chosen": -349.73046875, + "logps/rejected": -333.78692626953125, + "loss": 0.5306, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9183059930801392, + "rewards/margins": 2.305891513824463, + "rewards/rejected": -4.2241973876953125, + "step": 23720 + }, + { + "epoch": 0.7735985857549021, + "grad_norm": 3.8765032291412354, + "learning_rate": 3.71139787749427e-05, + "logits/chosen": 3.964439868927002, + "logits/rejected": 3.951498031616211, + "logps/chosen": -364.6172790527344, + "logps/rejected": -323.790283203125, + "loss": 0.7138, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.352557420730591, + "rewards/margins": 1.9815452098846436, + "rewards/rejected": -4.334102630615234, + "step": 23740 + }, + { + "epoch": 0.7742503116064228, + "grad_norm": 3.753230333328247, + "learning_rate": 3.710311641194425e-05, + "logits/chosen": 3.4447226524353027, + "logits/rejected": 3.67427134513855, + "logps/chosen": -321.0355529785156, + "logps/rejected": -319.59423828125, + "loss": 0.3724, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.3797669410705566, + "rewards/margins": 2.505765199661255, + "rewards/rejected": -4.885531902313232, + "step": 23760 + }, + { + "epoch": 0.7749020374579433, + "grad_norm": 6.043906211853027, + "learning_rate": 3.7092254048945804e-05, + "logits/chosen": 3.362619400024414, + "logits/rejected": 3.47294282913208, + "logps/chosen": -344.50201416015625, + "logps/rejected": -307.133544921875, + "loss": 0.4964, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.075915575027466, + "rewards/margins": 2.5194218158721924, + "rewards/rejected": -4.595337867736816, + "step": 23780 + }, + { + "epoch": 0.7755537633094639, + "grad_norm": 1.040571689605713, + "learning_rate": 3.708139168594736e-05, + "logits/chosen": 3.604151964187622, + "logits/rejected": 3.6903209686279297, + "logps/chosen": -348.1711730957031, + "logps/rejected": -334.7171630859375, + "loss": 0.3785, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.4593347311019897, + "rewards/margins": 3.5485739707946777, + "rewards/rejected": -5.007908821105957, + "step": 23800 + }, + { + "epoch": 0.7762054891609844, + "grad_norm": 1.6289451122283936, + "learning_rate": 3.707052932294892e-05, + "logits/chosen": 3.5395026206970215, + "logits/rejected": 3.661818742752075, + "logps/chosen": -363.6900939941406, + "logps/rejected": -310.8099060058594, + "loss": 0.3545, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0596133470535278, + "rewards/margins": 3.4413979053497314, + "rewards/rejected": -4.501010894775391, + "step": 23820 + }, + { + "epoch": 0.776857215012505, + "grad_norm": 3.417560577392578, + "learning_rate": 3.705966695995047e-05, + "logits/chosen": 3.4836297035217285, + "logits/rejected": 3.558716297149658, + "logps/chosen": -311.4240417480469, + "logps/rejected": -298.95416259765625, + "loss": 0.5663, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4007859230041504, + "rewards/margins": 2.16257905960083, + "rewards/rejected": -4.5633649826049805, + "step": 23840 + }, + { + "epoch": 0.7775089408640256, + "grad_norm": 0.9461150169372559, + "learning_rate": 3.704880459695203e-05, + "logits/chosen": 3.5685982704162598, + "logits/rejected": 3.6906864643096924, + "logps/chosen": -364.47991943359375, + "logps/rejected": -322.11541748046875, + "loss": 0.376, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.6700159311294556, + "rewards/margins": 3.3080806732177734, + "rewards/rejected": -4.9780964851379395, + "step": 23860 + }, + { + "epoch": 0.7781606667155461, + "grad_norm": 2.4282474517822266, + "learning_rate": 3.703794223395358e-05, + "logits/chosen": 3.5904700756073, + "logits/rejected": 3.898613691329956, + "logps/chosen": -334.869873046875, + "logps/rejected": -283.08447265625, + "loss": 0.3402, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.9479694366455078, + "rewards/margins": 2.853135585784912, + "rewards/rejected": -4.80110502243042, + "step": 23880 + }, + { + "epoch": 0.7788123925670667, + "grad_norm": 0.31342095136642456, + "learning_rate": 3.702707987095513e-05, + "logits/chosen": 3.664710521697998, + "logits/rejected": 3.709294080734253, + "logps/chosen": -351.32672119140625, + "logps/rejected": -328.4574279785156, + "loss": 0.3866, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4155932664871216, + "rewards/margins": 2.969046115875244, + "rewards/rejected": -4.384639739990234, + "step": 23900 + }, + { + "epoch": 0.7794641184185872, + "grad_norm": 2.0893566608428955, + "learning_rate": 3.7016217507956686e-05, + "logits/chosen": 3.670619249343872, + "logits/rejected": 3.723498582839966, + "logps/chosen": -347.82275390625, + "logps/rejected": -327.2398681640625, + "loss": 0.3494, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5446202754974365, + "rewards/margins": 3.6206631660461426, + "rewards/rejected": -5.165283679962158, + "step": 23920 + }, + { + "epoch": 0.7801158442701078, + "grad_norm": 1.033855676651001, + "learning_rate": 3.700535514495824e-05, + "logits/chosen": 3.732922077178955, + "logits/rejected": 3.8392205238342285, + "logps/chosen": -371.46160888671875, + "logps/rejected": -358.3462219238281, + "loss": 0.6109, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.978276252746582, + "rewards/margins": 2.6869449615478516, + "rewards/rejected": -4.665221214294434, + "step": 23940 + }, + { + "epoch": 0.7807675701216283, + "grad_norm": 3.0384035110473633, + "learning_rate": 3.699449278195979e-05, + "logits/chosen": 3.3823254108428955, + "logits/rejected": 3.434314012527466, + "logps/chosen": -315.8406982421875, + "logps/rejected": -268.89093017578125, + "loss": 0.63, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7956314086914062, + "rewards/margins": 2.478952646255493, + "rewards/rejected": -4.2745842933654785, + "step": 23960 + }, + { + "epoch": 0.7814192959731489, + "grad_norm": 3.249018907546997, + "learning_rate": 3.698363041896134e-05, + "logits/chosen": 3.667713165283203, + "logits/rejected": 3.740166425704956, + "logps/chosen": -376.1891784667969, + "logps/rejected": -331.5907897949219, + "loss": 0.5264, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7669575214385986, + "rewards/margins": 2.4616405963897705, + "rewards/rejected": -4.228598117828369, + "step": 23980 + }, + { + "epoch": 0.7820710218246695, + "grad_norm": 1.3071693181991577, + "learning_rate": 3.6972768055962896e-05, + "logits/chosen": 3.7711944580078125, + "logits/rejected": 3.8320224285125732, + "logps/chosen": -352.7505187988281, + "logps/rejected": -343.43817138671875, + "loss": 0.3572, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.842886209487915, + "rewards/margins": 3.1670994758605957, + "rewards/rejected": -5.009985446929932, + "step": 24000 + }, + { + "epoch": 0.78272274767619, + "grad_norm": 0.19331379234790802, + "learning_rate": 3.696190569296445e-05, + "logits/chosen": 3.7209954261779785, + "logits/rejected": 3.770921230316162, + "logps/chosen": -396.32855224609375, + "logps/rejected": -357.6484680175781, + "loss": 0.3467, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.654482126235962, + "rewards/margins": 3.4364662170410156, + "rewards/rejected": -5.090948581695557, + "step": 24020 + }, + { + "epoch": 0.7833744735277106, + "grad_norm": 1.6407912969589233, + "learning_rate": 3.6951043329966e-05, + "logits/chosen": 3.982651472091675, + "logits/rejected": 4.007279396057129, + "logps/chosen": -361.5669250488281, + "logps/rejected": -330.86041259765625, + "loss": 0.4854, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9618860483169556, + "rewards/margins": 3.004291534423828, + "rewards/rejected": -4.966177940368652, + "step": 24040 + }, + { + "epoch": 0.7840261993792311, + "grad_norm": 2.458097457885742, + "learning_rate": 3.6940180966967555e-05, + "logits/chosen": 3.30729603767395, + "logits/rejected": 3.422302722930908, + "logps/chosen": -337.2447204589844, + "logps/rejected": -338.5492248535156, + "loss": 0.4256, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3625714778900146, + "rewards/margins": 3.2680504322052, + "rewards/rejected": -5.630621910095215, + "step": 24060 + }, + { + "epoch": 0.7846779252307516, + "grad_norm": 4.820045471191406, + "learning_rate": 3.692931860396911e-05, + "logits/chosen": 3.2338147163391113, + "logits/rejected": 3.4531707763671875, + "logps/chosen": -338.30047607421875, + "logps/rejected": -341.3011169433594, + "loss": 0.6814, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.031013011932373, + "rewards/margins": 1.8470653295516968, + "rewards/rejected": -3.878077983856201, + "step": 24080 + }, + { + "epoch": 0.7853296510822723, + "grad_norm": 0.37513306736946106, + "learning_rate": 3.6918456240970664e-05, + "logits/chosen": 3.5352816581726074, + "logits/rejected": 3.62825083732605, + "logps/chosen": -340.7555847167969, + "logps/rejected": -314.91583251953125, + "loss": 0.4187, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.6209957599639893, + "rewards/margins": 2.481998920440674, + "rewards/rejected": -5.102994441986084, + "step": 24100 + }, + { + "epoch": 0.7859813769337928, + "grad_norm": 4.632561206817627, + "learning_rate": 3.690759387797222e-05, + "logits/chosen": 3.715977907180786, + "logits/rejected": 3.871361255645752, + "logps/chosen": -364.20111083984375, + "logps/rejected": -315.0688781738281, + "loss": 0.3682, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1783480644226074, + "rewards/margins": 3.4266433715820312, + "rewards/rejected": -5.604991912841797, + "step": 24120 + }, + { + "epoch": 0.7866331027853134, + "grad_norm": 2.543501138687134, + "learning_rate": 3.689673151497377e-05, + "logits/chosen": 3.6909778118133545, + "logits/rejected": 3.667430877685547, + "logps/chosen": -365.2169189453125, + "logps/rejected": -337.13409423828125, + "loss": 0.4493, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1802937984466553, + "rewards/margins": 3.0247905254364014, + "rewards/rejected": -5.205084323883057, + "step": 24140 + }, + { + "epoch": 0.7872848286368339, + "grad_norm": 4.028918266296387, + "learning_rate": 3.688586915197532e-05, + "logits/chosen": 3.6792006492614746, + "logits/rejected": 3.638352870941162, + "logps/chosen": -349.93316650390625, + "logps/rejected": -381.82745361328125, + "loss": 0.4645, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.190001964569092, + "rewards/margins": 3.363067150115967, + "rewards/rejected": -5.553069114685059, + "step": 24160 + }, + { + "epoch": 0.7879365544883544, + "grad_norm": 3.7722415924072266, + "learning_rate": 3.6875006788976874e-05, + "logits/chosen": 3.4383692741394043, + "logits/rejected": 3.6620426177978516, + "logps/chosen": -350.6599426269531, + "logps/rejected": -293.41455078125, + "loss": 0.3779, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.515434503555298, + "rewards/margins": 2.9738259315490723, + "rewards/rejected": -5.489260673522949, + "step": 24180 + }, + { + "epoch": 0.788588280339875, + "grad_norm": 1.1235781908035278, + "learning_rate": 3.686414442597843e-05, + "logits/chosen": 3.8294453620910645, + "logits/rejected": 4.012373447418213, + "logps/chosen": -385.64276123046875, + "logps/rejected": -370.53753662109375, + "loss": 0.4734, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.5308938026428223, + "rewards/margins": 3.166905403137207, + "rewards/rejected": -5.697798728942871, + "step": 24200 + }, + { + "epoch": 0.7892400061913956, + "grad_norm": 0.06286653131246567, + "learning_rate": 3.685328206297998e-05, + "logits/chosen": 3.3390815258026123, + "logits/rejected": 3.545916795730591, + "logps/chosen": -324.7923278808594, + "logps/rejected": -320.8528747558594, + "loss": 0.4073, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6971638202667236, + "rewards/margins": 3.1277480125427246, + "rewards/rejected": -5.824911117553711, + "step": 24220 + }, + { + "epoch": 0.7898917320429162, + "grad_norm": 1.8928941488265991, + "learning_rate": 3.684241969998153e-05, + "logits/chosen": 3.7227978706359863, + "logits/rejected": 3.8031699657440186, + "logps/chosen": -368.05303955078125, + "logps/rejected": -331.95538330078125, + "loss": 0.338, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.634087085723877, + "rewards/margins": 3.220451831817627, + "rewards/rejected": -5.854538917541504, + "step": 24240 + }, + { + "epoch": 0.7905434578944367, + "grad_norm": 7.031154632568359, + "learning_rate": 3.683155733698309e-05, + "logits/chosen": 3.35931396484375, + "logits/rejected": 3.317551851272583, + "logps/chosen": -361.414794921875, + "logps/rejected": -345.6872863769531, + "loss": 0.538, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.89127254486084, + "rewards/margins": 2.656954288482666, + "rewards/rejected": -5.548226356506348, + "step": 24260 + }, + { + "epoch": 0.7911951837459572, + "grad_norm": 1.7397034168243408, + "learning_rate": 3.682123809213456e-05, + "logits/chosen": 3.2779934406280518, + "logits/rejected": 3.1955764293670654, + "logps/chosen": -355.09454345703125, + "logps/rejected": -323.8433532714844, + "loss": 0.3642, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.336418867111206, + "rewards/margins": 3.4195332527160645, + "rewards/rejected": -5.755951881408691, + "step": 24280 + }, + { + "epoch": 0.7918469095974778, + "grad_norm": 2.0791938304901123, + "learning_rate": 3.681037572913612e-05, + "logits/chosen": 3.2158520221710205, + "logits/rejected": 3.3841190338134766, + "logps/chosen": -299.36151123046875, + "logps/rejected": -304.69415283203125, + "loss": 0.2937, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.1680777072906494, + "rewards/margins": 2.9662585258483887, + "rewards/rejected": -5.134335994720459, + "step": 24300 + }, + { + "epoch": 0.7924986354489983, + "grad_norm": 0.3133808970451355, + "learning_rate": 3.679951336613767e-05, + "logits/chosen": 3.4771676063537598, + "logits/rejected": 3.574669599533081, + "logps/chosen": -356.460693359375, + "logps/rejected": -320.12396240234375, + "loss": 0.4488, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8803181648254395, + "rewards/margins": 3.3513686656951904, + "rewards/rejected": -6.231686592102051, + "step": 24320 + }, + { + "epoch": 0.793150361300519, + "grad_norm": 3.7413272857666016, + "learning_rate": 3.678865100313922e-05, + "logits/chosen": 3.188067674636841, + "logits/rejected": 3.1677238941192627, + "logps/chosen": -332.1863708496094, + "logps/rejected": -332.0634765625, + "loss": 0.508, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.3836052417755127, + "rewards/margins": 3.531461715698242, + "rewards/rejected": -5.915066242218018, + "step": 24340 + }, + { + "epoch": 0.7938020871520395, + "grad_norm": 4.605832099914551, + "learning_rate": 3.677778864014078e-05, + "logits/chosen": 3.3215975761413574, + "logits/rejected": 3.337371349334717, + "logps/chosen": -305.4317932128906, + "logps/rejected": -353.03753662109375, + "loss": 0.4803, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.494236946105957, + "rewards/margins": 3.1551461219787598, + "rewards/rejected": -5.649382591247559, + "step": 24360 + }, + { + "epoch": 0.7944538130035601, + "grad_norm": 0.3521760404109955, + "learning_rate": 3.6766926277142336e-05, + "logits/chosen": 3.503007173538208, + "logits/rejected": 3.5406441688537598, + "logps/chosen": -370.98516845703125, + "logps/rejected": -342.5919189453125, + "loss": 0.4038, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.466806650161743, + "rewards/margins": 3.159654140472412, + "rewards/rejected": -5.626460552215576, + "step": 24380 + }, + { + "epoch": 0.7951055388550806, + "grad_norm": 1.2162681818008423, + "learning_rate": 3.675606391414389e-05, + "logits/chosen": 2.975961446762085, + "logits/rejected": 3.068547487258911, + "logps/chosen": -363.8763427734375, + "logps/rejected": -357.84136962890625, + "loss": 0.4801, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.5392720699310303, + "rewards/margins": 2.924267530441284, + "rewards/rejected": -5.4635396003723145, + "step": 24400 + }, + { + "epoch": 0.7957572647066011, + "grad_norm": 4.781440734863281, + "learning_rate": 3.674520155114544e-05, + "logits/chosen": 3.3733532428741455, + "logits/rejected": 3.5617382526397705, + "logps/chosen": -344.24273681640625, + "logps/rejected": -315.8865661621094, + "loss": 0.3318, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.4956603050231934, + "rewards/margins": 3.1944737434387207, + "rewards/rejected": -5.690134048461914, + "step": 24420 + }, + { + "epoch": 0.7964089905581218, + "grad_norm": 4.465220928192139, + "learning_rate": 3.6734339188146995e-05, + "logits/chosen": 3.2964751720428467, + "logits/rejected": 3.440962314605713, + "logps/chosen": -367.40301513671875, + "logps/rejected": -345.5003967285156, + "loss": 0.5404, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4089415073394775, + "rewards/margins": 2.654980182647705, + "rewards/rejected": -5.0639214515686035, + "step": 24440 + }, + { + "epoch": 0.7970607164096423, + "grad_norm": 2.682356119155884, + "learning_rate": 3.6723476825148546e-05, + "logits/chosen": 3.6233620643615723, + "logits/rejected": 3.7383880615234375, + "logps/chosen": -398.9897155761719, + "logps/rejected": -334.59429931640625, + "loss": 0.3107, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.4135833978652954, + "rewards/margins": 3.2388534545898438, + "rewards/rejected": -4.65243673324585, + "step": 24460 + }, + { + "epoch": 0.7977124422611629, + "grad_norm": 1.1161458492279053, + "learning_rate": 3.67126144621501e-05, + "logits/chosen": 3.5874314308166504, + "logits/rejected": 3.886218547821045, + "logps/chosen": -360.5286865234375, + "logps/rejected": -311.61724853515625, + "loss": 0.6534, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.849212169647217, + "rewards/margins": 1.9456287622451782, + "rewards/rejected": -4.7948408126831055, + "step": 24480 + }, + { + "epoch": 0.7983641681126834, + "grad_norm": 2.3975934982299805, + "learning_rate": 3.6701752099151654e-05, + "logits/chosen": 3.426569700241089, + "logits/rejected": 3.5006752014160156, + "logps/chosen": -334.1938781738281, + "logps/rejected": -364.1211853027344, + "loss": 0.3565, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8151003122329712, + "rewards/margins": 3.461827516555786, + "rewards/rejected": -5.276927947998047, + "step": 24500 + }, + { + "epoch": 0.7990158939642039, + "grad_norm": 0.27778521180152893, + "learning_rate": 3.6690889736153205e-05, + "logits/chosen": 3.236952543258667, + "logits/rejected": 3.5152745246887207, + "logps/chosen": -365.9968566894531, + "logps/rejected": -341.6545715332031, + "loss": 0.4013, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5377906560897827, + "rewards/margins": 4.079667091369629, + "rewards/rejected": -5.617457389831543, + "step": 24520 + }, + { + "epoch": 0.7996676198157245, + "grad_norm": 2.2421512603759766, + "learning_rate": 3.6680027373154756e-05, + "logits/chosen": 3.3201751708984375, + "logits/rejected": 3.4687857627868652, + "logps/chosen": -317.66961669921875, + "logps/rejected": -302.43499755859375, + "loss": 0.3235, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.0269947052001953, + "rewards/margins": 3.3546230792999268, + "rewards/rejected": -5.381618499755859, + "step": 24540 + }, + { + "epoch": 0.800319345667245, + "grad_norm": 3.2166006565093994, + "learning_rate": 3.666916501015631e-05, + "logits/chosen": 3.5780177116394043, + "logits/rejected": 3.6169235706329346, + "logps/chosen": -307.6004638671875, + "logps/rejected": -302.6830139160156, + "loss": 0.5757, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.5795674324035645, + "rewards/margins": 2.290017604827881, + "rewards/rejected": -4.869584560394287, + "step": 24560 + }, + { + "epoch": 0.8009710715187657, + "grad_norm": 4.259017467498779, + "learning_rate": 3.6658302647157864e-05, + "logits/chosen": 3.4302356243133545, + "logits/rejected": 3.418390989303589, + "logps/chosen": -348.18719482421875, + "logps/rejected": -329.558837890625, + "loss": 0.3646, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4582339525222778, + "rewards/margins": 3.6063849925994873, + "rewards/rejected": -5.064619064331055, + "step": 24580 + }, + { + "epoch": 0.8016227973702862, + "grad_norm": 1.1517192125320435, + "learning_rate": 3.6647440284159415e-05, + "logits/chosen": 3.474863052368164, + "logits/rejected": 3.5579516887664795, + "logps/chosen": -352.2987365722656, + "logps/rejected": -295.0191345214844, + "loss": 0.3823, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8629165887832642, + "rewards/margins": 2.8149497509002686, + "rewards/rejected": -4.6778669357299805, + "step": 24600 + }, + { + "epoch": 0.8022745232218067, + "grad_norm": 4.47021484375, + "learning_rate": 3.663657792116097e-05, + "logits/chosen": 3.3707892894744873, + "logits/rejected": 3.606003999710083, + "logps/chosen": -317.1146545410156, + "logps/rejected": -318.48138427734375, + "loss": 0.3731, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8236634731292725, + "rewards/margins": 3.3398499488830566, + "rewards/rejected": -5.163513660430908, + "step": 24620 + }, + { + "epoch": 0.8029262490733273, + "grad_norm": 0.918743371963501, + "learning_rate": 3.662571555816252e-05, + "logits/chosen": 3.379387617111206, + "logits/rejected": 3.3652901649475098, + "logps/chosen": -316.59124755859375, + "logps/rejected": -308.8219909667969, + "loss": 0.6402, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.123072385787964, + "rewards/margins": 2.974033832550049, + "rewards/rejected": -5.097105979919434, + "step": 24640 + }, + { + "epoch": 0.8035779749248478, + "grad_norm": 1.1717959642410278, + "learning_rate": 3.661485319516408e-05, + "logits/chosen": 3.494335889816284, + "logits/rejected": 3.525918483734131, + "logps/chosen": -314.04290771484375, + "logps/rejected": -315.52825927734375, + "loss": 0.4035, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8511072397232056, + "rewards/margins": 3.3999874591827393, + "rewards/rejected": -5.251094341278076, + "step": 24660 + }, + { + "epoch": 0.8042297007763685, + "grad_norm": 0.5235368013381958, + "learning_rate": 3.660399083216563e-05, + "logits/chosen": 3.4910645484924316, + "logits/rejected": 3.5233817100524902, + "logps/chosen": -325.5826721191406, + "logps/rejected": -278.22003173828125, + "loss": 0.3845, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.629913330078125, + "rewards/margins": 3.2336769104003906, + "rewards/rejected": -4.863590240478516, + "step": 24680 + }, + { + "epoch": 0.804881426627889, + "grad_norm": 8.08488941192627, + "learning_rate": 3.659312846916718e-05, + "logits/chosen": 3.520231246948242, + "logits/rejected": 3.4090209007263184, + "logps/chosen": -367.5302734375, + "logps/rejected": -324.5213928222656, + "loss": 0.7061, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.8299806118011475, + "rewards/margins": 2.640333652496338, + "rewards/rejected": -5.470314025878906, + "step": 24700 + }, + { + "epoch": 0.8055331524794095, + "grad_norm": 0.7817164659500122, + "learning_rate": 3.658226610616874e-05, + "logits/chosen": 3.5440878868103027, + "logits/rejected": 3.7102859020233154, + "logps/chosen": -372.67816162109375, + "logps/rejected": -355.355224609375, + "loss": 0.4333, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7240591049194336, + "rewards/margins": 3.5478272438049316, + "rewards/rejected": -5.271886348724365, + "step": 24720 + }, + { + "epoch": 0.8061848783309301, + "grad_norm": 3.705432415008545, + "learning_rate": 3.657140374317029e-05, + "logits/chosen": 3.5358688831329346, + "logits/rejected": 3.5860965251922607, + "logps/chosen": -337.44512939453125, + "logps/rejected": -293.09429931640625, + "loss": 0.3626, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7609729766845703, + "rewards/margins": 3.107577323913574, + "rewards/rejected": -4.8685503005981445, + "step": 24740 + }, + { + "epoch": 0.8068366041824506, + "grad_norm": 2.463428020477295, + "learning_rate": 3.656054138017184e-05, + "logits/chosen": 3.6178250312805176, + "logits/rejected": 3.499302387237549, + "logps/chosen": -312.77862548828125, + "logps/rejected": -318.66864013671875, + "loss": 0.4717, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.6476480960845947, + "rewards/margins": 3.198173999786377, + "rewards/rejected": -5.845822811126709, + "step": 24760 + }, + { + "epoch": 0.8074883300339712, + "grad_norm": 5.393563270568848, + "learning_rate": 3.65496790171734e-05, + "logits/chosen": 3.369873046875, + "logits/rejected": 3.3213348388671875, + "logps/chosen": -340.13189697265625, + "logps/rejected": -321.34515380859375, + "loss": 0.4617, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.957271933555603, + "rewards/margins": 3.3983356952667236, + "rewards/rejected": -5.355607509613037, + "step": 24780 + }, + { + "epoch": 0.8081400558854918, + "grad_norm": 1.7931452989578247, + "learning_rate": 3.653881665417495e-05, + "logits/chosen": 3.4947121143341064, + "logits/rejected": 3.6059658527374268, + "logps/chosen": -347.5293884277344, + "logps/rejected": -373.16986083984375, + "loss": 0.5293, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8850539922714233, + "rewards/margins": 3.5619633197784424, + "rewards/rejected": -5.447017669677734, + "step": 24800 + }, + { + "epoch": 0.8087917817370123, + "grad_norm": 5.28917121887207, + "learning_rate": 3.65279542911765e-05, + "logits/chosen": 2.9884419441223145, + "logits/rejected": 3.1415064334869385, + "logps/chosen": -330.80157470703125, + "logps/rejected": -286.3647155761719, + "loss": 0.6241, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.157731294631958, + "rewards/margins": 2.7329680919647217, + "rewards/rejected": -4.8906989097595215, + "step": 24820 + }, + { + "epoch": 0.8094435075885329, + "grad_norm": 1.4327057600021362, + "learning_rate": 3.651709192817806e-05, + "logits/chosen": 3.6945621967315674, + "logits/rejected": 3.6356563568115234, + "logps/chosen": -376.7503967285156, + "logps/rejected": -370.42095947265625, + "loss": 0.3355, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.160430669784546, + "rewards/margins": 3.6897952556610107, + "rewards/rejected": -4.850225925445557, + "step": 24840 + }, + { + "epoch": 0.8100952334400534, + "grad_norm": 1.8133670091629028, + "learning_rate": 3.650622956517961e-05, + "logits/chosen": 3.216445207595825, + "logits/rejected": 3.29854154586792, + "logps/chosen": -364.7332763671875, + "logps/rejected": -292.4669189453125, + "loss": 0.5183, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8048969507217407, + "rewards/margins": 2.410057544708252, + "rewards/rejected": -4.214953899383545, + "step": 24860 + }, + { + "epoch": 0.810746959291574, + "grad_norm": 4.434045791625977, + "learning_rate": 3.649536720218116e-05, + "logits/chosen": 3.591386318206787, + "logits/rejected": 3.809741973876953, + "logps/chosen": -390.28021240234375, + "logps/rejected": -379.1455993652344, + "loss": 0.7074, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.452135682106018, + "rewards/margins": 2.7539496421813965, + "rewards/rejected": -4.206085205078125, + "step": 24880 + }, + { + "epoch": 0.8113986851430945, + "grad_norm": 1.5098117589950562, + "learning_rate": 3.648450483918272e-05, + "logits/chosen": 3.278196334838867, + "logits/rejected": 3.2428855895996094, + "logps/chosen": -342.4667053222656, + "logps/rejected": -294.8621826171875, + "loss": 0.4259, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.8503568768501282, + "rewards/margins": 3.2003188133239746, + "rewards/rejected": -4.050675868988037, + "step": 24900 + }, + { + "epoch": 0.8120504109946152, + "grad_norm": 3.0081560611724854, + "learning_rate": 3.6473642476184275e-05, + "logits/chosen": 3.645432233810425, + "logits/rejected": 3.579200267791748, + "logps/chosen": -355.83135986328125, + "logps/rejected": -295.4815979003906, + "loss": 0.4672, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.583003044128418, + "rewards/margins": 3.0793979167938232, + "rewards/rejected": -4.66240119934082, + "step": 24920 + }, + { + "epoch": 0.8127021368461357, + "grad_norm": 1.0057852268218994, + "learning_rate": 3.6462780113185826e-05, + "logits/chosen": 3.685605525970459, + "logits/rejected": 3.8187363147735596, + "logps/chosen": -293.9223327636719, + "logps/rejected": -290.2452087402344, + "loss": 0.6374, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.747941255569458, + "rewards/margins": 2.143510103225708, + "rewards/rejected": -3.891451358795166, + "step": 24940 + }, + { + "epoch": 0.8133538626976562, + "grad_norm": 1.2024226188659668, + "learning_rate": 3.6451917750187377e-05, + "logits/chosen": 3.5755629539489746, + "logits/rejected": 3.750025510787964, + "logps/chosen": -337.1495056152344, + "logps/rejected": -340.103515625, + "loss": 0.3033, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.7570875287055969, + "rewards/margins": 3.785094738006592, + "rewards/rejected": -4.542181968688965, + "step": 24960 + }, + { + "epoch": 0.8140055885491768, + "grad_norm": 1.5838627815246582, + "learning_rate": 3.6441055387188934e-05, + "logits/chosen": 3.6421897411346436, + "logits/rejected": 3.6459567546844482, + "logps/chosen": -348.8547668457031, + "logps/rejected": -353.40899658203125, + "loss": 0.5391, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8159160614013672, + "rewards/margins": 2.7586498260498047, + "rewards/rejected": -4.574565887451172, + "step": 24980 + }, + { + "epoch": 0.8146573144006973, + "grad_norm": 1.4964066743850708, + "learning_rate": 3.6430193024190485e-05, + "logits/chosen": 3.476656675338745, + "logits/rejected": 3.507190704345703, + "logps/chosen": -347.4669494628906, + "logps/rejected": -309.55792236328125, + "loss": 0.3011, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.2787758111953735, + "rewards/margins": 3.158006191253662, + "rewards/rejected": -4.436781883239746, + "step": 25000 + }, + { + "epoch": 0.815309040252218, + "grad_norm": 2.301429271697998, + "learning_rate": 3.6419330661192036e-05, + "logits/chosen": 3.257385730743408, + "logits/rejected": 3.524919033050537, + "logps/chosen": -340.69677734375, + "logps/rejected": -344.3763732910156, + "loss": 0.2289, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3114588260650635, + "rewards/margins": 4.711578369140625, + "rewards/rejected": -6.023036479949951, + "step": 25020 + }, + { + "epoch": 0.8159607661037385, + "grad_norm": 7.794856071472168, + "learning_rate": 3.640846829819359e-05, + "logits/chosen": 3.2308907508850098, + "logits/rejected": 3.3781845569610596, + "logps/chosen": -354.6158752441406, + "logps/rejected": -327.6434631347656, + "loss": 0.4101, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.679173469543457, + "rewards/margins": 3.410614490509033, + "rewards/rejected": -5.089788436889648, + "step": 25040 + }, + { + "epoch": 0.816612491955259, + "grad_norm": 6.2247796058654785, + "learning_rate": 3.6397605935195144e-05, + "logits/chosen": 3.392632246017456, + "logits/rejected": 3.381539821624756, + "logps/chosen": -343.0557556152344, + "logps/rejected": -295.55657958984375, + "loss": 0.4776, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7010408639907837, + "rewards/margins": 3.047861099243164, + "rewards/rejected": -4.748901844024658, + "step": 25060 + }, + { + "epoch": 0.8172642178067796, + "grad_norm": 2.5157785415649414, + "learning_rate": 3.6386743572196695e-05, + "logits/chosen": 3.765227794647217, + "logits/rejected": 3.8123576641082764, + "logps/chosen": -321.5022277832031, + "logps/rejected": -310.2965087890625, + "loss": 0.5127, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3366224765777588, + "rewards/margins": 2.697890281677246, + "rewards/rejected": -4.034512519836426, + "step": 25080 + }, + { + "epoch": 0.8179159436583001, + "grad_norm": 0.015219015069305897, + "learning_rate": 3.6375881209198246e-05, + "logits/chosen": 3.2639660835266113, + "logits/rejected": 3.4479737281799316, + "logps/chosen": -290.12567138671875, + "logps/rejected": -299.11029052734375, + "loss": 0.2758, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.3287267684936523, + "rewards/margins": 3.787318706512451, + "rewards/rejected": -5.1160454750061035, + "step": 25100 + }, + { + "epoch": 0.8185676695098207, + "grad_norm": 2.8239758014678955, + "learning_rate": 3.63650188461998e-05, + "logits/chosen": 3.1937201023101807, + "logits/rejected": 3.1583328247070312, + "logps/chosen": -324.8180847167969, + "logps/rejected": -292.32611083984375, + "loss": 0.3905, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8145414590835571, + "rewards/margins": 3.4143505096435547, + "rewards/rejected": -5.2288923263549805, + "step": 25120 + }, + { + "epoch": 0.8192193953613413, + "grad_norm": 6.627508163452148, + "learning_rate": 3.6354156483201354e-05, + "logits/chosen": 3.3070340156555176, + "logits/rejected": 3.390657424926758, + "logps/chosen": -350.3944091796875, + "logps/rejected": -331.14105224609375, + "loss": 0.4986, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.0414235591888428, + "rewards/margins": 2.9124369621276855, + "rewards/rejected": -3.9538605213165283, + "step": 25140 + }, + { + "epoch": 0.8198711212128618, + "grad_norm": 2.3775484561920166, + "learning_rate": 3.634329412020291e-05, + "logits/chosen": 3.6524195671081543, + "logits/rejected": 3.6734938621520996, + "logps/chosen": -355.5890808105469, + "logps/rejected": -339.243896484375, + "loss": 0.6137, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3115158081054688, + "rewards/margins": 2.925363063812256, + "rewards/rejected": -4.236878871917725, + "step": 25160 + }, + { + "epoch": 0.8205228470643824, + "grad_norm": 4.715437412261963, + "learning_rate": 3.633243175720447e-05, + "logits/chosen": 3.2750728130340576, + "logits/rejected": 3.4255268573760986, + "logps/chosen": -308.3150634765625, + "logps/rejected": -326.957275390625, + "loss": 0.599, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6405328512191772, + "rewards/margins": 2.745941638946533, + "rewards/rejected": -4.386474609375, + "step": 25180 + }, + { + "epoch": 0.8211745729159029, + "grad_norm": 2.3287341594696045, + "learning_rate": 3.632156939420602e-05, + "logits/chosen": 3.5172581672668457, + "logits/rejected": 3.5152783393859863, + "logps/chosen": -381.2432861328125, + "logps/rejected": -358.7724609375, + "loss": 0.6362, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6145572662353516, + "rewards/margins": 2.898841142654419, + "rewards/rejected": -4.51339864730835, + "step": 25200 + }, + { + "epoch": 0.8218262987674235, + "grad_norm": 4.831684589385986, + "learning_rate": 3.631070703120757e-05, + "logits/chosen": 3.003539562225342, + "logits/rejected": 3.085090398788452, + "logps/chosen": -330.482666015625, + "logps/rejected": -332.05975341796875, + "loss": 0.5047, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.535635232925415, + "rewards/margins": 3.119096517562866, + "rewards/rejected": -4.654731273651123, + "step": 25220 + }, + { + "epoch": 0.822478024618944, + "grad_norm": 1.7735525369644165, + "learning_rate": 3.629984466820913e-05, + "logits/chosen": 2.985342025756836, + "logits/rejected": 3.2286903858184814, + "logps/chosen": -332.45654296875, + "logps/rejected": -314.766357421875, + "loss": 0.4398, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4269497394561768, + "rewards/margins": 3.3107943534851074, + "rewards/rejected": -4.737743854522705, + "step": 25240 + }, + { + "epoch": 0.8231297504704645, + "grad_norm": 6.965205192565918, + "learning_rate": 3.628898230521068e-05, + "logits/chosen": 3.694955348968506, + "logits/rejected": 3.5970420837402344, + "logps/chosen": -351.89581298828125, + "logps/rejected": -320.17816162109375, + "loss": 0.6019, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6593239307403564, + "rewards/margins": 2.272726535797119, + "rewards/rejected": -3.9320507049560547, + "step": 25260 + }, + { + "epoch": 0.8237814763219852, + "grad_norm": 1.3825725317001343, + "learning_rate": 3.627811994221223e-05, + "logits/chosen": 3.3292019367218018, + "logits/rejected": 3.6621692180633545, + "logps/chosen": -321.4964599609375, + "logps/rejected": -319.9980773925781, + "loss": 0.4152, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.373295783996582, + "rewards/margins": 3.4838759899139404, + "rewards/rejected": -4.857171535491943, + "step": 25280 + }, + { + "epoch": 0.8244332021735057, + "grad_norm": 2.067270517349243, + "learning_rate": 3.626725757921378e-05, + "logits/chosen": 3.4871420860290527, + "logits/rejected": 3.662963390350342, + "logps/chosen": -410.29107666015625, + "logps/rejected": -343.59149169921875, + "loss": 0.3626, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.3981457948684692, + "rewards/margins": 3.6815826892852783, + "rewards/rejected": -5.079728126525879, + "step": 25300 + }, + { + "epoch": 0.8250849280250263, + "grad_norm": 5.074583530426025, + "learning_rate": 3.625639521621534e-05, + "logits/chosen": 3.2508206367492676, + "logits/rejected": 3.332643508911133, + "logps/chosen": -314.803955078125, + "logps/rejected": -278.15362548828125, + "loss": 0.487, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.5955023765563965, + "rewards/margins": 2.6977906227111816, + "rewards/rejected": -5.293292999267578, + "step": 25320 + }, + { + "epoch": 0.8257366538765468, + "grad_norm": 7.152611255645752, + "learning_rate": 3.624553285321689e-05, + "logits/chosen": 3.100259304046631, + "logits/rejected": 3.151468276977539, + "logps/chosen": -328.26580810546875, + "logps/rejected": -355.338623046875, + "loss": 0.3919, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7549060583114624, + "rewards/margins": 4.224123954772949, + "rewards/rejected": -5.979029655456543, + "step": 25340 + }, + { + "epoch": 0.8263883797280673, + "grad_norm": 2.2922041416168213, + "learning_rate": 3.623467049021844e-05, + "logits/chosen": 3.20904541015625, + "logits/rejected": 3.3522751331329346, + "logps/chosen": -350.62353515625, + "logps/rejected": -314.281005859375, + "loss": 0.5249, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8388372659683228, + "rewards/margins": 2.781514883041382, + "rewards/rejected": -4.620351791381836, + "step": 25360 + }, + { + "epoch": 0.827040105579588, + "grad_norm": 3.6014459133148193, + "learning_rate": 3.622380812722e-05, + "logits/chosen": 3.4234962463378906, + "logits/rejected": 3.5376548767089844, + "logps/chosen": -382.79541015625, + "logps/rejected": -303.87451171875, + "loss": 0.4998, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0417706966400146, + "rewards/margins": 2.6020002365112305, + "rewards/rejected": -4.643770694732666, + "step": 25380 + }, + { + "epoch": 0.8276918314311085, + "grad_norm": 3.99904465675354, + "learning_rate": 3.621294576422155e-05, + "logits/chosen": 3.6094672679901123, + "logits/rejected": 3.7706313133239746, + "logps/chosen": -355.4876708984375, + "logps/rejected": -356.7953186035156, + "loss": 0.634, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.2257165908813477, + "rewards/margins": 3.1743321418762207, + "rewards/rejected": -5.400048732757568, + "step": 25400 + }, + { + "epoch": 0.8283435572826291, + "grad_norm": 0.34481149911880493, + "learning_rate": 3.6202083401223106e-05, + "logits/chosen": 3.1264424324035645, + "logits/rejected": 3.111078977584839, + "logps/chosen": -336.5526428222656, + "logps/rejected": -329.3305358886719, + "loss": 0.5561, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.3234734535217285, + "rewards/margins": 3.5797524452209473, + "rewards/rejected": -5.903225421905518, + "step": 25420 + }, + { + "epoch": 0.8289952831341496, + "grad_norm": 1.8668386936187744, + "learning_rate": 3.6191221038224656e-05, + "logits/chosen": 3.295574188232422, + "logits/rejected": 3.4497647285461426, + "logps/chosen": -319.38018798828125, + "logps/rejected": -339.4562683105469, + "loss": 0.3561, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.99005126953125, + "rewards/margins": 3.3658173084259033, + "rewards/rejected": -5.355868816375732, + "step": 25440 + }, + { + "epoch": 0.8296470089856702, + "grad_norm": 0.19569970667362213, + "learning_rate": 3.6180358675226214e-05, + "logits/chosen": 3.3807997703552246, + "logits/rejected": 3.479478359222412, + "logps/chosen": -375.4799499511719, + "logps/rejected": -342.1688537597656, + "loss": 0.3204, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3665084838867188, + "rewards/margins": 3.3150932788848877, + "rewards/rejected": -4.681601524353027, + "step": 25460 + }, + { + "epoch": 0.8302987348371907, + "grad_norm": 2.84078049659729, + "learning_rate": 3.6169496312227765e-05, + "logits/chosen": 3.1683189868927, + "logits/rejected": 3.2614798545837402, + "logps/chosen": -342.5557861328125, + "logps/rejected": -329.0782470703125, + "loss": 0.6481, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9824336767196655, + "rewards/margins": 2.858229160308838, + "rewards/rejected": -4.840662956237793, + "step": 25480 + }, + { + "epoch": 0.8309504606887113, + "grad_norm": 1.956852674484253, + "learning_rate": 3.6158633949229315e-05, + "logits/chosen": 3.170921802520752, + "logits/rejected": 3.3648993968963623, + "logps/chosen": -354.13116455078125, + "logps/rejected": -294.2445068359375, + "loss": 0.3353, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6877466440200806, + "rewards/margins": 2.977231502532959, + "rewards/rejected": -4.66497802734375, + "step": 25500 + }, + { + "epoch": 0.8316021865402319, + "grad_norm": 2.512688398361206, + "learning_rate": 3.614777158623087e-05, + "logits/chosen": 3.5231716632843018, + "logits/rejected": 3.5582072734832764, + "logps/chosen": -354.77569580078125, + "logps/rejected": -359.541748046875, + "loss": 0.3844, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7137610912322998, + "rewards/margins": 3.5767416954040527, + "rewards/rejected": -5.290503025054932, + "step": 25520 + }, + { + "epoch": 0.8322539123917524, + "grad_norm": 1.646874189376831, + "learning_rate": 3.6136909223232424e-05, + "logits/chosen": 3.613010883331299, + "logits/rejected": 3.6795783042907715, + "logps/chosen": -381.29583740234375, + "logps/rejected": -395.0981750488281, + "loss": 0.2974, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.0498557090759277, + "rewards/margins": 3.886963367462158, + "rewards/rejected": -5.936819553375244, + "step": 25540 + }, + { + "epoch": 0.832905638243273, + "grad_norm": 1.0667665004730225, + "learning_rate": 3.6126046860233975e-05, + "logits/chosen": 3.3749382495880127, + "logits/rejected": 3.396446704864502, + "logps/chosen": -350.5771179199219, + "logps/rejected": -290.64910888671875, + "loss": 0.4839, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.24798583984375, + "rewards/margins": 3.0776267051696777, + "rewards/rejected": -5.325612545013428, + "step": 25560 + }, + { + "epoch": 0.8335573640947935, + "grad_norm": 7.650717258453369, + "learning_rate": 3.611518449723553e-05, + "logits/chosen": 3.133927822113037, + "logits/rejected": 3.1960813999176025, + "logps/chosen": -349.2331237792969, + "logps/rejected": -326.288818359375, + "loss": 0.5807, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.9328818321228027, + "rewards/margins": 2.6829957962036133, + "rewards/rejected": -5.615877628326416, + "step": 25580 + }, + { + "epoch": 0.834209089946314, + "grad_norm": 1.4691661596298218, + "learning_rate": 3.610432213423708e-05, + "logits/chosen": 3.48353910446167, + "logits/rejected": 3.4093010425567627, + "logps/chosen": -326.14520263671875, + "logps/rejected": -292.2113037109375, + "loss": 0.4752, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7433735132217407, + "rewards/margins": 2.825223445892334, + "rewards/rejected": -4.568597316741943, + "step": 25600 + }, + { + "epoch": 0.8348608157978347, + "grad_norm": 4.292339324951172, + "learning_rate": 3.6093459771238634e-05, + "logits/chosen": 3.263719081878662, + "logits/rejected": 3.2759041786193848, + "logps/chosen": -324.74517822265625, + "logps/rejected": -320.0804138183594, + "loss": 0.4924, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9622224569320679, + "rewards/margins": 2.8260087966918945, + "rewards/rejected": -4.788231372833252, + "step": 25620 + }, + { + "epoch": 0.8355125416493552, + "grad_norm": 0.44252637028694153, + "learning_rate": 3.608259740824019e-05, + "logits/chosen": 3.174247980117798, + "logits/rejected": 3.0579071044921875, + "logps/chosen": -303.34259033203125, + "logps/rejected": -312.9439697265625, + "loss": 0.4673, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.323648452758789, + "rewards/margins": 3.456422805786133, + "rewards/rejected": -5.78007173538208, + "step": 25640 + }, + { + "epoch": 0.8361642675008758, + "grad_norm": 4.4394426345825195, + "learning_rate": 3.607173504524174e-05, + "logits/chosen": 2.8770413398742676, + "logits/rejected": 3.111905574798584, + "logps/chosen": -307.94146728515625, + "logps/rejected": -315.2080383300781, + "loss": 0.5463, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.3541061878204346, + "rewards/margins": 2.664602041244507, + "rewards/rejected": -5.018708229064941, + "step": 25660 + }, + { + "epoch": 0.8368159933523963, + "grad_norm": 0.9920825362205505, + "learning_rate": 3.60608726822433e-05, + "logits/chosen": 3.4827911853790283, + "logits/rejected": 3.4661102294921875, + "logps/chosen": -364.78094482421875, + "logps/rejected": -359.7891845703125, + "loss": 0.6046, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.5242934226989746, + "rewards/margins": 2.9853360652923584, + "rewards/rejected": -5.509629726409912, + "step": 25680 + }, + { + "epoch": 0.8374677192039168, + "grad_norm": 1.0426942110061646, + "learning_rate": 3.605001031924485e-05, + "logits/chosen": 3.5121712684631348, + "logits/rejected": 3.651942729949951, + "logps/chosen": -347.601806640625, + "logps/rejected": -316.36065673828125, + "loss": 0.4169, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.6321558952331543, + "rewards/margins": 2.5757431983947754, + "rewards/rejected": -5.20789909362793, + "step": 25700 + }, + { + "epoch": 0.8381194450554375, + "grad_norm": 1.8525638580322266, + "learning_rate": 3.603914795624641e-05, + "logits/chosen": 3.4435722827911377, + "logits/rejected": 3.444121837615967, + "logps/chosen": -361.8270263671875, + "logps/rejected": -323.8814392089844, + "loss": 0.5205, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.079486608505249, + "rewards/margins": 2.3994226455688477, + "rewards/rejected": -4.478909015655518, + "step": 25720 + }, + { + "epoch": 0.838771170906958, + "grad_norm": 0.7796317338943481, + "learning_rate": 3.602828559324796e-05, + "logits/chosen": 3.536731243133545, + "logits/rejected": 3.6159679889678955, + "logps/chosen": -339.379638671875, + "logps/rejected": -336.57476806640625, + "loss": 0.6366, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.5387063026428223, + "rewards/margins": 2.3887479305267334, + "rewards/rejected": -4.927454471588135, + "step": 25740 + }, + { + "epoch": 0.8394228967584786, + "grad_norm": 6.42546272277832, + "learning_rate": 3.601742323024951e-05, + "logits/chosen": 3.41184663772583, + "logits/rejected": 3.420415163040161, + "logps/chosen": -330.7108459472656, + "logps/rejected": -293.8144226074219, + "loss": 0.5232, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2670519351959229, + "rewards/margins": 2.6377546787261963, + "rewards/rejected": -3.904806613922119, + "step": 25760 + }, + { + "epoch": 0.8400746226099991, + "grad_norm": 11.569037437438965, + "learning_rate": 3.600656086725107e-05, + "logits/chosen": 3.2189738750457764, + "logits/rejected": 3.3819003105163574, + "logps/chosen": -340.59759521484375, + "logps/rejected": -315.1823425292969, + "loss": 0.5078, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2543518543243408, + "rewards/margins": 2.98321533203125, + "rewards/rejected": -4.237566947937012, + "step": 25780 + }, + { + "epoch": 0.8407263484615196, + "grad_norm": 1.8982901573181152, + "learning_rate": 3.599569850425262e-05, + "logits/chosen": 3.3934032917022705, + "logits/rejected": 3.5461559295654297, + "logps/chosen": -334.6742248535156, + "logps/rejected": -308.64642333984375, + "loss": 0.4993, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.489864468574524, + "rewards/margins": 2.6146087646484375, + "rewards/rejected": -4.104472637176514, + "step": 25800 + }, + { + "epoch": 0.8413780743130402, + "grad_norm": 2.005161762237549, + "learning_rate": 3.598483614125417e-05, + "logits/chosen": 3.2533211708068848, + "logits/rejected": 3.283612012863159, + "logps/chosen": -345.14300537109375, + "logps/rejected": -311.1353454589844, + "loss": 0.451, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5025060176849365, + "rewards/margins": 2.9265236854553223, + "rewards/rejected": -4.429029941558838, + "step": 25820 + }, + { + "epoch": 0.8420298001645607, + "grad_norm": 2.4000117778778076, + "learning_rate": 3.5973973778255726e-05, + "logits/chosen": 3.4675564765930176, + "logits/rejected": 3.474109649658203, + "logps/chosen": -350.98382568359375, + "logps/rejected": -299.0833435058594, + "loss": 0.3776, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6565042734146118, + "rewards/margins": 3.185575008392334, + "rewards/rejected": -4.842079162597656, + "step": 25840 + }, + { + "epoch": 0.8426815260160814, + "grad_norm": 0.359535813331604, + "learning_rate": 3.596311141525728e-05, + "logits/chosen": 3.1790502071380615, + "logits/rejected": 3.3290200233459473, + "logps/chosen": -344.1402893066406, + "logps/rejected": -278.22100830078125, + "loss": 0.4804, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5000580549240112, + "rewards/margins": 2.7368276119232178, + "rewards/rejected": -4.2368855476379395, + "step": 25860 + }, + { + "epoch": 0.8433332518676019, + "grad_norm": 0.21457529067993164, + "learning_rate": 3.595224905225883e-05, + "logits/chosen": 3.4656624794006348, + "logits/rejected": 3.54821515083313, + "logps/chosen": -336.7251892089844, + "logps/rejected": -326.2616271972656, + "loss": 0.4385, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.339255928993225, + "rewards/margins": 3.077634334564209, + "rewards/rejected": -4.4168901443481445, + "step": 25880 + }, + { + "epoch": 0.8439849777191224, + "grad_norm": 3.4836537837982178, + "learning_rate": 3.594138668926038e-05, + "logits/chosen": 3.3521010875701904, + "logits/rejected": 3.5858688354492188, + "logps/chosen": -318.01727294921875, + "logps/rejected": -292.00628662109375, + "loss": 0.4114, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3279342651367188, + "rewards/margins": 3.1503114700317383, + "rewards/rejected": -4.478245735168457, + "step": 25900 + }, + { + "epoch": 0.844636703570643, + "grad_norm": 1.345394253730774, + "learning_rate": 3.5930524326261936e-05, + "logits/chosen": 3.3238818645477295, + "logits/rejected": 3.4428000450134277, + "logps/chosen": -343.6374206542969, + "logps/rejected": -303.3056640625, + "loss": 0.4301, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.581610083580017, + "rewards/margins": 2.8071441650390625, + "rewards/rejected": -4.388753890991211, + "step": 25920 + }, + { + "epoch": 0.8452884294221635, + "grad_norm": 4.5713887214660645, + "learning_rate": 3.591966196326349e-05, + "logits/chosen": 3.78568959236145, + "logits/rejected": 3.647357225418091, + "logps/chosen": -329.69097900390625, + "logps/rejected": -330.73309326171875, + "loss": 0.7703, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.677570104598999, + "rewards/margins": 2.8208017349243164, + "rewards/rejected": -4.4983720779418945, + "step": 25940 + }, + { + "epoch": 0.8459401552736842, + "grad_norm": 2.4732134342193604, + "learning_rate": 3.5908799600265044e-05, + "logits/chosen": 3.4433982372283936, + "logits/rejected": 3.527550220489502, + "logps/chosen": -339.76080322265625, + "logps/rejected": -304.5992431640625, + "loss": 0.4242, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5101317167282104, + "rewards/margins": 2.889075756072998, + "rewards/rejected": -4.399207592010498, + "step": 25960 + }, + { + "epoch": 0.8465918811252047, + "grad_norm": 0.8794378638267517, + "learning_rate": 3.58979372372666e-05, + "logits/chosen": 3.7154221534729004, + "logits/rejected": 3.7958343029022217, + "logps/chosen": -324.5883483886719, + "logps/rejected": -310.5425109863281, + "loss": 0.4105, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.313846230506897, + "rewards/margins": 2.6356072425842285, + "rewards/rejected": -3.9494528770446777, + "step": 25980 + }, + { + "epoch": 0.8472436069767253, + "grad_norm": 4.088620185852051, + "learning_rate": 3.588707487426815e-05, + "logits/chosen": 3.4784579277038574, + "logits/rejected": 3.6289124488830566, + "logps/chosen": -327.22296142578125, + "logps/rejected": -290.1322937011719, + "loss": 0.3481, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.4626051187515259, + "rewards/margins": 3.3034934997558594, + "rewards/rejected": -4.766098499298096, + "step": 26000 + }, + { + "epoch": 0.8478953328282458, + "grad_norm": 6.505405426025391, + "learning_rate": 3.5876212511269704e-05, + "logits/chosen": 3.3890647888183594, + "logits/rejected": 3.612422466278076, + "logps/chosen": -327.1246337890625, + "logps/rejected": -331.1424560546875, + "loss": 0.4985, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.3407875299453735, + "rewards/margins": 2.4421041011810303, + "rewards/rejected": -3.7828917503356934, + "step": 26020 + }, + { + "epoch": 0.8485470586797663, + "grad_norm": 3.975090980529785, + "learning_rate": 3.5865350148271254e-05, + "logits/chosen": 3.480647325515747, + "logits/rejected": 3.5570759773254395, + "logps/chosen": -341.12158203125, + "logps/rejected": -346.1964416503906, + "loss": 0.3273, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.4392184615135193, + "rewards/margins": 2.987049102783203, + "rewards/rejected": -3.426267147064209, + "step": 26040 + }, + { + "epoch": 0.8491987845312869, + "grad_norm": 0.9531853795051575, + "learning_rate": 3.585448778527281e-05, + "logits/chosen": 3.7941925525665283, + "logits/rejected": 3.762270450592041, + "logps/chosen": -364.4023742675781, + "logps/rejected": -304.4715881347656, + "loss": 0.4011, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.3476892709732056, + "rewards/margins": 2.947596549987793, + "rewards/rejected": -4.295286178588867, + "step": 26060 + }, + { + "epoch": 0.8498505103828075, + "grad_norm": 1.0676908493041992, + "learning_rate": 3.584362542227436e-05, + "logits/chosen": 3.5139050483703613, + "logits/rejected": 3.3977150917053223, + "logps/chosen": -328.39593505859375, + "logps/rejected": -316.44134521484375, + "loss": 0.6065, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4320751428604126, + "rewards/margins": 2.6968777179718018, + "rewards/rejected": -4.128952980041504, + "step": 26080 + }, + { + "epoch": 0.8505022362343281, + "grad_norm": 2.725385904312134, + "learning_rate": 3.5832763059275914e-05, + "logits/chosen": 3.5845274925231934, + "logits/rejected": 3.5824055671691895, + "logps/chosen": -354.77838134765625, + "logps/rejected": -312.89117431640625, + "loss": 0.412, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.2250951528549194, + "rewards/margins": 3.1428983211517334, + "rewards/rejected": -4.367993354797363, + "step": 26100 + }, + { + "epoch": 0.8511539620858486, + "grad_norm": 2.5242557525634766, + "learning_rate": 3.582190069627747e-05, + "logits/chosen": 3.4717764854431152, + "logits/rejected": 3.5164241790771484, + "logps/chosen": -352.33123779296875, + "logps/rejected": -307.09783935546875, + "loss": 0.6217, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5261870622634888, + "rewards/margins": 2.6940600872039795, + "rewards/rejected": -4.2202467918396, + "step": 26120 + }, + { + "epoch": 0.8518056879373691, + "grad_norm": 1.1929785013198853, + "learning_rate": 3.581103833327902e-05, + "logits/chosen": 3.7919201850891113, + "logits/rejected": 3.8433470726013184, + "logps/chosen": -395.7960510253906, + "logps/rejected": -367.97381591796875, + "loss": 0.5669, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.857879638671875, + "rewards/margins": 2.5414986610412598, + "rewards/rejected": -4.399378299713135, + "step": 26140 + }, + { + "epoch": 0.8524574137888897, + "grad_norm": 3.5556461811065674, + "learning_rate": 3.580017597028057e-05, + "logits/chosen": 3.2463912963867188, + "logits/rejected": 3.341749668121338, + "logps/chosen": -325.7276611328125, + "logps/rejected": -320.9314270019531, + "loss": 0.3905, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.5402228832244873, + "rewards/margins": 3.3099365234375, + "rewards/rejected": -4.850159645080566, + "step": 26160 + }, + { + "epoch": 0.8531091396404102, + "grad_norm": 0.5317373871803284, + "learning_rate": 3.578931360728213e-05, + "logits/chosen": 3.531881332397461, + "logits/rejected": 3.6071231365203857, + "logps/chosen": -385.2152099609375, + "logps/rejected": -314.29388427734375, + "loss": 0.3884, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5731868743896484, + "rewards/margins": 2.897745132446289, + "rewards/rejected": -4.470931529998779, + "step": 26180 + }, + { + "epoch": 0.8537608654919309, + "grad_norm": 3.063145160675049, + "learning_rate": 3.577845124428368e-05, + "logits/chosen": 3.2730560302734375, + "logits/rejected": 3.334845781326294, + "logps/chosen": -335.35247802734375, + "logps/rejected": -328.0179748535156, + "loss": 0.3936, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.004499912261963, + "rewards/margins": 3.1404831409454346, + "rewards/rejected": -5.14498233795166, + "step": 26200 + }, + { + "epoch": 0.8544125913434514, + "grad_norm": 6.203271865844727, + "learning_rate": 3.576758888128524e-05, + "logits/chosen": 3.5893783569335938, + "logits/rejected": 3.6550655364990234, + "logps/chosen": -408.029541015625, + "logps/rejected": -343.0603332519531, + "loss": 0.2913, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.467949628829956, + "rewards/margins": 3.940901517868042, + "rewards/rejected": -5.408851623535156, + "step": 26220 + }, + { + "epoch": 0.8550643171949719, + "grad_norm": 1.5271143913269043, + "learning_rate": 3.575672651828679e-05, + "logits/chosen": 3.3301594257354736, + "logits/rejected": 3.4934451580047607, + "logps/chosen": -343.2996826171875, + "logps/rejected": -342.1629638671875, + "loss": 0.3893, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.1000919342041016, + "rewards/margins": 3.243826389312744, + "rewards/rejected": -5.3439178466796875, + "step": 26240 + }, + { + "epoch": 0.8557160430464925, + "grad_norm": 5.563521862030029, + "learning_rate": 3.574586415528835e-05, + "logits/chosen": 3.8802719116210938, + "logits/rejected": 3.856919050216675, + "logps/chosen": -363.3992614746094, + "logps/rejected": -320.35406494140625, + "loss": 0.5175, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.159895420074463, + "rewards/margins": 3.479245662689209, + "rewards/rejected": -5.63914155960083, + "step": 26260 + }, + { + "epoch": 0.856367768898013, + "grad_norm": 1.7391654253005981, + "learning_rate": 3.57350017922899e-05, + "logits/chosen": 3.0420982837677, + "logits/rejected": 3.291583299636841, + "logps/chosen": -344.11944580078125, + "logps/rejected": -297.71966552734375, + "loss": 0.5162, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.425823211669922, + "rewards/margins": 3.105224609375, + "rewards/rejected": -5.531047821044922, + "step": 26280 + }, + { + "epoch": 0.8570194947495337, + "grad_norm": 2.666801929473877, + "learning_rate": 3.572413942929145e-05, + "logits/chosen": 3.4599125385284424, + "logits/rejected": 3.4232890605926514, + "logps/chosen": -374.572021484375, + "logps/rejected": -332.940185546875, + "loss": 0.6506, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.941674828529358, + "rewards/margins": 2.9437129497528076, + "rewards/rejected": -4.885387420654297, + "step": 26300 + }, + { + "epoch": 0.8576712206010542, + "grad_norm": 4.224674701690674, + "learning_rate": 3.5713277066293006e-05, + "logits/chosen": 3.323099136352539, + "logits/rejected": 3.461528778076172, + "logps/chosen": -362.772705078125, + "logps/rejected": -336.4222106933594, + "loss": 0.5492, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6460235118865967, + "rewards/margins": 2.706815481185913, + "rewards/rejected": -5.35283899307251, + "step": 26320 + }, + { + "epoch": 0.8583229464525747, + "grad_norm": 8.191548347473145, + "learning_rate": 3.570241470329456e-05, + "logits/chosen": 3.417376756668091, + "logits/rejected": 3.393702268600464, + "logps/chosen": -369.73260498046875, + "logps/rejected": -341.55889892578125, + "loss": 0.5141, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6128482818603516, + "rewards/margins": 3.3898231983184814, + "rewards/rejected": -5.002671241760254, + "step": 26340 + }, + { + "epoch": 0.8589746723040953, + "grad_norm": 0.10622664541006088, + "learning_rate": 3.569155234029611e-05, + "logits/chosen": 3.409071445465088, + "logits/rejected": 3.4573638439178467, + "logps/chosen": -330.634033203125, + "logps/rejected": -321.22918701171875, + "loss": 0.6606, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.173731565475464, + "rewards/margins": 2.4548041820526123, + "rewards/rejected": -4.628535747528076, + "step": 26360 + }, + { + "epoch": 0.8596263981556158, + "grad_norm": 2.1222217082977295, + "learning_rate": 3.5680689977297665e-05, + "logits/chosen": 3.5618984699249268, + "logits/rejected": 3.7010726928710938, + "logps/chosen": -342.39715576171875, + "logps/rejected": -309.7198181152344, + "loss": 0.5025, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9263803958892822, + "rewards/margins": 2.7761754989624023, + "rewards/rejected": -4.7025556564331055, + "step": 26380 + }, + { + "epoch": 0.8602781240071364, + "grad_norm": 0.3746213912963867, + "learning_rate": 3.5669827614299216e-05, + "logits/chosen": 3.7036614418029785, + "logits/rejected": 3.599611282348633, + "logps/chosen": -370.2300720214844, + "logps/rejected": -332.8393859863281, + "loss": 0.3705, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8333740234375, + "rewards/margins": 3.0685603618621826, + "rewards/rejected": -4.901934623718262, + "step": 26400 + }, + { + "epoch": 0.860929849858657, + "grad_norm": 1.0852429866790771, + "learning_rate": 3.565896525130077e-05, + "logits/chosen": 3.483004331588745, + "logits/rejected": 3.6357407569885254, + "logps/chosen": -359.4261779785156, + "logps/rejected": -299.3288879394531, + "loss": 0.5932, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6798558235168457, + "rewards/margins": 2.446376323699951, + "rewards/rejected": -5.126232147216797, + "step": 26420 + }, + { + "epoch": 0.8615815757101775, + "grad_norm": 1.0105268955230713, + "learning_rate": 3.564810288830232e-05, + "logits/chosen": 3.582141160964966, + "logits/rejected": 3.704904556274414, + "logps/chosen": -334.8662109375, + "logps/rejected": -329.5475158691406, + "loss": 0.5853, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9730892181396484, + "rewards/margins": 2.965559482574463, + "rewards/rejected": -4.9386491775512695, + "step": 26440 + }, + { + "epoch": 0.8622333015616981, + "grad_norm": 1.8126294612884521, + "learning_rate": 3.5637240525303875e-05, + "logits/chosen": 3.656177520751953, + "logits/rejected": 3.7653796672821045, + "logps/chosen": -331.4509582519531, + "logps/rejected": -298.9779968261719, + "loss": 0.4828, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.105203866958618, + "rewards/margins": 2.642970561981201, + "rewards/rejected": -4.74817419052124, + "step": 26460 + }, + { + "epoch": 0.8628850274132186, + "grad_norm": 4.472099304199219, + "learning_rate": 3.562637816230543e-05, + "logits/chosen": 3.447995662689209, + "logits/rejected": 3.530885696411133, + "logps/chosen": -306.0140686035156, + "logps/rejected": -316.20404052734375, + "loss": 0.4445, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6060903072357178, + "rewards/margins": 3.0708606243133545, + "rewards/rejected": -4.676950931549072, + "step": 26480 + }, + { + "epoch": 0.8635367532647392, + "grad_norm": 0.1748809814453125, + "learning_rate": 3.561551579930698e-05, + "logits/chosen": 3.361232280731201, + "logits/rejected": 3.5926883220672607, + "logps/chosen": -330.2586975097656, + "logps/rejected": -288.2273254394531, + "loss": 0.4831, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2674323320388794, + "rewards/margins": 2.879624366760254, + "rewards/rejected": -4.1470561027526855, + "step": 26500 + }, + { + "epoch": 0.8641884791162597, + "grad_norm": 1.061169147491455, + "learning_rate": 3.560465343630854e-05, + "logits/chosen": 3.4242546558380127, + "logits/rejected": 3.4281845092773438, + "logps/chosen": -349.0876770019531, + "logps/rejected": -306.1029052734375, + "loss": 0.5713, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1106693744659424, + "rewards/margins": 2.391747236251831, + "rewards/rejected": -3.5024170875549316, + "step": 26520 + }, + { + "epoch": 0.8648402049677804, + "grad_norm": 1.2283775806427002, + "learning_rate": 3.559379107331009e-05, + "logits/chosen": 3.602476119995117, + "logits/rejected": 3.6020264625549316, + "logps/chosen": -372.6305236816406, + "logps/rejected": -331.2041320800781, + "loss": 0.3666, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.2177817821502686, + "rewards/margins": 3.192758560180664, + "rewards/rejected": -5.4105401039123535, + "step": 26540 + }, + { + "epoch": 0.8654919308193009, + "grad_norm": 2.553429365158081, + "learning_rate": 3.558292871031164e-05, + "logits/chosen": 3.4960083961486816, + "logits/rejected": 3.5987770557403564, + "logps/chosen": -364.44549560546875, + "logps/rejected": -361.6658020019531, + "loss": 0.3409, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6382957696914673, + "rewards/margins": 3.9286818504333496, + "rewards/rejected": -5.566977500915527, + "step": 26560 + }, + { + "epoch": 0.8661436566708214, + "grad_norm": 6.9860520362854, + "learning_rate": 3.55720663473132e-05, + "logits/chosen": 3.34975004196167, + "logits/rejected": 3.4465999603271484, + "logps/chosen": -341.38958740234375, + "logps/rejected": -313.86474609375, + "loss": 0.5486, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7180843353271484, + "rewards/margins": 2.448681354522705, + "rewards/rejected": -4.1667656898498535, + "step": 26580 + }, + { + "epoch": 0.866795382522342, + "grad_norm": 2.7477827072143555, + "learning_rate": 3.556120398431475e-05, + "logits/chosen": 3.3869919776916504, + "logits/rejected": 3.406269073486328, + "logps/chosen": -307.2366027832031, + "logps/rejected": -309.1568908691406, + "loss": 0.5314, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4386990070343018, + "rewards/margins": 2.5961251258850098, + "rewards/rejected": -4.034823894500732, + "step": 26600 + }, + { + "epoch": 0.8674471083738625, + "grad_norm": 5.986055850982666, + "learning_rate": 3.55503416213163e-05, + "logits/chosen": 3.2842013835906982, + "logits/rejected": 3.333244800567627, + "logps/chosen": -314.31634521484375, + "logps/rejected": -281.13580322265625, + "loss": 0.4286, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7370927333831787, + "rewards/margins": 2.576901912689209, + "rewards/rejected": -4.313994407653809, + "step": 26620 + }, + { + "epoch": 0.8680988342253831, + "grad_norm": 3.992210865020752, + "learning_rate": 3.553947925831785e-05, + "logits/chosen": 3.3945655822753906, + "logits/rejected": 3.3027572631835938, + "logps/chosen": -311.5277404785156, + "logps/rejected": -285.576171875, + "loss": 0.4436, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6878725290298462, + "rewards/margins": 3.0695641040802, + "rewards/rejected": -4.757437229156494, + "step": 26640 + }, + { + "epoch": 0.8687505600769037, + "grad_norm": 1.4215872287750244, + "learning_rate": 3.552861689531941e-05, + "logits/chosen": 3.1105546951293945, + "logits/rejected": 3.25150728225708, + "logps/chosen": -305.45660400390625, + "logps/rejected": -303.33807373046875, + "loss": 0.4009, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7233089208602905, + "rewards/margins": 2.4929099082946777, + "rewards/rejected": -4.2162184715271, + "step": 26660 + }, + { + "epoch": 0.8694022859284242, + "grad_norm": 0.9383944272994995, + "learning_rate": 3.551775453232096e-05, + "logits/chosen": 3.271937608718872, + "logits/rejected": 3.2647252082824707, + "logps/chosen": -355.4483642578125, + "logps/rejected": -362.9366149902344, + "loss": 0.3701, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.313787817955017, + "rewards/margins": 2.6928210258483887, + "rewards/rejected": -4.006608486175537, + "step": 26680 + }, + { + "epoch": 0.8700540117799448, + "grad_norm": 2.6227664947509766, + "learning_rate": 3.550689216932251e-05, + "logits/chosen": 3.503441572189331, + "logits/rejected": 3.434415340423584, + "logps/chosen": -369.8568115234375, + "logps/rejected": -324.2278747558594, + "loss": 0.3931, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.4410113096237183, + "rewards/margins": 3.2966017723083496, + "rewards/rejected": -4.737612724304199, + "step": 26700 + }, + { + "epoch": 0.8707057376314653, + "grad_norm": 4.93734073638916, + "learning_rate": 3.549602980632407e-05, + "logits/chosen": 2.983029842376709, + "logits/rejected": 3.0788493156433105, + "logps/chosen": -356.596923828125, + "logps/rejected": -335.806884765625, + "loss": 0.5093, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8702793121337891, + "rewards/margins": 2.9498982429504395, + "rewards/rejected": -3.8201775550842285, + "step": 26720 + }, + { + "epoch": 0.8713574634829859, + "grad_norm": 5.0690202713012695, + "learning_rate": 3.548516744332562e-05, + "logits/chosen": 3.4725348949432373, + "logits/rejected": 3.6351943016052246, + "logps/chosen": -366.5152893066406, + "logps/rejected": -291.3839111328125, + "loss": 0.3536, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.708548903465271, + "rewards/margins": 3.1516411304473877, + "rewards/rejected": -4.860189914703369, + "step": 26740 + }, + { + "epoch": 0.8720091893345064, + "grad_norm": 0.6777780652046204, + "learning_rate": 3.547430508032718e-05, + "logits/chosen": 3.2839393615722656, + "logits/rejected": 3.2358710765838623, + "logps/chosen": -324.88714599609375, + "logps/rejected": -308.77532958984375, + "loss": 0.5502, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.685202956199646, + "rewards/margins": 2.6444218158721924, + "rewards/rejected": -4.329624652862549, + "step": 26760 + }, + { + "epoch": 0.872660915186027, + "grad_norm": 15.57753849029541, + "learning_rate": 3.5463442717328735e-05, + "logits/chosen": 3.360567569732666, + "logits/rejected": 3.4486422538757324, + "logps/chosen": -329.8214111328125, + "logps/rejected": -311.9471435546875, + "loss": 0.5078, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.125692129135132, + "rewards/margins": 3.0748648643493652, + "rewards/rejected": -5.200557231903076, + "step": 26780 + }, + { + "epoch": 0.8733126410375476, + "grad_norm": 2.0315771102905273, + "learning_rate": 3.5452580354330286e-05, + "logits/chosen": 3.297714948654175, + "logits/rejected": 3.3997490406036377, + "logps/chosen": -382.62066650390625, + "logps/rejected": -369.57489013671875, + "loss": 0.4425, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3509387969970703, + "rewards/margins": 2.6364052295684814, + "rewards/rejected": -4.987344264984131, + "step": 26800 + }, + { + "epoch": 0.8739643668890681, + "grad_norm": 5.344809055328369, + "learning_rate": 3.5441717991331837e-05, + "logits/chosen": 3.0287978649139404, + "logits/rejected": 3.2646820545196533, + "logps/chosen": -344.96026611328125, + "logps/rejected": -330.3155517578125, + "loss": 0.5645, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.5174293518066406, + "rewards/margins": 2.938215732574463, + "rewards/rejected": -5.455645561218262, + "step": 26820 + }, + { + "epoch": 0.8746160927405887, + "grad_norm": 1.8360121250152588, + "learning_rate": 3.543085562833339e-05, + "logits/chosen": 3.12422251701355, + "logits/rejected": 3.0814387798309326, + "logps/chosen": -327.66925048828125, + "logps/rejected": -310.6183166503906, + "loss": 0.4509, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.2681801319122314, + "rewards/margins": 3.3912596702575684, + "rewards/rejected": -5.659440040588379, + "step": 26840 + }, + { + "epoch": 0.8752678185921092, + "grad_norm": 4.24968147277832, + "learning_rate": 3.5419993265334945e-05, + "logits/chosen": 3.1948418617248535, + "logits/rejected": 3.4281506538391113, + "logps/chosen": -378.39862060546875, + "logps/rejected": -316.944580078125, + "loss": 0.3516, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.1137712001800537, + "rewards/margins": 3.378830671310425, + "rewards/rejected": -5.4926018714904785, + "step": 26860 + }, + { + "epoch": 0.8759195444436297, + "grad_norm": 2.312868356704712, + "learning_rate": 3.5409130902336496e-05, + "logits/chosen": 3.180588960647583, + "logits/rejected": 3.1519980430603027, + "logps/chosen": -363.886962890625, + "logps/rejected": -305.2408447265625, + "loss": 0.4531, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8039671182632446, + "rewards/margins": 3.5993409156799316, + "rewards/rejected": -5.403307914733887, + "step": 26880 + }, + { + "epoch": 0.8765712702951504, + "grad_norm": 2.0976622104644775, + "learning_rate": 3.5398268539338046e-05, + "logits/chosen": 2.9907193183898926, + "logits/rejected": 3.162926197052002, + "logps/chosen": -346.80596923828125, + "logps/rejected": -331.63519287109375, + "loss": 0.4291, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.5973074436187744, + "rewards/margins": 3.240044355392456, + "rewards/rejected": -5.8373517990112305, + "step": 26900 + }, + { + "epoch": 0.8772229961466709, + "grad_norm": 7.431725978851318, + "learning_rate": 3.5387406176339604e-05, + "logits/chosen": 3.545381546020508, + "logits/rejected": 3.486555576324463, + "logps/chosen": -369.95574951171875, + "logps/rejected": -328.11029052734375, + "loss": 0.5512, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.2343482971191406, + "rewards/margins": 2.93053936958313, + "rewards/rejected": -5.164887428283691, + "step": 26920 + }, + { + "epoch": 0.8778747219981915, + "grad_norm": 4.186372756958008, + "learning_rate": 3.5376543813341155e-05, + "logits/chosen": 3.3172707557678223, + "logits/rejected": 3.3885626792907715, + "logps/chosen": -342.7325744628906, + "logps/rejected": -308.380859375, + "loss": 0.4565, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9689414501190186, + "rewards/margins": 2.8091254234313965, + "rewards/rejected": -4.778067111968994, + "step": 26940 + }, + { + "epoch": 0.878526447849712, + "grad_norm": 0.8353143334388733, + "learning_rate": 3.5365681450342706e-05, + "logits/chosen": 3.0590462684631348, + "logits/rejected": 3.1413326263427734, + "logps/chosen": -327.89556884765625, + "logps/rejected": -302.93048095703125, + "loss": 0.3908, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3837662935256958, + "rewards/margins": 3.1364712715148926, + "rewards/rejected": -4.520237922668457, + "step": 26960 + }, + { + "epoch": 0.8791781737012325, + "grad_norm": 3.043062210083008, + "learning_rate": 3.535481908734426e-05, + "logits/chosen": 3.5706286430358887, + "logits/rejected": 3.5805015563964844, + "logps/chosen": -361.6725158691406, + "logps/rejected": -297.4393310546875, + "loss": 0.3891, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6460949182510376, + "rewards/margins": 3.356813430786133, + "rewards/rejected": -5.002908229827881, + "step": 26980 + }, + { + "epoch": 0.8798298995527531, + "grad_norm": 0.8989477753639221, + "learning_rate": 3.5343956724345814e-05, + "logits/chosen": 3.299938678741455, + "logits/rejected": 3.296217441558838, + "logps/chosen": -304.3896789550781, + "logps/rejected": -308.621337890625, + "loss": 0.2648, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.3060157299041748, + "rewards/margins": 3.265836238861084, + "rewards/rejected": -4.57185173034668, + "step": 27000 + }, + { + "epoch": 0.8798298995527531, + "eval_logits/chosen": 3.445167064666748, + "eval_logits/rejected": 3.4387173652648926, + "eval_logps/chosen": -373.8082580566406, + "eval_logps/rejected": -350.93499755859375, + "eval_loss": 0.4419960677623749, + "eval_rewards/accuracies": 0.8293665051460266, + "eval_rewards/chosen": -1.9227646589279175, + "eval_rewards/margins": 3.547412633895874, + "eval_rewards/rejected": -5.470176696777344, + "eval_runtime": 3544.2612, + "eval_samples_per_second": 3.153, + "eval_steps_per_second": 3.153, + "step": 27000 + }, + { + "epoch": 0.8804816254042737, + "grad_norm": 6.789554595947266, + "learning_rate": 3.533309436134737e-05, + "logits/chosen": 3.3579814434051514, + "logits/rejected": 3.4469852447509766, + "logps/chosen": -351.2491760253906, + "logps/rejected": -311.24774169921875, + "loss": 0.4155, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6236549615859985, + "rewards/margins": 3.0217535495758057, + "rewards/rejected": -4.6454081535339355, + "step": 27020 + }, + { + "epoch": 0.8811333512557943, + "grad_norm": 1.1001877784729004, + "learning_rate": 3.532223199834892e-05, + "logits/chosen": 2.7787270545959473, + "logits/rejected": 2.821101427078247, + "logps/chosen": -296.27679443359375, + "logps/rejected": -291.08905029296875, + "loss": 0.5507, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7321414947509766, + "rewards/margins": 2.882765531539917, + "rewards/rejected": -4.6149067878723145, + "step": 27040 + }, + { + "epoch": 0.8817850771073148, + "grad_norm": 4.422394752502441, + "learning_rate": 3.531136963535048e-05, + "logits/chosen": 3.534209728240967, + "logits/rejected": 3.486354112625122, + "logps/chosen": -384.44268798828125, + "logps/rejected": -339.4278869628906, + "loss": 0.3502, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7637767791748047, + "rewards/margins": 3.531508684158325, + "rewards/rejected": -5.295285224914551, + "step": 27060 + }, + { + "epoch": 0.8824368029588354, + "grad_norm": 2.0672521591186523, + "learning_rate": 3.530050727235203e-05, + "logits/chosen": 3.612889528274536, + "logits/rejected": 3.6049740314483643, + "logps/chosen": -348.9222717285156, + "logps/rejected": -285.8466796875, + "loss": 0.5624, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.517600655555725, + "rewards/margins": 2.668416738510132, + "rewards/rejected": -4.186017036437988, + "step": 27080 + }, + { + "epoch": 0.8830885288103559, + "grad_norm": 5.289847373962402, + "learning_rate": 3.528964490935358e-05, + "logits/chosen": 3.4389681816101074, + "logits/rejected": 3.504826784133911, + "logps/chosen": -370.6736755371094, + "logps/rejected": -351.27838134765625, + "loss": 0.3566, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3514678478240967, + "rewards/margins": 3.5460731983184814, + "rewards/rejected": -4.897541046142578, + "step": 27100 + }, + { + "epoch": 0.8837402546618764, + "grad_norm": 3.5213584899902344, + "learning_rate": 3.527878254635514e-05, + "logits/chosen": 3.4570388793945312, + "logits/rejected": 3.6815810203552246, + "logps/chosen": -295.51226806640625, + "logps/rejected": -303.9058837890625, + "loss": 0.5564, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1384365558624268, + "rewards/margins": 2.7175185680389404, + "rewards/rejected": -4.855954647064209, + "step": 27120 + }, + { + "epoch": 0.8843919805133971, + "grad_norm": 5.69990873336792, + "learning_rate": 3.526792018335669e-05, + "logits/chosen": 3.446594715118408, + "logits/rejected": 3.554222822189331, + "logps/chosen": -313.01141357421875, + "logps/rejected": -327.2387390136719, + "loss": 0.5172, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0565619468688965, + "rewards/margins": 3.5709922313690186, + "rewards/rejected": -5.627554416656494, + "step": 27140 + }, + { + "epoch": 0.8850437063649176, + "grad_norm": 1.6270923614501953, + "learning_rate": 3.525705782035824e-05, + "logits/chosen": 3.437037229537964, + "logits/rejected": 3.5555293560028076, + "logps/chosen": -370.9471435546875, + "logps/rejected": -333.9298400878906, + "loss": 0.38, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.60597825050354, + "rewards/margins": 2.9113059043884277, + "rewards/rejected": -4.517284393310547, + "step": 27160 + }, + { + "epoch": 0.8856954322164382, + "grad_norm": 1.9746376276016235, + "learning_rate": 3.524673857550972e-05, + "logits/chosen": 3.3825125694274902, + "logits/rejected": 3.3561618328094482, + "logps/chosen": -347.9784240722656, + "logps/rejected": -320.7912292480469, + "loss": 0.6093, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8523151874542236, + "rewards/margins": 2.3261027336120605, + "rewards/rejected": -4.178418159484863, + "step": 27180 + }, + { + "epoch": 0.8863471580679587, + "grad_norm": 4.79823637008667, + "learning_rate": 3.523587621251127e-05, + "logits/chosen": 3.3507437705993652, + "logits/rejected": 3.393108367919922, + "logps/chosen": -345.62042236328125, + "logps/rejected": -310.37506103515625, + "loss": 0.4708, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5513383150100708, + "rewards/margins": 3.086245536804199, + "rewards/rejected": -4.6375837326049805, + "step": 27200 + }, + { + "epoch": 0.8869988839194792, + "grad_norm": 4.07036018371582, + "learning_rate": 3.522501384951282e-05, + "logits/chosen": 3.202735424041748, + "logits/rejected": 3.358494520187378, + "logps/chosen": -311.969970703125, + "logps/rejected": -324.9259948730469, + "loss": 0.2374, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7199798822402954, + "rewards/margins": 3.47888445854187, + "rewards/rejected": -5.198864459991455, + "step": 27220 + }, + { + "epoch": 0.8876506097709999, + "grad_norm": 2.1014063358306885, + "learning_rate": 3.521415148651438e-05, + "logits/chosen": 3.4843451976776123, + "logits/rejected": 3.455444812774658, + "logps/chosen": -320.906982421875, + "logps/rejected": -298.3660888671875, + "loss": 0.5114, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4213056564331055, + "rewards/margins": 1.9915482997894287, + "rewards/rejected": -3.412853956222534, + "step": 27240 + }, + { + "epoch": 0.8883023356225204, + "grad_norm": 2.9042861461639404, + "learning_rate": 3.520328912351593e-05, + "logits/chosen": 3.6359801292419434, + "logits/rejected": 3.6452019214630127, + "logps/chosen": -370.31280517578125, + "logps/rejected": -314.9769592285156, + "loss": 0.3995, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0684502124786377, + "rewards/margins": 2.7667014598846436, + "rewards/rejected": -4.835151195526123, + "step": 27260 + }, + { + "epoch": 0.888954061474041, + "grad_norm": 0.6326847076416016, + "learning_rate": 3.519242676051748e-05, + "logits/chosen": 3.5808258056640625, + "logits/rejected": 3.4833438396453857, + "logps/chosen": -379.09429931640625, + "logps/rejected": -335.1940002441406, + "loss": 0.5135, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.892622947692871, + "rewards/margins": 2.865530252456665, + "rewards/rejected": -4.758152961730957, + "step": 27280 + }, + { + "epoch": 0.8896057873255615, + "grad_norm": 2.3957059383392334, + "learning_rate": 3.518156439751904e-05, + "logits/chosen": 3.0785229206085205, + "logits/rejected": 3.2282309532165527, + "logps/chosen": -305.50811767578125, + "logps/rejected": -297.1010437011719, + "loss": 0.3729, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4832754135131836, + "rewards/margins": 3.207507610321045, + "rewards/rejected": -4.690783500671387, + "step": 27300 + }, + { + "epoch": 0.890257513177082, + "grad_norm": 9.857674598693848, + "learning_rate": 3.5170702034520595e-05, + "logits/chosen": 3.101640462875366, + "logits/rejected": 3.1123387813568115, + "logps/chosen": -328.50396728515625, + "logps/rejected": -327.0220642089844, + "loss": 0.7077, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.2019336223602295, + "rewards/margins": 1.9753490686416626, + "rewards/rejected": -4.177282810211182, + "step": 27320 + }, + { + "epoch": 0.8909092390286026, + "grad_norm": 1.2174248695373535, + "learning_rate": 3.5159839671522145e-05, + "logits/chosen": 3.1349895000457764, + "logits/rejected": 3.2807979583740234, + "logps/chosen": -372.18682861328125, + "logps/rejected": -344.47454833984375, + "loss": 0.3466, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7832911014556885, + "rewards/margins": 3.966702699661255, + "rewards/rejected": -5.749993801116943, + "step": 27340 + }, + { + "epoch": 0.8915609648801232, + "grad_norm": 2.0346839427948, + "learning_rate": 3.51489773085237e-05, + "logits/chosen": 3.2401645183563232, + "logits/rejected": 3.321239471435547, + "logps/chosen": -324.15960693359375, + "logps/rejected": -307.6062927246094, + "loss": 0.3461, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.114205837249756, + "rewards/margins": 3.6297574043273926, + "rewards/rejected": -5.743963241577148, + "step": 27360 + }, + { + "epoch": 0.8922126907316438, + "grad_norm": 2.8215696811676025, + "learning_rate": 3.5138114945525254e-05, + "logits/chosen": 2.8235104084014893, + "logits/rejected": 3.1046645641326904, + "logps/chosen": -300.77166748046875, + "logps/rejected": -294.0504150390625, + "loss": 0.5737, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.3246009349823, + "rewards/margins": 2.0374741554260254, + "rewards/rejected": -4.362075328826904, + "step": 27380 + }, + { + "epoch": 0.8928644165831643, + "grad_norm": 4.267367362976074, + "learning_rate": 3.5127252582526805e-05, + "logits/chosen": 3.1367623805999756, + "logits/rejected": 3.096296787261963, + "logps/chosen": -322.0823669433594, + "logps/rejected": -389.5498352050781, + "loss": 0.7392, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6603660583496094, + "rewards/margins": 2.3130698204040527, + "rewards/rejected": -3.973435163497925, + "step": 27400 + }, + { + "epoch": 0.8935161424346848, + "grad_norm": 7.32771110534668, + "learning_rate": 3.5116390219528355e-05, + "logits/chosen": 3.1594693660736084, + "logits/rejected": 3.2636520862579346, + "logps/chosen": -277.64874267578125, + "logps/rejected": -261.11358642578125, + "loss": 0.5381, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1561083793640137, + "rewards/margins": 2.540419816970825, + "rewards/rejected": -4.69652795791626, + "step": 27420 + }, + { + "epoch": 0.8941678682862054, + "grad_norm": 2.232724189758301, + "learning_rate": 3.510552785652991e-05, + "logits/chosen": 3.3624377250671387, + "logits/rejected": 3.3968441486358643, + "logps/chosen": -330.06170654296875, + "logps/rejected": -319.5298767089844, + "loss": 0.4589, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9436111450195312, + "rewards/margins": 2.680065631866455, + "rewards/rejected": -4.6236772537231445, + "step": 27440 + }, + { + "epoch": 0.8948195941377259, + "grad_norm": 4.0336713790893555, + "learning_rate": 3.5094665493531464e-05, + "logits/chosen": 3.448960065841675, + "logits/rejected": 3.3816611766815186, + "logps/chosen": -386.85223388671875, + "logps/rejected": -378.6731262207031, + "loss": 0.376, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8938802480697632, + "rewards/margins": 3.8093490600585938, + "rewards/rejected": -5.7032294273376465, + "step": 27460 + }, + { + "epoch": 0.8954713199892466, + "grad_norm": 3.939181327819824, + "learning_rate": 3.5083803130533015e-05, + "logits/chosen": 3.091920852661133, + "logits/rejected": 3.2375950813293457, + "logps/chosen": -346.91351318359375, + "logps/rejected": -317.560791015625, + "loss": 0.6739, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.574434995651245, + "rewards/margins": 2.629479169845581, + "rewards/rejected": -5.203914165496826, + "step": 27480 + }, + { + "epoch": 0.8961230458407671, + "grad_norm": 2.3396222591400146, + "learning_rate": 3.507294076753457e-05, + "logits/chosen": 3.4426143169403076, + "logits/rejected": 3.4859395027160645, + "logps/chosen": -347.537109375, + "logps/rejected": -294.4842834472656, + "loss": 0.272, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.878420114517212, + "rewards/margins": 3.5031306743621826, + "rewards/rejected": -5.3815507888793945, + "step": 27500 + }, + { + "epoch": 0.8967747716922876, + "grad_norm": 1.1260528564453125, + "learning_rate": 3.506207840453612e-05, + "logits/chosen": 3.4895596504211426, + "logits/rejected": 3.5106658935546875, + "logps/chosen": -348.8136291503906, + "logps/rejected": -329.8946533203125, + "loss": 0.6584, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.3015410900115967, + "rewards/margins": 1.9957482814788818, + "rewards/rejected": -4.2972893714904785, + "step": 27520 + }, + { + "epoch": 0.8974264975438082, + "grad_norm": 1.5209555625915527, + "learning_rate": 3.5051216041537674e-05, + "logits/chosen": 3.5239837169647217, + "logits/rejected": 3.622121810913086, + "logps/chosen": -351.91644287109375, + "logps/rejected": -358.6653747558594, + "loss": 0.4093, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8473091125488281, + "rewards/margins": 2.946962833404541, + "rewards/rejected": -4.794272422790527, + "step": 27540 + }, + { + "epoch": 0.8980782233953287, + "grad_norm": 1.0083138942718506, + "learning_rate": 3.504035367853923e-05, + "logits/chosen": 3.3522772789001465, + "logits/rejected": 3.4127914905548096, + "logps/chosen": -372.7818908691406, + "logps/rejected": -394.35028076171875, + "loss": 0.5438, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.680410385131836, + "rewards/margins": 3.060673236846924, + "rewards/rejected": -4.741084098815918, + "step": 27560 + }, + { + "epoch": 0.8987299492468493, + "grad_norm": 0.7541515231132507, + "learning_rate": 3.502949131554079e-05, + "logits/chosen": 3.5698132514953613, + "logits/rejected": 3.734055757522583, + "logps/chosen": -345.3548583984375, + "logps/rejected": -329.6003112792969, + "loss": 0.6371, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.99551522731781, + "rewards/margins": 1.6780154705047607, + "rewards/rejected": -3.6735305786132812, + "step": 27580 + }, + { + "epoch": 0.8993816750983699, + "grad_norm": 3.8325700759887695, + "learning_rate": 3.501862895254234e-05, + "logits/chosen": 3.5361666679382324, + "logits/rejected": 3.477916717529297, + "logps/chosen": -368.5334167480469, + "logps/rejected": -357.35986328125, + "loss": 0.4268, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.092846393585205, + "rewards/margins": 2.9822473526000977, + "rewards/rejected": -5.075094223022461, + "step": 27600 + }, + { + "epoch": 0.9000334009498905, + "grad_norm": 0.9322967529296875, + "learning_rate": 3.500776658954389e-05, + "logits/chosen": 3.050901174545288, + "logits/rejected": 3.1477441787719727, + "logps/chosen": -313.73443603515625, + "logps/rejected": -309.3685607910156, + "loss": 0.2881, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.6814229488372803, + "rewards/margins": 3.8149209022521973, + "rewards/rejected": -5.496344089508057, + "step": 27620 + }, + { + "epoch": 0.900685126801411, + "grad_norm": 1.5730409622192383, + "learning_rate": 3.499690422654545e-05, + "logits/chosen": 3.5055899620056152, + "logits/rejected": 3.5794901847839355, + "logps/chosen": -353.82427978515625, + "logps/rejected": -334.6461181640625, + "loss": 0.4609, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.4390087127685547, + "rewards/margins": 3.139045238494873, + "rewards/rejected": -5.578053951263428, + "step": 27640 + }, + { + "epoch": 0.9013368526529315, + "grad_norm": 1.074455738067627, + "learning_rate": 3.4986041863547e-05, + "logits/chosen": 3.3261101245880127, + "logits/rejected": 3.4483273029327393, + "logps/chosen": -360.39324951171875, + "logps/rejected": -298.15631103515625, + "loss": 0.4257, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6775093078613281, + "rewards/margins": 2.7248706817626953, + "rewards/rejected": -4.402379989624023, + "step": 27660 + }, + { + "epoch": 0.9019885785044521, + "grad_norm": 5.2476420402526855, + "learning_rate": 3.497517950054855e-05, + "logits/chosen": 3.4205658435821533, + "logits/rejected": 3.422358274459839, + "logps/chosen": -370.0203857421875, + "logps/rejected": -347.2138671875, + "loss": 0.5427, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3339269161224365, + "rewards/margins": 2.848371982574463, + "rewards/rejected": -5.1822991371154785, + "step": 27680 + }, + { + "epoch": 0.9026403043559726, + "grad_norm": 1.6719751358032227, + "learning_rate": 3.496431713755011e-05, + "logits/chosen": 3.2641360759735107, + "logits/rejected": 3.3610477447509766, + "logps/chosen": -310.0126037597656, + "logps/rejected": -316.53558349609375, + "loss": 0.3637, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.363333225250244, + "rewards/margins": 3.409489154815674, + "rewards/rejected": -5.77282190322876, + "step": 27700 + }, + { + "epoch": 0.9032920302074933, + "grad_norm": 2.482840061187744, + "learning_rate": 3.495345477455166e-05, + "logits/chosen": 3.3712944984436035, + "logits/rejected": 3.3528988361358643, + "logps/chosen": -387.29266357421875, + "logps/rejected": -370.6688232421875, + "loss": 0.3417, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.493534803390503, + "rewards/margins": 3.611548900604248, + "rewards/rejected": -6.105084419250488, + "step": 27720 + }, + { + "epoch": 0.9039437560590138, + "grad_norm": 5.63017463684082, + "learning_rate": 3.494259241155321e-05, + "logits/chosen": 3.303831100463867, + "logits/rejected": 3.3235442638397217, + "logps/chosen": -359.80841064453125, + "logps/rejected": -311.6003723144531, + "loss": 0.4095, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.092423915863037, + "rewards/margins": 3.214667558670044, + "rewards/rejected": -5.307091236114502, + "step": 27740 + }, + { + "epoch": 0.9045954819105343, + "grad_norm": 4.983243942260742, + "learning_rate": 3.493173004855476e-05, + "logits/chosen": 3.440885066986084, + "logits/rejected": 3.345148801803589, + "logps/chosen": -351.1071472167969, + "logps/rejected": -302.1373596191406, + "loss": 0.7359, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.646601438522339, + "rewards/margins": 2.486126661300659, + "rewards/rejected": -5.132728099822998, + "step": 27760 + }, + { + "epoch": 0.9052472077620549, + "grad_norm": 0.4197905957698822, + "learning_rate": 3.492086768555632e-05, + "logits/chosen": 3.5111680030822754, + "logits/rejected": 3.5253195762634277, + "logps/chosen": -332.858642578125, + "logps/rejected": -333.16033935546875, + "loss": 0.3537, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3535560369491577, + "rewards/margins": 3.222301959991455, + "rewards/rejected": -4.575857639312744, + "step": 27780 + }, + { + "epoch": 0.9058989336135754, + "grad_norm": 18.441638946533203, + "learning_rate": 3.491000532255787e-05, + "logits/chosen": 3.7813522815704346, + "logits/rejected": 3.7090084552764893, + "logps/chosen": -376.64984130859375, + "logps/rejected": -356.1200256347656, + "loss": 0.5269, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6293102502822876, + "rewards/margins": 2.7698326110839844, + "rewards/rejected": -4.399143218994141, + "step": 27800 + }, + { + "epoch": 0.906550659465096, + "grad_norm": 4.011420249938965, + "learning_rate": 3.4899142959559425e-05, + "logits/chosen": 3.500758409500122, + "logits/rejected": 3.6175620555877686, + "logps/chosen": -328.3623352050781, + "logps/rejected": -331.23541259765625, + "loss": 0.5177, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.114407539367676, + "rewards/margins": 2.953742027282715, + "rewards/rejected": -5.068149089813232, + "step": 27820 + }, + { + "epoch": 0.9072023853166166, + "grad_norm": 1.1495873928070068, + "learning_rate": 3.4888280596560976e-05, + "logits/chosen": 3.147688627243042, + "logits/rejected": 3.2147979736328125, + "logps/chosen": -326.2384338378906, + "logps/rejected": -306.38800048828125, + "loss": 0.4275, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9589588642120361, + "rewards/margins": 3.1102135181427, + "rewards/rejected": -5.069171905517578, + "step": 27840 + }, + { + "epoch": 0.9078541111681371, + "grad_norm": 3.5072851181030273, + "learning_rate": 3.4877418233562534e-05, + "logits/chosen": 3.3104019165039062, + "logits/rejected": 3.441350221633911, + "logps/chosen": -334.619384765625, + "logps/rejected": -277.9198913574219, + "loss": 0.3045, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.0144011974334717, + "rewards/margins": 3.261674165725708, + "rewards/rejected": -5.2760748863220215, + "step": 27860 + }, + { + "epoch": 0.9085058370196577, + "grad_norm": 4.355706214904785, + "learning_rate": 3.4866555870564084e-05, + "logits/chosen": 3.412299633026123, + "logits/rejected": 3.5361697673797607, + "logps/chosen": -298.87799072265625, + "logps/rejected": -332.74267578125, + "loss": 0.5114, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7345778942108154, + "rewards/margins": 2.6953070163726807, + "rewards/rejected": -4.429884910583496, + "step": 27880 + }, + { + "epoch": 0.9091575628711782, + "grad_norm": 1.0606791973114014, + "learning_rate": 3.485569350756564e-05, + "logits/chosen": 3.476349353790283, + "logits/rejected": 3.398899793624878, + "logps/chosen": -363.552490234375, + "logps/rejected": -325.92510986328125, + "loss": 0.4631, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6682714223861694, + "rewards/margins": 3.538928508758545, + "rewards/rejected": -5.207200050354004, + "step": 27900 + }, + { + "epoch": 0.9098092887226988, + "grad_norm": 3.9178454875946045, + "learning_rate": 3.484483114456719e-05, + "logits/chosen": 3.3734824657440186, + "logits/rejected": 3.586475372314453, + "logps/chosen": -342.24603271484375, + "logps/rejected": -295.84783935546875, + "loss": 0.3653, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.774118423461914, + "rewards/margins": 2.9295737743377686, + "rewards/rejected": -4.7036919593811035, + "step": 27920 + }, + { + "epoch": 0.9104610145742194, + "grad_norm": 5.50822114944458, + "learning_rate": 3.4833968781568744e-05, + "logits/chosen": 2.861348867416382, + "logits/rejected": 3.1256728172302246, + "logps/chosen": -395.216796875, + "logps/rejected": -352.32489013671875, + "loss": 0.3896, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7324039936065674, + "rewards/margins": 3.492952823638916, + "rewards/rejected": -5.225356101989746, + "step": 27940 + }, + { + "epoch": 0.9111127404257399, + "grad_norm": 1.971116065979004, + "learning_rate": 3.4823106418570294e-05, + "logits/chosen": 3.1212074756622314, + "logits/rejected": 3.1406004428863525, + "logps/chosen": -295.22320556640625, + "logps/rejected": -272.80523681640625, + "loss": 0.4809, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.3328604698181152, + "rewards/margins": 2.6392204761505127, + "rewards/rejected": -4.972081184387207, + "step": 27960 + }, + { + "epoch": 0.9117644662772605, + "grad_norm": 0.41140860319137573, + "learning_rate": 3.481224405557185e-05, + "logits/chosen": 2.909651279449463, + "logits/rejected": 3.090867042541504, + "logps/chosen": -307.9274597167969, + "logps/rejected": -290.5860290527344, + "loss": 0.3747, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.2892580032348633, + "rewards/margins": 3.2373085021972656, + "rewards/rejected": -5.526566505432129, + "step": 27980 + }, + { + "epoch": 0.912416192128781, + "grad_norm": 6.223174095153809, + "learning_rate": 3.48013816925734e-05, + "logits/chosen": 3.304537534713745, + "logits/rejected": 3.346344470977783, + "logps/chosen": -348.9827880859375, + "logps/rejected": -338.49322509765625, + "loss": 0.2743, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.8453490734100342, + "rewards/margins": 3.988779067993164, + "rewards/rejected": -5.834128379821777, + "step": 28000 + }, + { + "epoch": 0.9130679179803016, + "grad_norm": 1.1588751077651978, + "learning_rate": 3.4790519329574953e-05, + "logits/chosen": 3.306455612182617, + "logits/rejected": 3.379370927810669, + "logps/chosen": -328.5322570800781, + "logps/rejected": -326.7997741699219, + "loss": 0.747, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.463301181793213, + "rewards/margins": 2.202846050262451, + "rewards/rejected": -4.666146755218506, + "step": 28020 + }, + { + "epoch": 0.9137196438318221, + "grad_norm": 3.288214921951294, + "learning_rate": 3.477965696657651e-05, + "logits/chosen": 3.604034900665283, + "logits/rejected": 3.55659818649292, + "logps/chosen": -356.26617431640625, + "logps/rejected": -315.5042724609375, + "loss": 0.4573, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.2884936332702637, + "rewards/margins": 2.9341957569122314, + "rewards/rejected": -5.222689151763916, + "step": 28040 + }, + { + "epoch": 0.9143713696833426, + "grad_norm": 1.6663923263549805, + "learning_rate": 3.476879460357806e-05, + "logits/chosen": 3.564232587814331, + "logits/rejected": 3.571688175201416, + "logps/chosen": -375.9216613769531, + "logps/rejected": -369.6329650878906, + "loss": 0.5803, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.334402084350586, + "rewards/margins": 3.1730732917785645, + "rewards/rejected": -5.507475852966309, + "step": 28060 + }, + { + "epoch": 0.9150230955348633, + "grad_norm": 3.169771671295166, + "learning_rate": 3.475793224057962e-05, + "logits/chosen": 3.06345796585083, + "logits/rejected": 3.272836208343506, + "logps/chosen": -327.22894287109375, + "logps/rejected": -351.0149841308594, + "loss": 0.5514, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.3281912803649902, + "rewards/margins": 3.352862596511841, + "rewards/rejected": -6.68105411529541, + "step": 28080 + }, + { + "epoch": 0.9156748213863838, + "grad_norm": 3.1853044033050537, + "learning_rate": 3.474706987758117e-05, + "logits/chosen": 2.9662013053894043, + "logits/rejected": 3.132633686065674, + "logps/chosen": -325.10906982421875, + "logps/rejected": -303.6214904785156, + "loss": 0.396, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.3586666584014893, + "rewards/margins": 2.8654375076293945, + "rewards/rejected": -5.224104404449463, + "step": 28100 + }, + { + "epoch": 0.9163265472379044, + "grad_norm": 1.2843908071517944, + "learning_rate": 3.473620751458273e-05, + "logits/chosen": 3.2623772621154785, + "logits/rejected": 3.3935656547546387, + "logps/chosen": -366.0769958496094, + "logps/rejected": -302.7820739746094, + "loss": 0.5605, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9495176076889038, + "rewards/margins": 3.058215618133545, + "rewards/rejected": -5.007733345031738, + "step": 28120 + }, + { + "epoch": 0.9169782730894249, + "grad_norm": 0.1649167537689209, + "learning_rate": 3.472534515158428e-05, + "logits/chosen": 3.3482601642608643, + "logits/rejected": 3.6160850524902344, + "logps/chosen": -369.2115173339844, + "logps/rejected": -330.1651916503906, + "loss": 0.4397, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7089512348175049, + "rewards/margins": 3.5267562866210938, + "rewards/rejected": -5.235707759857178, + "step": 28140 + }, + { + "epoch": 0.9176299989409455, + "grad_norm": 0.5152572393417358, + "learning_rate": 3.471448278858583e-05, + "logits/chosen": 3.6696815490722656, + "logits/rejected": 3.5902538299560547, + "logps/chosen": -340.905029296875, + "logps/rejected": -315.9188232421875, + "loss": 0.5242, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.8083679676055908, + "rewards/margins": 3.1187689304351807, + "rewards/rejected": -4.9271368980407715, + "step": 28160 + }, + { + "epoch": 0.9182817247924661, + "grad_norm": 3.4761993885040283, + "learning_rate": 3.470362042558739e-05, + "logits/chosen": 3.2565131187438965, + "logits/rejected": 3.350341320037842, + "logps/chosen": -337.2800598144531, + "logps/rejected": -317.00274658203125, + "loss": 0.4768, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9697961807250977, + "rewards/margins": 3.1847667694091797, + "rewards/rejected": -5.154562950134277, + "step": 28180 + }, + { + "epoch": 0.9189334506439866, + "grad_norm": 0.4400092661380768, + "learning_rate": 3.469275806258894e-05, + "logits/chosen": 4.027118682861328, + "logits/rejected": 3.8433239459991455, + "logps/chosen": -385.7382507324219, + "logps/rejected": -353.15753173828125, + "loss": 0.6073, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.33436918258667, + "rewards/margins": 2.6946182250976562, + "rewards/rejected": -5.028986930847168, + "step": 28200 + }, + { + "epoch": 0.9195851764955072, + "grad_norm": 24.78778648376465, + "learning_rate": 3.468189569959049e-05, + "logits/chosen": 3.3077385425567627, + "logits/rejected": 3.3885111808776855, + "logps/chosen": -366.79412841796875, + "logps/rejected": -342.7254333496094, + "loss": 0.4307, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.7102882862091064, + "rewards/margins": 3.6972594261169434, + "rewards/rejected": -5.407548427581787, + "step": 28220 + }, + { + "epoch": 0.9202369023470277, + "grad_norm": 3.1545913219451904, + "learning_rate": 3.4671033336592046e-05, + "logits/chosen": 3.1036336421966553, + "logits/rejected": 3.1254172325134277, + "logps/chosen": -316.01348876953125, + "logps/rejected": -322.62225341796875, + "loss": 0.5524, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2954838275909424, + "rewards/margins": 3.31077241897583, + "rewards/rejected": -5.606256008148193, + "step": 28240 + }, + { + "epoch": 0.9208886281985483, + "grad_norm": 5.739023685455322, + "learning_rate": 3.46601709735936e-05, + "logits/chosen": 3.233081102371216, + "logits/rejected": 3.532827377319336, + "logps/chosen": -323.65521240234375, + "logps/rejected": -277.45379638671875, + "loss": 0.4552, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.7782909870147705, + "rewards/margins": 2.3744242191314697, + "rewards/rejected": -5.152714729309082, + "step": 28260 + }, + { + "epoch": 0.9215403540500688, + "grad_norm": 3.8672142028808594, + "learning_rate": 3.464930861059515e-05, + "logits/chosen": 3.4511547088623047, + "logits/rejected": 3.5123322010040283, + "logps/chosen": -336.6278991699219, + "logps/rejected": -321.93072509765625, + "loss": 0.3621, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.4675066471099854, + "rewards/margins": 3.492988109588623, + "rewards/rejected": -5.9604949951171875, + "step": 28280 + }, + { + "epoch": 0.9221920799015894, + "grad_norm": 1.962752103805542, + "learning_rate": 3.4638446247596705e-05, + "logits/chosen": 3.3953163623809814, + "logits/rejected": 3.527156352996826, + "logps/chosen": -361.3832092285156, + "logps/rejected": -335.1177978515625, + "loss": 0.5508, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.634721517562866, + "rewards/margins": 3.077437162399292, + "rewards/rejected": -5.712159156799316, + "step": 28300 + }, + { + "epoch": 0.92284380575311, + "grad_norm": 3.52275013923645, + "learning_rate": 3.4627583884598256e-05, + "logits/chosen": 3.1427042484283447, + "logits/rejected": 3.3790462017059326, + "logps/chosen": -340.86529541015625, + "logps/rejected": -324.8191223144531, + "loss": 0.5952, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9531023502349854, + "rewards/margins": 2.510467052459717, + "rewards/rejected": -4.463569164276123, + "step": 28320 + }, + { + "epoch": 0.9234955316046305, + "grad_norm": 1.366566777229309, + "learning_rate": 3.461672152159981e-05, + "logits/chosen": 3.4621951580047607, + "logits/rejected": 3.6514477729797363, + "logps/chosen": -366.91119384765625, + "logps/rejected": -314.7201232910156, + "loss": 0.599, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.350287437438965, + "rewards/margins": 2.927229404449463, + "rewards/rejected": -5.277516841888428, + "step": 28340 + }, + { + "epoch": 0.9241472574561511, + "grad_norm": 6.648904323577881, + "learning_rate": 3.4605859158601364e-05, + "logits/chosen": 3.2784576416015625, + "logits/rejected": 3.4115028381347656, + "logps/chosen": -327.21600341796875, + "logps/rejected": -323.34283447265625, + "loss": 0.6074, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0372235774993896, + "rewards/margins": 2.7846426963806152, + "rewards/rejected": -4.821866035461426, + "step": 28360 + }, + { + "epoch": 0.9247989833076716, + "grad_norm": 1.7456791400909424, + "learning_rate": 3.459499679560292e-05, + "logits/chosen": 3.3176465034484863, + "logits/rejected": 3.5060181617736816, + "logps/chosen": -321.0713806152344, + "logps/rejected": -311.2961120605469, + "loss": 0.625, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.069330930709839, + "rewards/margins": 2.217869281768799, + "rewards/rejected": -4.287199974060059, + "step": 28380 + }, + { + "epoch": 0.9254507091591921, + "grad_norm": 0.3379690945148468, + "learning_rate": 3.458413443260447e-05, + "logits/chosen": 3.4967732429504395, + "logits/rejected": 3.5222904682159424, + "logps/chosen": -362.5572814941406, + "logps/rejected": -304.2539978027344, + "loss": 0.4577, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.127441883087158, + "rewards/margins": 2.7225501537323, + "rewards/rejected": -4.849991798400879, + "step": 28400 + }, + { + "epoch": 0.9261024350107128, + "grad_norm": 0.5957090258598328, + "learning_rate": 3.457327206960602e-05, + "logits/chosen": 3.3344523906707764, + "logits/rejected": 3.2857284545898438, + "logps/chosen": -345.5547790527344, + "logps/rejected": -348.2261047363281, + "loss": 0.6553, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.986067771911621, + "rewards/margins": 2.654109477996826, + "rewards/rejected": -4.640177249908447, + "step": 28420 + }, + { + "epoch": 0.9267541608622333, + "grad_norm": 2.0847880840301514, + "learning_rate": 3.456240970660758e-05, + "logits/chosen": 3.425367832183838, + "logits/rejected": 3.33337140083313, + "logps/chosen": -328.7441711425781, + "logps/rejected": -335.81842041015625, + "loss": 0.4754, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6827659606933594, + "rewards/margins": 2.4741642475128174, + "rewards/rejected": -4.156930446624756, + "step": 28440 + }, + { + "epoch": 0.9274058867137539, + "grad_norm": 0.10027115046977997, + "learning_rate": 3.455154734360913e-05, + "logits/chosen": 3.493077039718628, + "logits/rejected": 3.475426435470581, + "logps/chosen": -373.0557861328125, + "logps/rejected": -375.5699768066406, + "loss": 0.4171, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9194034337997437, + "rewards/margins": 3.0584213733673096, + "rewards/rejected": -4.977824687957764, + "step": 28460 + }, + { + "epoch": 0.9280576125652744, + "grad_norm": 8.452141761779785, + "learning_rate": 3.454068498061068e-05, + "logits/chosen": 3.6191534996032715, + "logits/rejected": 3.628460645675659, + "logps/chosen": -375.4318542480469, + "logps/rejected": -362.79168701171875, + "loss": 0.5431, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.1206092834472656, + "rewards/margins": 3.0097432136535645, + "rewards/rejected": -5.130352973937988, + "step": 28480 + }, + { + "epoch": 0.9287093384167949, + "grad_norm": 0.8719784021377563, + "learning_rate": 3.452982261761224e-05, + "logits/chosen": 3.477512836456299, + "logits/rejected": 3.3468570709228516, + "logps/chosen": -319.5684509277344, + "logps/rejected": -319.72589111328125, + "loss": 0.4688, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.997014045715332, + "rewards/margins": 2.902061939239502, + "rewards/rejected": -4.899075984954834, + "step": 28500 + }, + { + "epoch": 0.9293610642683156, + "grad_norm": 6.576076507568359, + "learning_rate": 3.451896025461379e-05, + "logits/chosen": 3.210906982421875, + "logits/rejected": 3.211484909057617, + "logps/chosen": -333.90478515625, + "logps/rejected": -289.83709716796875, + "loss": 0.5324, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0765786170959473, + "rewards/margins": 2.607103109359741, + "rewards/rejected": -4.683681488037109, + "step": 28520 + }, + { + "epoch": 0.9300127901198361, + "grad_norm": 0.5618252754211426, + "learning_rate": 3.450809789161534e-05, + "logits/chosen": 3.790496349334717, + "logits/rejected": 3.910249710083008, + "logps/chosen": -374.2943420410156, + "logps/rejected": -337.6524963378906, + "loss": 0.4164, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.136646270751953, + "rewards/margins": 3.0717039108276367, + "rewards/rejected": -5.20835018157959, + "step": 28540 + }, + { + "epoch": 0.9306645159713567, + "grad_norm": 2.358067274093628, + "learning_rate": 3.449723552861689e-05, + "logits/chosen": 3.3324007987976074, + "logits/rejected": 3.296143054962158, + "logps/chosen": -328.9806213378906, + "logps/rejected": -336.32513427734375, + "loss": 0.4609, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0427534580230713, + "rewards/margins": 2.875999927520752, + "rewards/rejected": -4.918753623962402, + "step": 28560 + }, + { + "epoch": 0.9313162418228772, + "grad_norm": 2.3989503383636475, + "learning_rate": 3.448637316561845e-05, + "logits/chosen": 3.3653693199157715, + "logits/rejected": 3.397900342941284, + "logps/chosen": -328.83221435546875, + "logps/rejected": -295.81982421875, + "loss": 0.6241, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.560115337371826, + "rewards/margins": 2.4308016300201416, + "rewards/rejected": -4.9909162521362305, + "step": 28580 + }, + { + "epoch": 0.9319679676743977, + "grad_norm": 1.255363941192627, + "learning_rate": 3.447551080262e-05, + "logits/chosen": 3.2208809852600098, + "logits/rejected": 3.264528274536133, + "logps/chosen": -346.72784423828125, + "logps/rejected": -296.24224853515625, + "loss": 0.5743, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7666187286376953, + "rewards/margins": 2.6415562629699707, + "rewards/rejected": -4.408175468444824, + "step": 28600 + }, + { + "epoch": 0.9326196935259183, + "grad_norm": 4.318169593811035, + "learning_rate": 3.446464843962156e-05, + "logits/chosen": 3.3190598487854004, + "logits/rejected": 3.4251809120178223, + "logps/chosen": -344.56854248046875, + "logps/rejected": -299.27374267578125, + "loss": 0.5439, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6298002004623413, + "rewards/margins": 1.738581895828247, + "rewards/rejected": -3.368381977081299, + "step": 28620 + }, + { + "epoch": 0.9332714193774388, + "grad_norm": 3.871297836303711, + "learning_rate": 3.445378607662311e-05, + "logits/chosen": 3.2242813110351562, + "logits/rejected": 3.2679877281188965, + "logps/chosen": -326.9549865722656, + "logps/rejected": -283.95123291015625, + "loss": 0.6105, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.698581337928772, + "rewards/margins": 2.4753060340881348, + "rewards/rejected": -4.173887729644775, + "step": 28640 + }, + { + "epoch": 0.9339231452289595, + "grad_norm": 3.5711302757263184, + "learning_rate": 3.4442923713624667e-05, + "logits/chosen": 3.222838878631592, + "logits/rejected": 3.401592254638672, + "logps/chosen": -324.01788330078125, + "logps/rejected": -289.6505126953125, + "loss": 0.4575, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.180034875869751, + "rewards/margins": 2.6749424934387207, + "rewards/rejected": -3.85497784614563, + "step": 28660 + }, + { + "epoch": 0.93457487108048, + "grad_norm": 4.992178440093994, + "learning_rate": 3.443206135062622e-05, + "logits/chosen": 3.625840663909912, + "logits/rejected": 3.7061705589294434, + "logps/chosen": -349.79315185546875, + "logps/rejected": -340.203857421875, + "loss": 0.721, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2608563899993896, + "rewards/margins": 2.346696138381958, + "rewards/rejected": -3.6075527667999268, + "step": 28680 + }, + { + "epoch": 0.9352265969320006, + "grad_norm": 3.7150847911834717, + "learning_rate": 3.4421198987627775e-05, + "logits/chosen": 3.685804843902588, + "logits/rejected": 3.810981035232544, + "logps/chosen": -382.22979736328125, + "logps/rejected": -304.766357421875, + "loss": 0.4735, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3971374034881592, + "rewards/margins": 2.6002933979034424, + "rewards/rejected": -3.9974312782287598, + "step": 28700 + }, + { + "epoch": 0.9358783227835211, + "grad_norm": 1.9041317701339722, + "learning_rate": 3.4410336624629326e-05, + "logits/chosen": 3.5943474769592285, + "logits/rejected": 3.6305344104766846, + "logps/chosen": -342.5042724609375, + "logps/rejected": -333.02557373046875, + "loss": 0.3726, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.46736741065979, + "rewards/margins": 3.2713840007781982, + "rewards/rejected": -4.738751411437988, + "step": 28720 + }, + { + "epoch": 0.9365300486350416, + "grad_norm": 3.867677927017212, + "learning_rate": 3.4399474261630877e-05, + "logits/chosen": 3.6472344398498535, + "logits/rejected": 3.622514247894287, + "logps/chosen": -321.9580993652344, + "logps/rejected": -339.59625244140625, + "loss": 0.3701, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8917932510375977, + "rewards/margins": 3.261157989501953, + "rewards/rejected": -5.152951717376709, + "step": 28740 + }, + { + "epoch": 0.9371817744865623, + "grad_norm": 9.307119369506836, + "learning_rate": 3.438861189863243e-05, + "logits/chosen": 3.7137465476989746, + "logits/rejected": 3.8247647285461426, + "logps/chosen": -399.4239807128906, + "logps/rejected": -324.4278869628906, + "loss": 0.559, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7126487493515015, + "rewards/margins": 2.6525723934173584, + "rewards/rejected": -4.36522102355957, + "step": 28760 + }, + { + "epoch": 0.9378335003380828, + "grad_norm": 0.9274722337722778, + "learning_rate": 3.4377749535633985e-05, + "logits/chosen": 3.2929446697235107, + "logits/rejected": 3.482569456100464, + "logps/chosen": -354.246826171875, + "logps/rejected": -325.17156982421875, + "loss": 0.5475, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6044118404388428, + "rewards/margins": 3.0721230506896973, + "rewards/rejected": -4.676535129547119, + "step": 28780 + }, + { + "epoch": 0.9384852261896034, + "grad_norm": 3.9317870140075684, + "learning_rate": 3.4366887172635536e-05, + "logits/chosen": 3.8419265747070312, + "logits/rejected": 3.8387489318847656, + "logps/chosen": -421.9317321777344, + "logps/rejected": -321.0296630859375, + "loss": 0.4661, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7041479349136353, + "rewards/margins": 3.054544687271118, + "rewards/rejected": -4.758692741394043, + "step": 28800 + }, + { + "epoch": 0.9391369520411239, + "grad_norm": 2.4777493476867676, + "learning_rate": 3.4356024809637086e-05, + "logits/chosen": 3.6267218589782715, + "logits/rejected": 3.619422435760498, + "logps/chosen": -352.91571044921875, + "logps/rejected": -326.9364929199219, + "loss": 0.3651, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7615712881088257, + "rewards/margins": 2.452573537826538, + "rewards/rejected": -4.214145183563232, + "step": 28820 + }, + { + "epoch": 0.9397886778926444, + "grad_norm": 1.202634572982788, + "learning_rate": 3.4345162446638644e-05, + "logits/chosen": 3.3945419788360596, + "logits/rejected": 3.4647388458251953, + "logps/chosen": -340.6974182128906, + "logps/rejected": -338.3480224609375, + "loss": 0.4002, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.1619932651519775, + "rewards/margins": 2.94966983795166, + "rewards/rejected": -5.111663341522217, + "step": 28840 + }, + { + "epoch": 0.940440403744165, + "grad_norm": 4.167840003967285, + "learning_rate": 3.4334300083640195e-05, + "logits/chosen": 2.8635451793670654, + "logits/rejected": 3.185560703277588, + "logps/chosen": -335.0335388183594, + "logps/rejected": -308.1698913574219, + "loss": 0.539, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.385801315307617, + "rewards/margins": 2.405503749847412, + "rewards/rejected": -4.791305065155029, + "step": 28860 + }, + { + "epoch": 0.9410921295956856, + "grad_norm": 4.764140605926514, + "learning_rate": 3.432343772064175e-05, + "logits/chosen": 3.4073309898376465, + "logits/rejected": 3.504981517791748, + "logps/chosen": -326.89105224609375, + "logps/rejected": -322.7445068359375, + "loss": 0.6735, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.3872199058532715, + "rewards/margins": 2.546234130859375, + "rewards/rejected": -4.933454990386963, + "step": 28880 + }, + { + "epoch": 0.9417438554472062, + "grad_norm": 2.5115323066711426, + "learning_rate": 3.43125753576433e-05, + "logits/chosen": 3.4365005493164062, + "logits/rejected": 3.434781551361084, + "logps/chosen": -336.712646484375, + "logps/rejected": -299.61297607421875, + "loss": 0.4564, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.159038543701172, + "rewards/margins": 2.484426975250244, + "rewards/rejected": -4.643465995788574, + "step": 28900 + }, + { + "epoch": 0.9423955812987267, + "grad_norm": 0.7181722521781921, + "learning_rate": 3.430171299464486e-05, + "logits/chosen": 3.3814024925231934, + "logits/rejected": 3.4179508686065674, + "logps/chosen": -342.0052795410156, + "logps/rejected": -278.7216796875, + "loss": 0.3878, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.700823187828064, + "rewards/margins": 3.4336915016174316, + "rewards/rejected": -5.134514808654785, + "step": 28920 + }, + { + "epoch": 0.9430473071502472, + "grad_norm": 4.770831108093262, + "learning_rate": 3.429085063164641e-05, + "logits/chosen": 3.402985095977783, + "logits/rejected": 3.420710802078247, + "logps/chosen": -354.60186767578125, + "logps/rejected": -327.25946044921875, + "loss": 0.4243, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9841028451919556, + "rewards/margins": 3.0425608158111572, + "rewards/rejected": -5.026663780212402, + "step": 28940 + }, + { + "epoch": 0.9436990330017678, + "grad_norm": 2.772846221923828, + "learning_rate": 3.427998826864796e-05, + "logits/chosen": 3.938098192214966, + "logits/rejected": 4.083984851837158, + "logps/chosen": -369.24298095703125, + "logps/rejected": -374.9916687011719, + "loss": 0.5563, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0518548488616943, + "rewards/margins": 3.110872507095337, + "rewards/rejected": -5.162726879119873, + "step": 28960 + }, + { + "epoch": 0.9443507588532883, + "grad_norm": 6.910282611846924, + "learning_rate": 3.426912590564952e-05, + "logits/chosen": 2.955049991607666, + "logits/rejected": 3.161536931991577, + "logps/chosen": -357.8158874511719, + "logps/rejected": -365.3817443847656, + "loss": 0.4426, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1056931018829346, + "rewards/margins": 3.483254909515381, + "rewards/rejected": -5.588948726654053, + "step": 28980 + }, + { + "epoch": 0.945002484704809, + "grad_norm": 16.182104110717773, + "learning_rate": 3.425826354265107e-05, + "logits/chosen": 3.377385377883911, + "logits/rejected": 3.3934013843536377, + "logps/chosen": -372.22003173828125, + "logps/rejected": -357.5921630859375, + "loss": 0.3117, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.5044912099838257, + "rewards/margins": 3.2843177318573, + "rewards/rejected": -4.788809299468994, + "step": 29000 + }, + { + "epoch": 0.9456542105563295, + "grad_norm": 4.876967430114746, + "learning_rate": 3.424740117965262e-05, + "logits/chosen": 3.3420162200927734, + "logits/rejected": 3.42205810546875, + "logps/chosen": -355.19207763671875, + "logps/rejected": -327.27301025390625, + "loss": 0.3369, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7957130670547485, + "rewards/margins": 2.9464125633239746, + "rewards/rejected": -4.742125511169434, + "step": 29020 + }, + { + "epoch": 0.94630593640785, + "grad_norm": 9.713497161865234, + "learning_rate": 3.423653881665418e-05, + "logits/chosen": 2.9949116706848145, + "logits/rejected": 3.095327377319336, + "logps/chosen": -314.43701171875, + "logps/rejected": -295.5093078613281, + "loss": 0.5117, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.388861894607544, + "rewards/margins": 3.004005193710327, + "rewards/rejected": -5.392867088317871, + "step": 29040 + }, + { + "epoch": 0.9469576622593706, + "grad_norm": 2.2967469692230225, + "learning_rate": 3.422567645365573e-05, + "logits/chosen": 3.3200900554656982, + "logits/rejected": 3.3476109504699707, + "logps/chosen": -348.5272521972656, + "logps/rejected": -301.556396484375, + "loss": 0.436, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8157918453216553, + "rewards/margins": 2.7994844913482666, + "rewards/rejected": -4.615276336669922, + "step": 29060 + }, + { + "epoch": 0.9476093881108911, + "grad_norm": 2.7159552574157715, + "learning_rate": 3.421481409065728e-05, + "logits/chosen": 3.249096393585205, + "logits/rejected": 3.2878928184509277, + "logps/chosen": -373.31243896484375, + "logps/rejected": -332.422119140625, + "loss": 0.4606, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8707494735717773, + "rewards/margins": 3.122931957244873, + "rewards/rejected": -4.993681907653809, + "step": 29080 + }, + { + "epoch": 0.9482611139624118, + "grad_norm": 0.8295297622680664, + "learning_rate": 3.420395172765883e-05, + "logits/chosen": 3.2345848083496094, + "logits/rejected": 3.2626793384552, + "logps/chosen": -342.242431640625, + "logps/rejected": -310.64471435546875, + "loss": 0.3917, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.4619837999343872, + "rewards/margins": 2.8905210494995117, + "rewards/rejected": -4.352504730224609, + "step": 29100 + }, + { + "epoch": 0.9489128398139323, + "grad_norm": 0.7273170948028564, + "learning_rate": 3.419308936466039e-05, + "logits/chosen": 3.2590465545654297, + "logits/rejected": 3.4826018810272217, + "logps/chosen": -376.98577880859375, + "logps/rejected": -327.7928466796875, + "loss": 0.5393, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.864471435546875, + "rewards/margins": 3.8626370429992676, + "rewards/rejected": -5.727108955383301, + "step": 29120 + }, + { + "epoch": 0.9495645656654528, + "grad_norm": 0.8789936900138855, + "learning_rate": 3.418222700166194e-05, + "logits/chosen": 3.337165117263794, + "logits/rejected": 3.4403674602508545, + "logps/chosen": -356.0877990722656, + "logps/rejected": -306.14105224609375, + "loss": 0.4227, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5488113164901733, + "rewards/margins": 3.152416467666626, + "rewards/rejected": -4.701227188110352, + "step": 29140 + }, + { + "epoch": 0.9502162915169734, + "grad_norm": 2.8722915649414062, + "learning_rate": 3.41713646386635e-05, + "logits/chosen": 3.1268529891967773, + "logits/rejected": 3.1950631141662598, + "logps/chosen": -336.1680603027344, + "logps/rejected": -307.1205139160156, + "loss": 0.4719, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.0343480110168457, + "rewards/margins": 3.1795589923858643, + "rewards/rejected": -5.213906764984131, + "step": 29160 + }, + { + "epoch": 0.9508680173684939, + "grad_norm": 2.1048970222473145, + "learning_rate": 3.4160502275665055e-05, + "logits/chosen": 3.096139907836914, + "logits/rejected": 3.2716610431671143, + "logps/chosen": -347.3212585449219, + "logps/rejected": -282.6916198730469, + "loss": 0.4739, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.468400478363037, + "rewards/margins": 2.8645777702331543, + "rewards/rejected": -5.332978248596191, + "step": 29180 + }, + { + "epoch": 0.9515197432200145, + "grad_norm": 0.9579587578773499, + "learning_rate": 3.4149639912666606e-05, + "logits/chosen": 3.348191022872925, + "logits/rejected": 3.4973816871643066, + "logps/chosen": -292.32513427734375, + "logps/rejected": -297.41607666015625, + "loss": 0.4848, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.517128825187683, + "rewards/margins": 3.0221476554870605, + "rewards/rejected": -4.539277076721191, + "step": 29200 + }, + { + "epoch": 0.952171469071535, + "grad_norm": 1.4564976692199707, + "learning_rate": 3.4138777549668156e-05, + "logits/chosen": 3.079204797744751, + "logits/rejected": 3.2190613746643066, + "logps/chosen": -356.43560791015625, + "logps/rejected": -328.7374267578125, + "loss": 0.3417, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8834016919136047, + "rewards/margins": 4.159285068511963, + "rewards/rejected": -5.042686939239502, + "step": 29220 + }, + { + "epoch": 0.9528231949230557, + "grad_norm": 3.1240317821502686, + "learning_rate": 3.4127915186669714e-05, + "logits/chosen": 3.3914833068847656, + "logits/rejected": 3.463909864425659, + "logps/chosen": -381.0918273925781, + "logps/rejected": -341.0538024902344, + "loss": 0.4306, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.285802125930786, + "rewards/margins": 3.1086416244506836, + "rewards/rejected": -5.394443988800049, + "step": 29240 + }, + { + "epoch": 0.9534749207745762, + "grad_norm": 3.9852240085601807, + "learning_rate": 3.4117052823671265e-05, + "logits/chosen": 3.427964687347412, + "logits/rejected": 3.558384418487549, + "logps/chosen": -311.61212158203125, + "logps/rejected": -308.53076171875, + "loss": 0.4769, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.958587646484375, + "rewards/margins": 2.3105664253234863, + "rewards/rejected": -4.269154071807861, + "step": 29260 + }, + { + "epoch": 0.9541266466260967, + "grad_norm": 9.587297439575195, + "learning_rate": 3.4106190460672815e-05, + "logits/chosen": 3.2424569129943848, + "logits/rejected": 3.4521918296813965, + "logps/chosen": -319.6648864746094, + "logps/rejected": -291.5690002441406, + "loss": 0.5236, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8244445323944092, + "rewards/margins": 2.330552101135254, + "rewards/rejected": -4.154996871948242, + "step": 29280 + }, + { + "epoch": 0.9547783724776173, + "grad_norm": 0.6489887833595276, + "learning_rate": 3.4095328097674366e-05, + "logits/chosen": 3.3532028198242188, + "logits/rejected": 3.432976245880127, + "logps/chosen": -334.3958435058594, + "logps/rejected": -330.35565185546875, + "loss": 0.4268, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8782587051391602, + "rewards/margins": 3.1416594982147217, + "rewards/rejected": -5.019918441772461, + "step": 29300 + }, + { + "epoch": 0.9554300983291378, + "grad_norm": 5.547811985015869, + "learning_rate": 3.4084465734675924e-05, + "logits/chosen": 3.495380401611328, + "logits/rejected": 3.5471115112304688, + "logps/chosen": -347.65704345703125, + "logps/rejected": -352.1741027832031, + "loss": 0.5673, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.474891185760498, + "rewards/margins": 2.694420099258423, + "rewards/rejected": -5.169311046600342, + "step": 29320 + }, + { + "epoch": 0.9560818241806585, + "grad_norm": 2.039820671081543, + "learning_rate": 3.4073603371677475e-05, + "logits/chosen": 2.710002899169922, + "logits/rejected": 3.162214756011963, + "logps/chosen": -342.93438720703125, + "logps/rejected": -303.7356262207031, + "loss": 0.5217, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.422079563140869, + "rewards/margins": 2.5651357173919678, + "rewards/rejected": -4.987215042114258, + "step": 29340 + }, + { + "epoch": 0.956733550032179, + "grad_norm": 0.283351331949234, + "learning_rate": 3.4062741008679025e-05, + "logits/chosen": 3.7109713554382324, + "logits/rejected": 3.4823107719421387, + "logps/chosen": -423.7511291503906, + "logps/rejected": -354.68939208984375, + "loss": 0.4094, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5567896366119385, + "rewards/margins": 2.993375062942505, + "rewards/rejected": -4.550164699554443, + "step": 29360 + }, + { + "epoch": 0.9573852758836995, + "grad_norm": 2.5597047805786133, + "learning_rate": 3.405187864568058e-05, + "logits/chosen": 3.375859022140503, + "logits/rejected": 3.3435020446777344, + "logps/chosen": -372.93743896484375, + "logps/rejected": -335.4704284667969, + "loss": 0.4831, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.038407564163208, + "rewards/margins": 3.0183725357055664, + "rewards/rejected": -5.0567803382873535, + "step": 29380 + }, + { + "epoch": 0.9580370017352201, + "grad_norm": 5.172478199005127, + "learning_rate": 3.4041016282682134e-05, + "logits/chosen": 2.9097418785095215, + "logits/rejected": 2.942014694213867, + "logps/chosen": -349.81951904296875, + "logps/rejected": -344.7188415527344, + "loss": 0.4049, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.008673667907715, + "rewards/margins": 2.606020927429199, + "rewards/rejected": -4.614694595336914, + "step": 29400 + }, + { + "epoch": 0.9586887275867406, + "grad_norm": 1.602325677871704, + "learning_rate": 3.403015391968369e-05, + "logits/chosen": 3.772630214691162, + "logits/rejected": 3.6234283447265625, + "logps/chosen": -339.09808349609375, + "logps/rejected": -324.6116027832031, + "loss": 0.4087, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.215893268585205, + "rewards/margins": 2.971155881881714, + "rewards/rejected": -5.18704891204834, + "step": 29420 + }, + { + "epoch": 0.9593404534382612, + "grad_norm": 6.688640594482422, + "learning_rate": 3.401929155668525e-05, + "logits/chosen": 3.3642513751983643, + "logits/rejected": 3.2575011253356934, + "logps/chosen": -374.2980041503906, + "logps/rejected": -327.7897644042969, + "loss": 0.5251, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8116271495819092, + "rewards/margins": 2.6711809635162354, + "rewards/rejected": -4.4828081130981445, + "step": 29440 + }, + { + "epoch": 0.9599921792897818, + "grad_norm": 2.285893201828003, + "learning_rate": 3.40084291936868e-05, + "logits/chosen": 2.949157238006592, + "logits/rejected": 3.082937717437744, + "logps/chosen": -307.9759521484375, + "logps/rejected": -292.964111328125, + "loss": 0.5198, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.827652931213379, + "rewards/margins": 2.0124831199645996, + "rewards/rejected": -3.8401360511779785, + "step": 29460 + }, + { + "epoch": 0.9606439051413023, + "grad_norm": 3.439211130142212, + "learning_rate": 3.399756683068835e-05, + "logits/chosen": 3.427022933959961, + "logits/rejected": 3.499098539352417, + "logps/chosen": -373.7626037597656, + "logps/rejected": -347.31048583984375, + "loss": 0.4199, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.086125373840332, + "rewards/margins": 2.612905979156494, + "rewards/rejected": -4.699031829833984, + "step": 29480 + }, + { + "epoch": 0.9612956309928229, + "grad_norm": 3.9516940116882324, + "learning_rate": 3.39867044676899e-05, + "logits/chosen": 3.3374125957489014, + "logits/rejected": 3.2623603343963623, + "logps/chosen": -380.73358154296875, + "logps/rejected": -368.5245361328125, + "loss": 0.5244, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9894317388534546, + "rewards/margins": 3.4481329917907715, + "rewards/rejected": -5.437564849853516, + "step": 29500 + }, + { + "epoch": 0.9619473568443434, + "grad_norm": 0.6663318872451782, + "learning_rate": 3.397584210469146e-05, + "logits/chosen": 2.872525930404663, + "logits/rejected": 3.105090618133545, + "logps/chosen": -293.1208190917969, + "logps/rejected": -321.33197021484375, + "loss": 0.2893, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2070465087890625, + "rewards/margins": 3.376481294631958, + "rewards/rejected": -5.583528518676758, + "step": 29520 + }, + { + "epoch": 0.962599082695864, + "grad_norm": 2.0558416843414307, + "learning_rate": 3.396497974169301e-05, + "logits/chosen": 3.1482183933258057, + "logits/rejected": 3.174734115600586, + "logps/chosen": -300.46588134765625, + "logps/rejected": -304.82659912109375, + "loss": 0.4859, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8250617980957031, + "rewards/margins": 2.9115872383117676, + "rewards/rejected": -4.736649036407471, + "step": 29540 + }, + { + "epoch": 0.9632508085473845, + "grad_norm": 9.57752513885498, + "learning_rate": 3.395411737869456e-05, + "logits/chosen": 3.1141278743743896, + "logits/rejected": 2.844959259033203, + "logps/chosen": -339.52679443359375, + "logps/rejected": -304.49713134765625, + "loss": 0.4312, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.070359468460083, + "rewards/margins": 3.3358523845672607, + "rewards/rejected": -5.406211853027344, + "step": 29560 + }, + { + "epoch": 0.963902534398905, + "grad_norm": 2.2513349056243896, + "learning_rate": 3.394325501569612e-05, + "logits/chosen": 3.277080535888672, + "logits/rejected": 3.4297051429748535, + "logps/chosen": -322.0815734863281, + "logps/rejected": -332.28118896484375, + "loss": 0.4873, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0387673377990723, + "rewards/margins": 3.0018515586853027, + "rewards/rejected": -5.040618419647217, + "step": 29580 + }, + { + "epoch": 0.9645542602504257, + "grad_norm": 3.4353480339050293, + "learning_rate": 3.393239265269767e-05, + "logits/chosen": 3.271233320236206, + "logits/rejected": 3.3547794818878174, + "logps/chosen": -330.0671081542969, + "logps/rejected": -314.89501953125, + "loss": 0.379, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5390081405639648, + "rewards/margins": 2.391852855682373, + "rewards/rejected": -3.930861234664917, + "step": 29600 + }, + { + "epoch": 0.9652059861019462, + "grad_norm": 0.9731097221374512, + "learning_rate": 3.392153028969922e-05, + "logits/chosen": 2.807079315185547, + "logits/rejected": 2.819882869720459, + "logps/chosen": -374.261474609375, + "logps/rejected": -333.43939208984375, + "loss": 0.3968, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9626271724700928, + "rewards/margins": 2.9691689014434814, + "rewards/rejected": -4.931795597076416, + "step": 29620 + }, + { + "epoch": 0.9658577119534668, + "grad_norm": 11.218391418457031, + "learning_rate": 3.391066792670078e-05, + "logits/chosen": 3.241853713989258, + "logits/rejected": 3.4003589153289795, + "logps/chosen": -351.9326477050781, + "logps/rejected": -315.58856201171875, + "loss": 0.4296, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8803555965423584, + "rewards/margins": 3.221251964569092, + "rewards/rejected": -5.101607799530029, + "step": 29640 + }, + { + "epoch": 0.9665094378049873, + "grad_norm": 3.148081064224243, + "learning_rate": 3.389980556370233e-05, + "logits/chosen": 3.190120220184326, + "logits/rejected": 3.2630062103271484, + "logps/chosen": -335.69549560546875, + "logps/rejected": -282.5934753417969, + "loss": 0.4463, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.288196563720703, + "rewards/margins": 3.310438632965088, + "rewards/rejected": -5.598634719848633, + "step": 29660 + }, + { + "epoch": 0.9671611636565078, + "grad_norm": 4.645575523376465, + "learning_rate": 3.3888943200703885e-05, + "logits/chosen": 3.3429694175720215, + "logits/rejected": 3.5933945178985596, + "logps/chosen": -346.04998779296875, + "logps/rejected": -315.6741027832031, + "loss": 0.4366, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0990138053894043, + "rewards/margins": 2.8395867347717285, + "rewards/rejected": -4.938601016998291, + "step": 29680 + }, + { + "epoch": 0.9678128895080285, + "grad_norm": 1.5344657897949219, + "learning_rate": 3.3878080837705436e-05, + "logits/chosen": 3.4449028968811035, + "logits/rejected": 3.46867299079895, + "logps/chosen": -344.771728515625, + "logps/rejected": -341.259521484375, + "loss": 0.342, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7196919918060303, + "rewards/margins": 3.3001551628112793, + "rewards/rejected": -5.0198469161987305, + "step": 29700 + }, + { + "epoch": 0.968464615359549, + "grad_norm": 1.2622402906417847, + "learning_rate": 3.3867218474706994e-05, + "logits/chosen": 3.185062885284424, + "logits/rejected": 3.185925245285034, + "logps/chosen": -324.1579895019531, + "logps/rejected": -314.9599914550781, + "loss": 0.5931, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8253488540649414, + "rewards/margins": 2.4258875846862793, + "rewards/rejected": -4.251236438751221, + "step": 29720 + }, + { + "epoch": 0.9691163412110696, + "grad_norm": 2.5706558227539062, + "learning_rate": 3.3856356111708544e-05, + "logits/chosen": 3.5014488697052, + "logits/rejected": 3.4794602394104004, + "logps/chosen": -354.59173583984375, + "logps/rejected": -376.9726867675781, + "loss": 0.5063, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5162699222564697, + "rewards/margins": 3.1740410327911377, + "rewards/rejected": -4.690310955047607, + "step": 29740 + }, + { + "epoch": 0.9697680670625901, + "grad_norm": 2.7434604167938232, + "learning_rate": 3.3845493748710095e-05, + "logits/chosen": 3.0073773860931396, + "logits/rejected": 3.1648454666137695, + "logps/chosen": -290.3002624511719, + "logps/rejected": -300.5801696777344, + "loss": 0.5315, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.313101053237915, + "rewards/margins": 2.7155921459198, + "rewards/rejected": -4.028693199157715, + "step": 29760 + }, + { + "epoch": 0.9704197929141107, + "grad_norm": 6.294442176818848, + "learning_rate": 3.383463138571165e-05, + "logits/chosen": 3.3429114818573, + "logits/rejected": 3.3707973957061768, + "logps/chosen": -354.12066650390625, + "logps/rejected": -319.3813171386719, + "loss": 0.3754, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9388885498046875, + "rewards/margins": 3.0968143939971924, + "rewards/rejected": -5.035702705383301, + "step": 29780 + }, + { + "epoch": 0.9710715187656312, + "grad_norm": 0.6438301801681519, + "learning_rate": 3.3823769022713204e-05, + "logits/chosen": 3.260214328765869, + "logits/rejected": 3.368124485015869, + "logps/chosen": -322.4534912109375, + "logps/rejected": -296.5779724121094, + "loss": 0.455, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8828808069229126, + "rewards/margins": 2.976409912109375, + "rewards/rejected": -4.859290599822998, + "step": 29800 + }, + { + "epoch": 0.9717232446171518, + "grad_norm": 5.869410514831543, + "learning_rate": 3.3812906659714754e-05, + "logits/chosen": 3.6142547130584717, + "logits/rejected": 3.4929637908935547, + "logps/chosen": -368.0857849121094, + "logps/rejected": -296.7102966308594, + "loss": 0.4647, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.670273780822754, + "rewards/margins": 2.542121648788452, + "rewards/rejected": -4.212395668029785, + "step": 29820 + }, + { + "epoch": 0.9723749704686724, + "grad_norm": 2.1455111503601074, + "learning_rate": 3.380204429671631e-05, + "logits/chosen": 3.2873387336730957, + "logits/rejected": 3.324495792388916, + "logps/chosen": -342.9508361816406, + "logps/rejected": -343.36444091796875, + "loss": 0.4066, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.2285709381103516, + "rewards/margins": 3.140474557876587, + "rewards/rejected": -5.369045257568359, + "step": 29840 + }, + { + "epoch": 0.9730266963201929, + "grad_norm": 11.31480598449707, + "learning_rate": 3.379118193371786e-05, + "logits/chosen": 3.1313228607177734, + "logits/rejected": 3.219092607498169, + "logps/chosen": -336.677001953125, + "logps/rejected": -336.41162109375, + "loss": 0.543, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.341547966003418, + "rewards/margins": 3.2554163932800293, + "rewards/rejected": -5.5969648361206055, + "step": 29860 + }, + { + "epoch": 0.9736784221717135, + "grad_norm": 3.540754795074463, + "learning_rate": 3.3780319570719413e-05, + "logits/chosen": 3.4360930919647217, + "logits/rejected": 3.432767152786255, + "logps/chosen": -376.5940856933594, + "logps/rejected": -299.198486328125, + "loss": 0.5413, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.2970383167266846, + "rewards/margins": 3.553590774536133, + "rewards/rejected": -5.850628852844238, + "step": 29880 + }, + { + "epoch": 0.974330148023234, + "grad_norm": 2.127713680267334, + "learning_rate": 3.3769457207720964e-05, + "logits/chosen": 3.356735944747925, + "logits/rejected": 3.2958006858825684, + "logps/chosen": -312.4453125, + "logps/rejected": -328.4266052246094, + "loss": 0.6188, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.3764610290527344, + "rewards/margins": 3.4216084480285645, + "rewards/rejected": -5.798069477081299, + "step": 29900 + }, + { + "epoch": 0.9749818738747545, + "grad_norm": 1.0076944828033447, + "learning_rate": 3.375859484472252e-05, + "logits/chosen": 3.3793492317199707, + "logits/rejected": 3.451597213745117, + "logps/chosen": -388.6403503417969, + "logps/rejected": -336.5260009765625, + "loss": 0.4171, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.0153634548187256, + "rewards/margins": 3.1971147060394287, + "rewards/rejected": -5.212477684020996, + "step": 29920 + }, + { + "epoch": 0.9756335997262752, + "grad_norm": 2.181013822555542, + "learning_rate": 3.374773248172407e-05, + "logits/chosen": 2.9669737815856934, + "logits/rejected": 2.9161581993103027, + "logps/chosen": -335.108154296875, + "logps/rejected": -344.7012634277344, + "loss": 0.4965, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9522926807403564, + "rewards/margins": 3.5413615703582764, + "rewards/rejected": -5.493654727935791, + "step": 29940 + }, + { + "epoch": 0.9762853255777957, + "grad_norm": 3.5752511024475098, + "learning_rate": 3.373687011872563e-05, + "logits/chosen": 2.97943115234375, + "logits/rejected": 2.9746737480163574, + "logps/chosen": -333.1907653808594, + "logps/rejected": -286.22772216796875, + "loss": 0.5451, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.203840970993042, + "rewards/margins": 2.6583571434020996, + "rewards/rejected": -4.862198829650879, + "step": 29960 + }, + { + "epoch": 0.9769370514293163, + "grad_norm": 0.984665036201477, + "learning_rate": 3.372600775572719e-05, + "logits/chosen": 2.978184461593628, + "logits/rejected": 3.1598658561706543, + "logps/chosen": -367.30328369140625, + "logps/rejected": -325.08807373046875, + "loss": 0.3762, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.493817925453186, + "rewards/margins": 3.9173130989074707, + "rewards/rejected": -5.411130905151367, + "step": 29980 + }, + { + "epoch": 0.9775887772808368, + "grad_norm": 0.7550521492958069, + "learning_rate": 3.371514539272874e-05, + "logits/chosen": 3.049884080886841, + "logits/rejected": 3.2187225818634033, + "logps/chosen": -336.088623046875, + "logps/rejected": -314.9981689453125, + "loss": 0.3343, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.986462354660034, + "rewards/margins": 3.4174270629882812, + "rewards/rejected": -6.4038896560668945, + "step": 30000 + }, + { + "epoch": 0.9782405031323573, + "grad_norm": 13.522953033447266, + "learning_rate": 3.370428302973029e-05, + "logits/chosen": 3.077348232269287, + "logits/rejected": 3.4174067974090576, + "logps/chosen": -392.7007141113281, + "logps/rejected": -382.8271484375, + "loss": 0.597, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.1314780712127686, + "rewards/margins": 3.502567768096924, + "rewards/rejected": -5.634045600891113, + "step": 30020 + }, + { + "epoch": 0.978892228983878, + "grad_norm": 1.0661569833755493, + "learning_rate": 3.369342066673184e-05, + "logits/chosen": 3.1853816509246826, + "logits/rejected": 3.3157246112823486, + "logps/chosen": -359.3238830566406, + "logps/rejected": -367.37420654296875, + "loss": 0.7096, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.780897855758667, + "rewards/margins": 3.0894887447357178, + "rewards/rejected": -5.870386600494385, + "step": 30040 + }, + { + "epoch": 0.9795439548353985, + "grad_norm": 6.051328659057617, + "learning_rate": 3.36825583037334e-05, + "logits/chosen": 3.5364983081817627, + "logits/rejected": 3.452183246612549, + "logps/chosen": -376.60882568359375, + "logps/rejected": -347.89935302734375, + "loss": 0.4769, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.16662335395813, + "rewards/margins": 3.143105983734131, + "rewards/rejected": -6.30972957611084, + "step": 30060 + }, + { + "epoch": 0.9801956806869191, + "grad_norm": 11.043696403503418, + "learning_rate": 3.367169594073495e-05, + "logits/chosen": 3.07889986038208, + "logits/rejected": 3.3579883575439453, + "logps/chosen": -352.5848083496094, + "logps/rejected": -346.8067626953125, + "loss": 0.5683, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.795499324798584, + "rewards/margins": 2.8019680976867676, + "rewards/rejected": -6.59746789932251, + "step": 30080 + }, + { + "epoch": 0.9808474065384396, + "grad_norm": 4.814469814300537, + "learning_rate": 3.36608335777365e-05, + "logits/chosen": 3.0794413089752197, + "logits/rejected": 3.0191705226898193, + "logps/chosen": -342.7195739746094, + "logps/rejected": -338.61492919921875, + "loss": 0.6726, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2113037109375, + "rewards/margins": 2.7671561241149902, + "rewards/rejected": -5.97845983505249, + "step": 30100 + }, + { + "epoch": 0.9814991323899601, + "grad_norm": 7.374011993408203, + "learning_rate": 3.364997121473806e-05, + "logits/chosen": 2.7751214504241943, + "logits/rejected": 2.954129457473755, + "logps/chosen": -353.4765625, + "logps/rejected": -309.6745910644531, + "loss": 0.3731, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.704303026199341, + "rewards/margins": 3.236403703689575, + "rewards/rejected": -5.940706729888916, + "step": 30120 + }, + { + "epoch": 0.9821508582414807, + "grad_norm": 3.135708808898926, + "learning_rate": 3.363910885173961e-05, + "logits/chosen": 3.3010642528533936, + "logits/rejected": 3.248624324798584, + "logps/chosen": -366.3512878417969, + "logps/rejected": -353.7363586425781, + "loss": 0.3616, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.228586196899414, + "rewards/margins": 3.439286708831787, + "rewards/rejected": -6.667873382568359, + "step": 30140 + }, + { + "epoch": 0.9828025840930013, + "grad_norm": 2.059206485748291, + "learning_rate": 3.362824648874116e-05, + "logits/chosen": 3.276870012283325, + "logits/rejected": 3.3826420307159424, + "logps/chosen": -399.75079345703125, + "logps/rejected": -340.73223876953125, + "loss": 0.4798, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.487715244293213, + "rewards/margins": 3.339195966720581, + "rewards/rejected": -5.826911926269531, + "step": 30160 + }, + { + "epoch": 0.9834543099445219, + "grad_norm": 0.8339039087295532, + "learning_rate": 3.3617384125742716e-05, + "logits/chosen": 2.8573102951049805, + "logits/rejected": 3.147505044937134, + "logps/chosen": -334.3879699707031, + "logps/rejected": -292.7530212402344, + "loss": 0.4783, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.3977715969085693, + "rewards/margins": 3.428443431854248, + "rewards/rejected": -5.8262152671813965, + "step": 30180 + }, + { + "epoch": 0.9841060357960424, + "grad_norm": 2.76772403717041, + "learning_rate": 3.360652176274427e-05, + "logits/chosen": 3.3943238258361816, + "logits/rejected": 3.5753540992736816, + "logps/chosen": -346.0272521972656, + "logps/rejected": -342.57012939453125, + "loss": 0.4883, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.7713258266448975, + "rewards/margins": 2.808532476425171, + "rewards/rejected": -5.579858303070068, + "step": 30200 + }, + { + "epoch": 0.9847577616475629, + "grad_norm": 1.3948928117752075, + "learning_rate": 3.3595659399745824e-05, + "logits/chosen": 3.405855655670166, + "logits/rejected": 3.464427947998047, + "logps/chosen": -347.0652160644531, + "logps/rejected": -300.0810241699219, + "loss": 0.3184, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.130223035812378, + "rewards/margins": 3.3739540576934814, + "rewards/rejected": -5.504177570343018, + "step": 30220 + }, + { + "epoch": 0.9854094874990835, + "grad_norm": 2.3895304203033447, + "learning_rate": 3.3584797036747375e-05, + "logits/chosen": 3.277308940887451, + "logits/rejected": 3.137511730194092, + "logps/chosen": -323.56146240234375, + "logps/rejected": -335.84735107421875, + "loss": 0.3647, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.797067403793335, + "rewards/margins": 3.1476287841796875, + "rewards/rejected": -4.94469690322876, + "step": 30240 + }, + { + "epoch": 0.986061213350604, + "grad_norm": 3.7129600048065186, + "learning_rate": 3.357393467374893e-05, + "logits/chosen": 3.1351122856140137, + "logits/rejected": 3.1588666439056396, + "logps/chosen": -327.989990234375, + "logps/rejected": -357.4604797363281, + "loss": 0.3815, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1021342277526855, + "rewards/margins": 3.722696304321289, + "rewards/rejected": -5.824830532073975, + "step": 30260 + }, + { + "epoch": 0.9867129392021247, + "grad_norm": 0.5093979239463806, + "learning_rate": 3.356307231075048e-05, + "logits/chosen": 3.0108816623687744, + "logits/rejected": 3.09507155418396, + "logps/chosen": -346.207763671875, + "logps/rejected": -319.5750732421875, + "loss": 0.4699, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.604102373123169, + "rewards/margins": 3.6887810230255127, + "rewards/rejected": -6.292883396148682, + "step": 30280 + }, + { + "epoch": 0.9873646650536452, + "grad_norm": 1.746913194656372, + "learning_rate": 3.3552209947752034e-05, + "logits/chosen": 3.6144447326660156, + "logits/rejected": 3.5091660022735596, + "logps/chosen": -370.12652587890625, + "logps/rejected": -319.26678466796875, + "loss": 0.4543, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1394009590148926, + "rewards/margins": 3.1397171020507812, + "rewards/rejected": -5.279117584228516, + "step": 30300 + }, + { + "epoch": 0.9880163909051657, + "grad_norm": 1.1881473064422607, + "learning_rate": 3.354134758475359e-05, + "logits/chosen": 3.473018169403076, + "logits/rejected": 3.4846954345703125, + "logps/chosen": -380.5660400390625, + "logps/rejected": -343.75457763671875, + "loss": 0.2812, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2258827686309814, + "rewards/margins": 3.6662018299102783, + "rewards/rejected": -5.89208459854126, + "step": 30320 + }, + { + "epoch": 0.9886681167566863, + "grad_norm": 11.9409818649292, + "learning_rate": 3.353048522175514e-05, + "logits/chosen": 3.166771650314331, + "logits/rejected": 3.162294387817383, + "logps/chosen": -316.6907958984375, + "logps/rejected": -326.98089599609375, + "loss": 0.5823, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.1929142475128174, + "rewards/margins": 3.2773277759552, + "rewards/rejected": -5.470242500305176, + "step": 30340 + }, + { + "epoch": 0.9893198426082068, + "grad_norm": 0.6546741127967834, + "learning_rate": 3.351962285875669e-05, + "logits/chosen": 3.486769914627075, + "logits/rejected": 3.418902635574341, + "logps/chosen": -377.38714599609375, + "logps/rejected": -311.35345458984375, + "loss": 0.3912, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8504855632781982, + "rewards/margins": 3.3373475074768066, + "rewards/rejected": -5.187832832336426, + "step": 30360 + }, + { + "epoch": 0.9899715684597274, + "grad_norm": 1.1085526943206787, + "learning_rate": 3.350876049575825e-05, + "logits/chosen": 3.2654483318328857, + "logits/rejected": 3.473875045776367, + "logps/chosen": -388.51666259765625, + "logps/rejected": -346.02294921875, + "loss": 0.426, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.263887405395508, + "rewards/margins": 3.15932297706604, + "rewards/rejected": -5.423210620880127, + "step": 30380 + }, + { + "epoch": 0.990623294311248, + "grad_norm": 2.2314183712005615, + "learning_rate": 3.34978981327598e-05, + "logits/chosen": 3.3748397827148438, + "logits/rejected": 3.4762930870056152, + "logps/chosen": -365.04840087890625, + "logps/rejected": -347.90985107421875, + "loss": 0.3207, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.341988205909729, + "rewards/margins": 3.3885722160339355, + "rewards/rejected": -4.730559825897217, + "step": 30400 + }, + { + "epoch": 0.9912750201627686, + "grad_norm": 2.0121231079101562, + "learning_rate": 3.348703576976135e-05, + "logits/chosen": 3.5600051879882812, + "logits/rejected": 3.5430169105529785, + "logps/chosen": -319.27191162109375, + "logps/rejected": -336.0518798828125, + "loss": 0.5221, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.458444595336914, + "rewards/margins": 2.492673397064209, + "rewards/rejected": -4.951118469238281, + "step": 30420 + }, + { + "epoch": 0.9919267460142891, + "grad_norm": 3.6529369354248047, + "learning_rate": 3.34761734067629e-05, + "logits/chosen": 3.164428234100342, + "logits/rejected": 3.472052812576294, + "logps/chosen": -400.2348327636719, + "logps/rejected": -352.2412414550781, + "loss": 0.4134, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.8745973110198975, + "rewards/margins": 3.6266121864318848, + "rewards/rejected": -5.5012102127075195, + "step": 30440 + }, + { + "epoch": 0.9925784718658096, + "grad_norm": 2.4713916778564453, + "learning_rate": 3.346531104376446e-05, + "logits/chosen": 3.3298802375793457, + "logits/rejected": 3.3520236015319824, + "logps/chosen": -342.015625, + "logps/rejected": -288.23089599609375, + "loss": 0.5413, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.6313555240631104, + "rewards/margins": 2.2656664848327637, + "rewards/rejected": -4.897021770477295, + "step": 30460 + }, + { + "epoch": 0.9932301977173302, + "grad_norm": 1.5942399501800537, + "learning_rate": 3.345444868076602e-05, + "logits/chosen": 3.352914810180664, + "logits/rejected": 3.4447696208953857, + "logps/chosen": -336.6816101074219, + "logps/rejected": -320.1922607421875, + "loss": 0.5319, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.536515712738037, + "rewards/margins": 2.448772430419922, + "rewards/rejected": -4.985288143157959, + "step": 30480 + }, + { + "epoch": 0.9938819235688507, + "grad_norm": 1.5490524768829346, + "learning_rate": 3.344358631776757e-05, + "logits/chosen": 3.3861172199249268, + "logits/rejected": 3.4189133644104004, + "logps/chosen": -332.8922424316406, + "logps/rejected": -334.22216796875, + "loss": 0.392, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7318603992462158, + "rewards/margins": 2.891231060028076, + "rewards/rejected": -4.623091220855713, + "step": 30500 + }, + { + "epoch": 0.9945336494203714, + "grad_norm": 0.5112868547439575, + "learning_rate": 3.343272395476913e-05, + "logits/chosen": 3.4136099815368652, + "logits/rejected": 3.4150631427764893, + "logps/chosen": -318.08734130859375, + "logps/rejected": -323.7646179199219, + "loss": 0.46, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.032365322113037, + "rewards/margins": 3.075023889541626, + "rewards/rejected": -5.107388973236084, + "step": 30520 + }, + { + "epoch": 0.9951853752718919, + "grad_norm": 2.55098295211792, + "learning_rate": 3.342186159177068e-05, + "logits/chosen": 3.001533269882202, + "logits/rejected": 3.024885892868042, + "logps/chosen": -349.66986083984375, + "logps/rejected": -323.9239807128906, + "loss": 0.6206, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0992279052734375, + "rewards/margins": 2.2395386695861816, + "rewards/rejected": -4.338766574859619, + "step": 30540 + }, + { + "epoch": 0.9958371011234124, + "grad_norm": 2.473917245864868, + "learning_rate": 3.341099922877223e-05, + "logits/chosen": 3.3624274730682373, + "logits/rejected": 3.399639844894409, + "logps/chosen": -382.83001708984375, + "logps/rejected": -314.43255615234375, + "loss": 0.4458, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.205016851425171, + "rewards/margins": 2.644479274749756, + "rewards/rejected": -4.849496364593506, + "step": 30560 + }, + { + "epoch": 0.996488826974933, + "grad_norm": 2.4726967811584473, + "learning_rate": 3.3400136865773786e-05, + "logits/chosen": 3.397879123687744, + "logits/rejected": 3.1486880779266357, + "logps/chosen": -356.2794494628906, + "logps/rejected": -315.2786560058594, + "loss": 0.4844, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6652543544769287, + "rewards/margins": 2.5065267086029053, + "rewards/rejected": -4.171780586242676, + "step": 30580 + }, + { + "epoch": 0.9971405528264535, + "grad_norm": 2.061185836791992, + "learning_rate": 3.3389274502775337e-05, + "logits/chosen": 3.4826064109802246, + "logits/rejected": 3.3631644248962402, + "logps/chosen": -359.1213073730469, + "logps/rejected": -290.7142333984375, + "loss": 0.5942, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.853049635887146, + "rewards/margins": 2.6013801097869873, + "rewards/rejected": -4.454429626464844, + "step": 30600 + }, + { + "epoch": 0.9977922786779742, + "grad_norm": 1.09378182888031, + "learning_rate": 3.337841213977689e-05, + "logits/chosen": 3.133481502532959, + "logits/rejected": 3.264781951904297, + "logps/chosen": -338.6067199707031, + "logps/rejected": -325.25079345703125, + "loss": 0.3953, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8043735027313232, + "rewards/margins": 3.2847695350646973, + "rewards/rejected": -5.089142799377441, + "step": 30620 + }, + { + "epoch": 0.9984440045294947, + "grad_norm": 5.029660701751709, + "learning_rate": 3.336754977677844e-05, + "logits/chosen": 3.451188325881958, + "logits/rejected": 3.3195743560791016, + "logps/chosen": -337.01715087890625, + "logps/rejected": -332.62103271484375, + "loss": 0.4354, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.203275442123413, + "rewards/margins": 2.8069560527801514, + "rewards/rejected": -5.010231971740723, + "step": 30640 + }, + { + "epoch": 0.9990957303810152, + "grad_norm": 2.0784645080566406, + "learning_rate": 3.3356687413779996e-05, + "logits/chosen": 3.272204637527466, + "logits/rejected": 3.4715800285339355, + "logps/chosen": -348.2787170410156, + "logps/rejected": -358.3988342285156, + "loss": 0.4599, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8002405166625977, + "rewards/margins": 3.0052170753479004, + "rewards/rejected": -5.805457592010498, + "step": 30660 + }, + { + "epoch": 0.9997474562325358, + "grad_norm": 7.111709117889404, + "learning_rate": 3.3345825050781546e-05, + "logits/chosen": 3.442639112472534, + "logits/rejected": 3.560947895050049, + "logps/chosen": -348.1722717285156, + "logps/rejected": -333.6714782714844, + "loss": 0.5515, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.005985975265503, + "rewards/margins": 3.373289108276367, + "rewards/rejected": -5.379274845123291, + "step": 30680 + }, + { + "epoch": 1.0003991820840563, + "grad_norm": 1.074591040611267, + "learning_rate": 3.33349626877831e-05, + "logits/chosen": 3.428051710128784, + "logits/rejected": 3.648878812789917, + "logps/chosen": -372.50982666015625, + "logps/rejected": -292.89080810546875, + "loss": 0.3416, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.475926160812378, + "rewards/margins": 3.3908817768096924, + "rewards/rejected": -4.866807460784912, + "step": 30700 + }, + { + "epoch": 1.001050907935577, + "grad_norm": 1.0542329549789429, + "learning_rate": 3.3324100324784655e-05, + "logits/chosen": 3.3806731700897217, + "logits/rejected": 3.186990737915039, + "logps/chosen": -349.8481750488281, + "logps/rejected": -344.3046569824219, + "loss": 0.4117, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8732187747955322, + "rewards/margins": 2.4133265018463135, + "rewards/rejected": -4.2865447998046875, + "step": 30720 + }, + { + "epoch": 1.0017026337870976, + "grad_norm": 1.0377331972122192, + "learning_rate": 3.3313237961786206e-05, + "logits/chosen": 3.444506883621216, + "logits/rejected": 3.3112549781799316, + "logps/chosen": -326.6030578613281, + "logps/rejected": -304.0701904296875, + "loss": 0.2584, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.7289178371429443, + "rewards/margins": 3.643841505050659, + "rewards/rejected": -5.3727593421936035, + "step": 30740 + }, + { + "epoch": 1.002354359638618, + "grad_norm": 2.772258758544922, + "learning_rate": 3.330237559878776e-05, + "logits/chosen": 3.266049861907959, + "logits/rejected": 3.458425521850586, + "logps/chosen": -315.1651306152344, + "logps/rejected": -289.7227478027344, + "loss": 0.3345, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.507586717605591, + "rewards/margins": 3.188950300216675, + "rewards/rejected": -5.696537971496582, + "step": 30760 + }, + { + "epoch": 1.0030060854901386, + "grad_norm": 1.1155312061309814, + "learning_rate": 3.329151323578932e-05, + "logits/chosen": 3.174635171890259, + "logits/rejected": 2.958359718322754, + "logps/chosen": -325.75152587890625, + "logps/rejected": -300.9131774902344, + "loss": 0.328, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.349187135696411, + "rewards/margins": 2.7102255821228027, + "rewards/rejected": -5.059412956237793, + "step": 30780 + }, + { + "epoch": 1.0036578113416592, + "grad_norm": 3.0652260780334473, + "learning_rate": 3.328065087279087e-05, + "logits/chosen": 3.4361209869384766, + "logits/rejected": 3.5030148029327393, + "logps/chosen": -349.4901123046875, + "logps/rejected": -388.70196533203125, + "loss": 0.2765, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.4329192638397217, + "rewards/margins": 3.9396705627441406, + "rewards/rejected": -5.372590065002441, + "step": 30800 + }, + { + "epoch": 1.0043095371931796, + "grad_norm": 5.1675028800964355, + "learning_rate": 3.326978850979242e-05, + "logits/chosen": 3.220440626144409, + "logits/rejected": 3.2452335357666016, + "logps/chosen": -391.6290588378906, + "logps/rejected": -334.2843933105469, + "loss": 0.2622, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.388649344444275, + "rewards/margins": 3.7524447441101074, + "rewards/rejected": -5.141093730926514, + "step": 30820 + }, + { + "epoch": 1.0049612630447002, + "grad_norm": 1.8238393068313599, + "learning_rate": 3.325892614679397e-05, + "logits/chosen": 3.3322558403015137, + "logits/rejected": 3.55566668510437, + "logps/chosen": -358.9780578613281, + "logps/rejected": -321.5817565917969, + "loss": 0.253, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.5077861547470093, + "rewards/margins": 3.2449488639831543, + "rewards/rejected": -4.752735137939453, + "step": 30840 + }, + { + "epoch": 1.0056129888962209, + "grad_norm": 2.092055559158325, + "learning_rate": 3.324806378379553e-05, + "logits/chosen": 3.211439609527588, + "logits/rejected": 3.160414934158325, + "logps/chosen": -301.76690673828125, + "logps/rejected": -314.78082275390625, + "loss": 0.2348, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.8723087310791016, + "rewards/margins": 3.2063961029052734, + "rewards/rejected": -5.078704833984375, + "step": 30860 + }, + { + "epoch": 1.0062647147477413, + "grad_norm": 1.1446260213851929, + "learning_rate": 3.323720142079708e-05, + "logits/chosen": 3.271027088165283, + "logits/rejected": 3.3410708904266357, + "logps/chosen": -341.5811462402344, + "logps/rejected": -295.9261779785156, + "loss": 0.222, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.2773380279541016, + "rewards/margins": 4.080620765686035, + "rewards/rejected": -6.357958793640137, + "step": 30880 + }, + { + "epoch": 1.0069164405992619, + "grad_norm": 0.16238276660442352, + "learning_rate": 3.322633905779863e-05, + "logits/chosen": 3.472748279571533, + "logits/rejected": 3.2621593475341797, + "logps/chosen": -368.07415771484375, + "logps/rejected": -343.18707275390625, + "loss": 0.3987, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.279836416244507, + "rewards/margins": 3.399648666381836, + "rewards/rejected": -5.679485321044922, + "step": 30900 + }, + { + "epoch": 1.0075681664507825, + "grad_norm": 1.5055025815963745, + "learning_rate": 3.321547669480019e-05, + "logits/chosen": 3.259937286376953, + "logits/rejected": 3.3021912574768066, + "logps/chosen": -418.33892822265625, + "logps/rejected": -347.6068420410156, + "loss": 0.331, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1657216548919678, + "rewards/margins": 3.5580532550811768, + "rewards/rejected": -5.723775386810303, + "step": 30920 + }, + { + "epoch": 1.0082198923023031, + "grad_norm": 0.9887646436691284, + "learning_rate": 3.320461433180174e-05, + "logits/chosen": 3.2716457843780518, + "logits/rejected": 3.1340768337249756, + "logps/chosen": -355.9222106933594, + "logps/rejected": -308.06549072265625, + "loss": 0.2937, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7826541662216187, + "rewards/margins": 3.9301986694335938, + "rewards/rejected": -5.712852954864502, + "step": 30940 + }, + { + "epoch": 1.0088716181538235, + "grad_norm": 3.470557451248169, + "learning_rate": 3.319375196880329e-05, + "logits/chosen": 3.0492687225341797, + "logits/rejected": 3.0938313007354736, + "logps/chosen": -333.40972900390625, + "logps/rejected": -352.8801574707031, + "loss": 0.33, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9227575063705444, + "rewards/margins": 3.1646695137023926, + "rewards/rejected": -5.087427616119385, + "step": 30960 + }, + { + "epoch": 1.0095233440053442, + "grad_norm": 0.3572205901145935, + "learning_rate": 3.318288960580485e-05, + "logits/chosen": 3.0809197425842285, + "logits/rejected": 3.117067337036133, + "logps/chosen": -337.5281677246094, + "logps/rejected": -297.7648010253906, + "loss": 0.1354, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.0277185440063477, + "rewards/margins": 3.704662322998047, + "rewards/rejected": -5.7323808670043945, + "step": 30980 + }, + { + "epoch": 1.0101750698568648, + "grad_norm": 1.8100296258926392, + "learning_rate": 3.31720272428064e-05, + "logits/chosen": 2.9449660778045654, + "logits/rejected": 3.193847417831421, + "logps/chosen": -355.77520751953125, + "logps/rejected": -337.3812561035156, + "loss": 0.2738, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.643650770187378, + "rewards/margins": 3.7373549938201904, + "rewards/rejected": -5.381005764007568, + "step": 31000 + }, + { + "epoch": 1.0108267957083852, + "grad_norm": 3.7159669399261475, + "learning_rate": 3.316116487980796e-05, + "logits/chosen": 3.135540246963501, + "logits/rejected": 3.3195056915283203, + "logps/chosen": -366.4015197753906, + "logps/rejected": -349.47979736328125, + "loss": 0.2885, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.057676076889038, + "rewards/margins": 3.506985902786255, + "rewards/rejected": -6.564661979675293, + "step": 31020 + }, + { + "epoch": 1.0114785215599058, + "grad_norm": 3.7522075176239014, + "learning_rate": 3.315030251680951e-05, + "logits/chosen": 3.268831729888916, + "logits/rejected": 3.3573241233825684, + "logps/chosen": -358.9319152832031, + "logps/rejected": -290.679931640625, + "loss": 0.3862, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.9285904169082642, + "rewards/margins": 3.1716790199279785, + "rewards/rejected": -5.100269317626953, + "step": 31040 + }, + { + "epoch": 1.0121302474114264, + "grad_norm": 4.949117660522461, + "learning_rate": 3.3139440153811066e-05, + "logits/chosen": 2.9066920280456543, + "logits/rejected": 2.9981563091278076, + "logps/chosen": -318.89385986328125, + "logps/rejected": -324.1242370605469, + "loss": 0.2785, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.738542914390564, + "rewards/margins": 3.181380033493042, + "rewards/rejected": -4.919923305511475, + "step": 31060 + }, + { + "epoch": 1.0127819732629468, + "grad_norm": 1.0594877004623413, + "learning_rate": 3.3128577790812616e-05, + "logits/chosen": 2.923233985900879, + "logits/rejected": 3.0951945781707764, + "logps/chosen": -327.3102722167969, + "logps/rejected": -278.166259765625, + "loss": 0.2707, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.267763614654541, + "rewards/margins": 3.511237621307373, + "rewards/rejected": -5.7790021896362305, + "step": 31080 + }, + { + "epoch": 1.0134336991144675, + "grad_norm": 4.50054407119751, + "learning_rate": 3.311771542781417e-05, + "logits/chosen": 3.247253894805908, + "logits/rejected": 3.383859634399414, + "logps/chosen": -342.8865051269531, + "logps/rejected": -308.85809326171875, + "loss": 0.2173, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.788161516189575, + "rewards/margins": 3.8072619438171387, + "rewards/rejected": -6.595423221588135, + "step": 31100 + }, + { + "epoch": 1.014085424965988, + "grad_norm": 3.516953468322754, + "learning_rate": 3.3106853064815725e-05, + "logits/chosen": 3.1654350757598877, + "logits/rejected": 3.4134361743927, + "logps/chosen": -322.40240478515625, + "logps/rejected": -338.76678466796875, + "loss": 0.3382, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.4778313636779785, + "rewards/margins": 3.744813919067383, + "rewards/rejected": -6.222645282745361, + "step": 31120 + }, + { + "epoch": 1.0147371508175087, + "grad_norm": 0.2118988335132599, + "learning_rate": 3.3095990701817275e-05, + "logits/chosen": 2.8199961185455322, + "logits/rejected": 3.1488189697265625, + "logps/chosen": -354.8619689941406, + "logps/rejected": -372.5149841308594, + "loss": 0.24, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.5955588817596436, + "rewards/margins": 3.5046277046203613, + "rewards/rejected": -5.100186347961426, + "step": 31140 + }, + { + "epoch": 1.015388876669029, + "grad_norm": 1.0458229780197144, + "learning_rate": 3.3085128338818826e-05, + "logits/chosen": 3.317523956298828, + "logits/rejected": 3.5691781044006348, + "logps/chosen": -335.0774230957031, + "logps/rejected": -322.0765686035156, + "loss": 0.3097, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.2843589782714844, + "rewards/margins": 2.8790881633758545, + "rewards/rejected": -5.163447380065918, + "step": 31160 + }, + { + "epoch": 1.0160406025205497, + "grad_norm": 5.370657920837402, + "learning_rate": 3.307426597582038e-05, + "logits/chosen": 2.8140928745269775, + "logits/rejected": 2.8850624561309814, + "logps/chosen": -351.78802490234375, + "logps/rejected": -327.12213134765625, + "loss": 0.2385, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.2270846366882324, + "rewards/margins": 4.935652732849121, + "rewards/rejected": -7.162737846374512, + "step": 31180 + }, + { + "epoch": 1.0166923283720704, + "grad_norm": 0.6322416663169861, + "learning_rate": 3.3063403612821935e-05, + "logits/chosen": 3.210420608520508, + "logits/rejected": 3.3397650718688965, + "logps/chosen": -368.16046142578125, + "logps/rejected": -344.08599853515625, + "loss": 0.273, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.3873677253723145, + "rewards/margins": 4.053188800811768, + "rewards/rejected": -6.440556526184082, + "step": 31200 + }, + { + "epoch": 1.0173440542235908, + "grad_norm": 2.377168655395508, + "learning_rate": 3.3052541249823485e-05, + "logits/chosen": 2.9682469367980957, + "logits/rejected": 3.0865519046783447, + "logps/chosen": -335.143310546875, + "logps/rejected": -328.16448974609375, + "loss": 0.1893, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.407500743865967, + "rewards/margins": 3.721867084503174, + "rewards/rejected": -6.129366874694824, + "step": 31220 + }, + { + "epoch": 1.0179957800751114, + "grad_norm": 3.5353760719299316, + "learning_rate": 3.3041678886825036e-05, + "logits/chosen": 3.337400436401367, + "logits/rejected": 3.442906618118286, + "logps/chosen": -371.9844055175781, + "logps/rejected": -343.9320068359375, + "loss": 0.2942, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.390408992767334, + "rewards/margins": 4.082391738891602, + "rewards/rejected": -6.472799777984619, + "step": 31240 + }, + { + "epoch": 1.018647505926632, + "grad_norm": 2.222280263900757, + "learning_rate": 3.3030816523826594e-05, + "logits/chosen": 3.0184149742126465, + "logits/rejected": 3.105802536010742, + "logps/chosen": -335.8362121582031, + "logps/rejected": -335.3475036621094, + "loss": 0.4243, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.6201071739196777, + "rewards/margins": 3.0113720893859863, + "rewards/rejected": -5.631479263305664, + "step": 31260 + }, + { + "epoch": 1.0192992317781526, + "grad_norm": 3.9235551357269287, + "learning_rate": 3.301995416082815e-05, + "logits/chosen": 3.260164976119995, + "logits/rejected": 3.4011642932891846, + "logps/chosen": -345.288330078125, + "logps/rejected": -313.30218505859375, + "loss": 0.2756, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.199317693710327, + "rewards/margins": 4.389957904815674, + "rewards/rejected": -6.589275360107422, + "step": 31280 + }, + { + "epoch": 1.019950957629673, + "grad_norm": 0.6691313982009888, + "learning_rate": 3.30090917978297e-05, + "logits/chosen": 3.6155014038085938, + "logits/rejected": 3.616960048675537, + "logps/chosen": -412.9004821777344, + "logps/rejected": -384.1370544433594, + "loss": 0.2667, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6151869297027588, + "rewards/margins": 3.9175262451171875, + "rewards/rejected": -5.532713890075684, + "step": 31300 + }, + { + "epoch": 1.0206026834811937, + "grad_norm": 0.6876260042190552, + "learning_rate": 3.299822943483126e-05, + "logits/chosen": 2.6167714595794678, + "logits/rejected": 2.980222225189209, + "logps/chosen": -332.675537109375, + "logps/rejected": -339.16632080078125, + "loss": 0.2816, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.8400137424468994, + "rewards/margins": 3.958585739135742, + "rewards/rejected": -6.798600196838379, + "step": 31320 + }, + { + "epoch": 1.0212544093327143, + "grad_norm": 1.8548707962036133, + "learning_rate": 3.298736707183281e-05, + "logits/chosen": 3.322596311569214, + "logits/rejected": 3.366405487060547, + "logps/chosen": -363.05535888671875, + "logps/rejected": -359.6102600097656, + "loss": 0.2589, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.934292197227478, + "rewards/margins": 3.87666392326355, + "rewards/rejected": -5.810956001281738, + "step": 31340 + }, + { + "epoch": 1.0219061351842347, + "grad_norm": 0.1825685352087021, + "learning_rate": 3.297650470883436e-05, + "logits/chosen": 3.127986431121826, + "logits/rejected": 3.133280038833618, + "logps/chosen": -376.3436584472656, + "logps/rejected": -363.5606689453125, + "loss": 0.3148, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9927847385406494, + "rewards/margins": 3.88069486618042, + "rewards/rejected": -5.873479843139648, + "step": 31360 + }, + { + "epoch": 1.0225578610357553, + "grad_norm": 1.1971383094787598, + "learning_rate": 3.296564234583591e-05, + "logits/chosen": 3.3153538703918457, + "logits/rejected": 3.3512721061706543, + "logps/chosen": -367.6849060058594, + "logps/rejected": -349.813232421875, + "loss": 0.2572, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8098561763763428, + "rewards/margins": 4.136944770812988, + "rewards/rejected": -5.94680118560791, + "step": 31380 + }, + { + "epoch": 1.023209586887276, + "grad_norm": 1.4248380661010742, + "learning_rate": 3.295477998283747e-05, + "logits/chosen": 3.4821720123291016, + "logits/rejected": 3.457152843475342, + "logps/chosen": -343.6454162597656, + "logps/rejected": -347.5382385253906, + "loss": 0.3896, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.4242053031921387, + "rewards/margins": 3.524089813232422, + "rewards/rejected": -5.948294639587402, + "step": 31400 + }, + { + "epoch": 1.0238613127387963, + "grad_norm": 6.966257572174072, + "learning_rate": 3.294391761983902e-05, + "logits/chosen": 3.355026960372925, + "logits/rejected": 3.4076180458068848, + "logps/chosen": -367.8028869628906, + "logps/rejected": -353.405029296875, + "loss": 0.2609, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8395068645477295, + "rewards/margins": 4.198147773742676, + "rewards/rejected": -7.037654876708984, + "step": 31420 + }, + { + "epoch": 1.024513038590317, + "grad_norm": 3.2078402042388916, + "learning_rate": 3.293305525684057e-05, + "logits/chosen": 3.0415446758270264, + "logits/rejected": 3.0557198524475098, + "logps/chosen": -355.89031982421875, + "logps/rejected": -354.41595458984375, + "loss": 0.3287, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.792330503463745, + "rewards/margins": 4.080689430236816, + "rewards/rejected": -6.873020172119141, + "step": 31440 + }, + { + "epoch": 1.0251647644418376, + "grad_norm": 1.1752219200134277, + "learning_rate": 3.292219289384213e-05, + "logits/chosen": 3.120605230331421, + "logits/rejected": 3.16351056098938, + "logps/chosen": -342.9989929199219, + "logps/rejected": -319.5901184082031, + "loss": 0.2761, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.620972156524658, + "rewards/margins": 3.9818978309631348, + "rewards/rejected": -6.602869987487793, + "step": 31460 + }, + { + "epoch": 1.0258164902933582, + "grad_norm": 0.18570081889629364, + "learning_rate": 3.291133053084368e-05, + "logits/chosen": 3.359018325805664, + "logits/rejected": 3.3463263511657715, + "logps/chosen": -423.78857421875, + "logps/rejected": -369.99737548828125, + "loss": 0.2705, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.8546924591064453, + "rewards/margins": 4.140061855316162, + "rewards/rejected": -6.994753837585449, + "step": 31480 + }, + { + "epoch": 1.0264682161448786, + "grad_norm": 2.89898681640625, + "learning_rate": 3.290046816784523e-05, + "logits/chosen": 3.317455291748047, + "logits/rejected": 3.3660454750061035, + "logps/chosen": -365.5255432128906, + "logps/rejected": -343.6577453613281, + "loss": 0.1769, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7191804647445679, + "rewards/margins": 4.261567115783691, + "rewards/rejected": -5.980748176574707, + "step": 31500 + }, + { + "epoch": 1.0271199419963992, + "grad_norm": 2.891650676727295, + "learning_rate": 3.288960580484679e-05, + "logits/chosen": 3.0927541255950928, + "logits/rejected": 3.296412706375122, + "logps/chosen": -347.92999267578125, + "logps/rejected": -362.6603088378906, + "loss": 0.2148, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8389530181884766, + "rewards/margins": 3.8635737895965576, + "rewards/rejected": -6.702526092529297, + "step": 31520 + }, + { + "epoch": 1.0277716678479198, + "grad_norm": 1.8670357465744019, + "learning_rate": 3.2878743441848345e-05, + "logits/chosen": 3.0779969692230225, + "logits/rejected": 3.15032958984375, + "logps/chosen": -339.3960876464844, + "logps/rejected": -316.16650390625, + "loss": 0.2875, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.2529561519622803, + "rewards/margins": 4.445372104644775, + "rewards/rejected": -6.698327541351318, + "step": 31540 + }, + { + "epoch": 1.0284233936994402, + "grad_norm": 30.380611419677734, + "learning_rate": 3.2867881078849896e-05, + "logits/chosen": 2.5927646160125732, + "logits/rejected": 2.755613088607788, + "logps/chosen": -323.3199157714844, + "logps/rejected": -330.0204772949219, + "loss": 0.2544, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.942300796508789, + "rewards/margins": 3.81451416015625, + "rewards/rejected": -5.756814479827881, + "step": 31560 + }, + { + "epoch": 1.0290751195509609, + "grad_norm": 0.2924047112464905, + "learning_rate": 3.2857561834001374e-05, + "logits/chosen": 3.05615234375, + "logits/rejected": 3.4517829418182373, + "logps/chosen": -371.25714111328125, + "logps/rejected": -357.0452575683594, + "loss": 0.4525, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.109483242034912, + "rewards/margins": 4.190521717071533, + "rewards/rejected": -6.300004959106445, + "step": 31580 + }, + { + "epoch": 1.0297268454024815, + "grad_norm": 2.1778132915496826, + "learning_rate": 3.2846699471002925e-05, + "logits/chosen": 3.1856906414031982, + "logits/rejected": 3.1875009536743164, + "logps/chosen": -314.0682067871094, + "logps/rejected": -339.8470764160156, + "loss": 0.2726, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4078783988952637, + "rewards/margins": 3.802730083465576, + "rewards/rejected": -6.21060848236084, + "step": 31600 + }, + { + "epoch": 1.030378571254002, + "grad_norm": 1.2367883920669556, + "learning_rate": 3.2835837108004476e-05, + "logits/chosen": 3.197033405303955, + "logits/rejected": 3.2437350749969482, + "logps/chosen": -386.5335998535156, + "logps/rejected": -329.1663818359375, + "loss": 0.3516, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2182669639587402, + "rewards/margins": 4.367469787597656, + "rewards/rejected": -6.5857367515563965, + "step": 31620 + }, + { + "epoch": 1.0310302971055225, + "grad_norm": 5.085755348205566, + "learning_rate": 3.2824974745006034e-05, + "logits/chosen": 3.1161370277404785, + "logits/rejected": 3.1065125465393066, + "logps/chosen": -322.17486572265625, + "logps/rejected": -340.5460205078125, + "loss": 0.4832, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.620079517364502, + "rewards/margins": 2.966508388519287, + "rewards/rejected": -5.586587905883789, + "step": 31640 + }, + { + "epoch": 1.0316820229570431, + "grad_norm": 6.086770534515381, + "learning_rate": 3.2814112382007584e-05, + "logits/chosen": 3.3167529106140137, + "logits/rejected": 3.3572750091552734, + "logps/chosen": -352.5821228027344, + "logps/rejected": -364.7077941894531, + "loss": 0.1786, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.20666766166687, + "rewards/margins": 4.50130558013916, + "rewards/rejected": -6.707973480224609, + "step": 31660 + }, + { + "epoch": 1.0323337488085638, + "grad_norm": 2.2976443767547607, + "learning_rate": 3.2803250019009135e-05, + "logits/chosen": 2.986971616744995, + "logits/rejected": 3.1330785751342773, + "logps/chosen": -357.68658447265625, + "logps/rejected": -342.3680114746094, + "loss": 0.1734, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.4208502769470215, + "rewards/margins": 3.9441421031951904, + "rewards/rejected": -6.364992141723633, + "step": 31680 + }, + { + "epoch": 1.0329854746600842, + "grad_norm": 1.693296194076538, + "learning_rate": 3.279238765601069e-05, + "logits/chosen": 3.3979709148406982, + "logits/rejected": 3.395521640777588, + "logps/chosen": -394.25262451171875, + "logps/rejected": -323.55853271484375, + "loss": 0.2859, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.285306215286255, + "rewards/margins": 3.746976137161255, + "rewards/rejected": -6.03228235244751, + "step": 31700 + }, + { + "epoch": 1.0336372005116048, + "grad_norm": 4.4423017501831055, + "learning_rate": 3.2781525293012243e-05, + "logits/chosen": 3.2488956451416016, + "logits/rejected": 3.2667698860168457, + "logps/chosen": -373.1899108886719, + "logps/rejected": -331.14959716796875, + "loss": 0.2228, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.716806411743164, + "rewards/margins": 4.514590263366699, + "rewards/rejected": -7.231396675109863, + "step": 31720 + }, + { + "epoch": 1.0342889263631254, + "grad_norm": 3.3477392196655273, + "learning_rate": 3.2770662930013794e-05, + "logits/chosen": 2.92942214012146, + "logits/rejected": 3.0013134479522705, + "logps/chosen": -330.332763671875, + "logps/rejected": -354.7757873535156, + "loss": 0.2846, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.8722259998321533, + "rewards/margins": 4.022871971130371, + "rewards/rejected": -6.895097255706787, + "step": 31740 + }, + { + "epoch": 1.0349406522146458, + "grad_norm": 0.47064465284347534, + "learning_rate": 3.2759800567015345e-05, + "logits/chosen": 2.8213772773742676, + "logits/rejected": 3.022695302963257, + "logps/chosen": -374.54022216796875, + "logps/rejected": -374.9316101074219, + "loss": 0.2765, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.2917110919952393, + "rewards/margins": 4.261187553405762, + "rewards/rejected": -7.552898406982422, + "step": 31760 + }, + { + "epoch": 1.0355923780661664, + "grad_norm": 0.5751989483833313, + "learning_rate": 3.27489382040169e-05, + "logits/chosen": 3.018174648284912, + "logits/rejected": 3.173509359359741, + "logps/chosen": -373.3923034667969, + "logps/rejected": -333.4388427734375, + "loss": 0.3458, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.3077099323272705, + "rewards/margins": 3.978456497192383, + "rewards/rejected": -7.286166191101074, + "step": 31780 + }, + { + "epoch": 1.036244103917687, + "grad_norm": 1.4200456142425537, + "learning_rate": 3.2738075841018453e-05, + "logits/chosen": 3.1456663608551025, + "logits/rejected": 3.161090612411499, + "logps/chosen": -356.0911865234375, + "logps/rejected": -381.1825256347656, + "loss": 0.3378, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.978144884109497, + "rewards/margins": 3.6333327293395996, + "rewards/rejected": -7.611476898193359, + "step": 31800 + }, + { + "epoch": 1.0368958297692077, + "grad_norm": 0.6719117760658264, + "learning_rate": 3.272721347802001e-05, + "logits/chosen": 2.8980493545532227, + "logits/rejected": 3.11385440826416, + "logps/chosen": -370.8499450683594, + "logps/rejected": -329.43243408203125, + "loss": 0.2213, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.6411807537078857, + "rewards/margins": 4.207257270812988, + "rewards/rejected": -7.8484392166137695, + "step": 31820 + }, + { + "epoch": 1.037547555620728, + "grad_norm": 0.13081787526607513, + "learning_rate": 3.271635111502157e-05, + "logits/chosen": 3.030278444290161, + "logits/rejected": 3.0454931259155273, + "logps/chosen": -356.560302734375, + "logps/rejected": -345.10552978515625, + "loss": 0.1769, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.3438498973846436, + "rewards/margins": 4.920258045196533, + "rewards/rejected": -8.264108657836914, + "step": 31840 + }, + { + "epoch": 1.0381992814722487, + "grad_norm": 0.0119861401617527, + "learning_rate": 3.270548875202312e-05, + "logits/chosen": 3.0526316165924072, + "logits/rejected": 2.9831573963165283, + "logps/chosen": -320.9473876953125, + "logps/rejected": -347.5840148925781, + "loss": 0.4081, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.3438949584960938, + "rewards/margins": 3.791682720184326, + "rewards/rejected": -7.135578155517578, + "step": 31860 + }, + { + "epoch": 1.0388510073237693, + "grad_norm": 2.8026773929595947, + "learning_rate": 3.269462638902467e-05, + "logits/chosen": 2.983581066131592, + "logits/rejected": 3.130539894104004, + "logps/chosen": -377.12750244140625, + "logps/rejected": -357.6585388183594, + "loss": 0.4071, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.9729371070861816, + "rewards/margins": 3.9581031799316406, + "rewards/rejected": -6.9310407638549805, + "step": 31880 + }, + { + "epoch": 1.0395027331752897, + "grad_norm": 0.9922619462013245, + "learning_rate": 3.268376402602623e-05, + "logits/chosen": 2.934352397918701, + "logits/rejected": 3.156519889831543, + "logps/chosen": -383.4858093261719, + "logps/rejected": -343.74786376953125, + "loss": 0.2251, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.333858013153076, + "rewards/margins": 4.132508754730225, + "rewards/rejected": -7.466366767883301, + "step": 31900 + }, + { + "epoch": 1.0401544590268104, + "grad_norm": 3.4722254276275635, + "learning_rate": 3.267290166302778e-05, + "logits/chosen": 3.0400521755218506, + "logits/rejected": 3.202108860015869, + "logps/chosen": -323.94390869140625, + "logps/rejected": -363.57159423828125, + "loss": 0.3426, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.462897777557373, + "rewards/margins": 3.4863038063049316, + "rewards/rejected": -5.949202060699463, + "step": 31920 + }, + { + "epoch": 1.040806184878331, + "grad_norm": 0.7388986945152283, + "learning_rate": 3.266203930002933e-05, + "logits/chosen": 2.8560850620269775, + "logits/rejected": 2.7838594913482666, + "logps/chosen": -335.2443542480469, + "logps/rejected": -302.95379638671875, + "loss": 0.4023, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.93139910697937, + "rewards/margins": 3.2790894508361816, + "rewards/rejected": -6.2104878425598145, + "step": 31940 + }, + { + "epoch": 1.0414579107298514, + "grad_norm": 2.4111440181732178, + "learning_rate": 3.265117693703088e-05, + "logits/chosen": 3.037639617919922, + "logits/rejected": 3.1951346397399902, + "logps/chosen": -350.2577819824219, + "logps/rejected": -296.57427978515625, + "loss": 0.3762, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.964735984802246, + "rewards/margins": 3.105273485183716, + "rewards/rejected": -6.070009708404541, + "step": 31960 + }, + { + "epoch": 1.042109636581372, + "grad_norm": 4.408575534820557, + "learning_rate": 3.264031457403244e-05, + "logits/chosen": 3.0367696285247803, + "logits/rejected": 3.0874783992767334, + "logps/chosen": -347.1123962402344, + "logps/rejected": -289.9499816894531, + "loss": 0.2506, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.532938241958618, + "rewards/margins": 3.800011157989502, + "rewards/rejected": -6.332949161529541, + "step": 31980 + }, + { + "epoch": 1.0427613624328926, + "grad_norm": 1.1516841650009155, + "learning_rate": 3.262945221103399e-05, + "logits/chosen": 3.080725908279419, + "logits/rejected": 3.1881537437438965, + "logps/chosen": -364.08880615234375, + "logps/rejected": -345.4632568359375, + "loss": 0.2624, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.405890464782715, + "rewards/margins": 4.209997653961182, + "rewards/rejected": -6.615887641906738, + "step": 32000 + }, + { + "epoch": 1.0434130882844133, + "grad_norm": 4.180842876434326, + "learning_rate": 3.261858984803554e-05, + "logits/chosen": 2.7228245735168457, + "logits/rejected": 2.910079002380371, + "logps/chosen": -354.87738037109375, + "logps/rejected": -315.8990478515625, + "loss": 0.2083, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.9739539623260498, + "rewards/margins": 4.360935211181641, + "rewards/rejected": -6.3348894119262695, + "step": 32020 + }, + { + "epoch": 1.0440648141359337, + "grad_norm": 0.6304757595062256, + "learning_rate": 3.26077274850371e-05, + "logits/chosen": 3.274738311767578, + "logits/rejected": 3.2455108165740967, + "logps/chosen": -309.43511962890625, + "logps/rejected": -304.3037109375, + "loss": 0.4432, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.692728042602539, + "rewards/margins": 2.9386367797851562, + "rewards/rejected": -5.631364822387695, + "step": 32040 + }, + { + "epoch": 1.0447165399874543, + "grad_norm": 2.0885117053985596, + "learning_rate": 3.259686512203865e-05, + "logits/chosen": 3.3127503395080566, + "logits/rejected": 3.548640489578247, + "logps/chosen": -357.96734619140625, + "logps/rejected": -383.3288269042969, + "loss": 0.3352, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.0054690837860107, + "rewards/margins": 4.065767765045166, + "rewards/rejected": -7.071238040924072, + "step": 32060 + }, + { + "epoch": 1.045368265838975, + "grad_norm": 2.300396680831909, + "learning_rate": 3.2586002759040205e-05, + "logits/chosen": 2.956033229827881, + "logits/rejected": 3.2783608436584473, + "logps/chosen": -316.1154479980469, + "logps/rejected": -308.9698181152344, + "loss": 0.346, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.952763319015503, + "rewards/margins": 3.8754703998565674, + "rewards/rejected": -5.82823371887207, + "step": 32080 + }, + { + "epoch": 1.0460199916904953, + "grad_norm": 0.902328610420227, + "learning_rate": 3.2575140396041756e-05, + "logits/chosen": 3.499948501586914, + "logits/rejected": 3.5698390007019043, + "logps/chosen": -332.36700439453125, + "logps/rejected": -325.1605224609375, + "loss": 0.4157, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8761522769927979, + "rewards/margins": 2.643202304840088, + "rewards/rejected": -4.519354343414307, + "step": 32100 + }, + { + "epoch": 1.046671717542016, + "grad_norm": 4.755712032318115, + "learning_rate": 3.256427803304331e-05, + "logits/chosen": 3.341796875, + "logits/rejected": 3.3641815185546875, + "logps/chosen": -364.0575256347656, + "logps/rejected": -338.70953369140625, + "loss": 0.2441, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.416917085647583, + "rewards/margins": 3.9199211597442627, + "rewards/rejected": -6.336838722229004, + "step": 32120 + }, + { + "epoch": 1.0473234433935366, + "grad_norm": 3.078765392303467, + "learning_rate": 3.2553415670044864e-05, + "logits/chosen": 3.5735068321228027, + "logits/rejected": 3.710880994796753, + "logps/chosen": -383.7872009277344, + "logps/rejected": -344.2131652832031, + "loss": 0.2013, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.261828660964966, + "rewards/margins": 4.029403209686279, + "rewards/rejected": -6.291232109069824, + "step": 32140 + }, + { + "epoch": 1.0479751692450572, + "grad_norm": 0.27556896209716797, + "learning_rate": 3.2542553307046415e-05, + "logits/chosen": 3.200392484664917, + "logits/rejected": 3.2759976387023926, + "logps/chosen": -377.3845520019531, + "logps/rejected": -369.90020751953125, + "loss": 0.2957, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.9352357387542725, + "rewards/margins": 3.971684694290161, + "rewards/rejected": -6.906920433044434, + "step": 32160 + }, + { + "epoch": 1.0486268950965776, + "grad_norm": 0.3764527440071106, + "learning_rate": 3.253169094404797e-05, + "logits/chosen": 3.2429473400115967, + "logits/rejected": 3.423099994659424, + "logps/chosen": -333.07080078125, + "logps/rejected": -329.1881408691406, + "loss": 0.2296, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.723175525665283, + "rewards/margins": 3.9536938667297363, + "rewards/rejected": -6.6768693923950195, + "step": 32180 + }, + { + "epoch": 1.0492786209480982, + "grad_norm": 2.6627871990203857, + "learning_rate": 3.252082858104952e-05, + "logits/chosen": 3.099884033203125, + "logits/rejected": 3.3801143169403076, + "logps/chosen": -316.9541931152344, + "logps/rejected": -321.32135009765625, + "loss": 0.2422, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.5849270820617676, + "rewards/margins": 4.077270984649658, + "rewards/rejected": -6.662198066711426, + "step": 32200 + }, + { + "epoch": 1.0499303467996188, + "grad_norm": 2.3268799781799316, + "learning_rate": 3.2509966218051074e-05, + "logits/chosen": 3.270188808441162, + "logits/rejected": 3.3397090435028076, + "logps/chosen": -359.2665100097656, + "logps/rejected": -326.13458251953125, + "loss": 0.3934, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.8970489501953125, + "rewards/margins": 3.327131986618042, + "rewards/rejected": -6.224181175231934, + "step": 32220 + }, + { + "epoch": 1.0505820726511392, + "grad_norm": 0.01642039604485035, + "learning_rate": 3.249910385505263e-05, + "logits/chosen": 3.3851001262664795, + "logits/rejected": 3.5238165855407715, + "logps/chosen": -354.07635498046875, + "logps/rejected": -345.5018615722656, + "loss": 0.334, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.155308485031128, + "rewards/margins": 4.432682991027832, + "rewards/rejected": -6.587990760803223, + "step": 32240 + }, + { + "epoch": 1.0512337985026599, + "grad_norm": 3.225761890411377, + "learning_rate": 3.248824149205418e-05, + "logits/chosen": 3.5733115673065186, + "logits/rejected": 3.580371141433716, + "logps/chosen": -363.369140625, + "logps/rejected": -344.016357421875, + "loss": 0.4567, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.318470001220703, + "rewards/margins": 3.7096505165100098, + "rewards/rejected": -6.028120994567871, + "step": 32260 + }, + { + "epoch": 1.0518855243541805, + "grad_norm": 3.0177419185638428, + "learning_rate": 3.247737912905573e-05, + "logits/chosen": 3.3927242755889893, + "logits/rejected": 3.439061403274536, + "logps/chosen": -364.99090576171875, + "logps/rejected": -309.08294677734375, + "loss": 0.3411, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9454336166381836, + "rewards/margins": 3.392503023147583, + "rewards/rejected": -5.337937355041504, + "step": 32280 + }, + { + "epoch": 1.0525372502057009, + "grad_norm": 0.8248674273490906, + "learning_rate": 3.246651676605729e-05, + "logits/chosen": 3.119027853012085, + "logits/rejected": 3.181180477142334, + "logps/chosen": -353.6136779785156, + "logps/rejected": -322.18145751953125, + "loss": 0.3368, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7572778463363647, + "rewards/margins": 3.4960639476776123, + "rewards/rejected": -5.2533416748046875, + "step": 32300 + }, + { + "epoch": 1.0531889760572215, + "grad_norm": 2.169062614440918, + "learning_rate": 3.245565440305884e-05, + "logits/chosen": 3.2800440788269043, + "logits/rejected": 3.4227375984191895, + "logps/chosen": -304.5511779785156, + "logps/rejected": -265.6996154785156, + "loss": 0.1727, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.3901734352111816, + "rewards/margins": 4.053071975708008, + "rewards/rejected": -5.443244934082031, + "step": 32320 + }, + { + "epoch": 1.0538407019087421, + "grad_norm": 2.056321144104004, + "learning_rate": 3.244479204006039e-05, + "logits/chosen": 3.984064817428589, + "logits/rejected": 4.02722692489624, + "logps/chosen": -392.7979431152344, + "logps/rejected": -335.302001953125, + "loss": 0.2783, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.1101171970367432, + "rewards/margins": 4.626636981964111, + "rewards/rejected": -5.736753940582275, + "step": 32340 + }, + { + "epoch": 1.0544924277602628, + "grad_norm": 1.0678999423980713, + "learning_rate": 3.243392967706195e-05, + "logits/chosen": 3.722771406173706, + "logits/rejected": 3.7870049476623535, + "logps/chosen": -344.49444580078125, + "logps/rejected": -326.28033447265625, + "loss": 0.2959, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.61526358127594, + "rewards/margins": 3.6672630310058594, + "rewards/rejected": -5.282527446746826, + "step": 32360 + }, + { + "epoch": 1.0551441536117832, + "grad_norm": 3.321441650390625, + "learning_rate": 3.242306731406351e-05, + "logits/chosen": 3.3729472160339355, + "logits/rejected": 3.518719434738159, + "logps/chosen": -339.10400390625, + "logps/rejected": -295.0781555175781, + "loss": 0.3776, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.096189022064209, + "rewards/margins": 3.6813857555389404, + "rewards/rejected": -5.7775750160217285, + "step": 32380 + }, + { + "epoch": 1.0557958794633038, + "grad_norm": 2.3305563926696777, + "learning_rate": 3.241220495106506e-05, + "logits/chosen": 3.3298115730285645, + "logits/rejected": 3.3834259510040283, + "logps/chosen": -363.83624267578125, + "logps/rejected": -344.3626403808594, + "loss": 0.4234, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.2358174324035645, + "rewards/margins": 3.40403413772583, + "rewards/rejected": -5.639851093292236, + "step": 32400 + }, + { + "epoch": 1.0564476053148244, + "grad_norm": 0.47415515780448914, + "learning_rate": 3.240134258806661e-05, + "logits/chosen": 2.6728804111480713, + "logits/rejected": 2.9629123210906982, + "logps/chosen": -321.4997863769531, + "logps/rejected": -357.16326904296875, + "loss": 0.2649, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.032886505126953, + "rewards/margins": 3.86413836479187, + "rewards/rejected": -5.897025108337402, + "step": 32420 + }, + { + "epoch": 1.0570993311663448, + "grad_norm": 3.289975166320801, + "learning_rate": 3.2390480225068167e-05, + "logits/chosen": 3.1307895183563232, + "logits/rejected": 3.1623692512512207, + "logps/chosen": -326.0572814941406, + "logps/rejected": -284.6923522949219, + "loss": 0.2271, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.282402992248535, + "rewards/margins": 3.85949444770813, + "rewards/rejected": -6.141897201538086, + "step": 32440 + }, + { + "epoch": 1.0577510570178654, + "grad_norm": 1.4898706674575806, + "learning_rate": 3.237961786206972e-05, + "logits/chosen": 2.9731228351593018, + "logits/rejected": 3.3676369190216064, + "logps/chosen": -397.6000061035156, + "logps/rejected": -355.19561767578125, + "loss": 0.2466, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.109541654586792, + "rewards/margins": 4.170680999755859, + "rewards/rejected": -6.280222415924072, + "step": 32460 + }, + { + "epoch": 1.058402782869386, + "grad_norm": 0.5536209344863892, + "learning_rate": 3.236875549907127e-05, + "logits/chosen": 3.3327739238739014, + "logits/rejected": 3.4480767250061035, + "logps/chosen": -395.73699951171875, + "logps/rejected": -345.22271728515625, + "loss": 0.4849, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.0427680015563965, + "rewards/margins": 3.494807720184326, + "rewards/rejected": -6.537575721740723, + "step": 32480 + }, + { + "epoch": 1.0590545087209065, + "grad_norm": 0.8034530282020569, + "learning_rate": 3.2357893136072826e-05, + "logits/chosen": 3.4409401416778564, + "logits/rejected": 3.493689775466919, + "logps/chosen": -349.6027526855469, + "logps/rejected": -350.62200927734375, + "loss": 0.4457, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0333688259124756, + "rewards/margins": 3.7725729942321777, + "rewards/rejected": -5.805942058563232, + "step": 32500 + }, + { + "epoch": 1.059706234572427, + "grad_norm": 3.729334831237793, + "learning_rate": 3.2347030773074376e-05, + "logits/chosen": 3.6650092601776123, + "logits/rejected": 3.8991973400115967, + "logps/chosen": -389.00006103515625, + "logps/rejected": -349.2099914550781, + "loss": 0.1799, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4326997995376587, + "rewards/margins": 4.514864444732666, + "rewards/rejected": -5.947564125061035, + "step": 32520 + }, + { + "epoch": 1.0603579604239477, + "grad_norm": 2.285386323928833, + "learning_rate": 3.233616841007593e-05, + "logits/chosen": 3.458075761795044, + "logits/rejected": 3.6657214164733887, + "logps/chosen": -333.86199951171875, + "logps/rejected": -324.9666442871094, + "loss": 0.4016, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.8043320178985596, + "rewards/margins": 3.1593356132507324, + "rewards/rejected": -4.963667392730713, + "step": 32540 + }, + { + "epoch": 1.0610096862754683, + "grad_norm": 7.179728984832764, + "learning_rate": 3.232530604707748e-05, + "logits/chosen": 3.305936813354492, + "logits/rejected": 3.3561034202575684, + "logps/chosen": -371.1749267578125, + "logps/rejected": -310.7608947753906, + "loss": 0.2685, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.72540283203125, + "rewards/margins": 3.2559494972229004, + "rewards/rejected": -4.981351852416992, + "step": 32560 + }, + { + "epoch": 1.0616614121269887, + "grad_norm": 1.3266469240188599, + "learning_rate": 3.2314443684079036e-05, + "logits/chosen": 3.404832363128662, + "logits/rejected": 3.4764580726623535, + "logps/chosen": -347.43890380859375, + "logps/rejected": -299.05010986328125, + "loss": 0.1925, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6484483480453491, + "rewards/margins": 3.936234951019287, + "rewards/rejected": -5.584683418273926, + "step": 32580 + }, + { + "epoch": 1.0623131379785093, + "grad_norm": 0.17426566779613495, + "learning_rate": 3.2303581321080586e-05, + "logits/chosen": 3.1476452350616455, + "logits/rejected": 3.2403297424316406, + "logps/chosen": -362.6288757324219, + "logps/rejected": -342.67742919921875, + "loss": 0.1925, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7237523794174194, + "rewards/margins": 3.7644824981689453, + "rewards/rejected": -5.488234519958496, + "step": 32600 + }, + { + "epoch": 1.06296486383003, + "grad_norm": 4.643777847290039, + "learning_rate": 3.2292718958082144e-05, + "logits/chosen": 3.704949140548706, + "logits/rejected": 3.5954513549804688, + "logps/chosen": -382.2177734375, + "logps/rejected": -323.97198486328125, + "loss": 0.3119, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.3258228302001953, + "rewards/margins": 3.6067955493927, + "rewards/rejected": -5.932618141174316, + "step": 32620 + }, + { + "epoch": 1.0636165896815504, + "grad_norm": 2.3187897205352783, + "learning_rate": 3.22818565950837e-05, + "logits/chosen": 3.324749708175659, + "logits/rejected": 3.4061367511749268, + "logps/chosen": -351.1152648925781, + "logps/rejected": -359.9755859375, + "loss": 0.3379, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9033960103988647, + "rewards/margins": 4.508838653564453, + "rewards/rejected": -6.412235260009766, + "step": 32640 + }, + { + "epoch": 1.064268315533071, + "grad_norm": 6.075270175933838, + "learning_rate": 3.227099423208525e-05, + "logits/chosen": 3.6870789527893066, + "logits/rejected": 3.7252354621887207, + "logps/chosen": -371.77471923828125, + "logps/rejected": -314.95831298828125, + "loss": 0.382, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.8669676780700684, + "rewards/margins": 3.412940263748169, + "rewards/rejected": -6.279908180236816, + "step": 32660 + }, + { + "epoch": 1.0649200413845916, + "grad_norm": 3.0315353870391846, + "learning_rate": 3.22601318690868e-05, + "logits/chosen": 3.0675806999206543, + "logits/rejected": 3.050295352935791, + "logps/chosen": -342.479248046875, + "logps/rejected": -367.0733947753906, + "loss": 0.188, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5715181827545166, + "rewards/margins": 5.114248275756836, + "rewards/rejected": -6.685766696929932, + "step": 32680 + }, + { + "epoch": 1.0655717672361122, + "grad_norm": 2.6424074172973633, + "learning_rate": 3.224926950608836e-05, + "logits/chosen": 3.149056911468506, + "logits/rejected": 3.185819149017334, + "logps/chosen": -365.74322509765625, + "logps/rejected": -358.70452880859375, + "loss": 0.1808, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9880969524383545, + "rewards/margins": 4.39165735244751, + "rewards/rejected": -6.379754543304443, + "step": 32700 + }, + { + "epoch": 1.0662234930876326, + "grad_norm": 0.9634497761726379, + "learning_rate": 3.223840714308991e-05, + "logits/chosen": 3.4208176136016846, + "logits/rejected": 3.505516767501831, + "logps/chosen": -363.2804870605469, + "logps/rejected": -344.2446594238281, + "loss": 0.2763, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.0097150802612305, + "rewards/margins": 3.7644734382629395, + "rewards/rejected": -5.774188041687012, + "step": 32720 + }, + { + "epoch": 1.0668752189391533, + "grad_norm": 1.2597726583480835, + "learning_rate": 3.222754478009146e-05, + "logits/chosen": 3.1843295097351074, + "logits/rejected": 3.1843698024749756, + "logps/chosen": -380.51751708984375, + "logps/rejected": -333.78778076171875, + "loss": 0.2267, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.8640809059143066, + "rewards/margins": 3.8741493225097656, + "rewards/rejected": -6.738229274749756, + "step": 32740 + }, + { + "epoch": 1.067526944790674, + "grad_norm": 0.3200264275074005, + "learning_rate": 3.221668241709301e-05, + "logits/chosen": 3.1836161613464355, + "logits/rejected": 3.303508758544922, + "logps/chosen": -348.00396728515625, + "logps/rejected": -305.3224792480469, + "loss": 0.1797, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.1372854709625244, + "rewards/margins": 3.795283079147339, + "rewards/rejected": -5.932568073272705, + "step": 32760 + }, + { + "epoch": 1.0681786706421943, + "grad_norm": 3.458355188369751, + "learning_rate": 3.220582005409457e-05, + "logits/chosen": 3.147268772125244, + "logits/rejected": 3.0892577171325684, + "logps/chosen": -350.88995361328125, + "logps/rejected": -323.80224609375, + "loss": 0.1545, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.188310146331787, + "rewards/margins": 4.3824782371521, + "rewards/rejected": -6.570788383483887, + "step": 32780 + }, + { + "epoch": 1.068830396493715, + "grad_norm": 2.1683971881866455, + "learning_rate": 3.219495769109612e-05, + "logits/chosen": 3.4170730113983154, + "logits/rejected": 3.3682332038879395, + "logps/chosen": -403.35772705078125, + "logps/rejected": -387.6561584472656, + "loss": 0.2972, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.354405641555786, + "rewards/margins": 4.318927764892578, + "rewards/rejected": -6.673333644866943, + "step": 32800 + }, + { + "epoch": 1.0694821223452355, + "grad_norm": 0.06361764669418335, + "learning_rate": 3.218409532809767e-05, + "logits/chosen": 3.323692798614502, + "logits/rejected": 3.3635382652282715, + "logps/chosen": -387.2239074707031, + "logps/rejected": -369.09735107421875, + "loss": 0.2657, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.006308078765869, + "rewards/margins": 4.552167892456055, + "rewards/rejected": -6.558476448059082, + "step": 32820 + }, + { + "epoch": 1.070133848196756, + "grad_norm": 6.015855312347412, + "learning_rate": 3.217323296509923e-05, + "logits/chosen": 3.5036189556121826, + "logits/rejected": 3.432309627532959, + "logps/chosen": -367.708251953125, + "logps/rejected": -344.6845397949219, + "loss": 0.3202, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.3185622692108154, + "rewards/margins": 4.0794830322265625, + "rewards/rejected": -6.398045539855957, + "step": 32840 + }, + { + "epoch": 1.0707855740482766, + "grad_norm": 2.8076510429382324, + "learning_rate": 3.216237060210078e-05, + "logits/chosen": 3.0788094997406006, + "logits/rejected": 3.3430113792419434, + "logps/chosen": -337.47052001953125, + "logps/rejected": -305.0273742675781, + "loss": 0.4485, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.208899974822998, + "rewards/margins": 3.164031505584717, + "rewards/rejected": -5.372931480407715, + "step": 32860 + }, + { + "epoch": 1.0714372998997972, + "grad_norm": 0.823993980884552, + "learning_rate": 3.215150823910234e-05, + "logits/chosen": 3.1885197162628174, + "logits/rejected": 3.1567447185516357, + "logps/chosen": -417.70458984375, + "logps/rejected": -371.2560119628906, + "loss": 0.2945, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.6270124912261963, + "rewards/margins": 3.9112937450408936, + "rewards/rejected": -6.53830623626709, + "step": 32880 + }, + { + "epoch": 1.0720890257513176, + "grad_norm": 3.2349255084991455, + "learning_rate": 3.214064587610389e-05, + "logits/chosen": 3.342254161834717, + "logits/rejected": 3.3000216484069824, + "logps/chosen": -368.23583984375, + "logps/rejected": -335.69915771484375, + "loss": 0.2416, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.3719847202301025, + "rewards/margins": 4.233086585998535, + "rewards/rejected": -6.605071067810059, + "step": 32900 + }, + { + "epoch": 1.0727407516028382, + "grad_norm": 0.8437020182609558, + "learning_rate": 3.2129783513105446e-05, + "logits/chosen": 3.5553245544433594, + "logits/rejected": 3.6500446796417236, + "logps/chosen": -382.76043701171875, + "logps/rejected": -355.40631103515625, + "loss": 0.3696, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9264177083969116, + "rewards/margins": 4.366955280303955, + "rewards/rejected": -6.293372631072998, + "step": 32920 + }, + { + "epoch": 1.0733924774543588, + "grad_norm": 0.6143779754638672, + "learning_rate": 3.2118921150107e-05, + "logits/chosen": 3.2571685314178467, + "logits/rejected": 3.333824872970581, + "logps/chosen": -351.58892822265625, + "logps/rejected": -361.0576477050781, + "loss": 0.3311, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5340194702148438, + "rewards/margins": 4.291917324066162, + "rewards/rejected": -5.825936794281006, + "step": 32940 + }, + { + "epoch": 1.0740442033058795, + "grad_norm": 1.1735484600067139, + "learning_rate": 3.210805878710855e-05, + "logits/chosen": 3.2642734050750732, + "logits/rejected": 3.354067325592041, + "logps/chosen": -399.89715576171875, + "logps/rejected": -345.7653503417969, + "loss": 0.3174, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9308793544769287, + "rewards/margins": 4.155165672302246, + "rewards/rejected": -6.086045265197754, + "step": 32960 + }, + { + "epoch": 1.0746959291573999, + "grad_norm": 5.910482883453369, + "learning_rate": 3.2097196424110105e-05, + "logits/chosen": 3.501976490020752, + "logits/rejected": 3.6269378662109375, + "logps/chosen": -378.5753173828125, + "logps/rejected": -349.34161376953125, + "loss": 0.2643, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.266730785369873, + "rewards/margins": 4.042552947998047, + "rewards/rejected": -6.309284210205078, + "step": 32980 + }, + { + "epoch": 1.0753476550089205, + "grad_norm": 1.6781542301177979, + "learning_rate": 3.2086334061111656e-05, + "logits/chosen": 3.3125813007354736, + "logits/rejected": 3.4756622314453125, + "logps/chosen": -302.0391540527344, + "logps/rejected": -281.24371337890625, + "loss": 0.2807, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.0752265453338623, + "rewards/margins": 3.072707414627075, + "rewards/rejected": -5.1479339599609375, + "step": 33000 + }, + { + "epoch": 1.0759993808604411, + "grad_norm": 6.784409999847412, + "learning_rate": 3.207547169811321e-05, + "logits/chosen": 3.2877261638641357, + "logits/rejected": 3.3979859352111816, + "logps/chosen": -367.53839111328125, + "logps/rejected": -370.53466796875, + "loss": 0.3442, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.48725962638855, + "rewards/margins": 3.7632575035095215, + "rewards/rejected": -6.25051736831665, + "step": 33020 + }, + { + "epoch": 1.0766511067119615, + "grad_norm": 1.2592697143554688, + "learning_rate": 3.2064609335114765e-05, + "logits/chosen": 3.1062538623809814, + "logits/rejected": 3.206164836883545, + "logps/chosen": -323.169677734375, + "logps/rejected": -315.17938232421875, + "loss": 0.4164, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.749332904815674, + "rewards/margins": 3.293476104736328, + "rewards/rejected": -6.042808532714844, + "step": 33040 + }, + { + "epoch": 1.0773028325634821, + "grad_norm": 0.8864524364471436, + "learning_rate": 3.2053746972116315e-05, + "logits/chosen": 3.4053356647491455, + "logits/rejected": 3.4919979572296143, + "logps/chosen": -375.2286071777344, + "logps/rejected": -332.7674865722656, + "loss": 0.3002, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9860203266143799, + "rewards/margins": 3.946744441986084, + "rewards/rejected": -5.932765007019043, + "step": 33060 + }, + { + "epoch": 1.0779545584150028, + "grad_norm": 0.3971797823905945, + "learning_rate": 3.2042884609117866e-05, + "logits/chosen": 2.9113781452178955, + "logits/rejected": 3.3176522254943848, + "logps/chosen": -349.97161865234375, + "logps/rejected": -325.46136474609375, + "loss": 0.1536, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6946885585784912, + "rewards/margins": 4.403069496154785, + "rewards/rejected": -6.0977582931518555, + "step": 33080 + }, + { + "epoch": 1.0786062842665234, + "grad_norm": 6.9818572998046875, + "learning_rate": 3.203202224611942e-05, + "logits/chosen": 3.3974814414978027, + "logits/rejected": 3.35834002494812, + "logps/chosen": -400.0635986328125, + "logps/rejected": -340.399658203125, + "loss": 0.2553, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.7458823919296265, + "rewards/margins": 3.889500856399536, + "rewards/rejected": -5.635382652282715, + "step": 33100 + }, + { + "epoch": 1.0792580101180438, + "grad_norm": 5.697571277618408, + "learning_rate": 3.2021159883120975e-05, + "logits/chosen": 3.0476481914520264, + "logits/rejected": 3.2578024864196777, + "logps/chosen": -343.77056884765625, + "logps/rejected": -286.0328369140625, + "loss": 0.3449, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2312674522399902, + "rewards/margins": 3.2536227703094482, + "rewards/rejected": -5.484890937805176, + "step": 33120 + }, + { + "epoch": 1.0799097359695644, + "grad_norm": 9.823641777038574, + "learning_rate": 3.2010297520122525e-05, + "logits/chosen": 3.103921890258789, + "logits/rejected": 3.1188855171203613, + "logps/chosen": -313.35821533203125, + "logps/rejected": -331.3065490722656, + "loss": 0.3518, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.249566078186035, + "rewards/margins": 2.9587621688842773, + "rewards/rejected": -5.208327770233154, + "step": 33140 + }, + { + "epoch": 1.080561461821085, + "grad_norm": 5.145406246185303, + "learning_rate": 3.199943515712408e-05, + "logits/chosen": 3.350620746612549, + "logits/rejected": 3.4894981384277344, + "logps/chosen": -362.4195861816406, + "logps/rejected": -374.7322082519531, + "loss": 0.1885, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7797836065292358, + "rewards/margins": 4.1820969581604, + "rewards/rejected": -5.961880207061768, + "step": 33160 + }, + { + "epoch": 1.0812131876726054, + "grad_norm": 2.6972177028656006, + "learning_rate": 3.198857279412564e-05, + "logits/chosen": 3.2478814125061035, + "logits/rejected": 3.3221383094787598, + "logps/chosen": -323.14666748046875, + "logps/rejected": -357.284912109375, + "loss": 0.2516, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.0554401874542236, + "rewards/margins": 4.305346488952637, + "rewards/rejected": -6.3607869148254395, + "step": 33180 + }, + { + "epoch": 1.081864913524126, + "grad_norm": 1.0366191864013672, + "learning_rate": 3.197771043112719e-05, + "logits/chosen": 3.0452091693878174, + "logits/rejected": 3.346562623977661, + "logps/chosen": -333.507080078125, + "logps/rejected": -326.9417724609375, + "loss": 0.1534, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.045668125152588, + "rewards/margins": 4.186034202575684, + "rewards/rejected": -6.2317023277282715, + "step": 33200 + }, + { + "epoch": 1.0825166393756467, + "grad_norm": 0.8757151961326599, + "learning_rate": 3.196684806812874e-05, + "logits/chosen": 3.644109010696411, + "logits/rejected": 3.752816677093506, + "logps/chosen": -344.36151123046875, + "logps/rejected": -376.9736328125, + "loss": 0.1425, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.893967390060425, + "rewards/margins": 4.805129051208496, + "rewards/rejected": -7.6990966796875, + "step": 33220 + }, + { + "epoch": 1.0831683652271673, + "grad_norm": 3.441262722015381, + "learning_rate": 3.19559857051303e-05, + "logits/chosen": 3.0535101890563965, + "logits/rejected": 3.1531107425689697, + "logps/chosen": -333.4349670410156, + "logps/rejected": -358.5475158691406, + "loss": 0.3669, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.6973087787628174, + "rewards/margins": 3.786492109298706, + "rewards/rejected": -7.483800411224365, + "step": 33240 + }, + { + "epoch": 1.0838200910786877, + "grad_norm": 0.0831654742360115, + "learning_rate": 3.194512334213185e-05, + "logits/chosen": 3.3306827545166016, + "logits/rejected": 3.4634792804718018, + "logps/chosen": -356.9676208496094, + "logps/rejected": -349.1873779296875, + "loss": 0.2905, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.1602654457092285, + "rewards/margins": 4.210018634796143, + "rewards/rejected": -7.370283603668213, + "step": 33260 + }, + { + "epoch": 1.0844718169302083, + "grad_norm": 2.149723529815674, + "learning_rate": 3.19342609791334e-05, + "logits/chosen": 3.1293230056762695, + "logits/rejected": 3.2229926586151123, + "logps/chosen": -366.41845703125, + "logps/rejected": -364.91900634765625, + "loss": 0.3989, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.4761595726013184, + "rewards/margins": 3.581744432449341, + "rewards/rejected": -6.057904243469238, + "step": 33280 + }, + { + "epoch": 1.085123542781729, + "grad_norm": 1.0781009197235107, + "learning_rate": 3.192339861613495e-05, + "logits/chosen": 3.4323196411132812, + "logits/rejected": 3.405174732208252, + "logps/chosen": -399.3955383300781, + "logps/rejected": -332.19720458984375, + "loss": 0.2102, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.698568344116211, + "rewards/margins": 3.9607272148132324, + "rewards/rejected": -6.65929651260376, + "step": 33300 + }, + { + "epoch": 1.0857752686332494, + "grad_norm": 0.024063492193818092, + "learning_rate": 3.191253625313651e-05, + "logits/chosen": 2.999730110168457, + "logits/rejected": 3.3150105476379395, + "logps/chosen": -358.9717712402344, + "logps/rejected": -333.312744140625, + "loss": 0.2392, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.953217029571533, + "rewards/margins": 3.9856293201446533, + "rewards/rejected": -6.938845634460449, + "step": 33320 + }, + { + "epoch": 1.08642699448477, + "grad_norm": 0.28845134377479553, + "learning_rate": 3.190167389013806e-05, + "logits/chosen": 3.306591510772705, + "logits/rejected": 3.397582530975342, + "logps/chosen": -353.0913391113281, + "logps/rejected": -348.0680236816406, + "loss": 0.2825, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.828937292098999, + "rewards/margins": 4.32065486907959, + "rewards/rejected": -7.149592399597168, + "step": 33340 + }, + { + "epoch": 1.0870787203362906, + "grad_norm": 2.9980685710906982, + "learning_rate": 3.189081152713961e-05, + "logits/chosen": 2.910337448120117, + "logits/rejected": 3.1406338214874268, + "logps/chosen": -355.82525634765625, + "logps/rejected": -366.03717041015625, + "loss": 0.4681, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.7069435119628906, + "rewards/margins": 3.9239087104797363, + "rewards/rejected": -7.630851745605469, + "step": 33360 + }, + { + "epoch": 1.087730446187811, + "grad_norm": 0.5469492673873901, + "learning_rate": 3.187994916414117e-05, + "logits/chosen": 3.0936672687530518, + "logits/rejected": 3.1934823989868164, + "logps/chosen": -361.8411865234375, + "logps/rejected": -313.46295166015625, + "loss": 0.2332, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.6009299755096436, + "rewards/margins": 3.9910659790039062, + "rewards/rejected": -6.591996192932129, + "step": 33380 + }, + { + "epoch": 1.0883821720393316, + "grad_norm": 1.8174318075180054, + "learning_rate": 3.186908680114272e-05, + "logits/chosen": 2.9981164932250977, + "logits/rejected": 3.262179136276245, + "logps/chosen": -363.2374572753906, + "logps/rejected": -308.6988525390625, + "loss": 0.3033, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.985642910003662, + "rewards/margins": 3.19091796875, + "rewards/rejected": -6.17656135559082, + "step": 33400 + }, + { + "epoch": 1.0890338978908523, + "grad_norm": 1.3241711854934692, + "learning_rate": 3.185822443814428e-05, + "logits/chosen": 3.0240797996520996, + "logits/rejected": 3.132256269454956, + "logps/chosen": -325.45269775390625, + "logps/rejected": -336.3328857421875, + "loss": 0.2545, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.047510862350464, + "rewards/margins": 3.950181245803833, + "rewards/rejected": -6.997693061828613, + "step": 33420 + }, + { + "epoch": 1.0896856237423727, + "grad_norm": 1.6088576316833496, + "learning_rate": 3.1847362075145834e-05, + "logits/chosen": 3.185009479522705, + "logits/rejected": 3.339560031890869, + "logps/chosen": -321.71441650390625, + "logps/rejected": -327.6979064941406, + "loss": 0.288, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.4925925731658936, + "rewards/margins": 4.0677690505981445, + "rewards/rejected": -7.560361385345459, + "step": 33440 + }, + { + "epoch": 1.0903373495938933, + "grad_norm": 1.6457812786102295, + "learning_rate": 3.1836499712147385e-05, + "logits/chosen": 3.1509368419647217, + "logits/rejected": 3.1807217597961426, + "logps/chosen": -346.55987548828125, + "logps/rejected": -360.0465087890625, + "loss": 0.2105, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.6696090698242188, + "rewards/margins": 4.393975257873535, + "rewards/rejected": -7.063584327697754, + "step": 33460 + }, + { + "epoch": 1.090989075445414, + "grad_norm": 0.5734410881996155, + "learning_rate": 3.1825637349148936e-05, + "logits/chosen": 3.321554660797119, + "logits/rejected": 3.211735963821411, + "logps/chosen": -377.2806701660156, + "logps/rejected": -331.33648681640625, + "loss": 0.2289, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.6452255249023438, + "rewards/margins": 4.550453186035156, + "rewards/rejected": -8.1956787109375, + "step": 33480 + }, + { + "epoch": 1.0916408012969345, + "grad_norm": 5.01320219039917, + "learning_rate": 3.181477498615049e-05, + "logits/chosen": 2.361034631729126, + "logits/rejected": 2.5729291439056396, + "logps/chosen": -329.2546691894531, + "logps/rejected": -337.0495910644531, + "loss": 0.4059, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.199906349182129, + "rewards/margins": 4.2299580574035645, + "rewards/rejected": -8.429864883422852, + "step": 33500 + }, + { + "epoch": 1.092292527148455, + "grad_norm": 1.4035032987594604, + "learning_rate": 3.1803912623152044e-05, + "logits/chosen": 3.2722232341766357, + "logits/rejected": 3.3067409992218018, + "logps/chosen": -376.0247497558594, + "logps/rejected": -380.7710876464844, + "loss": 0.1553, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.341508388519287, + "rewards/margins": 4.572329044342041, + "rewards/rejected": -7.9138383865356445, + "step": 33520 + }, + { + "epoch": 1.0929442529999756, + "grad_norm": 3.118572950363159, + "learning_rate": 3.1793050260153595e-05, + "logits/chosen": 3.2713913917541504, + "logits/rejected": 3.304354429244995, + "logps/chosen": -346.00390625, + "logps/rejected": -322.10528564453125, + "loss": 0.2844, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.96669340133667, + "rewards/margins": 3.651611328125, + "rewards/rejected": -6.618304252624512, + "step": 33540 + }, + { + "epoch": 1.0935959788514962, + "grad_norm": 0.7617296576499939, + "learning_rate": 3.1782187897155146e-05, + "logits/chosen": 3.1323914527893066, + "logits/rejected": 3.178974151611328, + "logps/chosen": -379.6805114746094, + "logps/rejected": -402.2134094238281, + "loss": 0.2458, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.7235872745513916, + "rewards/margins": 4.789143085479736, + "rewards/rejected": -7.512729644775391, + "step": 33560 + }, + { + "epoch": 1.0942477047030166, + "grad_norm": 1.934569001197815, + "learning_rate": 3.1771325534156704e-05, + "logits/chosen": 2.7855892181396484, + "logits/rejected": 2.8744895458221436, + "logps/chosen": -296.51318359375, + "logps/rejected": -327.4674377441406, + "loss": 0.2826, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.407532215118408, + "rewards/margins": 4.082362174987793, + "rewards/rejected": -6.489893913269043, + "step": 33580 + }, + { + "epoch": 1.0948994305545372, + "grad_norm": 8.591286659240723, + "learning_rate": 3.1760463171158254e-05, + "logits/chosen": 2.6106975078582764, + "logits/rejected": 2.7780566215515137, + "logps/chosen": -347.65545654296875, + "logps/rejected": -342.4414367675781, + "loss": 0.2398, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.5986034870147705, + "rewards/margins": 4.7869157791137695, + "rewards/rejected": -8.385519027709961, + "step": 33600 + }, + { + "epoch": 1.0955511564060578, + "grad_norm": 3.7479586601257324, + "learning_rate": 3.1749600808159805e-05, + "logits/chosen": 2.807281970977783, + "logits/rejected": 2.769597053527832, + "logps/chosen": -361.0981750488281, + "logps/rejected": -367.0751953125, + "loss": 0.208, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.1601881980895996, + "rewards/margins": 4.632073402404785, + "rewards/rejected": -7.792261600494385, + "step": 33620 + }, + { + "epoch": 1.0962028822575784, + "grad_norm": 0.24373789131641388, + "learning_rate": 3.173873844516136e-05, + "logits/chosen": 2.9447569847106934, + "logits/rejected": 3.1188457012176514, + "logps/chosen": -369.92584228515625, + "logps/rejected": -347.48541259765625, + "loss": 0.3857, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.168647289276123, + "rewards/margins": 4.106196880340576, + "rewards/rejected": -7.274844169616699, + "step": 33640 + }, + { + "epoch": 1.0968546081090988, + "grad_norm": 0.5519270896911621, + "learning_rate": 3.1727876082162913e-05, + "logits/chosen": 2.769221305847168, + "logits/rejected": 2.931643009185791, + "logps/chosen": -311.7629089355469, + "logps/rejected": -330.54974365234375, + "loss": 0.2667, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.8584353923797607, + "rewards/margins": 3.8724758625030518, + "rewards/rejected": -6.7309112548828125, + "step": 33660 + }, + { + "epoch": 1.0975063339606195, + "grad_norm": 3.4785077571868896, + "learning_rate": 3.171701371916447e-05, + "logits/chosen": 3.2955169677734375, + "logits/rejected": 3.5083985328674316, + "logps/chosen": -364.0898742675781, + "logps/rejected": -379.32989501953125, + "loss": 0.2695, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.889425754547119, + "rewards/margins": 3.9731154441833496, + "rewards/rejected": -7.862541198730469, + "step": 33680 + }, + { + "epoch": 1.09815805981214, + "grad_norm": 6.109076499938965, + "learning_rate": 3.170615135616602e-05, + "logits/chosen": 2.9273159503936768, + "logits/rejected": 3.0251784324645996, + "logps/chosen": -387.2181091308594, + "logps/rejected": -359.0142822265625, + "loss": 0.2503, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.4268417358398438, + "rewards/margins": 5.344286918640137, + "rewards/rejected": -8.771127700805664, + "step": 33700 + }, + { + "epoch": 1.0988097856636605, + "grad_norm": 8.581245422363281, + "learning_rate": 3.169528899316758e-05, + "logits/chosen": 2.91397762298584, + "logits/rejected": 2.7906644344329834, + "logps/chosen": -354.6789245605469, + "logps/rejected": -350.5526428222656, + "loss": 0.2735, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.5519180297851562, + "rewards/margins": 4.364335060119629, + "rewards/rejected": -7.916253089904785, + "step": 33720 + }, + { + "epoch": 1.0994615115151811, + "grad_norm": 8.489725112915039, + "learning_rate": 3.168442663016913e-05, + "logits/chosen": 2.987903118133545, + "logits/rejected": 2.9252469539642334, + "logps/chosen": -365.7404479980469, + "logps/rejected": -376.0201110839844, + "loss": 0.2431, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.8721187114715576, + "rewards/margins": 4.041693687438965, + "rewards/rejected": -7.913812160491943, + "step": 33740 + }, + { + "epoch": 1.1001132373667017, + "grad_norm": 1.940566897392273, + "learning_rate": 3.167356426717068e-05, + "logits/chosen": 2.6883578300476074, + "logits/rejected": 2.865595579147339, + "logps/chosen": -343.1073913574219, + "logps/rejected": -377.99151611328125, + "loss": 0.2421, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.2557995319366455, + "rewards/margins": 4.160060882568359, + "rewards/rejected": -7.415860652923584, + "step": 33760 + }, + { + "epoch": 1.1007649632182224, + "grad_norm": 0.604231595993042, + "learning_rate": 3.166270190417224e-05, + "logits/chosen": 3.0776772499084473, + "logits/rejected": 3.066377639770508, + "logps/chosen": -353.2207946777344, + "logps/rejected": -387.3573303222656, + "loss": 0.2492, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.0389835834503174, + "rewards/margins": 4.258456230163574, + "rewards/rejected": -7.2974395751953125, + "step": 33780 + }, + { + "epoch": 1.1014166890697428, + "grad_norm": 0.03961041942238808, + "learning_rate": 3.165183954117379e-05, + "logits/chosen": 2.9589524269104004, + "logits/rejected": 3.121094226837158, + "logps/chosen": -373.9268798828125, + "logps/rejected": -367.5953063964844, + "loss": 0.4014, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.160959243774414, + "rewards/margins": 4.49851655960083, + "rewards/rejected": -7.659475803375244, + "step": 33800 + }, + { + "epoch": 1.1020684149212634, + "grad_norm": 0.41982412338256836, + "learning_rate": 3.164097717817534e-05, + "logits/chosen": 3.1324992179870605, + "logits/rejected": 3.124950408935547, + "logps/chosen": -397.2090148925781, + "logps/rejected": -361.63372802734375, + "loss": 0.2522, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.1162805557250977, + "rewards/margins": 4.682436943054199, + "rewards/rejected": -7.798717498779297, + "step": 33820 + }, + { + "epoch": 1.102720140772784, + "grad_norm": 0.23991015553474426, + "learning_rate": 3.16301148151769e-05, + "logits/chosen": 2.8351898193359375, + "logits/rejected": 3.03517484664917, + "logps/chosen": -351.294921875, + "logps/rejected": -341.2694396972656, + "loss": 0.4792, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.441119432449341, + "rewards/margins": 3.5397884845733643, + "rewards/rejected": -6.980908393859863, + "step": 33840 + }, + { + "epoch": 1.1033718666243044, + "grad_norm": 2.5299158096313477, + "learning_rate": 3.161925245217845e-05, + "logits/chosen": 3.026526927947998, + "logits/rejected": 3.2014732360839844, + "logps/chosen": -409.42120361328125, + "logps/rejected": -384.262939453125, + "loss": 0.204, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.643568754196167, + "rewards/margins": 5.51914119720459, + "rewards/rejected": -8.162711143493652, + "step": 33860 + }, + { + "epoch": 1.104023592475825, + "grad_norm": 2.424398183822632, + "learning_rate": 3.160839008918e-05, + "logits/chosen": 3.1356804370880127, + "logits/rejected": 3.2105040550231934, + "logps/chosen": -321.29449462890625, + "logps/rejected": -331.5135803222656, + "loss": 0.2348, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0033745765686035, + "rewards/margins": 4.030724048614502, + "rewards/rejected": -6.034098148345947, + "step": 33880 + }, + { + "epoch": 1.1046753183273457, + "grad_norm": 5.4423980712890625, + "learning_rate": 3.159752772618155e-05, + "logits/chosen": 2.864041805267334, + "logits/rejected": 3.0224640369415283, + "logps/chosen": -337.95806884765625, + "logps/rejected": -331.7654113769531, + "loss": 0.3962, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.839423179626465, + "rewards/margins": 3.783830165863037, + "rewards/rejected": -6.62325382232666, + "step": 33900 + }, + { + "epoch": 1.105327044178866, + "grad_norm": 2.7187066078186035, + "learning_rate": 3.158666536318311e-05, + "logits/chosen": 2.7177600860595703, + "logits/rejected": 2.7550649642944336, + "logps/chosen": -345.78082275390625, + "logps/rejected": -314.7359619140625, + "loss": 0.3693, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.7923598289489746, + "rewards/margins": 4.144471645355225, + "rewards/rejected": -6.936831474304199, + "step": 33920 + }, + { + "epoch": 1.1059787700303867, + "grad_norm": 0.8913408517837524, + "learning_rate": 3.157580300018466e-05, + "logits/chosen": 3.259089946746826, + "logits/rejected": 3.3009848594665527, + "logps/chosen": -382.01373291015625, + "logps/rejected": -334.8896789550781, + "loss": 0.3568, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.2376163005828857, + "rewards/margins": 4.07568883895874, + "rewards/rejected": -6.313305377960205, + "step": 33940 + }, + { + "epoch": 1.1066304958819073, + "grad_norm": 3.6776673793792725, + "learning_rate": 3.1564940637186216e-05, + "logits/chosen": 3.1646487712860107, + "logits/rejected": 3.252290725708008, + "logps/chosen": -403.5401306152344, + "logps/rejected": -343.11761474609375, + "loss": 0.2035, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.363614797592163, + "rewards/margins": 3.9283652305603027, + "rewards/rejected": -6.291980266571045, + "step": 33960 + }, + { + "epoch": 1.1072822217334277, + "grad_norm": 5.8808722496032715, + "learning_rate": 3.1554078274187773e-05, + "logits/chosen": 3.388521909713745, + "logits/rejected": 3.494328022003174, + "logps/chosen": -376.57952880859375, + "logps/rejected": -335.46063232421875, + "loss": 0.3677, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.2395942211151123, + "rewards/margins": 3.544799327850342, + "rewards/rejected": -5.784393787384033, + "step": 33980 + }, + { + "epoch": 1.1079339475849483, + "grad_norm": 7.765500068664551, + "learning_rate": 3.1543215911189324e-05, + "logits/chosen": 3.071599245071411, + "logits/rejected": 3.1025662422180176, + "logps/chosen": -348.062744140625, + "logps/rejected": -342.47210693359375, + "loss": 0.3748, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.62155818939209, + "rewards/margins": 3.8680386543273926, + "rewards/rejected": -6.489596366882324, + "step": 34000 + }, + { + "epoch": 1.108585673436469, + "grad_norm": 2.03092098236084, + "learning_rate": 3.1532353548190875e-05, + "logits/chosen": 2.9119575023651123, + "logits/rejected": 3.1517419815063477, + "logps/chosen": -332.6952209472656, + "logps/rejected": -320.27740478515625, + "loss": 0.2144, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.086266040802002, + "rewards/margins": 4.404753684997559, + "rewards/rejected": -6.491020202636719, + "step": 34020 + }, + { + "epoch": 1.1092373992879896, + "grad_norm": 0.8028056025505066, + "learning_rate": 3.1521491185192426e-05, + "logits/chosen": 3.1345021724700928, + "logits/rejected": 3.190929412841797, + "logps/chosen": -353.27294921875, + "logps/rejected": -336.1080017089844, + "loss": 0.185, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.7256760597229004, + "rewards/margins": 5.060001373291016, + "rewards/rejected": -7.785677433013916, + "step": 34040 + }, + { + "epoch": 1.10988912513951, + "grad_norm": 6.648626804351807, + "learning_rate": 3.151062882219398e-05, + "logits/chosen": 2.9288525581359863, + "logits/rejected": 3.0693318843841553, + "logps/chosen": -346.0721130371094, + "logps/rejected": -342.0542297363281, + "loss": 0.4001, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.180490255355835, + "rewards/margins": 4.063368797302246, + "rewards/rejected": -7.24385929107666, + "step": 34060 + }, + { + "epoch": 1.1105408509910306, + "grad_norm": 6.812672138214111, + "learning_rate": 3.1499766459195534e-05, + "logits/chosen": 2.9519753456115723, + "logits/rejected": 3.101034641265869, + "logps/chosen": -338.40704345703125, + "logps/rejected": -347.868408203125, + "loss": 0.2195, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.7930703163146973, + "rewards/margins": 4.721873760223389, + "rewards/rejected": -7.514944553375244, + "step": 34080 + }, + { + "epoch": 1.1111925768425512, + "grad_norm": 6.6524977684021, + "learning_rate": 3.1488904096197085e-05, + "logits/chosen": 3.2698655128479004, + "logits/rejected": 3.1162195205688477, + "logps/chosen": -353.9150085449219, + "logps/rejected": -344.5353088378906, + "loss": 0.3062, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.8241894245147705, + "rewards/margins": 4.742028713226318, + "rewards/rejected": -7.566218376159668, + "step": 34100 + }, + { + "epoch": 1.1118443026940716, + "grad_norm": 3.30126690864563, + "learning_rate": 3.147804173319864e-05, + "logits/chosen": 2.91245698928833, + "logits/rejected": 3.1828441619873047, + "logps/chosen": -324.23162841796875, + "logps/rejected": -357.7253723144531, + "loss": 0.4786, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2190887928009033, + "rewards/margins": 3.7781131267547607, + "rewards/rejected": -6.997202396392822, + "step": 34120 + }, + { + "epoch": 1.1124960285455923, + "grad_norm": 1.626505970954895, + "learning_rate": 3.146717937020019e-05, + "logits/chosen": 3.172595500946045, + "logits/rejected": 3.1913866996765137, + "logps/chosen": -324.5098571777344, + "logps/rejected": -309.31622314453125, + "loss": 0.2349, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.913517475128174, + "rewards/margins": 3.3395683765411377, + "rewards/rejected": -6.253086090087891, + "step": 34140 + }, + { + "epoch": 1.1131477543971129, + "grad_norm": 4.772274017333984, + "learning_rate": 3.1456317007201744e-05, + "logits/chosen": 3.623737335205078, + "logits/rejected": 3.5083413124084473, + "logps/chosen": -392.92034912109375, + "logps/rejected": -367.8506774902344, + "loss": 0.2201, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.1195764541625977, + "rewards/margins": 4.39517879486084, + "rewards/rejected": -7.5147552490234375, + "step": 34160 + }, + { + "epoch": 1.1137994802486335, + "grad_norm": 5.0322394371032715, + "learning_rate": 3.14454546442033e-05, + "logits/chosen": 2.768670082092285, + "logits/rejected": 2.959669589996338, + "logps/chosen": -316.91436767578125, + "logps/rejected": -339.03338623046875, + "loss": 0.4165, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2238895893096924, + "rewards/margins": 3.4552340507507324, + "rewards/rejected": -6.679124355316162, + "step": 34180 + }, + { + "epoch": 1.114451206100154, + "grad_norm": 0.6687820553779602, + "learning_rate": 3.143459228120485e-05, + "logits/chosen": 3.194380044937134, + "logits/rejected": 3.283275604248047, + "logps/chosen": -320.8129577636719, + "logps/rejected": -333.825439453125, + "loss": 0.3154, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.8508665561676025, + "rewards/margins": 4.283237457275391, + "rewards/rejected": -7.134103298187256, + "step": 34200 + }, + { + "epoch": 1.1151029319516745, + "grad_norm": 0.4111202657222748, + "learning_rate": 3.142372991820641e-05, + "logits/chosen": 3.0495707988739014, + "logits/rejected": 3.116722822189331, + "logps/chosen": -330.7998962402344, + "logps/rejected": -347.4542236328125, + "loss": 0.2661, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.4015355110168457, + "rewards/margins": 4.049644470214844, + "rewards/rejected": -6.451179504394531, + "step": 34220 + }, + { + "epoch": 1.1157546578031952, + "grad_norm": 0.8580633401870728, + "learning_rate": 3.141286755520796e-05, + "logits/chosen": 2.9387807846069336, + "logits/rejected": 3.1731181144714355, + "logps/chosen": -346.5454406738281, + "logps/rejected": -324.9747619628906, + "loss": 0.3556, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.9577343463897705, + "rewards/margins": 3.840312957763672, + "rewards/rejected": -6.7980475425720215, + "step": 34240 + }, + { + "epoch": 1.1164063836547156, + "grad_norm": 1.5921409130096436, + "learning_rate": 3.140200519220952e-05, + "logits/chosen": 3.2274532318115234, + "logits/rejected": 3.2835044860839844, + "logps/chosen": -384.4151916503906, + "logps/rejected": -344.17510986328125, + "loss": 0.2902, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.498866319656372, + "rewards/margins": 3.4821548461914062, + "rewards/rejected": -6.981020927429199, + "step": 34260 + }, + { + "epoch": 1.1170581095062362, + "grad_norm": 6.056510925292969, + "learning_rate": 3.139114282921107e-05, + "logits/chosen": 2.958836793899536, + "logits/rejected": 2.9370410442352295, + "logps/chosen": -351.366943359375, + "logps/rejected": -324.29705810546875, + "loss": 0.3405, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.7066493034362793, + "rewards/margins": 3.674790143966675, + "rewards/rejected": -6.381439685821533, + "step": 34280 + }, + { + "epoch": 1.1177098353577568, + "grad_norm": 1.4339096546173096, + "learning_rate": 3.138028046621262e-05, + "logits/chosen": 3.1433675289154053, + "logits/rejected": 3.291868209838867, + "logps/chosen": -363.0215759277344, + "logps/rejected": -343.90118408203125, + "loss": 0.2625, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.392151355743408, + "rewards/margins": 4.484349250793457, + "rewards/rejected": -7.876500606536865, + "step": 34300 + }, + { + "epoch": 1.1183615612092774, + "grad_norm": 5.162640571594238, + "learning_rate": 3.136941810321418e-05, + "logits/chosen": 3.0144710540771484, + "logits/rejected": 2.899437189102173, + "logps/chosen": -381.25067138671875, + "logps/rejected": -338.53521728515625, + "loss": 0.2034, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.812044382095337, + "rewards/margins": 4.662322044372559, + "rewards/rejected": -7.474366664886475, + "step": 34320 + }, + { + "epoch": 1.1190132870607978, + "grad_norm": 3.8278968334198, + "learning_rate": 3.135855574021573e-05, + "logits/chosen": 3.0354206562042236, + "logits/rejected": 3.1702144145965576, + "logps/chosen": -329.7952575683594, + "logps/rejected": -361.8499755859375, + "loss": 0.3442, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.801806688308716, + "rewards/margins": 3.8912129402160645, + "rewards/rejected": -7.693018913269043, + "step": 34340 + }, + { + "epoch": 1.1196650129123185, + "grad_norm": 0.11758338660001755, + "learning_rate": 3.134769337721728e-05, + "logits/chosen": 3.41265869140625, + "logits/rejected": 3.5062599182128906, + "logps/chosen": -414.85919189453125, + "logps/rejected": -339.31280517578125, + "loss": 0.2362, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.0242676734924316, + "rewards/margins": 5.330282211303711, + "rewards/rejected": -8.3545503616333, + "step": 34360 + }, + { + "epoch": 1.120316738763839, + "grad_norm": 2.9822540283203125, + "learning_rate": 3.1336831014218836e-05, + "logits/chosen": 2.957486152648926, + "logits/rejected": 3.171515703201294, + "logps/chosen": -352.1917724609375, + "logps/rejected": -312.1553039550781, + "loss": 0.3496, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.8917744159698486, + "rewards/margins": 3.992274522781372, + "rewards/rejected": -6.884049415588379, + "step": 34380 + }, + { + "epoch": 1.1209684646153595, + "grad_norm": 9.561629295349121, + "learning_rate": 3.132596865122039e-05, + "logits/chosen": 3.0549604892730713, + "logits/rejected": 3.111377239227295, + "logps/chosen": -346.882080078125, + "logps/rejected": -360.318359375, + "loss": 0.4213, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.8402211666107178, + "rewards/margins": 3.7089996337890625, + "rewards/rejected": -6.549221992492676, + "step": 34400 + }, + { + "epoch": 1.12162019046688, + "grad_norm": 0.08411363512277603, + "learning_rate": 3.131510628822194e-05, + "logits/chosen": 2.658567190170288, + "logits/rejected": 2.849452018737793, + "logps/chosen": -332.1090393066406, + "logps/rejected": -325.4173889160156, + "loss": 0.2801, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.941108465194702, + "rewards/margins": 4.274949073791504, + "rewards/rejected": -7.216057777404785, + "step": 34420 + }, + { + "epoch": 1.1222719163184007, + "grad_norm": 4.49995231628418, + "learning_rate": 3.130424392522349e-05, + "logits/chosen": 3.087923526763916, + "logits/rejected": 3.101837635040283, + "logps/chosen": -328.1824951171875, + "logps/rejected": -337.81866455078125, + "loss": 0.2496, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8932483196258545, + "rewards/margins": 3.704219102859497, + "rewards/rejected": -6.597466945648193, + "step": 34440 + }, + { + "epoch": 1.1229236421699211, + "grad_norm": 1.0321828126907349, + "learning_rate": 3.1293381562225046e-05, + "logits/chosen": 3.0254273414611816, + "logits/rejected": 3.221487045288086, + "logps/chosen": -362.49346923828125, + "logps/rejected": -338.71429443359375, + "loss": 0.3382, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4599456787109375, + "rewards/margins": 3.9168200492858887, + "rewards/rejected": -7.376765251159668, + "step": 34460 + }, + { + "epoch": 1.1235753680214418, + "grad_norm": 2.4879114627838135, + "learning_rate": 3.1282519199226604e-05, + "logits/chosen": 3.16432523727417, + "logits/rejected": 3.428316593170166, + "logps/chosen": -366.0587158203125, + "logps/rejected": -345.04974365234375, + "loss": 0.1735, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.641237735748291, + "rewards/margins": 3.984079360961914, + "rewards/rejected": -6.625316619873047, + "step": 34480 + }, + { + "epoch": 1.1242270938729624, + "grad_norm": 2.2509522438049316, + "learning_rate": 3.1271656836228155e-05, + "logits/chosen": 3.159298896789551, + "logits/rejected": 3.1732304096221924, + "logps/chosen": -360.76605224609375, + "logps/rejected": -359.5025329589844, + "loss": 0.4919, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.3724377155303955, + "rewards/margins": 3.915151596069336, + "rewards/rejected": -7.287588596343994, + "step": 34500 + }, + { + "epoch": 1.1248788197244828, + "grad_norm": 3.2121458053588867, + "learning_rate": 3.126079447322971e-05, + "logits/chosen": 2.9576773643493652, + "logits/rejected": 2.895653247833252, + "logps/chosen": -349.46783447265625, + "logps/rejected": -352.458251953125, + "loss": 0.3622, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7129600048065186, + "rewards/margins": 3.8591506481170654, + "rewards/rejected": -6.572110176086426, + "step": 34520 + }, + { + "epoch": 1.1255305455760034, + "grad_norm": 2.7843713760375977, + "learning_rate": 3.124993211023126e-05, + "logits/chosen": 3.202232837677002, + "logits/rejected": 3.2251667976379395, + "logps/chosen": -381.28765869140625, + "logps/rejected": -320.23687744140625, + "loss": 0.3553, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.0992586612701416, + "rewards/margins": 3.8588414192199707, + "rewards/rejected": -6.958099365234375, + "step": 34540 + }, + { + "epoch": 1.126182271427524, + "grad_norm": 1.160057544708252, + "learning_rate": 3.1239069747232814e-05, + "logits/chosen": 3.0137476921081543, + "logits/rejected": 3.264437198638916, + "logps/chosen": -337.1767272949219, + "logps/rejected": -350.8103332519531, + "loss": 0.1323, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.7069497108459473, + "rewards/margins": 4.589138031005859, + "rewards/rejected": -7.296087741851807, + "step": 34560 + }, + { + "epoch": 1.1268339972790447, + "grad_norm": 12.671175003051758, + "learning_rate": 3.122875050238429e-05, + "logits/chosen": 3.1135711669921875, + "logits/rejected": 3.1361582279205322, + "logps/chosen": -375.0445251464844, + "logps/rejected": -396.816162109375, + "loss": 0.4064, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.414142608642578, + "rewards/margins": 4.288693428039551, + "rewards/rejected": -7.702836036682129, + "step": 34580 + }, + { + "epoch": 1.127485723130565, + "grad_norm": 0.062364254146814346, + "learning_rate": 3.121788813938584e-05, + "logits/chosen": 3.0994303226470947, + "logits/rejected": 3.184217929840088, + "logps/chosen": -362.1825866699219, + "logps/rejected": -373.45196533203125, + "loss": 0.3071, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.380953073501587, + "rewards/margins": 5.03388786315918, + "rewards/rejected": -7.4148406982421875, + "step": 34600 + }, + { + "epoch": 1.1281374489820857, + "grad_norm": 0.710823118686676, + "learning_rate": 3.1207025776387394e-05, + "logits/chosen": 2.7514936923980713, + "logits/rejected": 3.0617218017578125, + "logps/chosen": -348.4432067871094, + "logps/rejected": -355.9071960449219, + "loss": 0.2503, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7976112365722656, + "rewards/margins": 4.741830825805664, + "rewards/rejected": -7.539442539215088, + "step": 34620 + }, + { + "epoch": 1.1287891748336063, + "grad_norm": 0.030340474098920822, + "learning_rate": 3.119616341338895e-05, + "logits/chosen": 3.2224292755126953, + "logits/rejected": 3.383004665374756, + "logps/chosen": -382.61572265625, + "logps/rejected": -312.2403564453125, + "loss": 0.3217, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0007081031799316, + "rewards/margins": 5.012217044830322, + "rewards/rejected": -8.012925148010254, + "step": 34640 + }, + { + "epoch": 1.1294409006851267, + "grad_norm": 5.78650426864624, + "learning_rate": 3.11853010503905e-05, + "logits/chosen": 3.1620898246765137, + "logits/rejected": 3.0963172912597656, + "logps/chosen": -360.3285827636719, + "logps/rejected": -354.18865966796875, + "loss": 0.474, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.4417991638183594, + "rewards/margins": 3.601431369781494, + "rewards/rejected": -7.0432305335998535, + "step": 34660 + }, + { + "epoch": 1.1300926265366473, + "grad_norm": 2.834744691848755, + "learning_rate": 3.117443868739205e-05, + "logits/chosen": 3.0958495140075684, + "logits/rejected": 3.342940092086792, + "logps/chosen": -342.4507751464844, + "logps/rejected": -358.6170654296875, + "loss": 0.2989, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.946873188018799, + "rewards/margins": 4.16260290145874, + "rewards/rejected": -7.109476566314697, + "step": 34680 + }, + { + "epoch": 1.130744352388168, + "grad_norm": 3.2753407955169678, + "learning_rate": 3.116357632439361e-05, + "logits/chosen": 3.276146650314331, + "logits/rejected": 3.4185893535614014, + "logps/chosen": -322.42401123046875, + "logps/rejected": -349.016357421875, + "loss": 0.386, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.5933902263641357, + "rewards/margins": 3.50372576713562, + "rewards/rejected": -7.097115993499756, + "step": 34700 + }, + { + "epoch": 1.1313960782396886, + "grad_norm": 3.5607972145080566, + "learning_rate": 3.115271396139516e-05, + "logits/chosen": 2.7008020877838135, + "logits/rejected": 2.866689682006836, + "logps/chosen": -333.0386657714844, + "logps/rejected": -318.0419006347656, + "loss": 0.2652, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.636077404022217, + "rewards/margins": 3.855846881866455, + "rewards/rejected": -6.491925239562988, + "step": 34720 + }, + { + "epoch": 1.132047804091209, + "grad_norm": 0.16460639238357544, + "learning_rate": 3.114185159839671e-05, + "logits/chosen": 3.0925495624542236, + "logits/rejected": 3.2637264728546143, + "logps/chosen": -357.15142822265625, + "logps/rejected": -381.72674560546875, + "loss": 0.2521, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.6767044067382812, + "rewards/margins": 5.134768009185791, + "rewards/rejected": -7.8114728927612305, + "step": 34740 + }, + { + "epoch": 1.1326995299427296, + "grad_norm": 1.7385427951812744, + "learning_rate": 3.113098923539827e-05, + "logits/chosen": 2.957892656326294, + "logits/rejected": 3.1539487838745117, + "logps/chosen": -340.572265625, + "logps/rejected": -320.57440185546875, + "loss": 0.1588, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.9023871421813965, + "rewards/margins": 4.2761454582214355, + "rewards/rejected": -7.178532600402832, + "step": 34760 + }, + { + "epoch": 1.1333512557942502, + "grad_norm": 0.1457894891500473, + "learning_rate": 3.112012687239983e-05, + "logits/chosen": 3.0943872928619385, + "logits/rejected": 3.2112555503845215, + "logps/chosen": -355.630859375, + "logps/rejected": -343.9024963378906, + "loss": 0.2455, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.9299099445343018, + "rewards/margins": 4.012988090515137, + "rewards/rejected": -6.942897796630859, + "step": 34780 + }, + { + "epoch": 1.1340029816457706, + "grad_norm": 1.8145661354064941, + "learning_rate": 3.110926450940138e-05, + "logits/chosen": 3.2523093223571777, + "logits/rejected": 3.427030563354492, + "logps/chosen": -344.3304138183594, + "logps/rejected": -316.7019958496094, + "loss": 0.2561, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.2177822589874268, + "rewards/margins": 3.8536109924316406, + "rewards/rejected": -7.071393013000488, + "step": 34800 + }, + { + "epoch": 1.1346547074972912, + "grad_norm": 4.630349159240723, + "learning_rate": 3.109840214640293e-05, + "logits/chosen": 3.279172420501709, + "logits/rejected": 3.308610200881958, + "logps/chosen": -337.5331115722656, + "logps/rejected": -325.22808837890625, + "loss": 0.1935, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.3226661682128906, + "rewards/margins": 4.426630020141602, + "rewards/rejected": -6.74929666519165, + "step": 34820 + }, + { + "epoch": 1.1353064333488119, + "grad_norm": 7.006641387939453, + "learning_rate": 3.1087539783404486e-05, + "logits/chosen": 3.2895126342773438, + "logits/rejected": 3.187121868133545, + "logps/chosen": -357.8849792480469, + "logps/rejected": -370.04986572265625, + "loss": 0.3105, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.026130199432373, + "rewards/margins": 3.5141777992248535, + "rewards/rejected": -6.540307521820068, + "step": 34840 + }, + { + "epoch": 1.1359581592003325, + "grad_norm": 11.025217056274414, + "learning_rate": 3.107667742040604e-05, + "logits/chosen": 3.0013718605041504, + "logits/rejected": 3.0559182167053223, + "logps/chosen": -344.8870849609375, + "logps/rejected": -348.17730712890625, + "loss": 0.3638, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.4167799949645996, + "rewards/margins": 3.156862258911133, + "rewards/rejected": -6.573641777038574, + "step": 34860 + }, + { + "epoch": 1.136609885051853, + "grad_norm": 2.106438636779785, + "learning_rate": 3.106581505740759e-05, + "logits/chosen": 2.842963695526123, + "logits/rejected": 3.0217270851135254, + "logps/chosen": -327.4586486816406, + "logps/rejected": -324.5084228515625, + "loss": 0.3924, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.1644482612609863, + "rewards/margins": 3.5569252967834473, + "rewards/rejected": -6.721373081207275, + "step": 34880 + }, + { + "epoch": 1.1372616109033735, + "grad_norm": 0.5875534415245056, + "learning_rate": 3.1054952694409145e-05, + "logits/chosen": 3.1096551418304443, + "logits/rejected": 3.114617347717285, + "logps/chosen": -369.25469970703125, + "logps/rejected": -436.5530700683594, + "loss": 0.3815, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.501293897628784, + "rewards/margins": 4.03704309463501, + "rewards/rejected": -7.538336753845215, + "step": 34900 + }, + { + "epoch": 1.1379133367548941, + "grad_norm": 1.1263806819915771, + "learning_rate": 3.1044090331410696e-05, + "logits/chosen": 2.7350964546203613, + "logits/rejected": 2.7540860176086426, + "logps/chosen": -343.18023681640625, + "logps/rejected": -319.71380615234375, + "loss": 0.2268, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.6240038871765137, + "rewards/margins": 4.100113391876221, + "rewards/rejected": -6.724117279052734, + "step": 34920 + }, + { + "epoch": 1.1385650626064145, + "grad_norm": 2.0869925022125244, + "learning_rate": 3.103322796841225e-05, + "logits/chosen": 2.7910664081573486, + "logits/rejected": 3.023456335067749, + "logps/chosen": -332.369873046875, + "logps/rejected": -322.36236572265625, + "loss": 0.3138, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.0088467597961426, + "rewards/margins": 3.7872543334960938, + "rewards/rejected": -6.796100616455078, + "step": 34940 + }, + { + "epoch": 1.1392167884579352, + "grad_norm": 1.3525803089141846, + "learning_rate": 3.1022365605413805e-05, + "logits/chosen": 3.1876089572906494, + "logits/rejected": 3.290208101272583, + "logps/chosen": -342.79437255859375, + "logps/rejected": -377.9436340332031, + "loss": 0.3043, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.177163600921631, + "rewards/margins": 4.391488552093506, + "rewards/rejected": -7.568652153015137, + "step": 34960 + }, + { + "epoch": 1.1398685143094558, + "grad_norm": 8.150525093078613, + "learning_rate": 3.1011503242415355e-05, + "logits/chosen": 3.357003688812256, + "logits/rejected": 3.308884859085083, + "logps/chosen": -398.0416259765625, + "logps/rejected": -380.86968994140625, + "loss": 0.4093, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.86519193649292, + "rewards/margins": 4.029577255249023, + "rewards/rejected": -6.894769191741943, + "step": 34980 + }, + { + "epoch": 1.1405202401609762, + "grad_norm": 1.8260202407836914, + "learning_rate": 3.1000640879416906e-05, + "logits/chosen": 3.1352381706237793, + "logits/rejected": 3.1685538291931152, + "logps/chosen": -373.25579833984375, + "logps/rejected": -354.92022705078125, + "loss": 0.3092, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.746968984603882, + "rewards/margins": 4.801129341125488, + "rewards/rejected": -7.548098564147949, + "step": 35000 + }, + { + "epoch": 1.1411719660124968, + "grad_norm": 1.1859629154205322, + "learning_rate": 3.0989778516418464e-05, + "logits/chosen": 3.1756651401519775, + "logits/rejected": 3.2367541790008545, + "logps/chosen": -337.2320556640625, + "logps/rejected": -344.8565368652344, + "loss": 0.2841, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.6964430809020996, + "rewards/margins": 4.019021511077881, + "rewards/rejected": -7.715465545654297, + "step": 35020 + }, + { + "epoch": 1.1418236918640174, + "grad_norm": 7.592705726623535, + "learning_rate": 3.097891615342002e-05, + "logits/chosen": 3.4712486267089844, + "logits/rejected": 3.4952709674835205, + "logps/chosen": -381.7475280761719, + "logps/rejected": -319.4245910644531, + "loss": 0.2725, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1175734996795654, + "rewards/margins": 4.381688117980957, + "rewards/rejected": -7.49926233291626, + "step": 35040 + }, + { + "epoch": 1.1424754177155378, + "grad_norm": 2.1400680541992188, + "learning_rate": 3.096805379042157e-05, + "logits/chosen": 3.023790121078491, + "logits/rejected": 3.062779188156128, + "logps/chosen": -320.6142883300781, + "logps/rejected": -312.2714538574219, + "loss": 0.2478, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.3456714153289795, + "rewards/margins": 4.527140140533447, + "rewards/rejected": -7.872811317443848, + "step": 35060 + }, + { + "epoch": 1.1431271435670585, + "grad_norm": 0.2780522108078003, + "learning_rate": 3.095719142742312e-05, + "logits/chosen": 2.972752571105957, + "logits/rejected": 3.143254041671753, + "logps/chosen": -430.70751953125, + "logps/rejected": -397.636962890625, + "loss": 0.1438, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.25661039352417, + "rewards/margins": 5.37701940536499, + "rewards/rejected": -8.63362979888916, + "step": 35080 + }, + { + "epoch": 1.143778869418579, + "grad_norm": 0.28239983320236206, + "learning_rate": 3.094632906442468e-05, + "logits/chosen": 3.011671543121338, + "logits/rejected": 2.9381580352783203, + "logps/chosen": -361.65203857421875, + "logps/rejected": -346.2157897949219, + "loss": 0.3226, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.257183074951172, + "rewards/margins": 4.018234729766846, + "rewards/rejected": -7.275418281555176, + "step": 35100 + }, + { + "epoch": 1.1444305952700997, + "grad_norm": 3.448291063308716, + "learning_rate": 3.093546670142623e-05, + "logits/chosen": 3.0785324573516846, + "logits/rejected": 3.168267011642456, + "logps/chosen": -332.05841064453125, + "logps/rejected": -367.3563537597656, + "loss": 0.2096, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.5445594787597656, + "rewards/margins": 4.382562637329102, + "rewards/rejected": -7.927121639251709, + "step": 35120 + }, + { + "epoch": 1.1450823211216201, + "grad_norm": 1.7338567972183228, + "learning_rate": 3.092460433842778e-05, + "logits/chosen": 2.8898634910583496, + "logits/rejected": 3.0455989837646484, + "logps/chosen": -358.20672607421875, + "logps/rejected": -326.3901062011719, + "loss": 0.27, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.0484607219696045, + "rewards/margins": 4.328792095184326, + "rewards/rejected": -7.37725305557251, + "step": 35140 + }, + { + "epoch": 1.1457340469731407, + "grad_norm": 0.7223158478736877, + "learning_rate": 3.091374197542934e-05, + "logits/chosen": 3.211202621459961, + "logits/rejected": 3.2227139472961426, + "logps/chosen": -342.5216369628906, + "logps/rejected": -303.1143493652344, + "loss": 0.297, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.1113762855529785, + "rewards/margins": 4.035075664520264, + "rewards/rejected": -7.1464524269104, + "step": 35160 + }, + { + "epoch": 1.1463857728246614, + "grad_norm": 0.5648542642593384, + "learning_rate": 3.090287961243089e-05, + "logits/chosen": 3.3361594676971436, + "logits/rejected": 3.5203022956848145, + "logps/chosen": -332.0790100097656, + "logps/rejected": -387.22222900390625, + "loss": 0.2772, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.885014533996582, + "rewards/margins": 4.477782249450684, + "rewards/rejected": -7.362797737121582, + "step": 35180 + }, + { + "epoch": 1.1470374986761818, + "grad_norm": 4.198761940002441, + "learning_rate": 3.089256036758236e-05, + "logits/chosen": 3.1152119636535645, + "logits/rejected": 3.3398990631103516, + "logps/chosen": -379.7227478027344, + "logps/rejected": -351.3172607421875, + "loss": 0.4704, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.878091812133789, + "rewards/margins": 4.328062057495117, + "rewards/rejected": -7.206153869628906, + "step": 35200 + }, + { + "epoch": 1.1476892245277024, + "grad_norm": 4.245848178863525, + "learning_rate": 3.088169800458392e-05, + "logits/chosen": 3.159088134765625, + "logits/rejected": 3.303269147872925, + "logps/chosen": -404.7352600097656, + "logps/rejected": -355.3196105957031, + "loss": 0.3134, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5825154781341553, + "rewards/margins": 3.8987133502960205, + "rewards/rejected": -6.481228828430176, + "step": 35220 + }, + { + "epoch": 1.148340950379223, + "grad_norm": 2.5293729305267334, + "learning_rate": 3.087083564158547e-05, + "logits/chosen": 3.4196925163269043, + "logits/rejected": 3.4683284759521484, + "logps/chosen": -326.78717041015625, + "logps/rejected": -315.5588684082031, + "loss": 0.3504, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.816007137298584, + "rewards/margins": 3.8654651641845703, + "rewards/rejected": -6.681471824645996, + "step": 35240 + }, + { + "epoch": 1.1489926762307436, + "grad_norm": 4.529097557067871, + "learning_rate": 3.085997327858702e-05, + "logits/chosen": 3.1862125396728516, + "logits/rejected": 3.1964688301086426, + "logps/chosen": -364.4066162109375, + "logps/rejected": -373.8343811035156, + "loss": 0.2289, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9039371013641357, + "rewards/margins": 4.316462516784668, + "rewards/rejected": -7.220399379730225, + "step": 35260 + }, + { + "epoch": 1.149644402082264, + "grad_norm": 0.05907747521996498, + "learning_rate": 3.084911091558858e-05, + "logits/chosen": 3.1289267539978027, + "logits/rejected": 3.145500421524048, + "logps/chosen": -329.9520263671875, + "logps/rejected": -358.6315002441406, + "loss": 0.2507, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.785738468170166, + "rewards/margins": 4.302941799163818, + "rewards/rejected": -7.088680267333984, + "step": 35280 + }, + { + "epoch": 1.1502961279337847, + "grad_norm": 4.617221832275391, + "learning_rate": 3.083824855259013e-05, + "logits/chosen": 3.304598569869995, + "logits/rejected": 3.3712151050567627, + "logps/chosen": -347.70379638671875, + "logps/rejected": -321.7219543457031, + "loss": 0.253, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7910544872283936, + "rewards/margins": 4.02209997177124, + "rewards/rejected": -6.813154697418213, + "step": 35300 + }, + { + "epoch": 1.1509478537853053, + "grad_norm": 0.37125569581985474, + "learning_rate": 3.082738618959169e-05, + "logits/chosen": 3.398890733718872, + "logits/rejected": 3.4098026752471924, + "logps/chosen": -379.81793212890625, + "logps/rejected": -384.6663818359375, + "loss": 0.1538, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8117995262145996, + "rewards/margins": 5.336245536804199, + "rewards/rejected": -8.148045539855957, + "step": 35320 + }, + { + "epoch": 1.1515995796368257, + "grad_norm": 2.7248318195343018, + "learning_rate": 3.081652382659324e-05, + "logits/chosen": 2.9252352714538574, + "logits/rejected": 2.8889100551605225, + "logps/chosen": -373.5787353515625, + "logps/rejected": -322.93634033203125, + "loss": 0.317, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.4858016967773438, + "rewards/margins": 3.774445056915283, + "rewards/rejected": -7.260246276855469, + "step": 35340 + }, + { + "epoch": 1.1522513054883463, + "grad_norm": 4.551990509033203, + "learning_rate": 3.0805661463594795e-05, + "logits/chosen": 3.5514767169952393, + "logits/rejected": 3.291461229324341, + "logps/chosen": -348.9503173828125, + "logps/rejected": -381.4579772949219, + "loss": 0.239, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.33502197265625, + "rewards/margins": 4.41613245010376, + "rewards/rejected": -7.75115442276001, + "step": 35360 + }, + { + "epoch": 1.152903031339867, + "grad_norm": 3.788947343826294, + "learning_rate": 3.0794799100596346e-05, + "logits/chosen": 3.1507816314697266, + "logits/rejected": 3.116849422454834, + "logps/chosen": -384.42730712890625, + "logps/rejected": -357.78118896484375, + "loss": 0.3284, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.339033603668213, + "rewards/margins": 4.905080318450928, + "rewards/rejected": -8.24411392211914, + "step": 35380 + }, + { + "epoch": 1.1535547571913876, + "grad_norm": 1.6317415237426758, + "learning_rate": 3.07839367375979e-05, + "logits/chosen": 3.286372423171997, + "logits/rejected": 3.3651528358459473, + "logps/chosen": -372.761474609375, + "logps/rejected": -340.91253662109375, + "loss": 0.2569, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.044373035430908, + "rewards/margins": 3.7318215370178223, + "rewards/rejected": -6.7761945724487305, + "step": 35400 + }, + { + "epoch": 1.154206483042908, + "grad_norm": 2.6547787189483643, + "learning_rate": 3.0773074374599454e-05, + "logits/chosen": 2.845984935760498, + "logits/rejected": 3.0406620502471924, + "logps/chosen": -342.4168395996094, + "logps/rejected": -340.1210021972656, + "loss": 0.231, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.3238277435302734, + "rewards/margins": 4.8068647384643555, + "rewards/rejected": -7.130692958831787, + "step": 35420 + }, + { + "epoch": 1.1548582088944286, + "grad_norm": 9.78886604309082, + "learning_rate": 3.0762212011601005e-05, + "logits/chosen": 3.0890235900878906, + "logits/rejected": 2.923475503921509, + "logps/chosen": -347.1639099121094, + "logps/rejected": -331.4012145996094, + "loss": 0.325, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0749058723449707, + "rewards/margins": 4.141808986663818, + "rewards/rejected": -7.216714382171631, + "step": 35440 + }, + { + "epoch": 1.1555099347459492, + "grad_norm": 6.178435802459717, + "learning_rate": 3.0751349648602556e-05, + "logits/chosen": 3.073936700820923, + "logits/rejected": 3.2000770568847656, + "logps/chosen": -343.3409423828125, + "logps/rejected": -306.2254638671875, + "loss": 0.322, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.6568870544433594, + "rewards/margins": 3.1184802055358887, + "rewards/rejected": -5.77536678314209, + "step": 35460 + }, + { + "epoch": 1.1561616605974696, + "grad_norm": 0.5186936259269714, + "learning_rate": 3.0741030403754034e-05, + "logits/chosen": 3.1233859062194824, + "logits/rejected": 3.356430768966675, + "logps/chosen": -368.4728088378906, + "logps/rejected": -383.636962890625, + "loss": 0.3711, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.553579807281494, + "rewards/margins": 4.46019172668457, + "rewards/rejected": -7.013771057128906, + "step": 35480 + }, + { + "epoch": 1.1568133864489902, + "grad_norm": 0.31475114822387695, + "learning_rate": 3.0730168040755585e-05, + "logits/chosen": 3.042513847351074, + "logits/rejected": 3.199760675430298, + "logps/chosen": -381.2278747558594, + "logps/rejected": -375.7433776855469, + "loss": 0.2666, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.6388628482818604, + "rewards/margins": 4.971614360809326, + "rewards/rejected": -7.610477447509766, + "step": 35500 + }, + { + "epoch": 1.1574651123005109, + "grad_norm": 4.139030456542969, + "learning_rate": 3.071930567775714e-05, + "logits/chosen": 3.125375986099243, + "logits/rejected": 3.2716965675354004, + "logps/chosen": -330.40423583984375, + "logps/rejected": -342.2942810058594, + "loss": 0.2392, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.273843765258789, + "rewards/margins": 4.965765476226807, + "rewards/rejected": -8.239608764648438, + "step": 35520 + }, + { + "epoch": 1.1581168381520313, + "grad_norm": 1.4237984418869019, + "learning_rate": 3.070844331475869e-05, + "logits/chosen": 3.554515838623047, + "logits/rejected": 3.54675030708313, + "logps/chosen": -393.5665588378906, + "logps/rejected": -373.18939208984375, + "loss": 0.2876, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8548264503479004, + "rewards/margins": 3.9333183765411377, + "rewards/rejected": -6.788145542144775, + "step": 35540 + }, + { + "epoch": 1.1587685640035519, + "grad_norm": 0.7712887525558472, + "learning_rate": 3.0697580951760244e-05, + "logits/chosen": 3.3016624450683594, + "logits/rejected": 3.2744338512420654, + "logps/chosen": -371.6776428222656, + "logps/rejected": -374.7511291503906, + "loss": 0.3516, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.734440565109253, + "rewards/margins": 3.9009182453155518, + "rewards/rejected": -6.635359287261963, + "step": 35560 + }, + { + "epoch": 1.1594202898550725, + "grad_norm": 0.9191162586212158, + "learning_rate": 3.0686718588761795e-05, + "logits/chosen": 3.2464187145233154, + "logits/rejected": 3.2748234272003174, + "logps/chosen": -368.14752197265625, + "logps/rejected": -328.0365295410156, + "loss": 0.3396, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.558617353439331, + "rewards/margins": 3.7654826641082764, + "rewards/rejected": -6.324099540710449, + "step": 35580 + }, + { + "epoch": 1.160072015706593, + "grad_norm": 0.6972732543945312, + "learning_rate": 3.067585622576335e-05, + "logits/chosen": 3.246889591217041, + "logits/rejected": 3.343036651611328, + "logps/chosen": -396.63848876953125, + "logps/rejected": -399.2658386230469, + "loss": 0.2451, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.878821849822998, + "rewards/margins": 4.90296745300293, + "rewards/rejected": -7.7817888259887695, + "step": 35600 + }, + { + "epoch": 1.1607237415581135, + "grad_norm": 10.53720760345459, + "learning_rate": 3.066499386276491e-05, + "logits/chosen": 3.1421754360198975, + "logits/rejected": 2.9150538444519043, + "logps/chosen": -351.7892761230469, + "logps/rejected": -398.12152099609375, + "loss": 0.3295, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.0771470069885254, + "rewards/margins": 3.677602767944336, + "rewards/rejected": -6.7547502517700195, + "step": 35620 + }, + { + "epoch": 1.1613754674096342, + "grad_norm": 1.9326423406600952, + "learning_rate": 3.065413149976646e-05, + "logits/chosen": 3.016547441482544, + "logits/rejected": 2.8338966369628906, + "logps/chosen": -363.4442443847656, + "logps/rejected": -411.06646728515625, + "loss": 0.2486, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.286104202270508, + "rewards/margins": 5.123827934265137, + "rewards/rejected": -8.409932136535645, + "step": 35640 + }, + { + "epoch": 1.1620271932611548, + "grad_norm": 1.711816668510437, + "learning_rate": 3.064326913676802e-05, + "logits/chosen": 3.306116819381714, + "logits/rejected": 3.1065638065338135, + "logps/chosen": -395.46759033203125, + "logps/rejected": -334.60809326171875, + "loss": 0.2018, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.5354347229003906, + "rewards/margins": 4.680716514587402, + "rewards/rejected": -8.216151237487793, + "step": 35660 + }, + { + "epoch": 1.1626789191126752, + "grad_norm": 1.8201544284820557, + "learning_rate": 3.063240677376957e-05, + "logits/chosen": 2.8474574089050293, + "logits/rejected": 3.218977451324463, + "logps/chosen": -358.7422790527344, + "logps/rejected": -369.23687744140625, + "loss": 0.428, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.0467684268951416, + "rewards/margins": 4.156871795654297, + "rewards/rejected": -7.203640937805176, + "step": 35680 + }, + { + "epoch": 1.1633306449641958, + "grad_norm": 0.28039392828941345, + "learning_rate": 3.062154441077112e-05, + "logits/chosen": 2.7611050605773926, + "logits/rejected": 2.8413197994232178, + "logps/chosen": -326.5291442871094, + "logps/rejected": -326.9194030761719, + "loss": 0.3675, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.4318184852600098, + "rewards/margins": 3.4142394065856934, + "rewards/rejected": -6.846057891845703, + "step": 35700 + }, + { + "epoch": 1.1639823708157164, + "grad_norm": 5.771555423736572, + "learning_rate": 3.061068204777268e-05, + "logits/chosen": 3.1455745697021484, + "logits/rejected": 3.311068058013916, + "logps/chosen": -346.61260986328125, + "logps/rejected": -335.9582824707031, + "loss": 0.3517, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.452594041824341, + "rewards/margins": 4.0531816482543945, + "rewards/rejected": -7.505776882171631, + "step": 35720 + }, + { + "epoch": 1.1646340966672368, + "grad_norm": 1.8337277173995972, + "learning_rate": 3.059981968477423e-05, + "logits/chosen": 2.9209814071655273, + "logits/rejected": 3.0338573455810547, + "logps/chosen": -388.91510009765625, + "logps/rejected": -331.18988037109375, + "loss": 0.1835, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.0370616912841797, + "rewards/margins": 4.365303993225098, + "rewards/rejected": -6.402365207672119, + "step": 35740 + }, + { + "epoch": 1.1652858225187575, + "grad_norm": 0.4092133939266205, + "learning_rate": 3.058895732177578e-05, + "logits/chosen": 3.2038700580596924, + "logits/rejected": 3.358901262283325, + "logps/chosen": -374.50238037109375, + "logps/rejected": -358.83966064453125, + "loss": 0.1279, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.687655448913574, + "rewards/margins": 4.915209770202637, + "rewards/rejected": -7.602865695953369, + "step": 35760 + }, + { + "epoch": 1.165937548370278, + "grad_norm": 0.4847314953804016, + "learning_rate": 3.057809495877733e-05, + "logits/chosen": 3.1197640895843506, + "logits/rejected": 3.006714344024658, + "logps/chosen": -381.7506103515625, + "logps/rejected": -359.51214599609375, + "loss": 0.2263, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.7985732555389404, + "rewards/margins": 4.231475830078125, + "rewards/rejected": -8.030050277709961, + "step": 35780 + }, + { + "epoch": 1.1665892742217987, + "grad_norm": 0.7860205769538879, + "learning_rate": 3.056723259577889e-05, + "logits/chosen": 3.1671833992004395, + "logits/rejected": 3.20475435256958, + "logps/chosen": -370.4530334472656, + "logps/rejected": -335.34588623046875, + "loss": 0.3834, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.4020748138427734, + "rewards/margins": 3.7189860343933105, + "rewards/rejected": -7.1210618019104, + "step": 35800 + }, + { + "epoch": 1.167241000073319, + "grad_norm": 3.248612642288208, + "learning_rate": 3.055637023278044e-05, + "logits/chosen": 2.72033429145813, + "logits/rejected": 2.9197685718536377, + "logps/chosen": -353.054931640625, + "logps/rejected": -372.8656921386719, + "loss": 0.3203, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.6624011993408203, + "rewards/margins": 3.9134202003479004, + "rewards/rejected": -7.575822353363037, + "step": 35820 + }, + { + "epoch": 1.1678927259248397, + "grad_norm": 6.610428333282471, + "learning_rate": 3.054550786978199e-05, + "logits/chosen": 2.9959475994110107, + "logits/rejected": 3.2736167907714844, + "logps/chosen": -368.41815185546875, + "logps/rejected": -302.4336242675781, + "loss": 0.2666, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.876913547515869, + "rewards/margins": 4.677968502044678, + "rewards/rejected": -7.554883003234863, + "step": 35840 + }, + { + "epoch": 1.1685444517763603, + "grad_norm": 2.7183115482330322, + "learning_rate": 3.0534645506783547e-05, + "logits/chosen": 3.1552891731262207, + "logits/rejected": 3.039559841156006, + "logps/chosen": -374.93414306640625, + "logps/rejected": -347.58795166015625, + "loss": 0.1997, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.0780091285705566, + "rewards/margins": 4.28035831451416, + "rewards/rejected": -7.358367919921875, + "step": 35860 + }, + { + "epoch": 1.1691961776278808, + "grad_norm": 5.401954650878906, + "learning_rate": 3.0523783143785104e-05, + "logits/chosen": 2.8488388061523438, + "logits/rejected": 3.002432346343994, + "logps/chosen": -337.21710205078125, + "logps/rejected": -329.4134521484375, + "loss": 0.2639, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.7935972213745117, + "rewards/margins": 4.142205238342285, + "rewards/rejected": -6.935802459716797, + "step": 35880 + }, + { + "epoch": 1.1698479034794014, + "grad_norm": 2.427314519882202, + "learning_rate": 3.051292078078665e-05, + "logits/chosen": 2.9525935649871826, + "logits/rejected": 2.9522273540496826, + "logps/chosen": -379.3227233886719, + "logps/rejected": -364.9892578125, + "loss": 0.3281, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.290332078933716, + "rewards/margins": 4.399946212768555, + "rewards/rejected": -7.690278053283691, + "step": 35900 + }, + { + "epoch": 1.170499629330922, + "grad_norm": 0.19064892828464508, + "learning_rate": 3.050205841778821e-05, + "logits/chosen": 3.241867780685425, + "logits/rejected": 3.3395466804504395, + "logps/chosen": -361.99176025390625, + "logps/rejected": -348.1388244628906, + "loss": 0.2479, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.196932315826416, + "rewards/margins": 4.437159538269043, + "rewards/rejected": -6.634092807769775, + "step": 35920 + }, + { + "epoch": 1.1711513551824426, + "grad_norm": 0.879212498664856, + "learning_rate": 3.049119605478976e-05, + "logits/chosen": 3.259395122528076, + "logits/rejected": 3.267143726348877, + "logps/chosen": -371.25848388671875, + "logps/rejected": -335.99603271484375, + "loss": 0.3243, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.3039937019348145, + "rewards/margins": 4.237624168395996, + "rewards/rejected": -6.541618347167969, + "step": 35940 + }, + { + "epoch": 1.171803081033963, + "grad_norm": 2.5266621112823486, + "learning_rate": 3.0480333691791314e-05, + "logits/chosen": 3.215946912765503, + "logits/rejected": 3.3283050060272217, + "logps/chosen": -328.7294921875, + "logps/rejected": -315.20855712890625, + "loss": 0.2376, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.7546329498291016, + "rewards/margins": 3.8301796913146973, + "rewards/rejected": -6.584812164306641, + "step": 35960 + }, + { + "epoch": 1.1724548068854836, + "grad_norm": 0.956320583820343, + "learning_rate": 3.0469471328792865e-05, + "logits/chosen": 2.9465761184692383, + "logits/rejected": 3.0488598346710205, + "logps/chosen": -402.035888671875, + "logps/rejected": -372.7070007324219, + "loss": 0.2833, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.3043278455734253, + "rewards/margins": 5.855583190917969, + "rewards/rejected": -7.159911155700684, + "step": 35980 + }, + { + "epoch": 1.1731065327370043, + "grad_norm": 2.1146445274353027, + "learning_rate": 3.0458608965794422e-05, + "logits/chosen": 3.184321880340576, + "logits/rejected": 3.357285737991333, + "logps/chosen": -357.3111877441406, + "logps/rejected": -334.48944091796875, + "loss": 0.2417, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.296539783477783, + "rewards/margins": 4.631216526031494, + "rewards/rejected": -7.927756309509277, + "step": 36000 + }, + { + "epoch": 1.1731065327370043, + "eval_logits/chosen": 3.164564371109009, + "eval_logits/rejected": 3.1777284145355225, + "eval_logps/chosen": -394.46783447265625, + "eval_logps/rejected": -378.8555908203125, + "eval_loss": 0.4787750244140625, + "eval_rewards/accuracies": 0.832140326499939, + "eval_rewards/chosen": -3.988715410232544, + "eval_rewards/margins": 4.273524761199951, + "eval_rewards/rejected": -8.262240409851074, + "eval_runtime": 3546.131, + "eval_samples_per_second": 3.152, + "eval_steps_per_second": 3.152, + "step": 36000 + }, + { + "epoch": 1.1737582585885247, + "grad_norm": 3.3639492988586426, + "learning_rate": 3.0447746602795973e-05, + "logits/chosen": 3.115433931350708, + "logits/rejected": 3.1284496784210205, + "logps/chosen": -358.1566467285156, + "logps/rejected": -333.9770202636719, + "loss": 0.3643, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3323159217834473, + "rewards/margins": 3.9577019214630127, + "rewards/rejected": -7.290017604827881, + "step": 36020 + }, + { + "epoch": 1.1744099844400453, + "grad_norm": 1.0814636945724487, + "learning_rate": 3.0436884239797524e-05, + "logits/chosen": 2.924020528793335, + "logits/rejected": 2.8821935653686523, + "logps/chosen": -315.60791015625, + "logps/rejected": -311.7601623535156, + "loss": 0.4024, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0742061138153076, + "rewards/margins": 4.088019371032715, + "rewards/rejected": -7.162225246429443, + "step": 36040 + }, + { + "epoch": 1.175061710291566, + "grad_norm": 3.4478847980499268, + "learning_rate": 3.042602187679908e-05, + "logits/chosen": 3.0931601524353027, + "logits/rejected": 3.1490139961242676, + "logps/chosen": -359.797119140625, + "logps/rejected": -365.45428466796875, + "loss": 0.1376, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.956174373626709, + "rewards/margins": 4.431668281555176, + "rewards/rejected": -7.387842655181885, + "step": 36060 + }, + { + "epoch": 1.1757134361430863, + "grad_norm": 0.1382720172405243, + "learning_rate": 3.0415159513800636e-05, + "logits/chosen": 3.223672389984131, + "logits/rejected": 3.361487627029419, + "logps/chosen": -437.34783935546875, + "logps/rejected": -404.8932800292969, + "loss": 0.1113, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.4925265312194824, + "rewards/margins": 5.888213157653809, + "rewards/rejected": -8.38074016571045, + "step": 36080 + }, + { + "epoch": 1.176365161994607, + "grad_norm": 4.780825614929199, + "learning_rate": 3.0404297150802186e-05, + "logits/chosen": 2.6404130458831787, + "logits/rejected": 2.779118061065674, + "logps/chosen": -329.86663818359375, + "logps/rejected": -322.6595153808594, + "loss": 0.4143, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.0484678745269775, + "rewards/margins": 3.3942513465881348, + "rewards/rejected": -6.442718505859375, + "step": 36100 + }, + { + "epoch": 1.1770168878461276, + "grad_norm": 0.29347100853919983, + "learning_rate": 3.0393434787803744e-05, + "logits/chosen": 2.9102683067321777, + "logits/rejected": 3.285167694091797, + "logps/chosen": -378.03265380859375, + "logps/rejected": -348.53558349609375, + "loss": 0.2739, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.151165246963501, + "rewards/margins": 5.3955841064453125, + "rewards/rejected": -8.546748161315918, + "step": 36120 + }, + { + "epoch": 1.177668613697648, + "grad_norm": 2.453143358230591, + "learning_rate": 3.0382572424805295e-05, + "logits/chosen": 3.072930097579956, + "logits/rejected": 3.2258212566375732, + "logps/chosen": -350.01519775390625, + "logps/rejected": -355.39495849609375, + "loss": 0.2004, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.8514761924743652, + "rewards/margins": 4.1879048347473145, + "rewards/rejected": -7.039381504058838, + "step": 36140 + }, + { + "epoch": 1.1783203395491686, + "grad_norm": 3.889885902404785, + "learning_rate": 3.0371710061806846e-05, + "logits/chosen": 2.927055835723877, + "logits/rejected": 2.9250476360321045, + "logps/chosen": -347.6831359863281, + "logps/rejected": -366.94085693359375, + "loss": 0.3671, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.316230058670044, + "rewards/margins": 3.874346971511841, + "rewards/rejected": -7.190577030181885, + "step": 36160 + }, + { + "epoch": 1.1789720654006892, + "grad_norm": 6.245812892913818, + "learning_rate": 3.0360847698808396e-05, + "logits/chosen": 2.642296075820923, + "logits/rejected": 2.9879891872406006, + "logps/chosen": -349.5757141113281, + "logps/rejected": -365.90789794921875, + "loss": 0.2625, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.933948040008545, + "rewards/margins": 4.591615676879883, + "rewards/rejected": -8.525564193725586, + "step": 36180 + }, + { + "epoch": 1.1796237912522098, + "grad_norm": 4.080329895019531, + "learning_rate": 3.0349985335809954e-05, + "logits/chosen": 3.0579235553741455, + "logits/rejected": 3.1478939056396484, + "logps/chosen": -349.96588134765625, + "logps/rejected": -324.6232604980469, + "loss": 0.2751, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.2459633350372314, + "rewards/margins": 4.255251884460449, + "rewards/rejected": -7.50121545791626, + "step": 36200 + }, + { + "epoch": 1.1802755171037302, + "grad_norm": 0.07051248103380203, + "learning_rate": 3.0339122972811508e-05, + "logits/chosen": 2.8688037395477295, + "logits/rejected": 2.9756696224212646, + "logps/chosen": -334.03997802734375, + "logps/rejected": -347.96893310546875, + "loss": 0.2349, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2038815021514893, + "rewards/margins": 4.402427673339844, + "rewards/rejected": -7.606309413909912, + "step": 36220 + }, + { + "epoch": 1.1809272429552509, + "grad_norm": 1.4659721851348877, + "learning_rate": 3.032826060981306e-05, + "logits/chosen": 3.1948812007904053, + "logits/rejected": 3.4417920112609863, + "logps/chosen": -354.62762451171875, + "logps/rejected": -375.2703857421875, + "loss": 0.2592, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.0074386596679688, + "rewards/margins": 4.835148811340332, + "rewards/rejected": -6.842587471008301, + "step": 36240 + }, + { + "epoch": 1.1815789688067715, + "grad_norm": 1.3398725986480713, + "learning_rate": 3.0317398246814616e-05, + "logits/chosen": 3.4211974143981934, + "logits/rejected": 3.3440029621124268, + "logps/chosen": -363.66241455078125, + "logps/rejected": -325.23602294921875, + "loss": 0.2619, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.84926438331604, + "rewards/margins": 4.178003787994385, + "rewards/rejected": -7.0272674560546875, + "step": 36260 + }, + { + "epoch": 1.182230694658292, + "grad_norm": 6.118765354156494, + "learning_rate": 3.0306535883816167e-05, + "logits/chosen": 3.2006118297576904, + "logits/rejected": 3.3019015789031982, + "logps/chosen": -320.45355224609375, + "logps/rejected": -267.81964111328125, + "loss": 0.4068, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.082428216934204, + "rewards/margins": 3.3327724933624268, + "rewards/rejected": -6.415200710296631, + "step": 36280 + }, + { + "epoch": 1.1828824205098125, + "grad_norm": 0.08269942551851273, + "learning_rate": 3.0295673520817718e-05, + "logits/chosen": 2.6019787788391113, + "logits/rejected": 2.8155055046081543, + "logps/chosen": -314.481689453125, + "logps/rejected": -321.4422607421875, + "loss": 0.3678, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.970221996307373, + "rewards/margins": 4.143068313598633, + "rewards/rejected": -7.113290309906006, + "step": 36300 + }, + { + "epoch": 1.1835341463613331, + "grad_norm": 1.9132206439971924, + "learning_rate": 3.0284811157819276e-05, + "logits/chosen": 3.37770414352417, + "logits/rejected": 3.5384280681610107, + "logps/chosen": -362.7208251953125, + "logps/rejected": -359.23284912109375, + "loss": 0.3403, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.95246958732605, + "rewards/margins": 4.638733863830566, + "rewards/rejected": -7.591204643249512, + "step": 36320 + }, + { + "epoch": 1.1841858722128538, + "grad_norm": 1.0839935541152954, + "learning_rate": 3.0273948794820826e-05, + "logits/chosen": 2.823150157928467, + "logits/rejected": 2.9771368503570557, + "logps/chosen": -300.35272216796875, + "logps/rejected": -337.66741943359375, + "loss": 0.3416, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.0398435592651367, + "rewards/margins": 5.094751834869385, + "rewards/rejected": -8.13459587097168, + "step": 36340 + }, + { + "epoch": 1.1848375980643742, + "grad_norm": 0.9375237822532654, + "learning_rate": 3.026308643182238e-05, + "logits/chosen": 3.1162917613983154, + "logits/rejected": 3.036151885986328, + "logps/chosen": -365.23822021484375, + "logps/rejected": -389.15142822265625, + "loss": 0.2631, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.3671302795410156, + "rewards/margins": 5.222300052642822, + "rewards/rejected": -8.589430809020996, + "step": 36360 + }, + { + "epoch": 1.1854893239158948, + "grad_norm": 0.8735193014144897, + "learning_rate": 3.025222406882393e-05, + "logits/chosen": 3.0098819732666016, + "logits/rejected": 3.1670875549316406, + "logps/chosen": -337.99884033203125, + "logps/rejected": -339.0718994140625, + "loss": 0.3294, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.3531463146209717, + "rewards/margins": 3.9745991230010986, + "rewards/rejected": -7.327746391296387, + "step": 36380 + }, + { + "epoch": 1.1861410497674154, + "grad_norm": 8.179872512817383, + "learning_rate": 3.024136170582549e-05, + "logits/chosen": 3.248208522796631, + "logits/rejected": 3.284550189971924, + "logps/chosen": -396.8111877441406, + "logps/rejected": -370.245849609375, + "loss": 0.3858, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.512533664703369, + "rewards/margins": 4.080694675445557, + "rewards/rejected": -7.593228340148926, + "step": 36400 + }, + { + "epoch": 1.1867927756189358, + "grad_norm": 1.7453020811080933, + "learning_rate": 3.023049934282704e-05, + "logits/chosen": 3.3772284984588623, + "logits/rejected": 3.439396381378174, + "logps/chosen": -374.25531005859375, + "logps/rejected": -384.30841064453125, + "loss": 0.2148, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.133815288543701, + "rewards/margins": 5.233504295349121, + "rewards/rejected": -8.367319107055664, + "step": 36420 + }, + { + "epoch": 1.1874445014704564, + "grad_norm": 4.073692321777344, + "learning_rate": 3.021963697982859e-05, + "logits/chosen": 2.7801930904388428, + "logits/rejected": 2.954946517944336, + "logps/chosen": -324.5502624511719, + "logps/rejected": -358.7742004394531, + "loss": 0.218, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.4635841846466064, + "rewards/margins": 4.641333103179932, + "rewards/rejected": -8.104917526245117, + "step": 36440 + }, + { + "epoch": 1.188096227321977, + "grad_norm": 0.2234710305929184, + "learning_rate": 3.0208774616830148e-05, + "logits/chosen": 3.11673903465271, + "logits/rejected": 3.2919507026672363, + "logps/chosen": -384.57159423828125, + "logps/rejected": -382.853515625, + "loss": 0.2065, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.685314655303955, + "rewards/margins": 5.0228166580200195, + "rewards/rejected": -8.708131790161133, + "step": 36460 + }, + { + "epoch": 1.1887479531734977, + "grad_norm": 0.32320573925971985, + "learning_rate": 3.0197912253831702e-05, + "logits/chosen": 3.2863478660583496, + "logits/rejected": 3.218744993209839, + "logps/chosen": -378.66912841796875, + "logps/rejected": -373.06927490234375, + "loss": 0.3134, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.6503396034240723, + "rewards/margins": 4.599379062652588, + "rewards/rejected": -8.249717712402344, + "step": 36480 + }, + { + "epoch": 1.189399679025018, + "grad_norm": 5.6014723777771, + "learning_rate": 3.0187049890833253e-05, + "logits/chosen": 2.9069106578826904, + "logits/rejected": 3.2126965522766113, + "logps/chosen": -356.61822509765625, + "logps/rejected": -391.61016845703125, + "loss": 0.27, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.500063419342041, + "rewards/margins": 4.447470188140869, + "rewards/rejected": -7.947534084320068, + "step": 36500 + }, + { + "epoch": 1.1900514048765387, + "grad_norm": 2.0604212284088135, + "learning_rate": 3.017618752783481e-05, + "logits/chosen": 3.100238800048828, + "logits/rejected": 3.0327229499816895, + "logps/chosen": -390.05938720703125, + "logps/rejected": -334.9447326660156, + "loss": 0.2129, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.2592148780822754, + "rewards/margins": 4.414196014404297, + "rewards/rejected": -7.673410892486572, + "step": 36520 + }, + { + "epoch": 1.1907031307280593, + "grad_norm": 0.36860141158103943, + "learning_rate": 3.016532516483636e-05, + "logits/chosen": 3.229217529296875, + "logits/rejected": 3.2326998710632324, + "logps/chosen": -363.8191833496094, + "logps/rejected": -360.14385986328125, + "loss": 0.1931, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.2480826377868652, + "rewards/margins": 4.536104679107666, + "rewards/rejected": -7.784187316894531, + "step": 36540 + }, + { + "epoch": 1.1913548565795797, + "grad_norm": 4.400070667266846, + "learning_rate": 3.0154462801837912e-05, + "logits/chosen": 3.4309380054473877, + "logits/rejected": 3.4856982231140137, + "logps/chosen": -390.8559265136719, + "logps/rejected": -381.86273193359375, + "loss": 0.2911, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.8950068950653076, + "rewards/margins": 5.044528007507324, + "rewards/rejected": -8.939535140991211, + "step": 36560 + }, + { + "epoch": 1.1920065824311004, + "grad_norm": 2.078432321548462, + "learning_rate": 3.0143600438839463e-05, + "logits/chosen": 3.0300285816192627, + "logits/rejected": 3.1445372104644775, + "logps/chosen": -370.891845703125, + "logps/rejected": -333.4796447753906, + "loss": 0.263, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.8486826419830322, + "rewards/margins": 4.1177077293396, + "rewards/rejected": -6.966391086578369, + "step": 36580 + }, + { + "epoch": 1.192658308282621, + "grad_norm": 0.6299524307250977, + "learning_rate": 3.013273807584102e-05, + "logits/chosen": 3.073944568634033, + "logits/rejected": 3.21331787109375, + "logps/chosen": -336.47943115234375, + "logps/rejected": -362.0628967285156, + "loss": 0.3018, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.923543930053711, + "rewards/margins": 4.372643947601318, + "rewards/rejected": -7.2961883544921875, + "step": 36600 + }, + { + "epoch": 1.1933100341341414, + "grad_norm": 4.901512145996094, + "learning_rate": 3.0121875712842575e-05, + "logits/chosen": 3.2932686805725098, + "logits/rejected": 3.2520625591278076, + "logps/chosen": -360.52362060546875, + "logps/rejected": -357.4234313964844, + "loss": 0.3172, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4037246704101562, + "rewards/margins": 4.16815710067749, + "rewards/rejected": -7.5718817710876465, + "step": 36620 + }, + { + "epoch": 1.193961759985662, + "grad_norm": 1.7484551668167114, + "learning_rate": 3.0111013349844125e-05, + "logits/chosen": 3.1403841972351074, + "logits/rejected": 3.0560431480407715, + "logps/chosen": -373.07513427734375, + "logps/rejected": -337.89703369140625, + "loss": 0.2485, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.8262312412261963, + "rewards/margins": 4.27649450302124, + "rewards/rejected": -7.102725982666016, + "step": 36640 + }, + { + "epoch": 1.1946134858371826, + "grad_norm": 2.7569618225097656, + "learning_rate": 3.0100150986845683e-05, + "logits/chosen": 3.051298141479492, + "logits/rejected": 3.171523332595825, + "logps/chosen": -373.2265625, + "logps/rejected": -340.3183288574219, + "loss": 0.2609, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.574448585510254, + "rewards/margins": 4.3807220458984375, + "rewards/rejected": -6.955170631408691, + "step": 36660 + }, + { + "epoch": 1.195265211688703, + "grad_norm": 0.7159741520881653, + "learning_rate": 3.0089288623847234e-05, + "logits/chosen": 2.8737902641296387, + "logits/rejected": 2.9778671264648438, + "logps/chosen": -370.7472229003906, + "logps/rejected": -326.8832092285156, + "loss": 0.1811, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.24322509765625, + "rewards/margins": 4.4199538230896, + "rewards/rejected": -7.66317892074585, + "step": 36680 + }, + { + "epoch": 1.1959169375402237, + "grad_norm": 0.13653285801410675, + "learning_rate": 3.0078426260848784e-05, + "logits/chosen": 2.9832499027252197, + "logits/rejected": 2.9707512855529785, + "logps/chosen": -326.056884765625, + "logps/rejected": -333.9803771972656, + "loss": 0.2844, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.06416654586792, + "rewards/margins": 4.249629020690918, + "rewards/rejected": -7.313795566558838, + "step": 36700 + }, + { + "epoch": 1.1965686633917443, + "grad_norm": 1.8570958375930786, + "learning_rate": 3.006756389785034e-05, + "logits/chosen": 2.9019088745117188, + "logits/rejected": 2.9880077838897705, + "logps/chosen": -342.896484375, + "logps/rejected": -376.57177734375, + "loss": 0.1527, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.834148406982422, + "rewards/margins": 4.5722784996032715, + "rewards/rejected": -8.406427383422852, + "step": 36720 + }, + { + "epoch": 1.197220389243265, + "grad_norm": 0.5446605086326599, + "learning_rate": 3.0056701534851893e-05, + "logits/chosen": 2.9295122623443604, + "logits/rejected": 3.1540188789367676, + "logps/chosen": -360.3033142089844, + "logps/rejected": -387.3160705566406, + "loss": 0.2694, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.8270297050476074, + "rewards/margins": 5.494095802307129, + "rewards/rejected": -9.321125984191895, + "step": 36740 + }, + { + "epoch": 1.1978721150947853, + "grad_norm": 0.623228132724762, + "learning_rate": 3.0045839171853447e-05, + "logits/chosen": 2.7141213417053223, + "logits/rejected": 2.8323302268981934, + "logps/chosen": -374.7201232910156, + "logps/rejected": -387.7843322753906, + "loss": 0.1856, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.5147857666015625, + "rewards/margins": 5.202258110046387, + "rewards/rejected": -9.71704387664795, + "step": 36760 + }, + { + "epoch": 1.198523840946306, + "grad_norm": 0.7826179265975952, + "learning_rate": 3.0034976808854998e-05, + "logits/chosen": 2.8661279678344727, + "logits/rejected": 2.917668342590332, + "logps/chosen": -383.6527099609375, + "logps/rejected": -387.4280090332031, + "loss": 0.1483, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.1863274574279785, + "rewards/margins": 4.902829647064209, + "rewards/rejected": -9.089157104492188, + "step": 36780 + }, + { + "epoch": 1.1991755667978266, + "grad_norm": 4.776628017425537, + "learning_rate": 3.0024114445856555e-05, + "logits/chosen": 3.047312021255493, + "logits/rejected": 3.067708730697632, + "logps/chosen": -348.5093688964844, + "logps/rejected": -387.3565368652344, + "loss": 0.3117, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.091419696807861, + "rewards/margins": 4.554335594177246, + "rewards/rejected": -8.64575481414795, + "step": 36800 + }, + { + "epoch": 1.199827292649347, + "grad_norm": 1.5552394390106201, + "learning_rate": 3.0013252082858106e-05, + "logits/chosen": 2.561112403869629, + "logits/rejected": 2.6756789684295654, + "logps/chosen": -353.24517822265625, + "logps/rejected": -370.54595947265625, + "loss": 0.342, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.418117046356201, + "rewards/margins": 4.805074214935303, + "rewards/rejected": -8.223191261291504, + "step": 36820 + }, + { + "epoch": 1.2004790185008676, + "grad_norm": 2.8963253498077393, + "learning_rate": 3.0002389719859657e-05, + "logits/chosen": 2.7811946868896484, + "logits/rejected": 2.907073736190796, + "logps/chosen": -372.90802001953125, + "logps/rejected": -385.38885498046875, + "loss": 0.2879, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.9043655395507812, + "rewards/margins": 4.821051597595215, + "rewards/rejected": -8.72541618347168, + "step": 36840 + }, + { + "epoch": 1.2011307443523882, + "grad_norm": 4.610888957977295, + "learning_rate": 2.9991527356861214e-05, + "logits/chosen": 2.7842812538146973, + "logits/rejected": 2.9319519996643066, + "logps/chosen": -327.3922424316406, + "logps/rejected": -335.6694030761719, + "loss": 0.235, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.88550066947937, + "rewards/margins": 4.657528400421143, + "rewards/rejected": -7.543028831481934, + "step": 36860 + }, + { + "epoch": 1.2017824702039088, + "grad_norm": 3.023463726043701, + "learning_rate": 2.998066499386277e-05, + "logits/chosen": 3.0492496490478516, + "logits/rejected": 3.0578484535217285, + "logps/chosen": -333.92236328125, + "logps/rejected": -360.665771484375, + "loss": 0.2421, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.619053363800049, + "rewards/margins": 4.313559532165527, + "rewards/rejected": -7.932614326477051, + "step": 36880 + }, + { + "epoch": 1.2024341960554292, + "grad_norm": 4.142569541931152, + "learning_rate": 2.996980263086432e-05, + "logits/chosen": 3.0680525302886963, + "logits/rejected": 2.8966927528381348, + "logps/chosen": -368.3769836425781, + "logps/rejected": -329.25970458984375, + "loss": 0.4807, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.8944594860076904, + "rewards/margins": 3.9883389472961426, + "rewards/rejected": -7.882798194885254, + "step": 36900 + }, + { + "epoch": 1.2030859219069499, + "grad_norm": 5.264869213104248, + "learning_rate": 2.995894026786587e-05, + "logits/chosen": 2.8334667682647705, + "logits/rejected": 2.8926010131835938, + "logps/chosen": -323.2110290527344, + "logps/rejected": -341.65850830078125, + "loss": 0.1983, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.3550121784210205, + "rewards/margins": 4.384033203125, + "rewards/rejected": -7.7390456199646, + "step": 36920 + }, + { + "epoch": 1.2037376477584705, + "grad_norm": 0.09724772721529007, + "learning_rate": 2.9948077904867428e-05, + "logits/chosen": 2.9915926456451416, + "logits/rejected": 3.029475450515747, + "logps/chosen": -398.7234191894531, + "logps/rejected": -345.69683837890625, + "loss": 0.2006, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.420281171798706, + "rewards/margins": 5.322068214416504, + "rewards/rejected": -8.742349624633789, + "step": 36940 + }, + { + "epoch": 1.2043893736099909, + "grad_norm": 4.592825412750244, + "learning_rate": 2.993721554186898e-05, + "logits/chosen": 2.9524357318878174, + "logits/rejected": 2.9652817249298096, + "logps/chosen": -338.1237487792969, + "logps/rejected": -338.27777099609375, + "loss": 0.4681, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.092881202697754, + "rewards/margins": 4.285431861877441, + "rewards/rejected": -8.378313064575195, + "step": 36960 + }, + { + "epoch": 1.2050410994615115, + "grad_norm": 5.022922039031982, + "learning_rate": 2.992635317887053e-05, + "logits/chosen": 3.151581287384033, + "logits/rejected": 3.161903142929077, + "logps/chosen": -370.05108642578125, + "logps/rejected": -369.0189514160156, + "loss": 0.2932, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.717613935470581, + "rewards/margins": 4.049964427947998, + "rewards/rejected": -7.767579078674316, + "step": 36980 + }, + { + "epoch": 1.2056928253130321, + "grad_norm": 2.040235996246338, + "learning_rate": 2.9915490815872087e-05, + "logits/chosen": 2.9081687927246094, + "logits/rejected": 2.865206241607666, + "logps/chosen": -327.0540771484375, + "logps/rejected": -345.0115966796875, + "loss": 0.2644, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.417975664138794, + "rewards/margins": 4.601924896240234, + "rewards/rejected": -8.01990032196045, + "step": 37000 + }, + { + "epoch": 1.2063445511645527, + "grad_norm": 1.2451454401016235, + "learning_rate": 2.990462845287364e-05, + "logits/chosen": 3.036083698272705, + "logits/rejected": 3.0426125526428223, + "logps/chosen": -392.3791809082031, + "logps/rejected": -390.2558898925781, + "loss": 0.207, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.6875176429748535, + "rewards/margins": 5.276442050933838, + "rewards/rejected": -8.963959693908691, + "step": 37020 + }, + { + "epoch": 1.2069962770160731, + "grad_norm": 3.5384771823883057, + "learning_rate": 2.9893766089875192e-05, + "logits/chosen": 3.089247226715088, + "logits/rejected": 3.1516871452331543, + "logps/chosen": -377.61468505859375, + "logps/rejected": -340.6239318847656, + "loss": 0.4376, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2632739543914795, + "rewards/margins": 3.6137287616729736, + "rewards/rejected": -6.877002716064453, + "step": 37040 + }, + { + "epoch": 1.2076480028675938, + "grad_norm": 5.042431831359863, + "learning_rate": 2.988290372687675e-05, + "logits/chosen": 3.156548023223877, + "logits/rejected": 3.3879313468933105, + "logps/chosen": -363.2475891113281, + "logps/rejected": -343.2138366699219, + "loss": 0.3025, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.3686704635620117, + "rewards/margins": 4.697813510894775, + "rewards/rejected": -7.066483974456787, + "step": 37060 + }, + { + "epoch": 1.2082997287191144, + "grad_norm": 4.660069465637207, + "learning_rate": 2.98720413638783e-05, + "logits/chosen": 3.088726758956909, + "logits/rejected": 3.103800058364868, + "logps/chosen": -341.99560546875, + "logps/rejected": -304.6501770019531, + "loss": 0.2998, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.9563541412353516, + "rewards/margins": 4.1119585037231445, + "rewards/rejected": -7.068312644958496, + "step": 37080 + }, + { + "epoch": 1.2089514545706348, + "grad_norm": 0.7474579811096191, + "learning_rate": 2.986117900087985e-05, + "logits/chosen": 3.282585620880127, + "logits/rejected": 3.3552260398864746, + "logps/chosen": -374.8750915527344, + "logps/rejected": -378.97296142578125, + "loss": 0.3248, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.3554527759552, + "rewards/margins": 5.200990200042725, + "rewards/rejected": -7.5564422607421875, + "step": 37100 + }, + { + "epoch": 1.2096031804221554, + "grad_norm": 3.869107246398926, + "learning_rate": 2.9850316637881405e-05, + "logits/chosen": 3.2429535388946533, + "logits/rejected": 3.3333022594451904, + "logps/chosen": -354.71697998046875, + "logps/rejected": -321.2247009277344, + "loss": 0.3116, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.5493645668029785, + "rewards/margins": 4.8606767654418945, + "rewards/rejected": -7.410041809082031, + "step": 37120 + }, + { + "epoch": 1.210254906273676, + "grad_norm": 5.53140926361084, + "learning_rate": 2.983945427488296e-05, + "logits/chosen": 3.2903976440429688, + "logits/rejected": 3.237541913986206, + "logps/chosen": -346.7254333496094, + "logps/rejected": -346.31988525390625, + "loss": 0.3628, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.1162378787994385, + "rewards/margins": 3.2903809547424316, + "rewards/rejected": -6.406619071960449, + "step": 37140 + }, + { + "epoch": 1.2109066321251964, + "grad_norm": 8.253570556640625, + "learning_rate": 2.9828591911884513e-05, + "logits/chosen": 2.9502272605895996, + "logits/rejected": 3.05395245552063, + "logps/chosen": -324.22503662109375, + "logps/rejected": -345.11224365234375, + "loss": 0.2546, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.069894790649414, + "rewards/margins": 3.9384868144989014, + "rewards/rejected": -7.0083818435668945, + "step": 37160 + }, + { + "epoch": 1.211558357976717, + "grad_norm": 2.937363386154175, + "learning_rate": 2.9817729548886064e-05, + "logits/chosen": 3.2642970085144043, + "logits/rejected": 3.0543174743652344, + "logps/chosen": -396.98785400390625, + "logps/rejected": -364.4139709472656, + "loss": 0.2681, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.331763505935669, + "rewards/margins": 4.5812788009643555, + "rewards/rejected": -7.9130425453186035, + "step": 37180 + }, + { + "epoch": 1.2122100838282377, + "grad_norm": 0.06653409451246262, + "learning_rate": 2.9806867185887622e-05, + "logits/chosen": 2.9009172916412354, + "logits/rejected": 3.0828325748443604, + "logps/chosen": -393.8069763183594, + "logps/rejected": -341.9633483886719, + "loss": 0.3303, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.0380699634552, + "rewards/margins": 3.4887192249298096, + "rewards/rejected": -6.52678918838501, + "step": 37200 + }, + { + "epoch": 1.212861809679758, + "grad_norm": 3.924586296081543, + "learning_rate": 2.9796004822889173e-05, + "logits/chosen": 3.0568654537200928, + "logits/rejected": 3.3286850452423096, + "logps/chosen": -344.1852111816406, + "logps/rejected": -298.50042724609375, + "loss": 0.3648, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4695911407470703, + "rewards/margins": 3.499962329864502, + "rewards/rejected": -5.969553470611572, + "step": 37220 + }, + { + "epoch": 1.2135135355312787, + "grad_norm": 4.090430736541748, + "learning_rate": 2.9785142459890723e-05, + "logits/chosen": 3.505760908126831, + "logits/rejected": 3.6455485820770264, + "logps/chosen": -358.3623962402344, + "logps/rejected": -333.61328125, + "loss": 0.378, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.6755473613739014, + "rewards/margins": 3.5298781394958496, + "rewards/rejected": -6.205426216125488, + "step": 37240 + }, + { + "epoch": 1.2141652613827993, + "grad_norm": 1.5190284252166748, + "learning_rate": 2.977428009689228e-05, + "logits/chosen": 3.0897791385650635, + "logits/rejected": 3.143099546432495, + "logps/chosen": -361.44757080078125, + "logps/rejected": -358.5282287597656, + "loss": 0.2945, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.2188806533813477, + "rewards/margins": 4.00559663772583, + "rewards/rejected": -6.224477767944336, + "step": 37260 + }, + { + "epoch": 1.21481698723432, + "grad_norm": 0.46509990096092224, + "learning_rate": 2.9763417733893835e-05, + "logits/chosen": 3.175039291381836, + "logits/rejected": 3.1554696559906006, + "logps/chosen": -349.68609619140625, + "logps/rejected": -312.30975341796875, + "loss": 0.4115, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9859306812286377, + "rewards/margins": 3.4957737922668457, + "rewards/rejected": -5.481704235076904, + "step": 37280 + }, + { + "epoch": 1.2154687130858404, + "grad_norm": 9.40445613861084, + "learning_rate": 2.9752555370895386e-05, + "logits/chosen": 3.002037286758423, + "logits/rejected": 3.272731304168701, + "logps/chosen": -317.92169189453125, + "logps/rejected": -343.75799560546875, + "loss": 0.3412, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.324246883392334, + "rewards/margins": 3.1764721870422363, + "rewards/rejected": -5.5007195472717285, + "step": 37300 + }, + { + "epoch": 1.216120438937361, + "grad_norm": 2.0243306159973145, + "learning_rate": 2.9741693007896937e-05, + "logits/chosen": 3.0793099403381348, + "logits/rejected": 3.1903440952301025, + "logps/chosen": -340.5123291015625, + "logps/rejected": -324.28411865234375, + "loss": 0.3257, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.2388696670532227, + "rewards/margins": 3.6082420349121094, + "rewards/rejected": -5.847111701965332, + "step": 37320 + }, + { + "epoch": 1.2167721647888816, + "grad_norm": 0.15222738683223724, + "learning_rate": 2.9730830644898494e-05, + "logits/chosen": 3.4200756549835205, + "logits/rejected": 3.3529632091522217, + "logps/chosen": -356.47503662109375, + "logps/rejected": -375.11956787109375, + "loss": 0.4704, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.6223487854003906, + "rewards/margins": 3.4189136028289795, + "rewards/rejected": -6.041261672973633, + "step": 37340 + }, + { + "epoch": 1.217423890640402, + "grad_norm": 4.39137601852417, + "learning_rate": 2.9719968281900045e-05, + "logits/chosen": 3.2128701210021973, + "logits/rejected": 3.0983786582946777, + "logps/chosen": -390.0033264160156, + "logps/rejected": -322.769775390625, + "loss": 0.2596, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0539355278015137, + "rewards/margins": 3.687967300415039, + "rewards/rejected": -6.7419023513793945, + "step": 37360 + }, + { + "epoch": 1.2180756164919226, + "grad_norm": 0.18680033087730408, + "learning_rate": 2.9709105918901596e-05, + "logits/chosen": 3.289238691329956, + "logits/rejected": 3.3212947845458984, + "logps/chosen": -360.6325378417969, + "logps/rejected": -347.08612060546875, + "loss": 0.186, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.3545186519622803, + "rewards/margins": 4.729369640350342, + "rewards/rejected": -7.083888053894043, + "step": 37380 + }, + { + "epoch": 1.2187273423434433, + "grad_norm": 3.6254799365997314, + "learning_rate": 2.9698243555903153e-05, + "logits/chosen": 2.912107229232788, + "logits/rejected": 2.993224859237671, + "logps/chosen": -327.16656494140625, + "logps/rejected": -305.99407958984375, + "loss": 0.3511, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.366215705871582, + "rewards/margins": 4.1318769454956055, + "rewards/rejected": -6.4980926513671875, + "step": 37400 + }, + { + "epoch": 1.2193790681949639, + "grad_norm": 6.62436056137085, + "learning_rate": 2.9687381192904708e-05, + "logits/chosen": 3.3254952430725098, + "logits/rejected": 3.2889816761016846, + "logps/chosen": -394.69696044921875, + "logps/rejected": -353.4488220214844, + "loss": 0.3407, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.1600799560546875, + "rewards/margins": 3.7953009605407715, + "rewards/rejected": -6.955381870269775, + "step": 37420 + }, + { + "epoch": 1.2200307940464843, + "grad_norm": 0.43892815709114075, + "learning_rate": 2.967651882990626e-05, + "logits/chosen": 2.5793211460113525, + "logits/rejected": 2.7087886333465576, + "logps/chosen": -289.1258544921875, + "logps/rejected": -296.94573974609375, + "loss": 0.2735, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.654370069503784, + "rewards/margins": 4.07970666885376, + "rewards/rejected": -6.734076499938965, + "step": 37440 + }, + { + "epoch": 1.220682519898005, + "grad_norm": 3.8956634998321533, + "learning_rate": 2.9665656466907816e-05, + "logits/chosen": 3.343158006668091, + "logits/rejected": 3.4985222816467285, + "logps/chosen": -358.8296813964844, + "logps/rejected": -368.79656982421875, + "loss": 0.382, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.7884931564331055, + "rewards/margins": 3.606330156326294, + "rewards/rejected": -6.3948235511779785, + "step": 37460 + }, + { + "epoch": 1.2213342457495255, + "grad_norm": 7.483689785003662, + "learning_rate": 2.9654794103909367e-05, + "logits/chosen": 3.1973628997802734, + "logits/rejected": 3.3812193870544434, + "logps/chosen": -361.6527099609375, + "logps/rejected": -326.0745849609375, + "loss": 0.3577, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.5116238594055176, + "rewards/margins": 3.7155089378356934, + "rewards/rejected": -6.227132797241211, + "step": 37480 + }, + { + "epoch": 1.221985971601046, + "grad_norm": 6.291428565979004, + "learning_rate": 2.9643931740910917e-05, + "logits/chosen": 2.99175763130188, + "logits/rejected": 3.218898057937622, + "logps/chosen": -334.7767333984375, + "logps/rejected": -336.912353515625, + "loss": 0.2276, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.569927215576172, + "rewards/margins": 4.566346645355225, + "rewards/rejected": -7.1362738609313965, + "step": 37500 + }, + { + "epoch": 1.2226376974525666, + "grad_norm": 4.356775283813477, + "learning_rate": 2.963306937791247e-05, + "logits/chosen": 2.8361217975616455, + "logits/rejected": 2.8821072578430176, + "logps/chosen": -344.0143737792969, + "logps/rejected": -329.55572509765625, + "loss": 0.3121, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.3988614082336426, + "rewards/margins": 3.677208423614502, + "rewards/rejected": -6.0760698318481445, + "step": 37520 + }, + { + "epoch": 1.2232894233040872, + "grad_norm": 1.7775264978408813, + "learning_rate": 2.9622207014914026e-05, + "logits/chosen": 3.0005202293395996, + "logits/rejected": 2.957458257675171, + "logps/chosen": -356.2666931152344, + "logps/rejected": -360.7987365722656, + "loss": 0.2755, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2107224464416504, + "rewards/margins": 4.466235637664795, + "rewards/rejected": -7.676957607269287, + "step": 37540 + }, + { + "epoch": 1.2239411491556078, + "grad_norm": 0.21189439296722412, + "learning_rate": 2.961134465191558e-05, + "logits/chosen": 2.890847682952881, + "logits/rejected": 3.1536500453948975, + "logps/chosen": -350.89324951171875, + "logps/rejected": -315.6048278808594, + "loss": 0.3419, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.9751510620117188, + "rewards/margins": 4.173426151275635, + "rewards/rejected": -7.148577690124512, + "step": 37560 + }, + { + "epoch": 1.2245928750071282, + "grad_norm": 2.3574912548065186, + "learning_rate": 2.960048228891713e-05, + "logits/chosen": 3.056978940963745, + "logits/rejected": 3.117018461227417, + "logps/chosen": -381.4471740722656, + "logps/rejected": -336.33624267578125, + "loss": 0.3874, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5593507289886475, + "rewards/margins": 3.926109790802002, + "rewards/rejected": -6.485459804534912, + "step": 37580 + }, + { + "epoch": 1.2252446008586488, + "grad_norm": 1.2397891283035278, + "learning_rate": 2.9589619925918688e-05, + "logits/chosen": 2.920255184173584, + "logits/rejected": 3.004603862762451, + "logps/chosen": -371.14910888671875, + "logps/rejected": -321.2538146972656, + "loss": 0.1808, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.411888599395752, + "rewards/margins": 4.687169075012207, + "rewards/rejected": -7.099057197570801, + "step": 37600 + }, + { + "epoch": 1.2258963267101695, + "grad_norm": 3.3956284523010254, + "learning_rate": 2.957875756292024e-05, + "logits/chosen": 3.0361170768737793, + "logits/rejected": 3.279668092727661, + "logps/chosen": -390.75115966796875, + "logps/rejected": -367.98980712890625, + "loss": 0.2711, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.3461060523986816, + "rewards/margins": 4.527472019195557, + "rewards/rejected": -7.873578071594238, + "step": 37620 + }, + { + "epoch": 1.2265480525616899, + "grad_norm": 1.7580889463424683, + "learning_rate": 2.956789519992179e-05, + "logits/chosen": 2.7728397846221924, + "logits/rejected": 2.9239659309387207, + "logps/chosen": -320.2571105957031, + "logps/rejected": -328.1617431640625, + "loss": 0.2679, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.0233492851257324, + "rewards/margins": 3.9069602489471436, + "rewards/rejected": -6.9303083419799805, + "step": 37640 + }, + { + "epoch": 1.2271997784132105, + "grad_norm": 1.0234088897705078, + "learning_rate": 2.9557032836923347e-05, + "logits/chosen": 2.7409377098083496, + "logits/rejected": 2.9366443157196045, + "logps/chosen": -348.22479248046875, + "logps/rejected": -321.4629821777344, + "loss": 0.2702, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.478327989578247, + "rewards/margins": 4.081462383270264, + "rewards/rejected": -7.559790134429932, + "step": 37660 + }, + { + "epoch": 1.227851504264731, + "grad_norm": 0.5762377977371216, + "learning_rate": 2.95461704739249e-05, + "logits/chosen": 2.9030792713165283, + "logits/rejected": 3.094449758529663, + "logps/chosen": -408.1871337890625, + "logps/rejected": -380.47601318359375, + "loss": 0.2086, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2185745239257812, + "rewards/margins": 4.568880081176758, + "rewards/rejected": -7.787454128265381, + "step": 37680 + }, + { + "epoch": 1.2285032301162515, + "grad_norm": 1.0456223487854004, + "learning_rate": 2.9535308110926452e-05, + "logits/chosen": 2.67653751373291, + "logits/rejected": 2.7626144886016846, + "logps/chosen": -335.49627685546875, + "logps/rejected": -345.1145935058594, + "loss": 0.2426, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.1449389457702637, + "rewards/margins": 3.8350086212158203, + "rewards/rejected": -6.979947566986084, + "step": 37700 + }, + { + "epoch": 1.2291549559677721, + "grad_norm": 3.788674831390381, + "learning_rate": 2.9524445747928003e-05, + "logits/chosen": 2.8022525310516357, + "logits/rejected": 2.9473555088043213, + "logps/chosen": -384.70458984375, + "logps/rejected": -347.3315734863281, + "loss": 0.2521, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.1142923831939697, + "rewards/margins": 5.07034969329834, + "rewards/rejected": -8.18464183807373, + "step": 37720 + }, + { + "epoch": 1.2298066818192928, + "grad_norm": 5.113173007965088, + "learning_rate": 2.951358338492956e-05, + "logits/chosen": 2.9067912101745605, + "logits/rejected": 3.005748748779297, + "logps/chosen": -366.77215576171875, + "logps/rejected": -337.89520263671875, + "loss": 0.2422, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.694242000579834, + "rewards/margins": 4.5049543380737305, + "rewards/rejected": -8.199197769165039, + "step": 37740 + }, + { + "epoch": 1.2304584076708132, + "grad_norm": 0.3738352954387665, + "learning_rate": 2.950272102193111e-05, + "logits/chosen": 2.987410068511963, + "logits/rejected": 3.037220001220703, + "logps/chosen": -388.57794189453125, + "logps/rejected": -372.9002990722656, + "loss": 0.1145, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.7813804149627686, + "rewards/margins": 6.038888454437256, + "rewards/rejected": -9.820268630981445, + "step": 37760 + }, + { + "epoch": 1.2311101335223338, + "grad_norm": 0.8338063359260559, + "learning_rate": 2.9491858658932662e-05, + "logits/chosen": 2.7621703147888184, + "logits/rejected": 2.7663280963897705, + "logps/chosen": -416.19622802734375, + "logps/rejected": -394.3783874511719, + "loss": 0.1287, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.7475123405456543, + "rewards/margins": 5.528482913970947, + "rewards/rejected": -9.275994300842285, + "step": 37780 + }, + { + "epoch": 1.2317618593738544, + "grad_norm": 9.29624080657959, + "learning_rate": 2.948099629593422e-05, + "logits/chosen": 2.9794137477874756, + "logits/rejected": 3.1608595848083496, + "logps/chosen": -379.2586669921875, + "logps/rejected": -336.5682373046875, + "loss": 0.3214, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.7466092109680176, + "rewards/margins": 3.8252949714660645, + "rewards/rejected": -7.571904182434082, + "step": 37800 + }, + { + "epoch": 1.232413585225375, + "grad_norm": 1.440016508102417, + "learning_rate": 2.9470133932935774e-05, + "logits/chosen": 2.9149391651153564, + "logits/rejected": 2.9046568870544434, + "logps/chosen": -386.05426025390625, + "logps/rejected": -417.758056640625, + "loss": 0.3203, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.173007965087891, + "rewards/margins": 5.819919109344482, + "rewards/rejected": -9.992927551269531, + "step": 37820 + }, + { + "epoch": 1.2330653110768954, + "grad_norm": 0.6146507263183594, + "learning_rate": 2.9459271569937325e-05, + "logits/chosen": 3.1103527545928955, + "logits/rejected": 3.188586950302124, + "logps/chosen": -427.6720275878906, + "logps/rejected": -399.9214172363281, + "loss": 0.2448, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.0052595138549805, + "rewards/margins": 4.802219390869141, + "rewards/rejected": -8.807478904724121, + "step": 37840 + }, + { + "epoch": 1.233717036928416, + "grad_norm": 1.4715890884399414, + "learning_rate": 2.9448409206938876e-05, + "logits/chosen": 3.0023386478424072, + "logits/rejected": 2.90616774559021, + "logps/chosen": -403.5404052734375, + "logps/rejected": -369.494873046875, + "loss": 0.4759, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.008244514465332, + "rewards/margins": 4.409018039703369, + "rewards/rejected": -8.417262077331543, + "step": 37860 + }, + { + "epoch": 1.2343687627799367, + "grad_norm": 0.14684805274009705, + "learning_rate": 2.9437546843940433e-05, + "logits/chosen": 3.176288604736328, + "logits/rejected": 3.21287202835083, + "logps/chosen": -375.35174560546875, + "logps/rejected": -341.600830078125, + "loss": 0.378, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.3911678791046143, + "rewards/margins": 4.50095272064209, + "rewards/rejected": -7.892121315002441, + "step": 37880 + }, + { + "epoch": 1.235020488631457, + "grad_norm": 2.660266399383545, + "learning_rate": 2.9426684480941984e-05, + "logits/chosen": 3.1914474964141846, + "logits/rejected": 3.1917688846588135, + "logps/chosen": -370.520263671875, + "logps/rejected": -351.6856689453125, + "loss": 0.3933, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.272623538970947, + "rewards/margins": 3.916208267211914, + "rewards/rejected": -8.18883228302002, + "step": 37900 + }, + { + "epoch": 1.2356722144829777, + "grad_norm": 1.9309942722320557, + "learning_rate": 2.9415822117943538e-05, + "logits/chosen": 3.3569226264953613, + "logits/rejected": 3.3008625507354736, + "logps/chosen": -389.2123107910156, + "logps/rejected": -389.19708251953125, + "loss": 0.321, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.698256015777588, + "rewards/margins": 4.382898330688477, + "rewards/rejected": -8.081153869628906, + "step": 37920 + }, + { + "epoch": 1.2363239403344983, + "grad_norm": 0.41428616642951965, + "learning_rate": 2.9404959754945092e-05, + "logits/chosen": 2.7603707313537598, + "logits/rejected": 2.671426296234131, + "logps/chosen": -320.8472595214844, + "logps/rejected": -328.9213562011719, + "loss": 0.4438, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.6459975242614746, + "rewards/margins": 4.534782409667969, + "rewards/rejected": -8.180780410766602, + "step": 37940 + }, + { + "epoch": 1.236975666186019, + "grad_norm": 0.5167787671089172, + "learning_rate": 2.9394097391946646e-05, + "logits/chosen": 2.7693538665771484, + "logits/rejected": 2.8866772651672363, + "logps/chosen": -335.9930114746094, + "logps/rejected": -350.5677795410156, + "loss": 0.2951, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.687096118927002, + "rewards/margins": 4.090375900268555, + "rewards/rejected": -7.777472019195557, + "step": 37960 + }, + { + "epoch": 1.2376273920375394, + "grad_norm": 4.9334025382995605, + "learning_rate": 2.9383235028948197e-05, + "logits/chosen": 2.774523973464966, + "logits/rejected": 2.903698205947876, + "logps/chosen": -348.2037048339844, + "logps/rejected": -323.295166015625, + "loss": 0.4332, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.835692882537842, + "rewards/margins": 3.8684017658233643, + "rewards/rejected": -7.704094886779785, + "step": 37980 + }, + { + "epoch": 1.23827911788906, + "grad_norm": 1.3601964712142944, + "learning_rate": 2.9372372665949755e-05, + "logits/chosen": 3.156487464904785, + "logits/rejected": 3.220773220062256, + "logps/chosen": -368.916259765625, + "logps/rejected": -331.0813293457031, + "loss": 0.2627, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.0863585472106934, + "rewards/margins": 4.328455924987793, + "rewards/rejected": -7.4148149490356445, + "step": 38000 + }, + { + "epoch": 1.2389308437405806, + "grad_norm": 0.03758377209305763, + "learning_rate": 2.9361510302951306e-05, + "logits/chosen": 3.126601457595825, + "logits/rejected": 3.226405382156372, + "logps/chosen": -371.77203369140625, + "logps/rejected": -350.2288513183594, + "loss": 0.4183, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.795628309249878, + "rewards/margins": 4.506763935089111, + "rewards/rejected": -8.30239200592041, + "step": 38020 + }, + { + "epoch": 1.239582569592101, + "grad_norm": 8.347885131835938, + "learning_rate": 2.9350647939952856e-05, + "logits/chosen": 3.241886615753174, + "logits/rejected": 3.3001701831817627, + "logps/chosen": -382.3407287597656, + "logps/rejected": -377.1053771972656, + "loss": 0.3631, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.2114779949188232, + "rewards/margins": 4.834640979766846, + "rewards/rejected": -8.04611873626709, + "step": 38040 + }, + { + "epoch": 1.2402342954436216, + "grad_norm": 6.846003532409668, + "learning_rate": 2.933978557695441e-05, + "logits/chosen": 3.298644542694092, + "logits/rejected": 3.447418212890625, + "logps/chosen": -362.5412292480469, + "logps/rejected": -344.14593505859375, + "loss": 0.4217, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.1615612506866455, + "rewards/margins": 3.7598538398742676, + "rewards/rejected": -6.921414852142334, + "step": 38060 + }, + { + "epoch": 1.2408860212951422, + "grad_norm": 5.474699020385742, + "learning_rate": 2.9328923213955968e-05, + "logits/chosen": 3.013645648956299, + "logits/rejected": 3.217203140258789, + "logps/chosen": -348.7642517089844, + "logps/rejected": -319.43389892578125, + "loss": 0.1992, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.987055540084839, + "rewards/margins": 4.384732723236084, + "rewards/rejected": -7.371788024902344, + "step": 38080 + }, + { + "epoch": 1.2415377471466629, + "grad_norm": 3.470130205154419, + "learning_rate": 2.931806085095752e-05, + "logits/chosen": 2.8517470359802246, + "logits/rejected": 3.0037038326263428, + "logps/chosen": -289.7176818847656, + "logps/rejected": -315.7960205078125, + "loss": 0.2583, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.294538974761963, + "rewards/margins": 3.8723881244659424, + "rewards/rejected": -7.166927337646484, + "step": 38100 + }, + { + "epoch": 1.2421894729981833, + "grad_norm": 3.8875656127929688, + "learning_rate": 2.930719848795907e-05, + "logits/chosen": 2.977975606918335, + "logits/rejected": 3.205152988433838, + "logps/chosen": -375.8282165527344, + "logps/rejected": -318.45172119140625, + "loss": 0.2592, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8250174522399902, + "rewards/margins": 4.647731781005859, + "rewards/rejected": -7.47274923324585, + "step": 38120 + }, + { + "epoch": 1.242841198849704, + "grad_norm": 8.227211952209473, + "learning_rate": 2.9296336124960627e-05, + "logits/chosen": 3.0056774616241455, + "logits/rejected": 2.8551223278045654, + "logps/chosen": -394.4847717285156, + "logps/rejected": -412.4497985839844, + "loss": 0.4277, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.4202942848205566, + "rewards/margins": 4.9957170486450195, + "rewards/rejected": -8.416011810302734, + "step": 38140 + }, + { + "epoch": 1.2434929247012245, + "grad_norm": 12.429779052734375, + "learning_rate": 2.9285473761962178e-05, + "logits/chosen": 3.025831937789917, + "logits/rejected": 3.030703067779541, + "logps/chosen": -346.313232421875, + "logps/rejected": -381.66204833984375, + "loss": 0.1555, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.6373131275177, + "rewards/margins": 5.825366020202637, + "rewards/rejected": -9.462678909301758, + "step": 38160 + }, + { + "epoch": 1.244144650552745, + "grad_norm": 0.5484778881072998, + "learning_rate": 2.9274611398963732e-05, + "logits/chosen": 2.7439208030700684, + "logits/rejected": 3.029402256011963, + "logps/chosen": -321.9343566894531, + "logps/rejected": -344.3016357421875, + "loss": 0.4348, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.33943247795105, + "rewards/margins": 4.900424003601074, + "rewards/rejected": -8.239855766296387, + "step": 38180 + }, + { + "epoch": 1.2447963764042655, + "grad_norm": 25.065799713134766, + "learning_rate": 2.9263749035965286e-05, + "logits/chosen": 2.883186101913452, + "logits/rejected": 2.998232364654541, + "logps/chosen": -391.1300354003906, + "logps/rejected": -354.5445251464844, + "loss": 0.4067, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.28721284866333, + "rewards/margins": 5.140786170959473, + "rewards/rejected": -8.427998542785645, + "step": 38200 + }, + { + "epoch": 1.2454481022557862, + "grad_norm": 10.453767776489258, + "learning_rate": 2.925288667296684e-05, + "logits/chosen": 2.442253828048706, + "logits/rejected": 2.5924415588378906, + "logps/chosen": -315.63604736328125, + "logps/rejected": -304.18267822265625, + "loss": 0.4207, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.91540789604187, + "rewards/margins": 4.343897342681885, + "rewards/rejected": -8.259305000305176, + "step": 38220 + }, + { + "epoch": 1.2460998281073066, + "grad_norm": 1.1260613203048706, + "learning_rate": 2.924202430996839e-05, + "logits/chosen": 2.866738796234131, + "logits/rejected": 3.104982852935791, + "logps/chosen": -349.521728515625, + "logps/rejected": -383.12298583984375, + "loss": 0.4231, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.6146862506866455, + "rewards/margins": 4.998887538909912, + "rewards/rejected": -8.61357307434082, + "step": 38240 + }, + { + "epoch": 1.2467515539588272, + "grad_norm": 0.6498889327049255, + "learning_rate": 2.9231161946969942e-05, + "logits/chosen": 3.1866393089294434, + "logits/rejected": 3.202868938446045, + "logps/chosen": -371.0865783691406, + "logps/rejected": -345.0054626464844, + "loss": 0.3214, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.619688034057617, + "rewards/margins": 4.514227867126465, + "rewards/rejected": -7.133915901184082, + "step": 38260 + }, + { + "epoch": 1.2474032798103478, + "grad_norm": 0.503179132938385, + "learning_rate": 2.92202995839715e-05, + "logits/chosen": 3.404170274734497, + "logits/rejected": 3.3014063835144043, + "logps/chosen": -345.979248046875, + "logps/rejected": -375.56451416015625, + "loss": 0.235, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.812744140625, + "rewards/margins": 5.0094804763793945, + "rewards/rejected": -7.822225093841553, + "step": 38280 + }, + { + "epoch": 1.2480550056618682, + "grad_norm": 5.424173355102539, + "learning_rate": 2.920943722097305e-05, + "logits/chosen": 3.053921937942505, + "logits/rejected": 3.1306252479553223, + "logps/chosen": -357.6407775878906, + "logps/rejected": -345.49615478515625, + "loss": 0.2245, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.6720290184020996, + "rewards/margins": 4.93041467666626, + "rewards/rejected": -7.602443695068359, + "step": 38300 + }, + { + "epoch": 1.2487067315133888, + "grad_norm": 0.719596803188324, + "learning_rate": 2.9198574857974605e-05, + "logits/chosen": 3.0608630180358887, + "logits/rejected": 2.927359104156494, + "logps/chosen": -350.2784118652344, + "logps/rejected": -334.102294921875, + "loss": 0.3189, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.00129771232605, + "rewards/margins": 4.151650428771973, + "rewards/rejected": -7.152947902679443, + "step": 38320 + }, + { + "epoch": 1.2493584573649095, + "grad_norm": 1.811889886856079, + "learning_rate": 2.918771249497616e-05, + "logits/chosen": 2.753364324569702, + "logits/rejected": 3.088681697845459, + "logps/chosen": -336.9158020019531, + "logps/rejected": -311.74444580078125, + "loss": 0.3488, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.990159273147583, + "rewards/margins": 3.8335490226745605, + "rewards/rejected": -6.823708534240723, + "step": 38340 + }, + { + "epoch": 1.25001018321643, + "grad_norm": 2.9914817810058594, + "learning_rate": 2.9176850131977713e-05, + "logits/chosen": 3.192396640777588, + "logits/rejected": 3.1364283561706543, + "logps/chosen": -340.98876953125, + "logps/rejected": -354.7937316894531, + "loss": 0.2538, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.589871883392334, + "rewards/margins": 4.135735511779785, + "rewards/rejected": -6.725607872009277, + "step": 38360 + }, + { + "epoch": 1.2506619090679505, + "grad_norm": 0.6603862643241882, + "learning_rate": 2.9165987768979264e-05, + "logits/chosen": 2.958233118057251, + "logits/rejected": 3.0732076168060303, + "logps/chosen": -347.1717529296875, + "logps/rejected": -379.98114013671875, + "loss": 0.2578, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.3675270080566406, + "rewards/margins": 4.5548810958862305, + "rewards/rejected": -7.922407627105713, + "step": 38380 + }, + { + "epoch": 1.2513136349194711, + "grad_norm": 7.462243556976318, + "learning_rate": 2.915512540598082e-05, + "logits/chosen": 3.0613207817077637, + "logits/rejected": 3.003857374191284, + "logps/chosen": -351.0692443847656, + "logps/rejected": -320.71875, + "loss": 0.4618, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.269831895828247, + "rewards/margins": 3.5178139209747314, + "rewards/rejected": -6.7876458168029785, + "step": 38400 + }, + { + "epoch": 1.2519653607709917, + "grad_norm": 4.7133307456970215, + "learning_rate": 2.9144263042982372e-05, + "logits/chosen": 3.1811885833740234, + "logits/rejected": 3.041858673095703, + "logps/chosen": -361.75469970703125, + "logps/rejected": -348.43438720703125, + "loss": 0.3335, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.7127842903137207, + "rewards/margins": 4.175289154052734, + "rewards/rejected": -6.888072967529297, + "step": 38420 + }, + { + "epoch": 1.2526170866225121, + "grad_norm": 5.759817123413086, + "learning_rate": 2.9133400679983923e-05, + "logits/chosen": 2.899167060852051, + "logits/rejected": 2.9029488563537598, + "logps/chosen": -330.982666015625, + "logps/rejected": -341.5336608886719, + "loss": 0.2025, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.399779796600342, + "rewards/margins": 4.329167366027832, + "rewards/rejected": -6.728947639465332, + "step": 38440 + }, + { + "epoch": 1.2532688124740328, + "grad_norm": 0.7723680734634399, + "learning_rate": 2.9122538316985477e-05, + "logits/chosen": 2.7385125160217285, + "logits/rejected": 2.7265918254852295, + "logps/chosen": -356.32061767578125, + "logps/rejected": -331.00433349609375, + "loss": 0.2926, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.733018398284912, + "rewards/margins": 4.578530788421631, + "rewards/rejected": -7.311549186706543, + "step": 38460 + }, + { + "epoch": 1.2539205383255534, + "grad_norm": 2.550680637359619, + "learning_rate": 2.9111675953987035e-05, + "logits/chosen": 2.923170328140259, + "logits/rejected": 3.121267557144165, + "logps/chosen": -348.82513427734375, + "logps/rejected": -329.62738037109375, + "loss": 0.2384, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.885467290878296, + "rewards/margins": 4.364028453826904, + "rewards/rejected": -7.2494964599609375, + "step": 38480 + }, + { + "epoch": 1.254572264177074, + "grad_norm": 0.07340823858976364, + "learning_rate": 2.9100813590988585e-05, + "logits/chosen": 2.647847890853882, + "logits/rejected": 2.842857837677002, + "logps/chosen": -362.8467102050781, + "logps/rejected": -385.18560791015625, + "loss": 0.1631, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6706676483154297, + "rewards/margins": 5.12375545501709, + "rewards/rejected": -7.7944231033325195, + "step": 38500 + }, + { + "epoch": 1.2552239900285944, + "grad_norm": 0.2716512382030487, + "learning_rate": 2.9089951227990136e-05, + "logits/chosen": 2.9246649742126465, + "logits/rejected": 3.1518867015838623, + "logps/chosen": -354.4378967285156, + "logps/rejected": -350.91778564453125, + "loss": 0.3944, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.2320938110351562, + "rewards/margins": 5.078822135925293, + "rewards/rejected": -8.31091594696045, + "step": 38520 + }, + { + "epoch": 1.255875715880115, + "grad_norm": 5.945369720458984, + "learning_rate": 2.9079088864991694e-05, + "logits/chosen": 3.0240659713745117, + "logits/rejected": 3.1790194511413574, + "logps/chosen": -397.4180603027344, + "logps/rejected": -347.41796875, + "loss": 0.4337, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.8979554176330566, + "rewards/margins": 3.656073808670044, + "rewards/rejected": -6.5540289878845215, + "step": 38540 + }, + { + "epoch": 1.2565274417316357, + "grad_norm": 3.5379061698913574, + "learning_rate": 2.9068226501993244e-05, + "logits/chosen": 3.0327744483947754, + "logits/rejected": 2.9842898845672607, + "logps/chosen": -341.8306579589844, + "logps/rejected": -356.4181213378906, + "loss": 0.2364, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.6204044818878174, + "rewards/margins": 5.155320167541504, + "rewards/rejected": -7.7757248878479, + "step": 38560 + }, + { + "epoch": 1.257179167583156, + "grad_norm": 0.16666004061698914, + "learning_rate": 2.90573641389948e-05, + "logits/chosen": 2.8635761737823486, + "logits/rejected": 2.939099073410034, + "logps/chosen": -363.3790283203125, + "logps/rejected": -361.083251953125, + "loss": 0.2744, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.663278102874756, + "rewards/margins": 4.805392742156982, + "rewards/rejected": -7.468670845031738, + "step": 38580 + }, + { + "epoch": 1.2578308934346767, + "grad_norm": 0.3115670084953308, + "learning_rate": 2.9046501775996353e-05, + "logits/chosen": 2.7830798625946045, + "logits/rejected": 2.8404381275177, + "logps/chosen": -334.7019348144531, + "logps/rejected": -372.75421142578125, + "loss": 0.2733, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.7230567932128906, + "rewards/margins": 4.460601806640625, + "rewards/rejected": -8.1836576461792, + "step": 38600 + }, + { + "epoch": 1.2584826192861973, + "grad_norm": 8.968711853027344, + "learning_rate": 2.9035639412997907e-05, + "logits/chosen": 2.8555026054382324, + "logits/rejected": 3.0711445808410645, + "logps/chosen": -343.84637451171875, + "logps/rejected": -342.1683349609375, + "loss": 0.3356, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.097738742828369, + "rewards/margins": 4.530218601226807, + "rewards/rejected": -7.627956390380859, + "step": 38620 + }, + { + "epoch": 1.259134345137718, + "grad_norm": 2.7115111351013184, + "learning_rate": 2.9024777049999458e-05, + "logits/chosen": 2.5871129035949707, + "logits/rejected": 2.717207431793213, + "logps/chosen": -346.2048645019531, + "logps/rejected": -343.74163818359375, + "loss": 0.2217, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.961465358734131, + "rewards/margins": 5.096529960632324, + "rewards/rejected": -8.05799674987793, + "step": 38640 + }, + { + "epoch": 1.2597860709892383, + "grad_norm": 9.483362197875977, + "learning_rate": 2.901391468700101e-05, + "logits/chosen": 3.2836976051330566, + "logits/rejected": 3.3469181060791016, + "logps/chosen": -389.91143798828125, + "logps/rejected": -381.85235595703125, + "loss": 0.2814, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.187173843383789, + "rewards/margins": 4.66082763671875, + "rewards/rejected": -7.848001003265381, + "step": 38660 + }, + { + "epoch": 1.260437796840759, + "grad_norm": 0.2936383783817291, + "learning_rate": 2.9003052324002566e-05, + "logits/chosen": 3.0076091289520264, + "logits/rejected": 3.0067994594573975, + "logps/chosen": -382.08746337890625, + "logps/rejected": -396.91766357421875, + "loss": 0.2959, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.1148858070373535, + "rewards/margins": 4.68201208114624, + "rewards/rejected": -7.796897888183594, + "step": 38680 + }, + { + "epoch": 1.2610895226922794, + "grad_norm": 3.3093934059143066, + "learning_rate": 2.8992189961004117e-05, + "logits/chosen": 2.6964352130889893, + "logits/rejected": 2.7593448162078857, + "logps/chosen": -360.76898193359375, + "logps/rejected": -333.90093994140625, + "loss": 0.5236, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.117205619812012, + "rewards/margins": 3.925342082977295, + "rewards/rejected": -8.042547225952148, + "step": 38700 + }, + { + "epoch": 1.2617412485438, + "grad_norm": 2.8148481845855713, + "learning_rate": 2.898132759800567e-05, + "logits/chosen": 2.99534010887146, + "logits/rejected": 3.087855815887451, + "logps/chosen": -392.92303466796875, + "logps/rejected": -376.72576904296875, + "loss": 0.2566, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.001624584197998, + "rewards/margins": 5.139468193054199, + "rewards/rejected": -8.141092300415039, + "step": 38720 + }, + { + "epoch": 1.2623929743953206, + "grad_norm": 3.9441215991973877, + "learning_rate": 2.897046523500723e-05, + "logits/chosen": 2.4008657932281494, + "logits/rejected": 2.527418613433838, + "logps/chosen": -302.85040283203125, + "logps/rejected": -329.6764221191406, + "loss": 0.4076, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.3848772048950195, + "rewards/margins": 3.8903796672821045, + "rewards/rejected": -8.275256156921387, + "step": 38740 + }, + { + "epoch": 1.2630447002468412, + "grad_norm": 0.19617553055286407, + "learning_rate": 2.895960287200878e-05, + "logits/chosen": 2.767601490020752, + "logits/rejected": 2.988434314727783, + "logps/chosen": -386.0408020019531, + "logps/rejected": -387.81146240234375, + "loss": 0.1906, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.502338409423828, + "rewards/margins": 5.122842311859131, + "rewards/rejected": -8.625181198120117, + "step": 38760 + }, + { + "epoch": 1.2636964260983619, + "grad_norm": 3.2699708938598633, + "learning_rate": 2.894874050901033e-05, + "logits/chosen": 2.8415446281433105, + "logits/rejected": 2.8578734397888184, + "logps/chosen": -349.50579833984375, + "logps/rejected": -354.46783447265625, + "loss": 0.2373, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.6943516731262207, + "rewards/margins": 4.771458625793457, + "rewards/rejected": -8.46580982208252, + "step": 38780 + }, + { + "epoch": 1.2643481519498823, + "grad_norm": 3.5396108627319336, + "learning_rate": 2.8937878146011888e-05, + "logits/chosen": 2.6964669227600098, + "logits/rejected": 3.0011181831359863, + "logps/chosen": -338.70562744140625, + "logps/rejected": -321.8588562011719, + "loss": 0.2793, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.5518429279327393, + "rewards/margins": 4.2949113845825195, + "rewards/rejected": -7.846755027770996, + "step": 38800 + }, + { + "epoch": 1.2649998778014029, + "grad_norm": 1.3512378931045532, + "learning_rate": 2.892701578301344e-05, + "logits/chosen": 2.658557415008545, + "logits/rejected": 2.71169376373291, + "logps/chosen": -359.494873046875, + "logps/rejected": -338.481201171875, + "loss": 0.1748, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.398494005203247, + "rewards/margins": 5.167198657989502, + "rewards/rejected": -8.565690994262695, + "step": 38820 + }, + { + "epoch": 1.2656516036529233, + "grad_norm": 0.11368968337774277, + "learning_rate": 2.891615342001499e-05, + "logits/chosen": 2.778355121612549, + "logits/rejected": 2.9449830055236816, + "logps/chosen": -357.09906005859375, + "logps/rejected": -371.1062927246094, + "loss": 0.2786, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.76567006111145, + "rewards/margins": 4.796786308288574, + "rewards/rejected": -8.562456130981445, + "step": 38840 + }, + { + "epoch": 1.266303329504444, + "grad_norm": 4.173718452453613, + "learning_rate": 2.8905291057016544e-05, + "logits/chosen": 3.3182907104492188, + "logits/rejected": 3.376065492630005, + "logps/chosen": -431.52508544921875, + "logps/rejected": -358.18603515625, + "loss": 0.1803, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.214592456817627, + "rewards/margins": 4.917912483215332, + "rewards/rejected": -8.1325044631958, + "step": 38860 + }, + { + "epoch": 1.2669550553559645, + "grad_norm": 3.2465317249298096, + "learning_rate": 2.88944286940181e-05, + "logits/chosen": 2.7257840633392334, + "logits/rejected": 2.992659091949463, + "logps/chosen": -332.4230041503906, + "logps/rejected": -348.76861572265625, + "loss": 0.3224, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.7525811195373535, + "rewards/margins": 4.939417839050293, + "rewards/rejected": -8.691999435424805, + "step": 38880 + }, + { + "epoch": 1.2676067812074852, + "grad_norm": 1.3444796800613403, + "learning_rate": 2.8883566331019652e-05, + "logits/chosen": 2.855996608734131, + "logits/rejected": 2.9240448474884033, + "logps/chosen": -375.1543273925781, + "logps/rejected": -358.1072692871094, + "loss": 0.2333, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.623591899871826, + "rewards/margins": 5.097198486328125, + "rewards/rejected": -8.72079086303711, + "step": 38900 + }, + { + "epoch": 1.2682585070590056, + "grad_norm": 2.907748222351074, + "learning_rate": 2.8872703968021203e-05, + "logits/chosen": 3.1518523693084717, + "logits/rejected": 3.370378017425537, + "logps/chosen": -378.31317138671875, + "logps/rejected": -373.47161865234375, + "loss": 0.194, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.5342960357666016, + "rewards/margins": 4.774020195007324, + "rewards/rejected": -8.308317184448242, + "step": 38920 + }, + { + "epoch": 1.2689102329105262, + "grad_norm": 2.618046283721924, + "learning_rate": 2.886184160502276e-05, + "logits/chosen": 2.7471938133239746, + "logits/rejected": 2.9607462882995605, + "logps/chosen": -322.2836608886719, + "logps/rejected": -343.3858337402344, + "loss": 0.1545, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.462092876434326, + "rewards/margins": 4.192048072814941, + "rewards/rejected": -7.654141426086426, + "step": 38940 + }, + { + "epoch": 1.2695619587620468, + "grad_norm": 2.403822660446167, + "learning_rate": 2.885097924202431e-05, + "logits/chosen": 2.917825222015381, + "logits/rejected": 2.91159725189209, + "logps/chosen": -341.5159912109375, + "logps/rejected": -334.47882080078125, + "loss": 0.2189, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.8934261798858643, + "rewards/margins": 4.585991859436035, + "rewards/rejected": -7.479417324066162, + "step": 38960 + }, + { + "epoch": 1.2702136846135672, + "grad_norm": 5.182380199432373, + "learning_rate": 2.8840116879025865e-05, + "logits/chosen": 2.794076442718506, + "logits/rejected": 3.112276554107666, + "logps/chosen": -352.96295166015625, + "logps/rejected": -327.55194091796875, + "loss": 0.2913, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.5210907459259033, + "rewards/margins": 4.657331943511963, + "rewards/rejected": -8.178422927856445, + "step": 38980 + }, + { + "epoch": 1.2708654104650878, + "grad_norm": 7.904381275177002, + "learning_rate": 2.8829254516027416e-05, + "logits/chosen": 3.1614463329315186, + "logits/rejected": 3.2893729209899902, + "logps/chosen": -405.5785827636719, + "logps/rejected": -402.1766662597656, + "loss": 0.3477, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.2515437602996826, + "rewards/margins": 4.780577659606934, + "rewards/rejected": -8.032119750976562, + "step": 39000 + }, + { + "epoch": 1.2715171363166085, + "grad_norm": 1.419650673866272, + "learning_rate": 2.8818392153028973e-05, + "logits/chosen": 3.0581510066986084, + "logits/rejected": 3.043383836746216, + "logps/chosen": -348.5039978027344, + "logps/rejected": -331.28076171875, + "loss": 0.2178, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.983170509338379, + "rewards/margins": 4.853863716125488, + "rewards/rejected": -7.837033748626709, + "step": 39020 + }, + { + "epoch": 1.272168862168129, + "grad_norm": 1.8286609649658203, + "learning_rate": 2.8807529790030524e-05, + "logits/chosen": 2.7338502407073975, + "logits/rejected": 2.7730870246887207, + "logps/chosen": -387.5224914550781, + "logps/rejected": -410.7662048339844, + "loss": 0.2942, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.384312391281128, + "rewards/margins": 4.900865077972412, + "rewards/rejected": -8.285179138183594, + "step": 39040 + }, + { + "epoch": 1.2728205880196495, + "grad_norm": 1.0360934734344482, + "learning_rate": 2.8796667427032075e-05, + "logits/chosen": 3.0154380798339844, + "logits/rejected": 3.0657553672790527, + "logps/chosen": -308.1293640136719, + "logps/rejected": -308.73956298828125, + "loss": 0.4453, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.94939923286438, + "rewards/margins": 3.78765869140625, + "rewards/rejected": -7.737058162689209, + "step": 39060 + }, + { + "epoch": 1.27347231387117, + "grad_norm": 5.233984470367432, + "learning_rate": 2.8785805064033633e-05, + "logits/chosen": 2.743514060974121, + "logits/rejected": 2.8903183937072754, + "logps/chosen": -373.6652526855469, + "logps/rejected": -397.91278076171875, + "loss": 0.1906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.132380962371826, + "rewards/margins": 5.153519630432129, + "rewards/rejected": -8.285901069641113, + "step": 39080 + }, + { + "epoch": 1.2741240397226907, + "grad_norm": 2.699550151824951, + "learning_rate": 2.8774942701035183e-05, + "logits/chosen": 3.052898406982422, + "logits/rejected": 3.0993399620056152, + "logps/chosen": -400.5385437011719, + "logps/rejected": -373.1784973144531, + "loss": 0.2756, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.385119676589966, + "rewards/margins": 4.852551460266113, + "rewards/rejected": -8.237671852111816, + "step": 39100 + }, + { + "epoch": 1.2747757655742111, + "grad_norm": 1.1240736246109009, + "learning_rate": 2.8764080338036738e-05, + "logits/chosen": 3.019599199295044, + "logits/rejected": 3.1143128871917725, + "logps/chosen": -400.3301696777344, + "logps/rejected": -375.3173828125, + "loss": 0.3786, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.2248668670654297, + "rewards/margins": 4.7686262130737305, + "rewards/rejected": -7.993493556976318, + "step": 39120 + }, + { + "epoch": 1.2754274914257318, + "grad_norm": 2.531259059906006, + "learning_rate": 2.8753217975038295e-05, + "logits/chosen": 2.804663896560669, + "logits/rejected": 2.9478871822357178, + "logps/chosen": -346.89300537109375, + "logps/rejected": -324.8660583496094, + "loss": 0.391, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.884155750274658, + "rewards/margins": 4.518618583679199, + "rewards/rejected": -7.402773857116699, + "step": 39140 + }, + { + "epoch": 1.2760792172772524, + "grad_norm": 5.284489631652832, + "learning_rate": 2.8742355612039846e-05, + "logits/chosen": 2.877856731414795, + "logits/rejected": 3.05855393409729, + "logps/chosen": -373.5476989746094, + "logps/rejected": -316.72650146484375, + "loss": 0.3064, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.2944984436035156, + "rewards/margins": 4.5151262283325195, + "rewards/rejected": -7.809624671936035, + "step": 39160 + }, + { + "epoch": 1.276730943128773, + "grad_norm": 2.5262935161590576, + "learning_rate": 2.8731493249041397e-05, + "logits/chosen": 3.078671932220459, + "logits/rejected": 3.089184045791626, + "logps/chosen": -363.955810546875, + "logps/rejected": -343.96221923828125, + "loss": 0.1603, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.8386118412017822, + "rewards/margins": 5.7502641677856445, + "rewards/rejected": -7.588876247406006, + "step": 39180 + }, + { + "epoch": 1.2773826689802934, + "grad_norm": 0.19227083027362823, + "learning_rate": 2.8720630886042947e-05, + "logits/chosen": 3.1352405548095703, + "logits/rejected": 3.2756950855255127, + "logps/chosen": -366.84600830078125, + "logps/rejected": -386.9676818847656, + "loss": 0.3826, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.1136715412139893, + "rewards/margins": 4.180805206298828, + "rewards/rejected": -7.2944769859313965, + "step": 39200 + }, + { + "epoch": 1.278034394831814, + "grad_norm": 3.445228099822998, + "learning_rate": 2.8709768523044505e-05, + "logits/chosen": 2.7593941688537598, + "logits/rejected": 2.9700613021850586, + "logps/chosen": -300.1949768066406, + "logps/rejected": -337.985107421875, + "loss": 0.3788, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.6473305225372314, + "rewards/margins": 4.017708778381348, + "rewards/rejected": -7.6650390625, + "step": 39220 + }, + { + "epoch": 1.2786861206833344, + "grad_norm": 2.859114646911621, + "learning_rate": 2.8698906160046056e-05, + "logits/chosen": 3.212994337081909, + "logits/rejected": 3.162837505340576, + "logps/chosen": -391.76513671875, + "logps/rejected": -322.85064697265625, + "loss": 0.3603, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.399585723876953, + "rewards/margins": 4.250263214111328, + "rewards/rejected": -7.649848937988281, + "step": 39240 + }, + { + "epoch": 1.279337846534855, + "grad_norm": 0.47527098655700684, + "learning_rate": 2.868804379704761e-05, + "logits/chosen": 2.9603514671325684, + "logits/rejected": 3.2081871032714844, + "logps/chosen": -399.6717834472656, + "logps/rejected": -356.0782775878906, + "loss": 0.2401, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.385622024536133, + "rewards/margins": 4.568413257598877, + "rewards/rejected": -7.954035758972168, + "step": 39260 + }, + { + "epoch": 1.2799895723863757, + "grad_norm": 1.1873834133148193, + "learning_rate": 2.8677181434049168e-05, + "logits/chosen": 2.9820122718811035, + "logits/rejected": 2.9814226627349854, + "logps/chosen": -340.46490478515625, + "logps/rejected": -332.4761657714844, + "loss": 0.2009, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.0433382987976074, + "rewards/margins": 5.271481513977051, + "rewards/rejected": -7.314820289611816, + "step": 39280 + }, + { + "epoch": 1.2806412982378963, + "grad_norm": 2.2325594425201416, + "learning_rate": 2.866631907105072e-05, + "logits/chosen": 3.137418031692505, + "logits/rejected": 3.2385661602020264, + "logps/chosen": -404.6410827636719, + "logps/rejected": -364.09490966796875, + "loss": 0.2218, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.6650338172912598, + "rewards/margins": 4.815578937530518, + "rewards/rejected": -7.480612277984619, + "step": 39300 + }, + { + "epoch": 1.281293024089417, + "grad_norm": 2.3482351303100586, + "learning_rate": 2.865545670805227e-05, + "logits/chosen": 3.200300693511963, + "logits/rejected": 3.364957809448242, + "logps/chosen": -401.00286865234375, + "logps/rejected": -345.16668701171875, + "loss": 0.1721, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.45133113861084, + "rewards/margins": 4.878209114074707, + "rewards/rejected": -7.329540252685547, + "step": 39320 + }, + { + "epoch": 1.2819447499409373, + "grad_norm": 2.8379979133605957, + "learning_rate": 2.8644594345053827e-05, + "logits/chosen": 3.3523738384246826, + "logits/rejected": 3.2519474029541016, + "logps/chosen": -369.92864990234375, + "logps/rejected": -336.4187316894531, + "loss": 0.3844, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.7464871406555176, + "rewards/margins": 4.0548906326293945, + "rewards/rejected": -6.801377773284912, + "step": 39340 + }, + { + "epoch": 1.282596475792458, + "grad_norm": 1.47157621383667, + "learning_rate": 2.8633731982055377e-05, + "logits/chosen": 3.3894801139831543, + "logits/rejected": 3.285923480987549, + "logps/chosen": -328.99267578125, + "logps/rejected": -334.0780029296875, + "loss": 0.2075, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.6434671878814697, + "rewards/margins": 4.241604328155518, + "rewards/rejected": -6.885071754455566, + "step": 39360 + }, + { + "epoch": 1.2832482016439783, + "grad_norm": 2.8582725524902344, + "learning_rate": 2.862286961905693e-05, + "logits/chosen": 2.863081455230713, + "logits/rejected": 2.8136963844299316, + "logps/chosen": -369.9722595214844, + "logps/rejected": -387.753662109375, + "loss": 0.2054, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.643803596496582, + "rewards/margins": 4.890944480895996, + "rewards/rejected": -7.534747123718262, + "step": 39380 + }, + { + "epoch": 1.283899927495499, + "grad_norm": 5.108292102813721, + "learning_rate": 2.8612007256058482e-05, + "logits/chosen": 2.6402153968811035, + "logits/rejected": 2.724862813949585, + "logps/chosen": -320.2696533203125, + "logps/rejected": -329.70037841796875, + "loss": 0.2792, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1248373985290527, + "rewards/margins": 4.623147010803223, + "rewards/rejected": -7.74798583984375, + "step": 39400 + }, + { + "epoch": 1.2845516533470196, + "grad_norm": 3.3714606761932373, + "learning_rate": 2.860114489306004e-05, + "logits/chosen": 2.967125177383423, + "logits/rejected": 3.0094196796417236, + "logps/chosen": -358.9912109375, + "logps/rejected": -330.21197509765625, + "loss": 0.2766, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.032898426055908, + "rewards/margins": 4.898059368133545, + "rewards/rejected": -7.930956840515137, + "step": 39420 + }, + { + "epoch": 1.2852033791985402, + "grad_norm": 7.543229579925537, + "learning_rate": 2.859028253006159e-05, + "logits/chosen": 2.832017660140991, + "logits/rejected": 2.7373805046081543, + "logps/chosen": -360.2361755371094, + "logps/rejected": -319.8377380371094, + "loss": 0.26, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1946723461151123, + "rewards/margins": 4.286242961883545, + "rewards/rejected": -7.4809160232543945, + "step": 39440 + }, + { + "epoch": 1.2858551050500606, + "grad_norm": 62.42949295043945, + "learning_rate": 2.857942016706314e-05, + "logits/chosen": 2.7047390937805176, + "logits/rejected": 2.7156484127044678, + "logps/chosen": -344.9250793457031, + "logps/rejected": -345.3624267578125, + "loss": 0.458, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.6582298278808594, + "rewards/margins": 3.4010353088378906, + "rewards/rejected": -7.05926513671875, + "step": 39460 + }, + { + "epoch": 1.2865068309015812, + "grad_norm": 2.7345385551452637, + "learning_rate": 2.85685578040647e-05, + "logits/chosen": 3.0935089588165283, + "logits/rejected": 3.2761497497558594, + "logps/chosen": -372.1888732910156, + "logps/rejected": -347.15692138671875, + "loss": 0.2224, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7466447353363037, + "rewards/margins": 4.466960906982422, + "rewards/rejected": -7.213606357574463, + "step": 39480 + }, + { + "epoch": 1.2871585567531019, + "grad_norm": 4.470441818237305, + "learning_rate": 2.855769544106625e-05, + "logits/chosen": 3.1367478370666504, + "logits/rejected": 3.0641798973083496, + "logps/chosen": -353.77752685546875, + "logps/rejected": -355.659912109375, + "loss": 0.4062, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.01194429397583, + "rewards/margins": 3.646892547607422, + "rewards/rejected": -6.65883731842041, + "step": 39500 + }, + { + "epoch": 1.2878102826046223, + "grad_norm": 2.3909971714019775, + "learning_rate": 2.8546833078067804e-05, + "logits/chosen": 2.8546667098999023, + "logits/rejected": 3.098576784133911, + "logps/chosen": -353.3459777832031, + "logps/rejected": -341.8504638671875, + "loss": 0.4087, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.1932730674743652, + "rewards/margins": 4.316581726074219, + "rewards/rejected": -7.5098557472229, + "step": 39520 + }, + { + "epoch": 1.288462008456143, + "grad_norm": 7.22975492477417, + "learning_rate": 2.853597071506936e-05, + "logits/chosen": 2.7781410217285156, + "logits/rejected": 2.8533682823181152, + "logps/chosen": -356.88714599609375, + "logps/rejected": -326.76019287109375, + "loss": 0.2538, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9754174947738647, + "rewards/margins": 5.68056583404541, + "rewards/rejected": -7.655982971191406, + "step": 39540 + }, + { + "epoch": 1.2891137343076635, + "grad_norm": 0.19762203097343445, + "learning_rate": 2.8525108352070912e-05, + "logits/chosen": 2.595390796661377, + "logits/rejected": 2.791670083999634, + "logps/chosen": -333.1112365722656, + "logps/rejected": -336.7330627441406, + "loss": 0.2655, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7941770553588867, + "rewards/margins": 3.8518173694610596, + "rewards/rejected": -6.645994663238525, + "step": 39560 + }, + { + "epoch": 1.2897654601591841, + "grad_norm": 0.8054193258285522, + "learning_rate": 2.8514245989072463e-05, + "logits/chosen": 2.914340019226074, + "logits/rejected": 2.942333698272705, + "logps/chosen": -336.21051025390625, + "logps/rejected": -351.03497314453125, + "loss": 0.1701, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.345146894454956, + "rewards/margins": 4.308511257171631, + "rewards/rejected": -6.653658390045166, + "step": 39580 + }, + { + "epoch": 1.2904171860107045, + "grad_norm": 5.319876670837402, + "learning_rate": 2.8503383626074014e-05, + "logits/chosen": 2.9397010803222656, + "logits/rejected": 2.964608669281006, + "logps/chosen": -325.33795166015625, + "logps/rejected": -359.4188232421875, + "loss": 0.356, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.4959423542022705, + "rewards/margins": 4.240969657897949, + "rewards/rejected": -6.736911773681641, + "step": 39600 + }, + { + "epoch": 1.2910689118622252, + "grad_norm": 10.504396438598633, + "learning_rate": 2.849252126307557e-05, + "logits/chosen": 2.809077739715576, + "logits/rejected": 2.627314329147339, + "logps/chosen": -361.7268371582031, + "logps/rejected": -355.10400390625, + "loss": 0.2596, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.8436444997787476, + "rewards/margins": 4.545200824737549, + "rewards/rejected": -6.388845920562744, + "step": 39620 + }, + { + "epoch": 1.2917206377137458, + "grad_norm": 2.8454833030700684, + "learning_rate": 2.8481658900077122e-05, + "logits/chosen": 3.3545982837677, + "logits/rejected": 3.349088668823242, + "logps/chosen": -375.6394958496094, + "logps/rejected": -332.37835693359375, + "loss": 0.2541, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.6395888328552246, + "rewards/margins": 4.859615802764893, + "rewards/rejected": -7.499204158782959, + "step": 39640 + }, + { + "epoch": 1.2923723635652662, + "grad_norm": 3.576442241668701, + "learning_rate": 2.8470796537078676e-05, + "logits/chosen": 2.819162607192993, + "logits/rejected": 2.9763379096984863, + "logps/chosen": -381.47308349609375, + "logps/rejected": -337.0021667480469, + "loss": 0.298, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.555135726928711, + "rewards/margins": 4.896793842315674, + "rewards/rejected": -7.451929569244385, + "step": 39660 + }, + { + "epoch": 1.2930240894167868, + "grad_norm": 0.9392933249473572, + "learning_rate": 2.8459934174080234e-05, + "logits/chosen": 2.990187168121338, + "logits/rejected": 2.7921998500823975, + "logps/chosen": -361.6982116699219, + "logps/rejected": -363.57806396484375, + "loss": 0.3425, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.3011608123779297, + "rewards/margins": 4.447274208068848, + "rewards/rejected": -7.748434543609619, + "step": 39680 + }, + { + "epoch": 1.2936758152683074, + "grad_norm": 0.297637015581131, + "learning_rate": 2.8449071811081785e-05, + "logits/chosen": 3.0461440086364746, + "logits/rejected": 3.143397092819214, + "logps/chosen": -364.7994689941406, + "logps/rejected": -318.1226806640625, + "loss": 0.1737, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.1070754528045654, + "rewards/margins": 4.534703731536865, + "rewards/rejected": -7.641779899597168, + "step": 39700 + }, + { + "epoch": 1.294327541119828, + "grad_norm": 3.259164571762085, + "learning_rate": 2.8438752566233263e-05, + "logits/chosen": 2.7770321369171143, + "logits/rejected": 2.868302583694458, + "logps/chosen": -345.42401123046875, + "logps/rejected": -288.4171142578125, + "loss": 0.3897, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.597529888153076, + "rewards/margins": 4.135787010192871, + "rewards/rejected": -6.733316898345947, + "step": 39720 + }, + { + "epoch": 1.2949792669713485, + "grad_norm": 3.7166240215301514, + "learning_rate": 2.8427890203234814e-05, + "logits/chosen": 3.2561416625976562, + "logits/rejected": 3.148522138595581, + "logps/chosen": -385.2004089355469, + "logps/rejected": -349.0193786621094, + "loss": 0.2563, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.7457339763641357, + "rewards/margins": 4.636038780212402, + "rewards/rejected": -7.381772041320801, + "step": 39740 + }, + { + "epoch": 1.295630992822869, + "grad_norm": 6.9160075187683105, + "learning_rate": 2.8417027840236365e-05, + "logits/chosen": 3.2386081218719482, + "logits/rejected": 3.0475947856903076, + "logps/chosen": -356.5245361328125, + "logps/rejected": -337.50762939453125, + "loss": 0.2573, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.459836959838867, + "rewards/margins": 4.955075263977051, + "rewards/rejected": -7.41491174697876, + "step": 39760 + }, + { + "epoch": 1.2962827186743895, + "grad_norm": 2.1825098991394043, + "learning_rate": 2.8406165477237916e-05, + "logits/chosen": 2.995940685272217, + "logits/rejected": 3.0814173221588135, + "logps/chosen": -350.17645263671875, + "logps/rejected": -369.6962890625, + "loss": 0.1916, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.28973388671875, + "rewards/margins": 5.433933258056641, + "rewards/rejected": -7.723667144775391, + "step": 39780 + }, + { + "epoch": 1.2969344445259101, + "grad_norm": 0.1717275083065033, + "learning_rate": 2.8395303114239473e-05, + "logits/chosen": 2.852304697036743, + "logits/rejected": 2.991739273071289, + "logps/chosen": -355.229248046875, + "logps/rejected": -363.5116882324219, + "loss": 0.405, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.194913625717163, + "rewards/margins": 4.499933242797852, + "rewards/rejected": -7.694847106933594, + "step": 39800 + }, + { + "epoch": 1.2975861703774307, + "grad_norm": 0.6007089018821716, + "learning_rate": 2.8384440751241027e-05, + "logits/chosen": 3.0476877689361572, + "logits/rejected": 3.062844753265381, + "logps/chosen": -361.4922790527344, + "logps/rejected": -362.9143371582031, + "loss": 0.2664, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9241766929626465, + "rewards/margins": 4.721526622772217, + "rewards/rejected": -7.6457037925720215, + "step": 39820 + }, + { + "epoch": 1.2982378962289514, + "grad_norm": 1.5537328720092773, + "learning_rate": 2.8373578388242578e-05, + "logits/chosen": 2.4134716987609863, + "logits/rejected": 2.6478140354156494, + "logps/chosen": -347.02362060546875, + "logps/rejected": -342.37176513671875, + "loss": 0.3633, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.0489912033081055, + "rewards/margins": 4.668906211853027, + "rewards/rejected": -7.717896938323975, + "step": 39840 + }, + { + "epoch": 1.298889622080472, + "grad_norm": 6.181808948516846, + "learning_rate": 2.8362716025244136e-05, + "logits/chosen": 2.756727933883667, + "logits/rejected": 2.912684917449951, + "logps/chosen": -341.9676513671875, + "logps/rejected": -368.7874450683594, + "loss": 0.2884, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.836463212966919, + "rewards/margins": 4.85014009475708, + "rewards/rejected": -7.686603546142578, + "step": 39860 + }, + { + "epoch": 1.2995413479319924, + "grad_norm": 6.018069267272949, + "learning_rate": 2.8351853662245686e-05, + "logits/chosen": 3.396461009979248, + "logits/rejected": 3.2592549324035645, + "logps/chosen": -362.9842834472656, + "logps/rejected": -358.0769958496094, + "loss": 0.2963, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.256343126296997, + "rewards/margins": 4.945796012878418, + "rewards/rejected": -8.202138900756836, + "step": 39880 + }, + { + "epoch": 1.300193073783513, + "grad_norm": 2.4137370586395264, + "learning_rate": 2.8340991299247237e-05, + "logits/chosen": 3.014970064163208, + "logits/rejected": 3.033156633377075, + "logps/chosen": -373.9560546875, + "logps/rejected": -363.5077209472656, + "loss": 0.3451, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.160040855407715, + "rewards/margins": 4.117456436157227, + "rewards/rejected": -7.277496337890625, + "step": 39900 + }, + { + "epoch": 1.3008447996350334, + "grad_norm": 1.297493577003479, + "learning_rate": 2.8330128936248795e-05, + "logits/chosen": 2.738081455230713, + "logits/rejected": 2.9589130878448486, + "logps/chosen": -307.6200866699219, + "logps/rejected": -322.36578369140625, + "loss": 0.2417, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.963371992111206, + "rewards/margins": 4.406216144561768, + "rewards/rejected": -7.369588375091553, + "step": 39920 + }, + { + "epoch": 1.301496525486554, + "grad_norm": 4.3018059730529785, + "learning_rate": 2.8319266573250345e-05, + "logits/chosen": 2.925894260406494, + "logits/rejected": 3.0071399211883545, + "logps/chosen": -352.8347473144531, + "logps/rejected": -338.3848876953125, + "loss": 0.2373, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9487862586975098, + "rewards/margins": 4.457152366638184, + "rewards/rejected": -7.405938625335693, + "step": 39940 + }, + { + "epoch": 1.3021482513380747, + "grad_norm": 0.01820971816778183, + "learning_rate": 2.83084042102519e-05, + "logits/chosen": 3.0263800621032715, + "logits/rejected": 3.078989028930664, + "logps/chosen": -393.7946472167969, + "logps/rejected": -337.225830078125, + "loss": 0.2969, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.867924690246582, + "rewards/margins": 4.899261474609375, + "rewards/rejected": -7.767186641693115, + "step": 39960 + }, + { + "epoch": 1.3027999771895953, + "grad_norm": 0.622330904006958, + "learning_rate": 2.829754184725345e-05, + "logits/chosen": 2.880746364593506, + "logits/rejected": 3.1491808891296387, + "logps/chosen": -351.72265625, + "logps/rejected": -320.53912353515625, + "loss": 0.2464, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.750811815261841, + "rewards/margins": 4.762175559997559, + "rewards/rejected": -7.5129876136779785, + "step": 39980 + }, + { + "epoch": 1.3034517030411157, + "grad_norm": 3.1483891010284424, + "learning_rate": 2.8286679484255008e-05, + "logits/chosen": 3.2083098888397217, + "logits/rejected": 3.1547188758850098, + "logps/chosen": -366.90179443359375, + "logps/rejected": -382.16949462890625, + "loss": 0.3078, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.4626259803771973, + "rewards/margins": 4.24493408203125, + "rewards/rejected": -7.707559108734131, + "step": 40000 + }, + { + "epoch": 1.3041034288926363, + "grad_norm": 6.377371788024902, + "learning_rate": 2.827581712125656e-05, + "logits/chosen": 3.1092658042907715, + "logits/rejected": 3.096174716949463, + "logps/chosen": -341.87371826171875, + "logps/rejected": -315.26702880859375, + "loss": 0.2255, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.475673198699951, + "rewards/margins": 4.308002471923828, + "rewards/rejected": -7.7836761474609375, + "step": 40020 + }, + { + "epoch": 1.304755154744157, + "grad_norm": 1.8647098541259766, + "learning_rate": 2.826495475825811e-05, + "logits/chosen": 2.966322660446167, + "logits/rejected": 2.9698925018310547, + "logps/chosen": -371.0279541015625, + "logps/rejected": -336.2210998535156, + "loss": 0.3909, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.9243645668029785, + "rewards/margins": 4.072884559631348, + "rewards/rejected": -6.997249603271484, + "step": 40040 + }, + { + "epoch": 1.3054068805956773, + "grad_norm": 2.068396806716919, + "learning_rate": 2.8254092395259667e-05, + "logits/chosen": 3.040269374847412, + "logits/rejected": 3.1662516593933105, + "logps/chosen": -349.86456298828125, + "logps/rejected": -327.59197998046875, + "loss": 0.5003, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.1147289276123047, + "rewards/margins": 4.616648197174072, + "rewards/rejected": -7.731377601623535, + "step": 40060 + }, + { + "epoch": 1.306058606447198, + "grad_norm": 1.2001621723175049, + "learning_rate": 2.824323003226122e-05, + "logits/chosen": 3.017120838165283, + "logits/rejected": 3.079986572265625, + "logps/chosen": -367.0091857910156, + "logps/rejected": -356.2591857910156, + "loss": 0.2732, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.4606597423553467, + "rewards/margins": 4.163954734802246, + "rewards/rejected": -6.6246137619018555, + "step": 40080 + }, + { + "epoch": 1.3067103322987186, + "grad_norm": 3.780017852783203, + "learning_rate": 2.8232367669262772e-05, + "logits/chosen": 2.991868495941162, + "logits/rejected": 3.0272679328918457, + "logps/chosen": -358.5476989746094, + "logps/rejected": -346.9438781738281, + "loss": 0.2272, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.215886354446411, + "rewards/margins": 4.554275035858154, + "rewards/rejected": -7.7701616287231445, + "step": 40100 + }, + { + "epoch": 1.3073620581502392, + "grad_norm": 4.70280647277832, + "learning_rate": 2.822150530626433e-05, + "logits/chosen": 3.107456922531128, + "logits/rejected": 3.2728805541992188, + "logps/chosen": -334.0258483886719, + "logps/rejected": -331.0494079589844, + "loss": 0.1974, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.6107234954833984, + "rewards/margins": 4.769770622253418, + "rewards/rejected": -7.380494117736816, + "step": 40120 + }, + { + "epoch": 1.3080137840017596, + "grad_norm": 9.000246047973633, + "learning_rate": 2.821064294326588e-05, + "logits/chosen": 3.04085111618042, + "logits/rejected": 3.1418395042419434, + "logps/chosen": -370.57928466796875, + "logps/rejected": -369.90020751953125, + "loss": 0.1555, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.866827964782715, + "rewards/margins": 5.335010528564453, + "rewards/rejected": -8.201838493347168, + "step": 40140 + }, + { + "epoch": 1.3086655098532802, + "grad_norm": 2.280258893966675, + "learning_rate": 2.819978058026743e-05, + "logits/chosen": 2.853182077407837, + "logits/rejected": 2.9997611045837402, + "logps/chosen": -319.4559326171875, + "logps/rejected": -309.7649230957031, + "loss": 0.4044, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.6221251487731934, + "rewards/margins": 3.966308116912842, + "rewards/rejected": -6.588433742523193, + "step": 40160 + }, + { + "epoch": 1.3093172357048009, + "grad_norm": 2.1411380767822266, + "learning_rate": 2.8188918217268982e-05, + "logits/chosen": 3.2737762928009033, + "logits/rejected": 3.301731586456299, + "logps/chosen": -379.7537841796875, + "logps/rejected": -354.9706115722656, + "loss": 0.3623, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.10077166557312, + "rewards/margins": 4.6023149490356445, + "rewards/rejected": -7.703086853027344, + "step": 40180 + }, + { + "epoch": 1.3099689615563213, + "grad_norm": 9.098488807678223, + "learning_rate": 2.817805585427054e-05, + "logits/chosen": 2.978977680206299, + "logits/rejected": 3.335749864578247, + "logps/chosen": -354.8219299316406, + "logps/rejected": -345.4627380371094, + "loss": 0.296, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.600487232208252, + "rewards/margins": 3.8423123359680176, + "rewards/rejected": -6.4427995681762695, + "step": 40200 + }, + { + "epoch": 1.3106206874078419, + "grad_norm": 8.488588333129883, + "learning_rate": 2.8167193491272094e-05, + "logits/chosen": 3.149914264678955, + "logits/rejected": 3.200421094894409, + "logps/chosen": -383.89935302734375, + "logps/rejected": -363.67034912109375, + "loss": 0.4276, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9233176708221436, + "rewards/margins": 4.386472702026367, + "rewards/rejected": -7.30979061126709, + "step": 40220 + }, + { + "epoch": 1.3112724132593625, + "grad_norm": 0.5160138607025146, + "learning_rate": 2.8156331128273645e-05, + "logits/chosen": 2.9229400157928467, + "logits/rejected": 3.1931228637695312, + "logps/chosen": -374.69891357421875, + "logps/rejected": -359.2183837890625, + "loss": 0.2375, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.603961229324341, + "rewards/margins": 4.099138259887695, + "rewards/rejected": -6.703099727630615, + "step": 40240 + }, + { + "epoch": 1.3119241391108831, + "grad_norm": 0.0958750769495964, + "learning_rate": 2.8145468765275202e-05, + "logits/chosen": 3.0825412273406982, + "logits/rejected": 3.1181106567382812, + "logps/chosen": -373.5565490722656, + "logps/rejected": -350.17535400390625, + "loss": 0.3218, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.930208206176758, + "rewards/margins": 5.27905797958374, + "rewards/rejected": -8.209266662597656, + "step": 40260 + }, + { + "epoch": 1.3125758649624035, + "grad_norm": 1.7550573348999023, + "learning_rate": 2.8134606402276753e-05, + "logits/chosen": 3.207322597503662, + "logits/rejected": 3.2355198860168457, + "logps/chosen": -388.7909240722656, + "logps/rejected": -382.5318298339844, + "loss": 0.2764, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.393998622894287, + "rewards/margins": 4.121576309204102, + "rewards/rejected": -6.5155744552612305, + "step": 40280 + }, + { + "epoch": 1.3132275908139242, + "grad_norm": 0.059675779193639755, + "learning_rate": 2.8123744039278304e-05, + "logits/chosen": 3.1281065940856934, + "logits/rejected": 3.018996000289917, + "logps/chosen": -364.18304443359375, + "logps/rejected": -321.11920166015625, + "loss": 0.387, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2324891090393066, + "rewards/margins": 3.661998748779297, + "rewards/rejected": -6.8944878578186035, + "step": 40300 + }, + { + "epoch": 1.3138793166654446, + "grad_norm": 8.088929176330566, + "learning_rate": 2.811288167627986e-05, + "logits/chosen": 3.2364399433135986, + "logits/rejected": 3.1998839378356934, + "logps/chosen": -374.93328857421875, + "logps/rejected": -354.1737060546875, + "loss": 0.2494, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.6421617269515991, + "rewards/margins": 5.084616661071777, + "rewards/rejected": -6.726778507232666, + "step": 40320 + }, + { + "epoch": 1.3145310425169652, + "grad_norm": 5.0946502685546875, + "learning_rate": 2.8102019313281412e-05, + "logits/chosen": 3.016416311264038, + "logits/rejected": 3.042985677719116, + "logps/chosen": -358.5484924316406, + "logps/rejected": -330.51904296875, + "loss": 0.2843, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7991771697998047, + "rewards/margins": 4.104367256164551, + "rewards/rejected": -7.9035444259643555, + "step": 40340 + }, + { + "epoch": 1.3151827683684858, + "grad_norm": 1.656697392463684, + "learning_rate": 2.8091156950282966e-05, + "logits/chosen": 2.783447265625, + "logits/rejected": 3.083613872528076, + "logps/chosen": -352.43731689453125, + "logps/rejected": -360.0465393066406, + "loss": 0.3315, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.875154972076416, + "rewards/margins": 4.084954261779785, + "rewards/rejected": -6.960108757019043, + "step": 40360 + }, + { + "epoch": 1.3158344942200064, + "grad_norm": 7.835024833679199, + "learning_rate": 2.8080294587284517e-05, + "logits/chosen": 3.0390541553497314, + "logits/rejected": 3.0448880195617676, + "logps/chosen": -375.33984375, + "logps/rejected": -362.2414855957031, + "loss": 0.2923, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.481131076812744, + "rewards/margins": 4.152714729309082, + "rewards/rejected": -7.633846282958984, + "step": 40380 + }, + { + "epoch": 1.316486220071527, + "grad_norm": 3.005847215652466, + "learning_rate": 2.8069432224286075e-05, + "logits/chosen": 2.8914780616760254, + "logits/rejected": 2.9603488445281982, + "logps/chosen": -358.39764404296875, + "logps/rejected": -365.69940185546875, + "loss": 0.2885, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.849144458770752, + "rewards/margins": 4.812201499938965, + "rewards/rejected": -7.661346435546875, + "step": 40400 + }, + { + "epoch": 1.3171379459230474, + "grad_norm": 2.6800882816314697, + "learning_rate": 2.8058569861287625e-05, + "logits/chosen": 2.7103238105773926, + "logits/rejected": 2.799264907836914, + "logps/chosen": -354.9295959472656, + "logps/rejected": -320.63067626953125, + "loss": 0.2435, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.153273820877075, + "rewards/margins": 4.695878505706787, + "rewards/rejected": -7.849152565002441, + "step": 40420 + }, + { + "epoch": 1.317789671774568, + "grad_norm": 3.5696635246276855, + "learning_rate": 2.8047707498289176e-05, + "logits/chosen": 3.0260348320007324, + "logits/rejected": 3.254439115524292, + "logps/chosen": -330.26580810546875, + "logps/rejected": -369.06353759765625, + "loss": 0.3091, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.2655715942382812, + "rewards/margins": 3.8891186714172363, + "rewards/rejected": -7.154690742492676, + "step": 40440 + }, + { + "epoch": 1.3184413976260885, + "grad_norm": 6.5467939376831055, + "learning_rate": 2.8036845135290734e-05, + "logits/chosen": 2.807978868484497, + "logits/rejected": 3.001674175262451, + "logps/chosen": -355.5603942871094, + "logps/rejected": -370.74261474609375, + "loss": 0.3029, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.393153667449951, + "rewards/margins": 4.793345928192139, + "rewards/rejected": -8.186498641967773, + "step": 40460 + }, + { + "epoch": 1.319093123477609, + "grad_norm": 3.2517812252044678, + "learning_rate": 2.8025982772292288e-05, + "logits/chosen": 3.235708236694336, + "logits/rejected": 3.2123970985412598, + "logps/chosen": -327.94134521484375, + "logps/rejected": -353.469970703125, + "loss": 0.3273, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.437953233718872, + "rewards/margins": 3.8746352195739746, + "rewards/rejected": -7.312589168548584, + "step": 40480 + }, + { + "epoch": 1.3197448493291297, + "grad_norm": 1.138706922531128, + "learning_rate": 2.801512040929384e-05, + "logits/chosen": 3.245647430419922, + "logits/rejected": 3.351475954055786, + "logps/chosen": -371.32843017578125, + "logps/rejected": -350.5782775878906, + "loss": 0.1778, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.091081142425537, + "rewards/margins": 4.681506156921387, + "rewards/rejected": -7.772587776184082, + "step": 40500 + }, + { + "epoch": 1.3203965751806503, + "grad_norm": 2.8608763217926025, + "learning_rate": 2.8004258046295396e-05, + "logits/chosen": 3.0135676860809326, + "logits/rejected": 3.342933177947998, + "logps/chosen": -369.6342468261719, + "logps/rejected": -376.0373840332031, + "loss": 0.1751, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.351121187210083, + "rewards/margins": 4.991515636444092, + "rewards/rejected": -8.342637062072754, + "step": 40520 + }, + { + "epoch": 1.3210483010321707, + "grad_norm": 2.0671398639678955, + "learning_rate": 2.7993395683296947e-05, + "logits/chosen": 2.822144031524658, + "logits/rejected": 2.818068742752075, + "logps/chosen": -353.3376770019531, + "logps/rejected": -370.88323974609375, + "loss": 0.3669, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.4896316528320312, + "rewards/margins": 4.3933424949646, + "rewards/rejected": -7.882974147796631, + "step": 40540 + }, + { + "epoch": 1.3217000268836914, + "grad_norm": 6.599484443664551, + "learning_rate": 2.7982533320298498e-05, + "logits/chosen": 2.9927256107330322, + "logits/rejected": 3.232419490814209, + "logps/chosen": -363.68743896484375, + "logps/rejected": -353.02459716796875, + "loss": 0.3198, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.2933640480041504, + "rewards/margins": 4.417301177978516, + "rewards/rejected": -7.710665225982666, + "step": 40560 + }, + { + "epoch": 1.322351752735212, + "grad_norm": 1.8370550870895386, + "learning_rate": 2.797167095730005e-05, + "logits/chosen": 2.957577705383301, + "logits/rejected": 3.168661117553711, + "logps/chosen": -358.48712158203125, + "logps/rejected": -340.9774169921875, + "loss": 0.2421, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.6639797687530518, + "rewards/margins": 4.581272125244141, + "rewards/rejected": -8.245251655578613, + "step": 40580 + }, + { + "epoch": 1.3230034785867324, + "grad_norm": 1.922440528869629, + "learning_rate": 2.7960808594301606e-05, + "logits/chosen": 2.980095624923706, + "logits/rejected": 3.165741443634033, + "logps/chosen": -360.35302734375, + "logps/rejected": -353.3277893066406, + "loss": 0.3242, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.9805305004119873, + "rewards/margins": 4.489892482757568, + "rewards/rejected": -8.470422744750977, + "step": 40600 + }, + { + "epoch": 1.323655204438253, + "grad_norm": 0.7803003787994385, + "learning_rate": 2.794994623130316e-05, + "logits/chosen": 2.9751534461975098, + "logits/rejected": 2.9324798583984375, + "logps/chosen": -376.6629943847656, + "logps/rejected": -356.95867919921875, + "loss": 0.1832, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.566023349761963, + "rewards/margins": 4.9369215965271, + "rewards/rejected": -8.502943992614746, + "step": 40620 + }, + { + "epoch": 1.3243069302897736, + "grad_norm": 0.9725908637046814, + "learning_rate": 2.793908386830471e-05, + "logits/chosen": 3.1739203929901123, + "logits/rejected": 3.1897130012512207, + "logps/chosen": -335.8782958984375, + "logps/rejected": -328.1597595214844, + "loss": 0.2492, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.930744171142578, + "rewards/margins": 4.581047534942627, + "rewards/rejected": -7.511791229248047, + "step": 40640 + }, + { + "epoch": 1.3249586561412943, + "grad_norm": 0.193658247590065, + "learning_rate": 2.792822150530627e-05, + "logits/chosen": 3.103778600692749, + "logits/rejected": 2.9909985065460205, + "logps/chosen": -412.28912353515625, + "logps/rejected": -347.57647705078125, + "loss": 0.3342, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.5225117206573486, + "rewards/margins": 5.085416793823242, + "rewards/rejected": -8.607928276062012, + "step": 40660 + }, + { + "epoch": 1.3256103819928147, + "grad_norm": 6.917929172515869, + "learning_rate": 2.791735914230782e-05, + "logits/chosen": 3.5167129039764404, + "logits/rejected": 3.4998507499694824, + "logps/chosen": -399.11846923828125, + "logps/rejected": -345.297607421875, + "loss": 0.4153, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.7922253608703613, + "rewards/margins": 4.672554016113281, + "rewards/rejected": -7.464779853820801, + "step": 40680 + }, + { + "epoch": 1.3262621078443353, + "grad_norm": 5.646677017211914, + "learning_rate": 2.790649677930937e-05, + "logits/chosen": 3.404322862625122, + "logits/rejected": 3.429084300994873, + "logps/chosen": -342.1352844238281, + "logps/rejected": -317.3875427246094, + "loss": 0.289, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.0670418739318848, + "rewards/margins": 3.8634536266326904, + "rewards/rejected": -6.930495262145996, + "step": 40700 + }, + { + "epoch": 1.326913833695856, + "grad_norm": 2.5879769325256348, + "learning_rate": 2.7895634416310924e-05, + "logits/chosen": 3.128272533416748, + "logits/rejected": 3.283323287963867, + "logps/chosen": -376.3011169433594, + "logps/rejected": -368.7590637207031, + "loss": 0.1793, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9816932678222656, + "rewards/margins": 5.099911689758301, + "rewards/rejected": -8.081605911254883, + "step": 40720 + }, + { + "epoch": 1.3275655595473763, + "grad_norm": 4.338809490203857, + "learning_rate": 2.788477205331248e-05, + "logits/chosen": 3.161386728286743, + "logits/rejected": 3.150158405303955, + "logps/chosen": -385.8524169921875, + "logps/rejected": -414.55682373046875, + "loss": 0.2276, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.281731367111206, + "rewards/margins": 5.741702556610107, + "rewards/rejected": -9.02343463897705, + "step": 40740 + }, + { + "epoch": 1.328217285398897, + "grad_norm": 1.6657506227493286, + "learning_rate": 2.7873909690314033e-05, + "logits/chosen": 3.0940659046173096, + "logits/rejected": 3.270146608352661, + "logps/chosen": -377.55255126953125, + "logps/rejected": -355.08428955078125, + "loss": 0.257, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.3581345081329346, + "rewards/margins": 4.145219326019287, + "rewards/rejected": -7.503354072570801, + "step": 40760 + }, + { + "epoch": 1.3288690112504176, + "grad_norm": 1.9503017663955688, + "learning_rate": 2.7863047327315583e-05, + "logits/chosen": 2.9611103534698486, + "logits/rejected": 2.8915934562683105, + "logps/chosen": -343.75927734375, + "logps/rejected": -362.6438903808594, + "loss": 0.136, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.3818836212158203, + "rewards/margins": 4.745660305023193, + "rewards/rejected": -8.127543449401855, + "step": 40780 + }, + { + "epoch": 1.3295207371019382, + "grad_norm": 1.128928542137146, + "learning_rate": 2.785218496431714e-05, + "logits/chosen": 3.0042920112609863, + "logits/rejected": 3.1563172340393066, + "logps/chosen": -329.3778381347656, + "logps/rejected": -343.4700927734375, + "loss": 0.2318, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.7789735794067383, + "rewards/margins": 5.066990852355957, + "rewards/rejected": -7.845963954925537, + "step": 40800 + }, + { + "epoch": 1.3301724629534586, + "grad_norm": 1.2602039575576782, + "learning_rate": 2.7841322601318692e-05, + "logits/chosen": 2.864191770553589, + "logits/rejected": 3.146336078643799, + "logps/chosen": -344.72686767578125, + "logps/rejected": -353.44293212890625, + "loss": 0.2467, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.152226686477661, + "rewards/margins": 4.82366418838501, + "rewards/rejected": -7.975890159606934, + "step": 40820 + }, + { + "epoch": 1.3308241888049792, + "grad_norm": 6.194849967956543, + "learning_rate": 2.7830460238320243e-05, + "logits/chosen": 2.6844277381896973, + "logits/rejected": 2.8122897148132324, + "logps/chosen": -351.733154296875, + "logps/rejected": -335.72198486328125, + "loss": 0.2827, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.9312243461608887, + "rewards/margins": 4.373991966247559, + "rewards/rejected": -7.305216312408447, + "step": 40840 + }, + { + "epoch": 1.3314759146564996, + "grad_norm": 0.13243260979652405, + "learning_rate": 2.782014099347172e-05, + "logits/chosen": 2.4933762550354004, + "logits/rejected": 2.7623343467712402, + "logps/chosen": -327.6114807128906, + "logps/rejected": -336.36517333984375, + "loss": 0.4027, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.651078701019287, + "rewards/margins": 4.76001501083374, + "rewards/rejected": -8.411093711853027, + "step": 40860 + }, + { + "epoch": 1.3321276405080202, + "grad_norm": 0.39418646693229675, + "learning_rate": 2.780927863047327e-05, + "logits/chosen": 2.775278091430664, + "logits/rejected": 2.8721203804016113, + "logps/chosen": -359.6840515136719, + "logps/rejected": -375.21624755859375, + "loss": 0.1715, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.747351884841919, + "rewards/margins": 4.664284706115723, + "rewards/rejected": -7.411635398864746, + "step": 40880 + }, + { + "epoch": 1.3327793663595409, + "grad_norm": 1.3592230081558228, + "learning_rate": 2.779841626747483e-05, + "logits/chosen": 2.9649555683135986, + "logits/rejected": 2.990971326828003, + "logps/chosen": -403.91912841796875, + "logps/rejected": -344.66229248046875, + "loss": 0.3329, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2738945484161377, + "rewards/margins": 3.817993640899658, + "rewards/rejected": -7.091888427734375, + "step": 40900 + }, + { + "epoch": 1.3334310922110615, + "grad_norm": 3.2031497955322266, + "learning_rate": 2.7787553904476383e-05, + "logits/chosen": 2.8968794345855713, + "logits/rejected": 2.8880696296691895, + "logps/chosen": -352.2283630371094, + "logps/rejected": -360.6944885253906, + "loss": 0.2301, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.66400146484375, + "rewards/margins": 4.967472076416016, + "rewards/rejected": -8.631474494934082, + "step": 40920 + }, + { + "epoch": 1.334082818062582, + "grad_norm": 5.060016632080078, + "learning_rate": 2.7776691541477934e-05, + "logits/chosen": 3.0373737812042236, + "logits/rejected": 3.005910634994507, + "logps/chosen": -410.27520751953125, + "logps/rejected": -383.64459228515625, + "loss": 0.3117, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.103039741516113, + "rewards/margins": 4.5865373611450195, + "rewards/rejected": -8.689577102661133, + "step": 40940 + }, + { + "epoch": 1.3347345439141025, + "grad_norm": 0.12078012526035309, + "learning_rate": 2.7765829178479485e-05, + "logits/chosen": 2.6410253047943115, + "logits/rejected": 2.851996421813965, + "logps/chosen": -328.7090148925781, + "logps/rejected": -373.46282958984375, + "loss": 0.2754, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.0000319480896, + "rewards/margins": 4.694328308105469, + "rewards/rejected": -8.694360733032227, + "step": 40960 + }, + { + "epoch": 1.3353862697656231, + "grad_norm": 2.8700790405273438, + "learning_rate": 2.7754966815481043e-05, + "logits/chosen": 3.0088868141174316, + "logits/rejected": 2.9102165699005127, + "logps/chosen": -371.1498107910156, + "logps/rejected": -393.2100524902344, + "loss": 0.2757, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.178208827972412, + "rewards/margins": 4.806981086730957, + "rewards/rejected": -8.985190391540527, + "step": 40980 + }, + { + "epoch": 1.3360379956171435, + "grad_norm": 1.2361217737197876, + "learning_rate": 2.7744104452482593e-05, + "logits/chosen": 2.846691846847534, + "logits/rejected": 2.749922513961792, + "logps/chosen": -354.9847412109375, + "logps/rejected": -377.8529357910156, + "loss": 0.3629, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.561770439147949, + "rewards/margins": 3.7627155780792236, + "rewards/rejected": -8.324485778808594, + "step": 41000 + }, + { + "epoch": 1.3366897214686642, + "grad_norm": 0.3625097870826721, + "learning_rate": 2.7733242089484147e-05, + "logits/chosen": 2.713379383087158, + "logits/rejected": 3.019130229949951, + "logps/chosen": -383.2427673339844, + "logps/rejected": -353.2222900390625, + "loss": 0.3408, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.272407531738281, + "rewards/margins": 4.405542850494385, + "rewards/rejected": -8.677949905395508, + "step": 41020 + }, + { + "epoch": 1.3373414473201848, + "grad_norm": 4.198266506195068, + "learning_rate": 2.77223797264857e-05, + "logits/chosen": 2.526608467102051, + "logits/rejected": 2.5632307529449463, + "logps/chosen": -287.81219482421875, + "logps/rejected": -290.9226989746094, + "loss": 0.3221, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.306044101715088, + "rewards/margins": 3.9771618843078613, + "rewards/rejected": -7.283205986022949, + "step": 41040 + }, + { + "epoch": 1.3379931731717054, + "grad_norm": 1.8121780157089233, + "learning_rate": 2.7711517363487256e-05, + "logits/chosen": 2.776364326477051, + "logits/rejected": 2.9479079246520996, + "logps/chosen": -354.19622802734375, + "logps/rejected": -329.9374084472656, + "loss": 0.2552, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.728738307952881, + "rewards/margins": 4.867438793182373, + "rewards/rejected": -8.59617805480957, + "step": 41060 + }, + { + "epoch": 1.3386448990232258, + "grad_norm": 6.064487934112549, + "learning_rate": 2.7700655000488807e-05, + "logits/chosen": 3.0803399085998535, + "logits/rejected": 3.2412147521972656, + "logps/chosen": -383.745849609375, + "logps/rejected": -358.17047119140625, + "loss": 0.3643, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.5009052753448486, + "rewards/margins": 4.719305515289307, + "rewards/rejected": -8.220210075378418, + "step": 41080 + }, + { + "epoch": 1.3392966248747464, + "grad_norm": 0.7382254004478455, + "learning_rate": 2.7689792637490357e-05, + "logits/chosen": 2.921053409576416, + "logits/rejected": 2.921210765838623, + "logps/chosen": -337.3469543457031, + "logps/rejected": -396.3471374511719, + "loss": 0.5077, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.7460811138153076, + "rewards/margins": 4.132368087768555, + "rewards/rejected": -7.878449440002441, + "step": 41100 + }, + { + "epoch": 1.339948350726267, + "grad_norm": 0.22742827236652374, + "learning_rate": 2.7678930274491915e-05, + "logits/chosen": 2.636559247970581, + "logits/rejected": 2.8374850749969482, + "logps/chosen": -338.341796875, + "logps/rejected": -365.95123291015625, + "loss": 0.2772, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.3338465690612793, + "rewards/margins": 4.06803035736084, + "rewards/rejected": -7.401876926422119, + "step": 41120 + }, + { + "epoch": 1.3406000765777875, + "grad_norm": 2.164768934249878, + "learning_rate": 2.7668067911493466e-05, + "logits/chosen": 3.002100944519043, + "logits/rejected": 3.1672604084014893, + "logps/chosen": -410.5416564941406, + "logps/rejected": -382.50421142578125, + "loss": 0.2713, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.4311294555664062, + "rewards/margins": 4.762333869934082, + "rewards/rejected": -8.193463325500488, + "step": 41140 + }, + { + "epoch": 1.341251802429308, + "grad_norm": 0.8957540988922119, + "learning_rate": 2.765720554849502e-05, + "logits/chosen": 3.0395712852478027, + "logits/rejected": 3.1167359352111816, + "logps/chosen": -389.43804931640625, + "logps/rejected": -399.7729187011719, + "loss": 0.1209, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.181568145751953, + "rewards/margins": 4.947844505310059, + "rewards/rejected": -8.129411697387695, + "step": 41160 + }, + { + "epoch": 1.3419035282808287, + "grad_norm": 0.13415423035621643, + "learning_rate": 2.7646343185496577e-05, + "logits/chosen": 3.123321294784546, + "logits/rejected": 3.1273632049560547, + "logps/chosen": -387.0546875, + "logps/rejected": -372.92694091796875, + "loss": 0.2485, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.9359874725341797, + "rewards/margins": 4.417372703552246, + "rewards/rejected": -7.353359222412109, + "step": 41180 + }, + { + "epoch": 1.3425552541323493, + "grad_norm": 1.5591673851013184, + "learning_rate": 2.7635480822498128e-05, + "logits/chosen": 2.3329546451568604, + "logits/rejected": 2.558499336242676, + "logps/chosen": -305.4666442871094, + "logps/rejected": -328.4232177734375, + "loss": 0.235, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.3983798027038574, + "rewards/margins": 4.382620334625244, + "rewards/rejected": -7.780999660491943, + "step": 41200 + }, + { + "epoch": 1.3432069799838697, + "grad_norm": 2.1749300956726074, + "learning_rate": 2.762461845949968e-05, + "logits/chosen": 2.6358513832092285, + "logits/rejected": 2.6878418922424316, + "logps/chosen": -339.06402587890625, + "logps/rejected": -325.87890625, + "loss": 0.3933, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.503736972808838, + "rewards/margins": 3.8236701488494873, + "rewards/rejected": -7.3274078369140625, + "step": 41220 + }, + { + "epoch": 1.3438587058353904, + "grad_norm": 0.5528323650360107, + "learning_rate": 2.7613756096501237e-05, + "logits/chosen": 3.0677857398986816, + "logits/rejected": 3.224860668182373, + "logps/chosen": -397.2816162109375, + "logps/rejected": -365.7948303222656, + "loss": 0.1747, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.1644110679626465, + "rewards/margins": 4.729699611663818, + "rewards/rejected": -7.894110679626465, + "step": 41240 + }, + { + "epoch": 1.344510431686911, + "grad_norm": 6.614677906036377, + "learning_rate": 2.7602893733502787e-05, + "logits/chosen": 2.7263052463531494, + "logits/rejected": 2.6809182167053223, + "logps/chosen": -335.0800476074219, + "logps/rejected": -368.3699645996094, + "loss": 0.1622, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.612136125564575, + "rewards/margins": 4.810025691986084, + "rewards/rejected": -8.422162055969238, + "step": 41260 + }, + { + "epoch": 1.3451621575384314, + "grad_norm": 5.0287957191467285, + "learning_rate": 2.7592031370504338e-05, + "logits/chosen": 3.2603659629821777, + "logits/rejected": 3.3501839637756348, + "logps/chosen": -391.7828063964844, + "logps/rejected": -383.04364013671875, + "loss": 0.2356, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.6311447620391846, + "rewards/margins": 4.780065536499023, + "rewards/rejected": -8.411211013793945, + "step": 41280 + }, + { + "epoch": 1.345813883389952, + "grad_norm": 1.2771127223968506, + "learning_rate": 2.7581169007505892e-05, + "logits/chosen": 3.108579635620117, + "logits/rejected": 2.9887821674346924, + "logps/chosen": -397.39251708984375, + "logps/rejected": -396.4506530761719, + "loss": 0.1888, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.459815263748169, + "rewards/margins": 5.009576797485352, + "rewards/rejected": -8.469392776489258, + "step": 41300 + }, + { + "epoch": 1.3464656092414726, + "grad_norm": 0.26128044724464417, + "learning_rate": 2.757030664450745e-05, + "logits/chosen": 2.8534934520721436, + "logits/rejected": 2.9090054035186768, + "logps/chosen": -309.0020751953125, + "logps/rejected": -329.0263977050781, + "loss": 0.4158, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.5750362873077393, + "rewards/margins": 3.4345946311950684, + "rewards/rejected": -7.0096306800842285, + "step": 41320 + }, + { + "epoch": 1.3471173350929933, + "grad_norm": 2.505251169204712, + "learning_rate": 2.7559444281509e-05, + "logits/chosen": 3.0811827182769775, + "logits/rejected": 3.0076799392700195, + "logps/chosen": -409.2415466308594, + "logps/rejected": -383.0291442871094, + "loss": 0.4297, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.5064339637756348, + "rewards/margins": 5.000086784362793, + "rewards/rejected": -8.50652027130127, + "step": 41340 + }, + { + "epoch": 1.3477690609445137, + "grad_norm": 0.5470070838928223, + "learning_rate": 2.754858191851055e-05, + "logits/chosen": 2.871912956237793, + "logits/rejected": 2.8303399085998535, + "logps/chosen": -325.07672119140625, + "logps/rejected": -353.49517822265625, + "loss": 0.2777, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.3520236015319824, + "rewards/margins": 4.836073875427246, + "rewards/rejected": -8.18809700012207, + "step": 41360 + }, + { + "epoch": 1.3484207867960343, + "grad_norm": 0.7581974267959595, + "learning_rate": 2.753771955551211e-05, + "logits/chosen": 2.939462184906006, + "logits/rejected": 2.9615044593811035, + "logps/chosen": -375.709716796875, + "logps/rejected": -344.73797607421875, + "loss": 0.4168, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.6849803924560547, + "rewards/margins": 4.830596923828125, + "rewards/rejected": -8.51557731628418, + "step": 41380 + }, + { + "epoch": 1.3490725126475547, + "grad_norm": 6.700002670288086, + "learning_rate": 2.752685719251366e-05, + "logits/chosen": 2.7415833473205566, + "logits/rejected": 2.83575439453125, + "logps/chosen": -338.8346252441406, + "logps/rejected": -336.6610107421875, + "loss": 0.3699, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.7002692222595215, + "rewards/margins": 4.185259819030762, + "rewards/rejected": -7.885529518127441, + "step": 41400 + }, + { + "epoch": 1.3497242384990753, + "grad_norm": 5.212071895599365, + "learning_rate": 2.7515994829515214e-05, + "logits/chosen": 3.090916395187378, + "logits/rejected": 3.051856517791748, + "logps/chosen": -405.22088623046875, + "logps/rejected": -380.11614990234375, + "loss": 0.4396, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.0391459465026855, + "rewards/margins": 3.574734926223755, + "rewards/rejected": -6.6138811111450195, + "step": 41420 + }, + { + "epoch": 1.350375964350596, + "grad_norm": 0.5975571274757385, + "learning_rate": 2.7505132466516768e-05, + "logits/chosen": 2.948279619216919, + "logits/rejected": 2.7182021141052246, + "logps/chosen": -312.4766540527344, + "logps/rejected": -338.79876708984375, + "loss": 0.2786, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.5877087116241455, + "rewards/margins": 4.233839511871338, + "rewards/rejected": -6.821547508239746, + "step": 41440 + }, + { + "epoch": 1.3510276902021165, + "grad_norm": 0.011383896693587303, + "learning_rate": 2.7494270103518322e-05, + "logits/chosen": 2.9498493671417236, + "logits/rejected": 3.071035623550415, + "logps/chosen": -324.8909912109375, + "logps/rejected": -360.47259521484375, + "loss": 0.2034, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.646422863006592, + "rewards/margins": 5.047552108764648, + "rewards/rejected": -7.693975925445557, + "step": 41460 + }, + { + "epoch": 1.3516794160536372, + "grad_norm": 1.0308399200439453, + "learning_rate": 2.7483407740519873e-05, + "logits/chosen": 3.0801162719726562, + "logits/rejected": 3.1337666511535645, + "logps/chosen": -326.7474365234375, + "logps/rejected": -323.98028564453125, + "loss": 0.3919, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.176401376724243, + "rewards/margins": 4.1783037185668945, + "rewards/rejected": -7.354704856872559, + "step": 41480 + }, + { + "epoch": 1.3523311419051576, + "grad_norm": 3.2000982761383057, + "learning_rate": 2.7472545377521424e-05, + "logits/chosen": 3.0648350715637207, + "logits/rejected": 3.0841667652130127, + "logps/chosen": -341.22991943359375, + "logps/rejected": -331.75238037109375, + "loss": 0.2016, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.9218478202819824, + "rewards/margins": 4.5360002517700195, + "rewards/rejected": -7.457847595214844, + "step": 41500 + }, + { + "epoch": 1.3529828677566782, + "grad_norm": 2.075136661529541, + "learning_rate": 2.746168301452298e-05, + "logits/chosen": 3.1177639961242676, + "logits/rejected": 3.2162654399871826, + "logps/chosen": -353.83648681640625, + "logps/rejected": -325.277587890625, + "loss": 0.1838, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.8256258964538574, + "rewards/margins": 4.572648525238037, + "rewards/rejected": -7.3982744216918945, + "step": 41520 + }, + { + "epoch": 1.3536345936081986, + "grad_norm": 2.1902828216552734, + "learning_rate": 2.7450820651524532e-05, + "logits/chosen": 2.8470699787139893, + "logits/rejected": 2.978854179382324, + "logps/chosen": -339.31768798828125, + "logps/rejected": -316.53369140625, + "loss": 0.2364, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.082859754562378, + "rewards/margins": 4.403314113616943, + "rewards/rejected": -7.4861741065979, + "step": 41540 + }, + { + "epoch": 1.3542863194597192, + "grad_norm": 16.79722023010254, + "learning_rate": 2.7439958288526086e-05, + "logits/chosen": 2.6731464862823486, + "logits/rejected": 2.932197093963623, + "logps/chosen": -313.0052795410156, + "logps/rejected": -319.26617431640625, + "loss": 0.3471, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.014261245727539, + "rewards/margins": 3.6495883464813232, + "rewards/rejected": -6.663848876953125, + "step": 41560 + }, + { + "epoch": 1.3549380453112398, + "grad_norm": 2.869222640991211, + "learning_rate": 2.7429095925527644e-05, + "logits/chosen": 2.6891677379608154, + "logits/rejected": 2.7878708839416504, + "logps/chosen": -366.68463134765625, + "logps/rejected": -382.8919982910156, + "loss": 0.231, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.920161724090576, + "rewards/margins": 4.859793186187744, + "rewards/rejected": -7.779955863952637, + "step": 41580 + }, + { + "epoch": 1.3555897711627605, + "grad_norm": 2.4376304149627686, + "learning_rate": 2.7418233562529195e-05, + "logits/chosen": 3.2136013507843018, + "logits/rejected": 3.3619327545166016, + "logps/chosen": -361.27960205078125, + "logps/rejected": -361.3377380371094, + "loss": 0.3872, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.724745273590088, + "rewards/margins": 3.787281036376953, + "rewards/rejected": -7.512026786804199, + "step": 41600 + }, + { + "epoch": 1.3562414970142809, + "grad_norm": 1.866412878036499, + "learning_rate": 2.7407371199530746e-05, + "logits/chosen": 2.9895219802856445, + "logits/rejected": 3.111166477203369, + "logps/chosen": -370.8126525878906, + "logps/rejected": -339.55328369140625, + "loss": 0.4635, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.668499708175659, + "rewards/margins": 4.232931137084961, + "rewards/rejected": -6.901431083679199, + "step": 41620 + }, + { + "epoch": 1.3568932228658015, + "grad_norm": 2.0878660678863525, + "learning_rate": 2.7396508836532303e-05, + "logits/chosen": 3.2897086143493652, + "logits/rejected": 3.168671131134033, + "logps/chosen": -389.95977783203125, + "logps/rejected": -360.8770446777344, + "loss": 0.2811, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.7141597270965576, + "rewards/margins": 4.864468574523926, + "rewards/rejected": -7.578627586364746, + "step": 41640 + }, + { + "epoch": 1.3575449487173221, + "grad_norm": 1.9591472148895264, + "learning_rate": 2.7385646473533854e-05, + "logits/chosen": 3.0397067070007324, + "logits/rejected": 3.2255420684814453, + "logps/chosen": -365.9883728027344, + "logps/rejected": -414.59405517578125, + "loss": 0.2985, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.949267864227295, + "rewards/margins": 4.349067687988281, + "rewards/rejected": -7.298335075378418, + "step": 41660 + }, + { + "epoch": 1.3581966745688425, + "grad_norm": 0.8924974799156189, + "learning_rate": 2.7374784110535408e-05, + "logits/chosen": 2.6933224201202393, + "logits/rejected": 2.9423205852508545, + "logps/chosen": -326.7147216796875, + "logps/rejected": -312.8984680175781, + "loss": 0.2533, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.819124698638916, + "rewards/margins": 3.800574779510498, + "rewards/rejected": -6.619699001312256, + "step": 41680 + }, + { + "epoch": 1.3588484004203631, + "grad_norm": 1.3121418952941895, + "learning_rate": 2.736392174753696e-05, + "logits/chosen": 3.2540135383605957, + "logits/rejected": 3.1483988761901855, + "logps/chosen": -380.9153747558594, + "logps/rejected": -308.11981201171875, + "loss": 0.241, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.0511436462402344, + "rewards/margins": 4.146324634552002, + "rewards/rejected": -7.1974687576293945, + "step": 41700 + }, + { + "epoch": 1.3595001262718838, + "grad_norm": 1.981067419052124, + "learning_rate": 2.7353059384538516e-05, + "logits/chosen": 3.1643226146698, + "logits/rejected": 3.2852261066436768, + "logps/chosen": -394.6600036621094, + "logps/rejected": -325.2879333496094, + "loss": 0.2173, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.1173856258392334, + "rewards/margins": 4.242886066436768, + "rewards/rejected": -7.360272407531738, + "step": 41720 + }, + { + "epoch": 1.3601518521234044, + "grad_norm": 0.08277080953121185, + "learning_rate": 2.7342197021540067e-05, + "logits/chosen": 2.8536524772644043, + "logits/rejected": 3.017230272293091, + "logps/chosen": -359.81939697265625, + "logps/rejected": -334.2613830566406, + "loss": 0.3527, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.3594672679901123, + "rewards/margins": 4.0359601974487305, + "rewards/rejected": -7.395427703857422, + "step": 41740 + }, + { + "epoch": 1.3608035779749248, + "grad_norm": 0.3186678886413574, + "learning_rate": 2.7331334658541618e-05, + "logits/chosen": 2.803846836090088, + "logits/rejected": 2.966190814971924, + "logps/chosen": -348.906494140625, + "logps/rejected": -316.0068054199219, + "loss": 0.2645, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6054728031158447, + "rewards/margins": 3.895231246948242, + "rewards/rejected": -6.500704288482666, + "step": 41760 + }, + { + "epoch": 1.3614553038264454, + "grad_norm": 11.824504852294922, + "learning_rate": 2.7320472295543176e-05, + "logits/chosen": 2.9342474937438965, + "logits/rejected": 2.9594407081604004, + "logps/chosen": -333.2513122558594, + "logps/rejected": -309.7200622558594, + "loss": 0.4547, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.7727582454681396, + "rewards/margins": 3.3027777671813965, + "rewards/rejected": -6.075536251068115, + "step": 41780 + }, + { + "epoch": 1.362107029677966, + "grad_norm": 0.15874022245407104, + "learning_rate": 2.7309609932544726e-05, + "logits/chosen": 3.208888292312622, + "logits/rejected": 3.1932384967803955, + "logps/chosen": -398.28179931640625, + "logps/rejected": -345.9381103515625, + "loss": 0.1718, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9533259868621826, + "rewards/margins": 5.000319957733154, + "rewards/rejected": -6.9536452293396, + "step": 41800 + }, + { + "epoch": 1.3627587555294864, + "grad_norm": 2.8437886238098145, + "learning_rate": 2.729874756954628e-05, + "logits/chosen": 3.2111594676971436, + "logits/rejected": 3.2755565643310547, + "logps/chosen": -365.13031005859375, + "logps/rejected": -330.4754943847656, + "loss": 0.4079, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.625307559967041, + "rewards/margins": 3.7621726989746094, + "rewards/rejected": -6.387480735778809, + "step": 41820 + }, + { + "epoch": 1.363410481381007, + "grad_norm": 2.339174747467041, + "learning_rate": 2.7287885206547835e-05, + "logits/chosen": 3.2057011127471924, + "logits/rejected": 3.3301711082458496, + "logps/chosen": -364.2801818847656, + "logps/rejected": -364.36529541015625, + "loss": 0.3153, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.500654458999634, + "rewards/margins": 3.472233533859253, + "rewards/rejected": -5.972887992858887, + "step": 41840 + }, + { + "epoch": 1.3640622072325277, + "grad_norm": 2.1857504844665527, + "learning_rate": 2.727702284354939e-05, + "logits/chosen": 2.8863472938537598, + "logits/rejected": 3.0336155891418457, + "logps/chosen": -310.1482849121094, + "logps/rejected": -306.3955993652344, + "loss": 0.2344, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.089836597442627, + "rewards/margins": 4.214724063873291, + "rewards/rejected": -7.304560661315918, + "step": 41860 + }, + { + "epoch": 1.3647139330840483, + "grad_norm": 1.479887843132019, + "learning_rate": 2.726616048055094e-05, + "logits/chosen": 2.896310806274414, + "logits/rejected": 3.0405757427215576, + "logps/chosen": -351.8935546875, + "logps/rejected": -332.2619934082031, + "loss": 0.2688, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.5360965728759766, + "rewards/margins": 4.789463520050049, + "rewards/rejected": -7.325560092926025, + "step": 41880 + }, + { + "epoch": 1.3653656589355687, + "grad_norm": 5.277497291564941, + "learning_rate": 2.725529811755249e-05, + "logits/chosen": 3.2122597694396973, + "logits/rejected": 3.2152867317199707, + "logps/chosen": -303.3759460449219, + "logps/rejected": -287.2282409667969, + "loss": 0.3035, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.0368735790252686, + "rewards/margins": 3.5752015113830566, + "rewards/rejected": -6.612074375152588, + "step": 41900 + }, + { + "epoch": 1.3660173847870893, + "grad_norm": 3.4508352279663086, + "learning_rate": 2.7244435754554048e-05, + "logits/chosen": 3.0851988792419434, + "logits/rejected": 3.1460304260253906, + "logps/chosen": -334.1946716308594, + "logps/rejected": -343.4989318847656, + "loss": 0.3912, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.1459383964538574, + "rewards/margins": 4.4051008224487305, + "rewards/rejected": -7.551039218902588, + "step": 41920 + }, + { + "epoch": 1.3666691106386097, + "grad_norm": 0.1191866397857666, + "learning_rate": 2.72335733915556e-05, + "logits/chosen": 2.747447967529297, + "logits/rejected": 2.8278727531433105, + "logps/chosen": -317.83148193359375, + "logps/rejected": -304.41912841796875, + "loss": 0.3696, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.5189995765686035, + "rewards/margins": 3.410205364227295, + "rewards/rejected": -5.929204940795898, + "step": 41940 + }, + { + "epoch": 1.3673208364901304, + "grad_norm": 1.9110153913497925, + "learning_rate": 2.7222711028557153e-05, + "logits/chosen": 3.080918788909912, + "logits/rejected": 3.155748128890991, + "logps/chosen": -345.55181884765625, + "logps/rejected": -328.8233947753906, + "loss": 0.2346, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.2744007110595703, + "rewards/margins": 4.162424564361572, + "rewards/rejected": -6.436825752258301, + "step": 41960 + }, + { + "epoch": 1.367972562341651, + "grad_norm": 0.43657878041267395, + "learning_rate": 2.721184866555871e-05, + "logits/chosen": 3.1879096031188965, + "logits/rejected": 3.262448787689209, + "logps/chosen": -318.5672302246094, + "logps/rejected": -339.8133239746094, + "loss": 0.2963, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.4239354133605957, + "rewards/margins": 3.451326370239258, + "rewards/rejected": -5.8752617835998535, + "step": 41980 + }, + { + "epoch": 1.3686242881931716, + "grad_norm": 0.698229968547821, + "learning_rate": 2.720098630256026e-05, + "logits/chosen": 3.10184907913208, + "logits/rejected": 3.064955234527588, + "logps/chosen": -391.1082458496094, + "logps/rejected": -385.6929016113281, + "loss": 0.3112, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.8343396186828613, + "rewards/margins": 4.184619903564453, + "rewards/rejected": -7.018959045410156, + "step": 42000 + }, + { + "epoch": 1.3692760140446922, + "grad_norm": 5.170802116394043, + "learning_rate": 2.7190123939561812e-05, + "logits/chosen": 3.2124381065368652, + "logits/rejected": 3.275144100189209, + "logps/chosen": -388.9017028808594, + "logps/rejected": -368.0245056152344, + "loss": 0.3244, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.774019241333008, + "rewards/margins": 5.072519779205322, + "rewards/rejected": -7.846539497375488, + "step": 42020 + }, + { + "epoch": 1.3699277398962126, + "grad_norm": 0.43189576268196106, + "learning_rate": 2.717926157656337e-05, + "logits/chosen": 3.317038059234619, + "logits/rejected": 3.2792515754699707, + "logps/chosen": -359.13714599609375, + "logps/rejected": -342.3946838378906, + "loss": 0.1402, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.5175940990448, + "rewards/margins": 4.582385540008545, + "rewards/rejected": -7.099980354309082, + "step": 42040 + }, + { + "epoch": 1.3705794657477333, + "grad_norm": 1.4853273630142212, + "learning_rate": 2.716839921356492e-05, + "logits/chosen": 3.0121684074401855, + "logits/rejected": 3.084960460662842, + "logps/chosen": -356.9316711425781, + "logps/rejected": -386.6122131347656, + "loss": 0.3526, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4724924564361572, + "rewards/margins": 4.872684478759766, + "rewards/rejected": -7.34517765045166, + "step": 42060 + }, + { + "epoch": 1.3712311915992537, + "grad_norm": 0.3595186769962311, + "learning_rate": 2.7157536850566475e-05, + "logits/chosen": 3.0081164836883545, + "logits/rejected": 3.1630330085754395, + "logps/chosen": -360.8828125, + "logps/rejected": -344.0373229980469, + "loss": 0.366, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.5465474128723145, + "rewards/margins": 4.519254207611084, + "rewards/rejected": -7.065802097320557, + "step": 42080 + }, + { + "epoch": 1.3718829174507743, + "grad_norm": 1.633378028869629, + "learning_rate": 2.7146674487568025e-05, + "logits/chosen": 3.0707106590270996, + "logits/rejected": 3.219521999359131, + "logps/chosen": -352.31732177734375, + "logps/rejected": -331.528076171875, + "loss": 0.329, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.1209232807159424, + "rewards/margins": 3.592989444732666, + "rewards/rejected": -6.713912010192871, + "step": 42100 + }, + { + "epoch": 1.372534643302295, + "grad_norm": 0.6439725160598755, + "learning_rate": 2.7135812124569583e-05, + "logits/chosen": 2.7525668144226074, + "logits/rejected": 2.9910082817077637, + "logps/chosen": -288.247802734375, + "logps/rejected": -336.8344421386719, + "loss": 0.2184, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.1849493980407715, + "rewards/margins": 4.238938331604004, + "rewards/rejected": -7.423887729644775, + "step": 42120 + }, + { + "epoch": 1.3731863691538155, + "grad_norm": 0.6847518086433411, + "learning_rate": 2.7124949761571134e-05, + "logits/chosen": 3.1644201278686523, + "logits/rejected": 3.306828022003174, + "logps/chosen": -352.48016357421875, + "logps/rejected": -317.36383056640625, + "loss": 0.2257, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.3966288566589355, + "rewards/margins": 4.310238361358643, + "rewards/rejected": -6.706866264343262, + "step": 42140 + }, + { + "epoch": 1.373838095005336, + "grad_norm": 2.6045565605163574, + "learning_rate": 2.7114087398572684e-05, + "logits/chosen": 2.9326090812683105, + "logits/rejected": 3.0016517639160156, + "logps/chosen": -345.37994384765625, + "logps/rejected": -362.60113525390625, + "loss": 0.1425, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.43987774848938, + "rewards/margins": 4.905055046081543, + "rewards/rejected": -7.344932556152344, + "step": 42160 + }, + { + "epoch": 1.3744898208568566, + "grad_norm": 3.817735195159912, + "learning_rate": 2.7103225035574242e-05, + "logits/chosen": 2.75661563873291, + "logits/rejected": 2.765155792236328, + "logps/chosen": -309.35565185546875, + "logps/rejected": -339.8120422363281, + "loss": 0.4087, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.7168023586273193, + "rewards/margins": 3.4034221172332764, + "rewards/rejected": -6.1202239990234375, + "step": 42180 + }, + { + "epoch": 1.3751415467083772, + "grad_norm": 3.9074416160583496, + "learning_rate": 2.7092362672575793e-05, + "logits/chosen": 2.8859734535217285, + "logits/rejected": 3.08479380607605, + "logps/chosen": -378.0807800292969, + "logps/rejected": -344.9278869628906, + "loss": 0.2925, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.4811928272247314, + "rewards/margins": 3.7609355449676514, + "rewards/rejected": -7.242127895355225, + "step": 42200 + }, + { + "epoch": 1.3757932725598976, + "grad_norm": 2.009782075881958, + "learning_rate": 2.7081500309577347e-05, + "logits/chosen": 2.850399971008301, + "logits/rejected": 2.882781982421875, + "logps/chosen": -342.66143798828125, + "logps/rejected": -333.5711669921875, + "loss": 0.2207, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.021395206451416, + "rewards/margins": 3.8922438621520996, + "rewards/rejected": -6.913639068603516, + "step": 42220 + }, + { + "epoch": 1.3764449984114182, + "grad_norm": 3.3765573501586914, + "learning_rate": 2.7070637946578905e-05, + "logits/chosen": 2.907005786895752, + "logits/rejected": 3.031139850616455, + "logps/chosen": -361.64849853515625, + "logps/rejected": -362.91705322265625, + "loss": 0.3314, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.3984546661376953, + "rewards/margins": 3.455711841583252, + "rewards/rejected": -6.854166507720947, + "step": 42240 + }, + { + "epoch": 1.3770967242629388, + "grad_norm": 1.244903326034546, + "learning_rate": 2.7059775583580455e-05, + "logits/chosen": 2.937476396560669, + "logits/rejected": 2.838658094406128, + "logps/chosen": -375.9602966308594, + "logps/rejected": -380.85626220703125, + "loss": 0.13, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.7442429065704346, + "rewards/margins": 5.154716968536377, + "rewards/rejected": -8.898959159851074, + "step": 42260 + }, + { + "epoch": 1.3777484501144595, + "grad_norm": 0.7675425410270691, + "learning_rate": 2.7048913220582006e-05, + "logits/chosen": 2.9939825534820557, + "logits/rejected": 2.81392240524292, + "logps/chosen": -373.4430236816406, + "logps/rejected": -316.82049560546875, + "loss": 0.1945, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.0867717266082764, + "rewards/margins": 4.901711463928223, + "rewards/rejected": -7.988483428955078, + "step": 42280 + }, + { + "epoch": 1.3784001759659799, + "grad_norm": 1.8313626050949097, + "learning_rate": 2.7038050857583557e-05, + "logits/chosen": 2.8457729816436768, + "logits/rejected": 2.7688117027282715, + "logps/chosen": -340.4208068847656, + "logps/rejected": -330.6663513183594, + "loss": 0.4183, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.5588595867156982, + "rewards/margins": 3.782519578933716, + "rewards/rejected": -7.341378688812256, + "step": 42300 + }, + { + "epoch": 1.3790519018175005, + "grad_norm": 6.221458911895752, + "learning_rate": 2.7027188494585114e-05, + "logits/chosen": 2.7020201683044434, + "logits/rejected": 2.6571602821350098, + "logps/chosen": -362.08697509765625, + "logps/rejected": -372.46063232421875, + "loss": 0.4958, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.7392897605895996, + "rewards/margins": 4.208250999450684, + "rewards/rejected": -7.947541236877441, + "step": 42320 + }, + { + "epoch": 1.379703627669021, + "grad_norm": 4.838074207305908, + "learning_rate": 2.7016326131586665e-05, + "logits/chosen": 2.7350454330444336, + "logits/rejected": 2.93247652053833, + "logps/chosen": -337.4032287597656, + "logps/rejected": -358.0648498535156, + "loss": 0.3678, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.693432569503784, + "rewards/margins": 3.719467878341675, + "rewards/rejected": -7.412900447845459, + "step": 42340 + }, + { + "epoch": 1.3803553535205415, + "grad_norm": 0.24882735311985016, + "learning_rate": 2.700546376858822e-05, + "logits/chosen": 2.497936725616455, + "logits/rejected": 2.679299831390381, + "logps/chosen": -309.31634521484375, + "logps/rejected": -351.23919677734375, + "loss": 0.1575, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.138261318206787, + "rewards/margins": 4.815915107727051, + "rewards/rejected": -7.954176425933838, + "step": 42360 + }, + { + "epoch": 1.3810070793720621, + "grad_norm": 2.276047706604004, + "learning_rate": 2.6994601405589777e-05, + "logits/chosen": 3.3247275352478027, + "logits/rejected": 3.062669038772583, + "logps/chosen": -382.7999572753906, + "logps/rejected": -355.7027893066406, + "loss": 0.1317, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.872415065765381, + "rewards/margins": 4.988711357116699, + "rewards/rejected": -7.861126899719238, + "step": 42380 + }, + { + "epoch": 1.3816588052235828, + "grad_norm": 3.634411096572876, + "learning_rate": 2.6983739042591328e-05, + "logits/chosen": 2.756840467453003, + "logits/rejected": 2.9092416763305664, + "logps/chosen": -338.4385070800781, + "logps/rejected": -339.16278076171875, + "loss": 0.1603, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.3757736682891846, + "rewards/margins": 4.886358261108398, + "rewards/rejected": -8.26213264465332, + "step": 42400 + }, + { + "epoch": 1.3823105310751034, + "grad_norm": 2.3429291248321533, + "learning_rate": 2.697287667959288e-05, + "logits/chosen": 3.0609374046325684, + "logits/rejected": 3.0350735187530518, + "logps/chosen": -339.1978454589844, + "logps/rejected": -334.68218994140625, + "loss": 0.3753, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.975252628326416, + "rewards/margins": 3.888972043991089, + "rewards/rejected": -6.864224433898926, + "step": 42420 + }, + { + "epoch": 1.3829622569266238, + "grad_norm": 4.899526119232178, + "learning_rate": 2.696201431659443e-05, + "logits/chosen": 2.7839434146881104, + "logits/rejected": 2.824972629547119, + "logps/chosen": -332.8955383300781, + "logps/rejected": -339.8250427246094, + "loss": 0.2516, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.6474697589874268, + "rewards/margins": 4.570696830749512, + "rewards/rejected": -8.218167304992676, + "step": 42440 + }, + { + "epoch": 1.3836139827781444, + "grad_norm": 8.599358558654785, + "learning_rate": 2.6951151953595987e-05, + "logits/chosen": 2.757744073867798, + "logits/rejected": 2.810635805130005, + "logps/chosen": -365.873779296875, + "logps/rejected": -388.0255432128906, + "loss": 0.4792, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.6524651050567627, + "rewards/margins": 4.272579669952393, + "rewards/rejected": -7.925045013427734, + "step": 42460 + }, + { + "epoch": 1.3842657086296648, + "grad_norm": 0.01946413516998291, + "learning_rate": 2.694028959059754e-05, + "logits/chosen": 2.693601608276367, + "logits/rejected": 2.8217577934265137, + "logps/chosen": -361.11785888671875, + "logps/rejected": -345.0450439453125, + "loss": 0.1708, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.368607759475708, + "rewards/margins": 5.423698425292969, + "rewards/rejected": -7.792306423187256, + "step": 42480 + }, + { + "epoch": 1.3849174344811854, + "grad_norm": 12.719889640808105, + "learning_rate": 2.6929427227599092e-05, + "logits/chosen": 2.716970920562744, + "logits/rejected": 2.688704013824463, + "logps/chosen": -365.27362060546875, + "logps/rejected": -358.92718505859375, + "loss": 0.4299, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.96024751663208, + "rewards/margins": 3.9267470836639404, + "rewards/rejected": -7.886994361877441, + "step": 42500 + }, + { + "epoch": 1.385569160332706, + "grad_norm": 3.1889920234680176, + "learning_rate": 2.691856486460065e-05, + "logits/chosen": 2.9691598415374756, + "logits/rejected": 3.1778228282928467, + "logps/chosen": -377.59979248046875, + "logps/rejected": -351.2986755371094, + "loss": 0.2475, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.7503981590270996, + "rewards/margins": 4.511201858520508, + "rewards/rejected": -8.26159954071045, + "step": 42520 + }, + { + "epoch": 1.3862208861842267, + "grad_norm": 0.09972859919071198, + "learning_rate": 2.69077025016022e-05, + "logits/chosen": 2.6159095764160156, + "logits/rejected": 2.7586493492126465, + "logps/chosen": -346.4560546875, + "logps/rejected": -343.07061767578125, + "loss": 0.1781, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.692138671875, + "rewards/margins": 4.509974002838135, + "rewards/rejected": -8.202112197875977, + "step": 42540 + }, + { + "epoch": 1.3868726120357473, + "grad_norm": 0.32746344804763794, + "learning_rate": 2.689684013860375e-05, + "logits/chosen": 3.1178746223449707, + "logits/rejected": 3.1301703453063965, + "logps/chosen": -379.1728210449219, + "logps/rejected": -402.6018981933594, + "loss": 0.4351, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.3223910331726074, + "rewards/margins": 3.9236323833465576, + "rewards/rejected": -7.246024131774902, + "step": 42560 + }, + { + "epoch": 1.3875243378872677, + "grad_norm": 0.33910831809043884, + "learning_rate": 2.688597777560531e-05, + "logits/chosen": 2.847212553024292, + "logits/rejected": 2.8432624340057373, + "logps/chosen": -383.9209289550781, + "logps/rejected": -365.77557373046875, + "loss": 0.2549, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.5622997283935547, + "rewards/margins": 5.158782005310059, + "rewards/rejected": -7.7210822105407715, + "step": 42580 + }, + { + "epoch": 1.3881760637387883, + "grad_norm": 5.494560718536377, + "learning_rate": 2.687511541260686e-05, + "logits/chosen": 2.7812042236328125, + "logits/rejected": 2.9149398803710938, + "logps/chosen": -355.0440368652344, + "logps/rejected": -386.0023193359375, + "loss": 0.3352, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.2820942401885986, + "rewards/margins": 4.441958427429199, + "rewards/rejected": -7.724053382873535, + "step": 42600 + }, + { + "epoch": 1.3888277895903087, + "grad_norm": 7.461318016052246, + "learning_rate": 2.6864253049608413e-05, + "logits/chosen": 3.1937108039855957, + "logits/rejected": 3.1720452308654785, + "logps/chosen": -333.267822265625, + "logps/rejected": -343.5469665527344, + "loss": 0.2569, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.094808578491211, + "rewards/margins": 3.8883140087127686, + "rewards/rejected": -6.983122825622559, + "step": 42620 + }, + { + "epoch": 1.3894795154418293, + "grad_norm": 3.7022624015808105, + "learning_rate": 2.6853390686609964e-05, + "logits/chosen": 2.9953665733337402, + "logits/rejected": 2.8152480125427246, + "logps/chosen": -365.7652587890625, + "logps/rejected": -372.83538818359375, + "loss": 0.2755, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.9843146800994873, + "rewards/margins": 4.182738304138184, + "rewards/rejected": -8.16705322265625, + "step": 42640 + }, + { + "epoch": 1.39013124129335, + "grad_norm": 1.7281455993652344, + "learning_rate": 2.6842528323611522e-05, + "logits/chosen": 3.294940233230591, + "logits/rejected": 3.2340826988220215, + "logps/chosen": -390.0007629394531, + "logps/rejected": -358.3250732421875, + "loss": 0.3103, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.0713682174682617, + "rewards/margins": 4.819250583648682, + "rewards/rejected": -7.890618801116943, + "step": 42660 + }, + { + "epoch": 1.3907829671448706, + "grad_norm": 0.34618768095970154, + "learning_rate": 2.6831665960613073e-05, + "logits/chosen": 3.133376121520996, + "logits/rejected": 3.098573684692383, + "logps/chosen": -388.6717834472656, + "logps/rejected": -329.1380920410156, + "loss": 0.2047, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.908334732055664, + "rewards/margins": 5.009298324584961, + "rewards/rejected": -7.917634010314941, + "step": 42680 + }, + { + "epoch": 1.391434692996391, + "grad_norm": 6.1737871170043945, + "learning_rate": 2.6820803597614623e-05, + "logits/chosen": 3.0486817359924316, + "logits/rejected": 3.080768346786499, + "logps/chosen": -361.85137939453125, + "logps/rejected": -352.41607666015625, + "loss": 0.3753, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.215457439422607, + "rewards/margins": 3.907435178756714, + "rewards/rejected": -8.122892379760742, + "step": 42700 + }, + { + "epoch": 1.3920864188479116, + "grad_norm": 0.8970543742179871, + "learning_rate": 2.680994123461618e-05, + "logits/chosen": 2.70536470413208, + "logits/rejected": 2.87231183052063, + "logps/chosen": -359.65484619140625, + "logps/rejected": -353.1298522949219, + "loss": 0.3041, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.471205949783325, + "rewards/margins": 5.031083106994629, + "rewards/rejected": -8.502288818359375, + "step": 42720 + }, + { + "epoch": 1.3927381446994322, + "grad_norm": 8.95015811920166, + "learning_rate": 2.6799078871617732e-05, + "logits/chosen": 2.74235463142395, + "logits/rejected": 2.894235372543335, + "logps/chosen": -374.08428955078125, + "logps/rejected": -363.6249084472656, + "loss": 0.3626, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.6375625133514404, + "rewards/margins": 4.2804155349731445, + "rewards/rejected": -7.917977809906006, + "step": 42740 + }, + { + "epoch": 1.3933898705509526, + "grad_norm": 2.789599657058716, + "learning_rate": 2.6788216508619286e-05, + "logits/chosen": 2.747462034225464, + "logits/rejected": 2.962102174758911, + "logps/chosen": -387.663818359375, + "logps/rejected": -334.09991455078125, + "loss": 0.4636, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.384016513824463, + "rewards/margins": 4.362807273864746, + "rewards/rejected": -7.746823787689209, + "step": 42760 + }, + { + "epoch": 1.3940415964024733, + "grad_norm": 0.41195961833000183, + "learning_rate": 2.6777354145620843e-05, + "logits/chosen": 2.971083164215088, + "logits/rejected": 3.06976056098938, + "logps/chosen": -346.60675048828125, + "logps/rejected": -318.48516845703125, + "loss": 0.2282, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.2109813690185547, + "rewards/margins": 4.341931343078613, + "rewards/rejected": -7.552912712097168, + "step": 42780 + }, + { + "epoch": 1.394693322253994, + "grad_norm": 2.7452690601348877, + "learning_rate": 2.6766491782622394e-05, + "logits/chosen": 3.0912158489227295, + "logits/rejected": 3.0873947143554688, + "logps/chosen": -345.0160217285156, + "logps/rejected": -323.5786437988281, + "loss": 0.2199, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.3702712059020996, + "rewards/margins": 4.3314619064331055, + "rewards/rejected": -7.7017316818237305, + "step": 42800 + }, + { + "epoch": 1.3953450481055145, + "grad_norm": 1.0122014284133911, + "learning_rate": 2.6755629419623945e-05, + "logits/chosen": 2.9428253173828125, + "logits/rejected": 2.8649771213531494, + "logps/chosen": -368.7125549316406, + "logps/rejected": -336.232666015625, + "loss": 0.3954, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.688715696334839, + "rewards/margins": 3.9451816082000732, + "rewards/rejected": -7.633897304534912, + "step": 42820 + }, + { + "epoch": 1.395996773957035, + "grad_norm": 0.023519422858953476, + "learning_rate": 2.6744767056625496e-05, + "logits/chosen": 2.87367582321167, + "logits/rejected": 2.9693031311035156, + "logps/chosen": -359.2769775390625, + "logps/rejected": -364.17236328125, + "loss": 0.2169, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.276949405670166, + "rewards/margins": 4.568796634674072, + "rewards/rejected": -7.845746040344238, + "step": 42840 + }, + { + "epoch": 1.3966484998085555, + "grad_norm": 1.5573233366012573, + "learning_rate": 2.6733904693627053e-05, + "logits/chosen": 2.912076711654663, + "logits/rejected": 2.948204517364502, + "logps/chosen": -340.7378234863281, + "logps/rejected": -316.23236083984375, + "loss": 0.3559, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.839939594268799, + "rewards/margins": 3.718304395675659, + "rewards/rejected": -7.558244228363037, + "step": 42860 + }, + { + "epoch": 1.3973002256600762, + "grad_norm": 4.4738054275512695, + "learning_rate": 2.6723042330628608e-05, + "logits/chosen": 3.029313564300537, + "logits/rejected": 3.044527530670166, + "logps/chosen": -358.9796447753906, + "logps/rejected": -286.61651611328125, + "loss": 0.3234, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.4585514068603516, + "rewards/margins": 3.683349609375, + "rewards/rejected": -7.14190149307251, + "step": 42880 + }, + { + "epoch": 1.3979519515115966, + "grad_norm": 2.7750940322875977, + "learning_rate": 2.6712179967630158e-05, + "logits/chosen": 3.0566203594207764, + "logits/rejected": 2.8623344898223877, + "logps/chosen": -357.89886474609375, + "logps/rejected": -401.06085205078125, + "loss": 0.4264, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.0682902336120605, + "rewards/margins": 4.12894344329834, + "rewards/rejected": -7.197232723236084, + "step": 42900 + }, + { + "epoch": 1.3986036773631172, + "grad_norm": 4.84989595413208, + "learning_rate": 2.6701317604631716e-05, + "logits/chosen": 2.755439043045044, + "logits/rejected": 2.9018301963806152, + "logps/chosen": -349.5210266113281, + "logps/rejected": -373.4514465332031, + "loss": 0.4399, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.927508592605591, + "rewards/margins": 4.351337432861328, + "rewards/rejected": -8.27884578704834, + "step": 42920 + }, + { + "epoch": 1.3992554032146378, + "grad_norm": 0.29334723949432373, + "learning_rate": 2.6690455241633267e-05, + "logits/chosen": 2.902775526046753, + "logits/rejected": 3.0687031745910645, + "logps/chosen": -353.22808837890625, + "logps/rejected": -313.5022888183594, + "loss": 0.2122, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.787616729736328, + "rewards/margins": 4.6919636726379395, + "rewards/rejected": -7.479580879211426, + "step": 42940 + }, + { + "epoch": 1.3999071290661584, + "grad_norm": 3.675074338912964, + "learning_rate": 2.6679592878634817e-05, + "logits/chosen": 2.7868943214416504, + "logits/rejected": 2.974961757659912, + "logps/chosen": -362.56207275390625, + "logps/rejected": -349.84552001953125, + "loss": 0.1669, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.608804941177368, + "rewards/margins": 4.857590198516846, + "rewards/rejected": -7.466394901275635, + "step": 42960 + }, + { + "epoch": 1.4005588549176788, + "grad_norm": 6.032090187072754, + "learning_rate": 2.6668730515636375e-05, + "logits/chosen": 3.3588783740997314, + "logits/rejected": 3.2755560874938965, + "logps/chosen": -397.6248474121094, + "logps/rejected": -345.41619873046875, + "loss": 0.3525, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.022202968597412, + "rewards/margins": 3.8594532012939453, + "rewards/rejected": -6.881655693054199, + "step": 42980 + }, + { + "epoch": 1.4012105807691995, + "grad_norm": 1.9806400537490845, + "learning_rate": 2.6657868152637926e-05, + "logits/chosen": 2.82804799079895, + "logits/rejected": 2.6772103309631348, + "logps/chosen": -305.06390380859375, + "logps/rejected": -286.51287841796875, + "loss": 0.2827, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.0319340229034424, + "rewards/margins": 3.949489116668701, + "rewards/rejected": -6.981423377990723, + "step": 43000 + }, + { + "epoch": 1.4018623066207199, + "grad_norm": 6.881690502166748, + "learning_rate": 2.664700578963948e-05, + "logits/chosen": 3.116203784942627, + "logits/rejected": 3.161367654800415, + "logps/chosen": -337.82049560546875, + "logps/rejected": -341.65203857421875, + "loss": 0.1746, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.863234043121338, + "rewards/margins": 4.60484504699707, + "rewards/rejected": -7.468080043792725, + "step": 43020 + }, + { + "epoch": 1.4025140324722405, + "grad_norm": 7.264423370361328, + "learning_rate": 2.663614342664103e-05, + "logits/chosen": 3.1679561138153076, + "logits/rejected": 2.946652889251709, + "logps/chosen": -368.5858459472656, + "logps/rejected": -357.00262451171875, + "loss": 0.4096, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.2612507343292236, + "rewards/margins": 3.9716758728027344, + "rewards/rejected": -7.232926368713379, + "step": 43040 + }, + { + "epoch": 1.4031657583237611, + "grad_norm": 1.8390653133392334, + "learning_rate": 2.6625281063642588e-05, + "logits/chosen": 3.01383900642395, + "logits/rejected": 3.1535696983337402, + "logps/chosen": -332.2303161621094, + "logps/rejected": -363.8714294433594, + "loss": 0.1916, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.062002658843994, + "rewards/margins": 4.363285064697266, + "rewards/rejected": -7.425288200378418, + "step": 43060 + }, + { + "epoch": 1.4038174841752817, + "grad_norm": 2.1508140563964844, + "learning_rate": 2.661441870064414e-05, + "logits/chosen": 2.7266628742218018, + "logits/rejected": 2.7393510341644287, + "logps/chosen": -321.78741455078125, + "logps/rejected": -317.55029296875, + "loss": 0.4416, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.7362148761749268, + "rewards/margins": 3.7075695991516113, + "rewards/rejected": -7.443783760070801, + "step": 43080 + }, + { + "epoch": 1.4044692100268024, + "grad_norm": 1.1362566947937012, + "learning_rate": 2.660355633764569e-05, + "logits/chosen": 3.1250884532928467, + "logits/rejected": 3.0175912380218506, + "logps/chosen": -394.99859619140625, + "logps/rejected": -385.04180908203125, + "loss": 0.4104, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.1891863346099854, + "rewards/margins": 4.49161434173584, + "rewards/rejected": -7.680799961090088, + "step": 43100 + }, + { + "epoch": 1.4051209358783228, + "grad_norm": 2.499504566192627, + "learning_rate": 2.6592693974647247e-05, + "logits/chosen": 3.0249359607696533, + "logits/rejected": 3.080103874206543, + "logps/chosen": -365.4206237792969, + "logps/rejected": -329.9017333984375, + "loss": 0.2529, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.1791887283325195, + "rewards/margins": 4.538372039794922, + "rewards/rejected": -7.7175612449646, + "step": 43120 + }, + { + "epoch": 1.4057726617298434, + "grad_norm": 3.169712781906128, + "learning_rate": 2.6581831611648798e-05, + "logits/chosen": 2.9245333671569824, + "logits/rejected": 3.187894582748413, + "logps/chosen": -374.2789001464844, + "logps/rejected": -346.7582092285156, + "loss": 0.2197, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.8719749450683594, + "rewards/margins": 4.954981327056885, + "rewards/rejected": -7.826956748962402, + "step": 43140 + }, + { + "epoch": 1.4064243875813638, + "grad_norm": 3.992065668106079, + "learning_rate": 2.6570969248650352e-05, + "logits/chosen": 2.8663668632507324, + "logits/rejected": 2.962613344192505, + "logps/chosen": -316.9072570800781, + "logps/rejected": -339.02398681640625, + "loss": 0.3234, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.022752285003662, + "rewards/margins": 4.377469539642334, + "rewards/rejected": -7.400221824645996, + "step": 43160 + }, + { + "epoch": 1.4070761134328844, + "grad_norm": 0.7754494547843933, + "learning_rate": 2.656010688565191e-05, + "logits/chosen": 3.025602340698242, + "logits/rejected": 2.9638171195983887, + "logps/chosen": -365.6903076171875, + "logps/rejected": -314.614990234375, + "loss": 0.1811, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.470036029815674, + "rewards/margins": 4.493351459503174, + "rewards/rejected": -6.963387489318848, + "step": 43180 + }, + { + "epoch": 1.407727839284405, + "grad_norm": 2.360023260116577, + "learning_rate": 2.654924452265346e-05, + "logits/chosen": 3.129563808441162, + "logits/rejected": 3.2053089141845703, + "logps/chosen": -361.4666442871094, + "logps/rejected": -334.45025634765625, + "loss": 0.3279, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.6458091735839844, + "rewards/margins": 4.163692951202393, + "rewards/rejected": -6.809501647949219, + "step": 43200 + }, + { + "epoch": 1.4083795651359257, + "grad_norm": 1.1386741399765015, + "learning_rate": 2.653838215965501e-05, + "logits/chosen": 3.1107687950134277, + "logits/rejected": 3.0407865047454834, + "logps/chosen": -358.8177795410156, + "logps/rejected": -347.2168273925781, + "loss": 0.3446, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4206032752990723, + "rewards/margins": 4.144595146179199, + "rewards/rejected": -6.5651984214782715, + "step": 43220 + }, + { + "epoch": 1.409031290987446, + "grad_norm": 4.309762954711914, + "learning_rate": 2.6527519796656562e-05, + "logits/chosen": 2.8893654346466064, + "logits/rejected": 2.9619946479797363, + "logps/chosen": -328.0361633300781, + "logps/rejected": -327.66033935546875, + "loss": 0.169, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.712345600128174, + "rewards/margins": 4.990317344665527, + "rewards/rejected": -7.702662467956543, + "step": 43240 + }, + { + "epoch": 1.4096830168389667, + "grad_norm": 1.622134804725647, + "learning_rate": 2.651665743365812e-05, + "logits/chosen": 3.32336163520813, + "logits/rejected": 3.2301647663116455, + "logps/chosen": -388.4174499511719, + "logps/rejected": -350.3134460449219, + "loss": 0.3421, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.512744426727295, + "rewards/margins": 3.9569003582000732, + "rewards/rejected": -6.469645023345947, + "step": 43260 + }, + { + "epoch": 1.4103347426904873, + "grad_norm": 2.8315412998199463, + "learning_rate": 2.6505795070659674e-05, + "logits/chosen": 2.691105365753174, + "logits/rejected": 3.0933074951171875, + "logps/chosen": -297.6305847167969, + "logps/rejected": -346.9998474121094, + "loss": 0.1156, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.42390775680542, + "rewards/margins": 4.5659379959106445, + "rewards/rejected": -6.989845275878906, + "step": 43280 + }, + { + "epoch": 1.4109864685420077, + "grad_norm": 0.2753395438194275, + "learning_rate": 2.6494932707661225e-05, + "logits/chosen": 2.9180567264556885, + "logits/rejected": 2.8044042587280273, + "logps/chosen": -381.80242919921875, + "logps/rejected": -336.75445556640625, + "loss": 0.2309, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.4181671142578125, + "rewards/margins": 4.577618598937988, + "rewards/rejected": -7.995786190032959, + "step": 43300 + }, + { + "epoch": 1.4116381943935283, + "grad_norm": 0.7772421836853027, + "learning_rate": 2.6484070344662782e-05, + "logits/chosen": 2.848482131958008, + "logits/rejected": 2.9140734672546387, + "logps/chosen": -361.46563720703125, + "logps/rejected": -357.5126953125, + "loss": 0.2075, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.0727436542510986, + "rewards/margins": 5.497607231140137, + "rewards/rejected": -8.570351600646973, + "step": 43320 + }, + { + "epoch": 1.412289920245049, + "grad_norm": 3.698310613632202, + "learning_rate": 2.6473207981664333e-05, + "logits/chosen": 2.7610952854156494, + "logits/rejected": 2.8281562328338623, + "logps/chosen": -341.49444580078125, + "logps/rejected": -350.1986389160156, + "loss": 0.3239, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.59477162361145, + "rewards/margins": 4.57219123840332, + "rewards/rejected": -8.166962623596191, + "step": 43340 + }, + { + "epoch": 1.4129416460965696, + "grad_norm": 3.2741658687591553, + "learning_rate": 2.6462345618665884e-05, + "logits/chosen": 2.8854010105133057, + "logits/rejected": 2.8122596740722656, + "logps/chosen": -326.99261474609375, + "logps/rejected": -335.3639831542969, + "loss": 0.3947, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.910757541656494, + "rewards/margins": 3.869356155395508, + "rewards/rejected": -7.78011417388916, + "step": 43360 + }, + { + "epoch": 1.41359337194809, + "grad_norm": 7.896108627319336, + "learning_rate": 2.645148325566744e-05, + "logits/chosen": 3.12673020362854, + "logits/rejected": 3.272228240966797, + "logps/chosen": -396.2419738769531, + "logps/rejected": -391.6969299316406, + "loss": 0.2549, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.952012300491333, + "rewards/margins": 4.667591571807861, + "rewards/rejected": -7.619603633880615, + "step": 43380 + }, + { + "epoch": 1.4142450977996106, + "grad_norm": 10.007984161376953, + "learning_rate": 2.6440620892668992e-05, + "logits/chosen": 2.886303663253784, + "logits/rejected": 2.988511562347412, + "logps/chosen": -351.4476623535156, + "logps/rejected": -343.7672424316406, + "loss": 0.3831, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.152735948562622, + "rewards/margins": 4.143369674682617, + "rewards/rejected": -7.296105861663818, + "step": 43400 + }, + { + "epoch": 1.4148968236511312, + "grad_norm": 0.8314996957778931, + "learning_rate": 2.6429758529670546e-05, + "logits/chosen": 3.225922107696533, + "logits/rejected": 3.254135847091675, + "logps/chosen": -377.19354248046875, + "logps/rejected": -335.90618896484375, + "loss": 0.3742, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.160884380340576, + "rewards/margins": 4.026655197143555, + "rewards/rejected": -7.187539577484131, + "step": 43420 + }, + { + "epoch": 1.4155485495026516, + "grad_norm": 2.462210178375244, + "learning_rate": 2.6418896166672097e-05, + "logits/chosen": 3.560781955718994, + "logits/rejected": 3.623866319656372, + "logps/chosen": -363.5093078613281, + "logps/rejected": -369.16143798828125, + "loss": 0.2457, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.868126392364502, + "rewards/margins": 4.744473457336426, + "rewards/rejected": -7.612599849700928, + "step": 43440 + }, + { + "epoch": 1.4162002753541723, + "grad_norm": 3.1089799404144287, + "learning_rate": 2.6408033803673655e-05, + "logits/chosen": 3.0051076412200928, + "logits/rejected": 3.065276622772217, + "logps/chosen": -338.5215148925781, + "logps/rejected": -348.5374755859375, + "loss": 0.255, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.8299400806427, + "rewards/margins": 5.5728020668029785, + "rewards/rejected": -8.402741432189941, + "step": 43460 + }, + { + "epoch": 1.4168520012056929, + "grad_norm": 0.031131234019994736, + "learning_rate": 2.6397171440675206e-05, + "logits/chosen": 2.7074809074401855, + "logits/rejected": 2.9010186195373535, + "logps/chosen": -340.8890686035156, + "logps/rejected": -375.1905212402344, + "loss": 0.2917, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.706324815750122, + "rewards/margins": 4.921767234802246, + "rewards/rejected": -8.628091812133789, + "step": 43480 + }, + { + "epoch": 1.4175037270572135, + "grad_norm": 0.5621334910392761, + "learning_rate": 2.6386309077676756e-05, + "logits/chosen": 2.838188648223877, + "logits/rejected": 2.8326194286346436, + "logps/chosen": -338.7428894042969, + "logps/rejected": -320.0206604003906, + "loss": 0.168, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7317960262298584, + "rewards/margins": 4.518387317657471, + "rewards/rejected": -7.25018310546875, + "step": 43500 + }, + { + "epoch": 1.418155452908734, + "grad_norm": 2.8849971294403076, + "learning_rate": 2.6375446714678314e-05, + "logits/chosen": 2.944157361984253, + "logits/rejected": 3.271352767944336, + "logps/chosen": -359.5950622558594, + "logps/rejected": -335.0108947753906, + "loss": 0.4072, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.999716281890869, + "rewards/margins": 4.008286952972412, + "rewards/rejected": -7.008003234863281, + "step": 43520 + }, + { + "epoch": 1.4188071787602545, + "grad_norm": 1.3195806741714478, + "learning_rate": 2.6364584351679865e-05, + "logits/chosen": 3.380533218383789, + "logits/rejected": 3.3277504444122314, + "logps/chosen": -329.41796875, + "logps/rejected": -331.0675048828125, + "loss": 0.1795, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.3628127574920654, + "rewards/margins": 4.743875503540039, + "rewards/rejected": -8.106689453125, + "step": 43540 + }, + { + "epoch": 1.419458904611775, + "grad_norm": 4.502197265625, + "learning_rate": 2.635372198868142e-05, + "logits/chosen": 3.2894184589385986, + "logits/rejected": 3.1213834285736084, + "logps/chosen": -375.5347595214844, + "logps/rejected": -382.40667724609375, + "loss": 0.2124, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.1161797046661377, + "rewards/margins": 4.718972682952881, + "rewards/rejected": -7.835152626037598, + "step": 43560 + }, + { + "epoch": 1.4201106304632956, + "grad_norm": 2.746851682662964, + "learning_rate": 2.634285962568297e-05, + "logits/chosen": 2.7514891624450684, + "logits/rejected": 2.795297622680664, + "logps/chosen": -340.2427673339844, + "logps/rejected": -348.6074523925781, + "loss": 0.2557, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.279970645904541, + "rewards/margins": 5.751366138458252, + "rewards/rejected": -9.03133773803711, + "step": 43580 + }, + { + "epoch": 1.4207623563148162, + "grad_norm": 3.740967273712158, + "learning_rate": 2.6331997262684527e-05, + "logits/chosen": 3.1750271320343018, + "logits/rejected": 3.148775577545166, + "logps/chosen": -395.18292236328125, + "logps/rejected": -380.00335693359375, + "loss": 0.164, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.8228347301483154, + "rewards/margins": 5.949399471282959, + "rewards/rejected": -8.772233963012695, + "step": 43600 + }, + { + "epoch": 1.4214140821663368, + "grad_norm": 0.6964420080184937, + "learning_rate": 2.6321134899686078e-05, + "logits/chosen": 2.967379570007324, + "logits/rejected": 3.0231966972351074, + "logps/chosen": -372.4400939941406, + "logps/rejected": -339.484375, + "loss": 0.3606, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.0524086952209473, + "rewards/margins": 4.751233100891113, + "rewards/rejected": -7.803641319274902, + "step": 43620 + }, + { + "epoch": 1.4220658080178574, + "grad_norm": 4.751871109008789, + "learning_rate": 2.631027253668763e-05, + "logits/chosen": 2.9269814491271973, + "logits/rejected": 2.9670655727386475, + "logps/chosen": -325.3454284667969, + "logps/rejected": -365.03887939453125, + "loss": 0.1688, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.916268825531006, + "rewards/margins": 4.762847900390625, + "rewards/rejected": -7.679116725921631, + "step": 43640 + }, + { + "epoch": 1.4227175338693778, + "grad_norm": 3.338867664337158, + "learning_rate": 2.6299410173689186e-05, + "logits/chosen": 2.9391121864318848, + "logits/rejected": 2.856391191482544, + "logps/chosen": -372.3049011230469, + "logps/rejected": -373.39398193359375, + "loss": 0.2686, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.062713146209717, + "rewards/margins": 4.881858825683594, + "rewards/rejected": -7.944571495056152, + "step": 43660 + }, + { + "epoch": 1.4233692597208984, + "grad_norm": 0.38239169120788574, + "learning_rate": 2.628854781069074e-05, + "logits/chosen": 3.202117443084717, + "logits/rejected": 3.2106406688690186, + "logps/chosen": -403.5514831542969, + "logps/rejected": -369.0350036621094, + "loss": 0.2895, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.578831911087036, + "rewards/margins": 4.077765464782715, + "rewards/rejected": -7.656598091125488, + "step": 43680 + }, + { + "epoch": 1.4240209855724189, + "grad_norm": 5.524907112121582, + "learning_rate": 2.627768544769229e-05, + "logits/chosen": 2.989712953567505, + "logits/rejected": 3.267331600189209, + "logps/chosen": -340.92388916015625, + "logps/rejected": -319.9012451171875, + "loss": 0.3919, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.697765350341797, + "rewards/margins": 4.45229434967041, + "rewards/rejected": -8.150059700012207, + "step": 43700 + }, + { + "epoch": 1.4246727114239395, + "grad_norm": 2.892303228378296, + "learning_rate": 2.626682308469385e-05, + "logits/chosen": 2.9247097969055176, + "logits/rejected": 2.92694091796875, + "logps/chosen": -361.52935791015625, + "logps/rejected": -354.46942138671875, + "loss": 0.2322, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.117594003677368, + "rewards/margins": 4.421625137329102, + "rewards/rejected": -7.539219856262207, + "step": 43720 + }, + { + "epoch": 1.42532443727546, + "grad_norm": 0.9167161583900452, + "learning_rate": 2.62559607216954e-05, + "logits/chosen": 3.2569923400878906, + "logits/rejected": 3.3460171222686768, + "logps/chosen": -376.49945068359375, + "logps/rejected": -344.69207763671875, + "loss": 0.3003, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.1614794731140137, + "rewards/margins": 4.590293884277344, + "rewards/rejected": -7.751772880554199, + "step": 43740 + }, + { + "epoch": 1.4259761631269807, + "grad_norm": 0.41617250442504883, + "learning_rate": 2.624509835869695e-05, + "logits/chosen": 3.2300961017608643, + "logits/rejected": 3.1939635276794434, + "logps/chosen": -386.82470703125, + "logps/rejected": -367.45794677734375, + "loss": 0.1432, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.923654794692993, + "rewards/margins": 5.098263263702393, + "rewards/rejected": -8.021917343139648, + "step": 43760 + }, + { + "epoch": 1.4266278889785011, + "grad_norm": 8.269306182861328, + "learning_rate": 2.62342359956985e-05, + "logits/chosen": 3.072523355484009, + "logits/rejected": 3.0669429302215576, + "logps/chosen": -389.08831787109375, + "logps/rejected": -365.79254150390625, + "loss": 0.3488, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.131821393966675, + "rewards/margins": 5.212411880493164, + "rewards/rejected": -8.344233512878418, + "step": 43780 + }, + { + "epoch": 1.4272796148300217, + "grad_norm": 0.06328090280294418, + "learning_rate": 2.622337363270006e-05, + "logits/chosen": 3.0981290340423584, + "logits/rejected": 2.8231682777404785, + "logps/chosen": -323.41705322265625, + "logps/rejected": -370.14642333984375, + "loss": 0.3689, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.1486871242523193, + "rewards/margins": 4.044694423675537, + "rewards/rejected": -7.193381309509277, + "step": 43800 + }, + { + "epoch": 1.4279313406815424, + "grad_norm": 2.2361626625061035, + "learning_rate": 2.6212511269701613e-05, + "logits/chosen": 2.8524715900421143, + "logits/rejected": 2.7695870399475098, + "logps/chosen": -338.9732360839844, + "logps/rejected": -342.0224304199219, + "loss": 0.2475, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.6725401878356934, + "rewards/margins": 4.948959827423096, + "rewards/rejected": -8.621500015258789, + "step": 43820 + }, + { + "epoch": 1.4285830665330628, + "grad_norm": 3.31884765625, + "learning_rate": 2.6201648906703164e-05, + "logits/chosen": 2.641953706741333, + "logits/rejected": 2.8855910301208496, + "logps/chosen": -326.029296875, + "logps/rejected": -350.97772216796875, + "loss": 0.2225, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.152181386947632, + "rewards/margins": 5.305581569671631, + "rewards/rejected": -8.457761764526367, + "step": 43840 + }, + { + "epoch": 1.4292347923845834, + "grad_norm": 5.329334259033203, + "learning_rate": 2.619078654370472e-05, + "logits/chosen": 2.8622984886169434, + "logits/rejected": 2.9675230979919434, + "logps/chosen": -384.32470703125, + "logps/rejected": -349.52227783203125, + "loss": 0.5319, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.20109486579895, + "rewards/margins": 4.014819622039795, + "rewards/rejected": -7.215914249420166, + "step": 43860 + }, + { + "epoch": 1.429886518236104, + "grad_norm": 5.276125907897949, + "learning_rate": 2.6179924180706272e-05, + "logits/chosen": 2.7532496452331543, + "logits/rejected": 2.8555479049682617, + "logps/chosen": -329.9774169921875, + "logps/rejected": -334.376953125, + "loss": 0.3198, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.019792079925537, + "rewards/margins": 3.936659336090088, + "rewards/rejected": -6.956451416015625, + "step": 43880 + }, + { + "epoch": 1.4305382440876246, + "grad_norm": 1.9470213651657104, + "learning_rate": 2.6169061817707823e-05, + "logits/chosen": 2.7081634998321533, + "logits/rejected": 2.891474962234497, + "logps/chosen": -323.30133056640625, + "logps/rejected": -310.37255859375, + "loss": 0.368, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.43314790725708, + "rewards/margins": 4.745339393615723, + "rewards/rejected": -8.178486824035645, + "step": 43900 + }, + { + "epoch": 1.431189969939145, + "grad_norm": 1.8567438125610352, + "learning_rate": 2.615819945470938e-05, + "logits/chosen": 2.904358386993408, + "logits/rejected": 3.1401214599609375, + "logps/chosen": -349.7923889160156, + "logps/rejected": -354.05670166015625, + "loss": 0.2675, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.9150443077087402, + "rewards/margins": 4.173688888549805, + "rewards/rejected": -7.088733673095703, + "step": 43920 + }, + { + "epoch": 1.4318416957906657, + "grad_norm": 0.11434343457221985, + "learning_rate": 2.614733709171093e-05, + "logits/chosen": 2.9805586338043213, + "logits/rejected": 3.07446026802063, + "logps/chosen": -371.9018859863281, + "logps/rejected": -384.2696533203125, + "loss": 0.404, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.9104180335998535, + "rewards/margins": 3.608123779296875, + "rewards/rejected": -7.5185418128967285, + "step": 43940 + }, + { + "epoch": 1.4324934216421863, + "grad_norm": 0.48476338386535645, + "learning_rate": 2.6136474728712485e-05, + "logits/chosen": 2.9328157901763916, + "logits/rejected": 2.9763007164001465, + "logps/chosen": -380.48223876953125, + "logps/rejected": -345.3112487792969, + "loss": 0.321, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.3250317573547363, + "rewards/margins": 4.2405314445495605, + "rewards/rejected": -7.565564155578613, + "step": 43960 + }, + { + "epoch": 1.4331451474937067, + "grad_norm": 3.214158773422241, + "learning_rate": 2.6125612365714036e-05, + "logits/chosen": 2.828827381134033, + "logits/rejected": 2.900650978088379, + "logps/chosen": -378.4399108886719, + "logps/rejected": -344.3221740722656, + "loss": 0.277, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.545689821243286, + "rewards/margins": 4.201727390289307, + "rewards/rejected": -6.747417449951172, + "step": 43980 + }, + { + "epoch": 1.4337968733452273, + "grad_norm": 1.1220296621322632, + "learning_rate": 2.6114750002715594e-05, + "logits/chosen": 3.018665313720703, + "logits/rejected": 3.113740921020508, + "logps/chosen": -366.16217041015625, + "logps/rejected": -333.1024169921875, + "loss": 0.2248, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.3203577995300293, + "rewards/margins": 4.3230414390563965, + "rewards/rejected": -7.643399238586426, + "step": 44000 + }, + { + "epoch": 1.434448599196748, + "grad_norm": 1.3470536470413208, + "learning_rate": 2.6103887639717144e-05, + "logits/chosen": 3.2508246898651123, + "logits/rejected": 3.2229912281036377, + "logps/chosen": -338.72821044921875, + "logps/rejected": -361.61041259765625, + "loss": 0.344, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.5622754096984863, + "rewards/margins": 4.364292144775391, + "rewards/rejected": -7.926568508148193, + "step": 44020 + }, + { + "epoch": 1.4351003250482686, + "grad_norm": 1.5414808988571167, + "learning_rate": 2.6093025276718695e-05, + "logits/chosen": 3.125105142593384, + "logits/rejected": 3.072634220123291, + "logps/chosen": -389.4272155761719, + "logps/rejected": -361.66473388671875, + "loss": 0.3787, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.736222505569458, + "rewards/margins": 4.616351127624512, + "rewards/rejected": -8.35257339477539, + "step": 44040 + }, + { + "epoch": 1.435752050899789, + "grad_norm": 2.1203370094299316, + "learning_rate": 2.6082162913720253e-05, + "logits/chosen": 3.0518462657928467, + "logits/rejected": 3.2185120582580566, + "logps/chosen": -368.822998046875, + "logps/rejected": -334.4240417480469, + "loss": 0.5394, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1117167472839355, + "rewards/margins": 4.113792419433594, + "rewards/rejected": -7.225508689880371, + "step": 44060 + }, + { + "epoch": 1.4364037767513096, + "grad_norm": 0.5823163390159607, + "learning_rate": 2.6071300550721807e-05, + "logits/chosen": 2.850040912628174, + "logits/rejected": 2.894183397293091, + "logps/chosen": -324.2867126464844, + "logps/rejected": -335.36920166015625, + "loss": 0.2842, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.5175087451934814, + "rewards/margins": 4.0466837882995605, + "rewards/rejected": -7.564192295074463, + "step": 44080 + }, + { + "epoch": 1.43705550260283, + "grad_norm": 6.067378997802734, + "learning_rate": 2.6060438187723358e-05, + "logits/chosen": 3.2211222648620605, + "logits/rejected": 3.170640468597412, + "logps/chosen": -358.09637451171875, + "logps/rejected": -333.36859130859375, + "loss": 0.2462, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.445408344268799, + "rewards/margins": 4.573918342590332, + "rewards/rejected": -7.019326686859131, + "step": 44100 + }, + { + "epoch": 1.4377072284543506, + "grad_norm": 0.013233812525868416, + "learning_rate": 2.6049575824724915e-05, + "logits/chosen": 3.031369686126709, + "logits/rejected": 3.12164568901062, + "logps/chosen": -350.7204895019531, + "logps/rejected": -350.21173095703125, + "loss": 0.1871, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.639214277267456, + "rewards/margins": 4.574221134185791, + "rewards/rejected": -7.213435173034668, + "step": 44120 + }, + { + "epoch": 1.4383589543058712, + "grad_norm": 3.1822421550750732, + "learning_rate": 2.6038713461726466e-05, + "logits/chosen": 3.0553512573242188, + "logits/rejected": 3.1618478298187256, + "logps/chosen": -329.5912780761719, + "logps/rejected": -308.61083984375, + "loss": 0.3427, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.850809097290039, + "rewards/margins": 4.629349708557129, + "rewards/rejected": -7.480158805847168, + "step": 44140 + }, + { + "epoch": 1.4390106801573919, + "grad_norm": 0.5233020186424255, + "learning_rate": 2.6027851098728017e-05, + "logits/chosen": 3.0037169456481934, + "logits/rejected": 3.0036964416503906, + "logps/chosen": -340.609375, + "logps/rejected": -357.97711181640625, + "loss": 0.2357, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.592400312423706, + "rewards/margins": 4.482752799987793, + "rewards/rejected": -8.075153350830078, + "step": 44160 + }, + { + "epoch": 1.4396624060089125, + "grad_norm": 3.25773024559021, + "learning_rate": 2.601698873572957e-05, + "logits/chosen": 3.3510711193084717, + "logits/rejected": 3.241018295288086, + "logps/chosen": -382.421630859375, + "logps/rejected": -334.9117736816406, + "loss": 0.281, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.5915513038635254, + "rewards/margins": 4.112009525299072, + "rewards/rejected": -7.703561305999756, + "step": 44180 + }, + { + "epoch": 1.440314131860433, + "grad_norm": 4.488653659820557, + "learning_rate": 2.6006126372731125e-05, + "logits/chosen": 2.948965549468994, + "logits/rejected": 3.030235767364502, + "logps/chosen": -346.1293640136719, + "logps/rejected": -336.8999328613281, + "loss": 0.3223, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.9287712574005127, + "rewards/margins": 3.6951446533203125, + "rewards/rejected": -7.6239166259765625, + "step": 44200 + }, + { + "epoch": 1.4409658577119535, + "grad_norm": 5.202831745147705, + "learning_rate": 2.599526400973268e-05, + "logits/chosen": 2.991853952407837, + "logits/rejected": 3.014455795288086, + "logps/chosen": -329.4571533203125, + "logps/rejected": -331.91021728515625, + "loss": 0.3885, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.8160743713378906, + "rewards/margins": 3.6337780952453613, + "rewards/rejected": -7.44985294342041, + "step": 44220 + }, + { + "epoch": 1.441617583563474, + "grad_norm": 0.679135262966156, + "learning_rate": 2.598440164673423e-05, + "logits/chosen": 2.746267080307007, + "logits/rejected": 2.938859462738037, + "logps/chosen": -340.84478759765625, + "logps/rejected": -366.92236328125, + "loss": 0.1338, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.8847079277038574, + "rewards/margins": 5.194717884063721, + "rewards/rejected": -9.079425811767578, + "step": 44240 + }, + { + "epoch": 1.4422693094149945, + "grad_norm": 8.545858383178711, + "learning_rate": 2.5973539283735788e-05, + "logits/chosen": 2.8149771690368652, + "logits/rejected": 2.944575548171997, + "logps/chosen": -340.1820373535156, + "logps/rejected": -399.2066650390625, + "loss": 0.2649, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.8980307579040527, + "rewards/margins": 4.391913414001465, + "rewards/rejected": -8.28994369506836, + "step": 44260 + }, + { + "epoch": 1.4429210352665152, + "grad_norm": 2.4381580352783203, + "learning_rate": 2.596267692073734e-05, + "logits/chosen": 2.9398703575134277, + "logits/rejected": 3.0637779235839844, + "logps/chosen": -350.23614501953125, + "logps/rejected": -369.9688415527344, + "loss": 0.2099, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.288267135620117, + "rewards/margins": 4.851130485534668, + "rewards/rejected": -9.139398574829102, + "step": 44280 + }, + { + "epoch": 1.4435727611180358, + "grad_norm": 1.0410935878753662, + "learning_rate": 2.595181455773889e-05, + "logits/chosen": 2.7356584072113037, + "logits/rejected": 2.672607898712158, + "logps/chosen": -358.9982604980469, + "logps/rejected": -368.6001892089844, + "loss": 0.4076, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.01709508895874, + "rewards/margins": 4.457031726837158, + "rewards/rejected": -8.474126815795898, + "step": 44300 + }, + { + "epoch": 1.4442244869695562, + "grad_norm": 0.9854913949966431, + "learning_rate": 2.5940952194740447e-05, + "logits/chosen": 2.7333202362060547, + "logits/rejected": 2.970202922821045, + "logps/chosen": -363.2304382324219, + "logps/rejected": -354.66241455078125, + "loss": 0.1915, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.6909871101379395, + "rewards/margins": 4.880914211273193, + "rewards/rejected": -8.571901321411133, + "step": 44320 + }, + { + "epoch": 1.4448762128210768, + "grad_norm": 9.554886817932129, + "learning_rate": 2.5930089831741998e-05, + "logits/chosen": 3.0059876441955566, + "logits/rejected": 3.18506121635437, + "logps/chosen": -338.2264404296875, + "logps/rejected": -391.3064880371094, + "loss": 0.3616, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.890578508377075, + "rewards/margins": 5.1840996742248535, + "rewards/rejected": -9.074678421020508, + "step": 44340 + }, + { + "epoch": 1.4455279386725974, + "grad_norm": 3.197634220123291, + "learning_rate": 2.5919227468743552e-05, + "logits/chosen": 3.081350803375244, + "logits/rejected": 3.237133026123047, + "logps/chosen": -375.95355224609375, + "logps/rejected": -412.4092712402344, + "loss": 0.2919, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.3131496906280518, + "rewards/margins": 4.582493782043457, + "rewards/rejected": -7.895643711090088, + "step": 44360 + }, + { + "epoch": 1.4461796645241178, + "grad_norm": 1.5615510940551758, + "learning_rate": 2.5908365105745103e-05, + "logits/chosen": 2.8271970748901367, + "logits/rejected": 3.1060798168182373, + "logps/chosen": -352.403076171875, + "logps/rejected": -370.06036376953125, + "loss": 0.188, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.394553184509277, + "rewards/margins": 5.037221431732178, + "rewards/rejected": -9.431774139404297, + "step": 44380 + }, + { + "epoch": 1.4468313903756385, + "grad_norm": 4.078502655029297, + "learning_rate": 2.589750274274666e-05, + "logits/chosen": 2.830500364303589, + "logits/rejected": 3.193885326385498, + "logps/chosen": -363.2549133300781, + "logps/rejected": -358.3389892578125, + "loss": 0.2723, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.337214231491089, + "rewards/margins": 4.754358768463135, + "rewards/rejected": -8.091573715209961, + "step": 44400 + }, + { + "epoch": 1.447483116227159, + "grad_norm": 1.463731050491333, + "learning_rate": 2.588664037974821e-05, + "logits/chosen": 2.9714977741241455, + "logits/rejected": 3.065103769302368, + "logps/chosen": -351.42279052734375, + "logps/rejected": -311.6922912597656, + "loss": 0.3085, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4091708660125732, + "rewards/margins": 4.724234580993652, + "rewards/rejected": -8.133405685424805, + "step": 44420 + }, + { + "epoch": 1.4481348420786797, + "grad_norm": 3.2470180988311768, + "learning_rate": 2.5875778016749762e-05, + "logits/chosen": 3.2239232063293457, + "logits/rejected": 3.0828704833984375, + "logps/chosen": -381.35382080078125, + "logps/rejected": -338.4111633300781, + "loss": 0.1814, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.7005646228790283, + "rewards/margins": 5.253127098083496, + "rewards/rejected": -7.953691005706787, + "step": 44440 + }, + { + "epoch": 1.4487865679302, + "grad_norm": 6.770795822143555, + "learning_rate": 2.586491565375132e-05, + "logits/chosen": 3.2472240924835205, + "logits/rejected": 3.0608153343200684, + "logps/chosen": -358.581787109375, + "logps/rejected": -381.9258117675781, + "loss": 0.2961, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.66660475730896, + "rewards/margins": 4.02367639541626, + "rewards/rejected": -6.690281867980957, + "step": 44460 + }, + { + "epoch": 1.4494382937817207, + "grad_norm": 0.5161373019218445, + "learning_rate": 2.5854053290752873e-05, + "logits/chosen": 2.63671875, + "logits/rejected": 2.866770029067993, + "logps/chosen": -351.33349609375, + "logps/rejected": -332.0528564453125, + "loss": 0.2043, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.094569206237793, + "rewards/margins": 4.410530090332031, + "rewards/rejected": -7.505099296569824, + "step": 44480 + }, + { + "epoch": 1.4500900196332414, + "grad_norm": 0.20111079514026642, + "learning_rate": 2.5843190927754424e-05, + "logits/chosen": 3.120941638946533, + "logits/rejected": 3.257772922515869, + "logps/chosen": -370.56402587890625, + "logps/rejected": -340.5447082519531, + "loss": 0.2671, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.965630531311035, + "rewards/margins": 5.2598371505737305, + "rewards/rejected": -8.22546672821045, + "step": 44500 + }, + { + "epoch": 1.4507417454847618, + "grad_norm": 3.4682414531707764, + "learning_rate": 2.5832328564755982e-05, + "logits/chosen": 2.651793956756592, + "logits/rejected": 2.877932548522949, + "logps/chosen": -320.7098388671875, + "logps/rejected": -311.755859375, + "loss": 0.278, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.888704299926758, + "rewards/margins": 4.344584941864014, + "rewards/rejected": -8.233288764953613, + "step": 44520 + }, + { + "epoch": 1.4513934713362824, + "grad_norm": 6.506678104400635, + "learning_rate": 2.5821466201757533e-05, + "logits/chosen": 3.015608310699463, + "logits/rejected": 2.9988491535186768, + "logps/chosen": -368.63043212890625, + "logps/rejected": -370.3812561035156, + "loss": 0.335, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.039888858795166, + "rewards/margins": 4.5971574783325195, + "rewards/rejected": -7.637046813964844, + "step": 44540 + }, + { + "epoch": 1.452045197187803, + "grad_norm": 0.10048145055770874, + "learning_rate": 2.5810603838759083e-05, + "logits/chosen": 3.0628175735473633, + "logits/rejected": 3.101072072982788, + "logps/chosen": -345.16363525390625, + "logps/rejected": -341.79144287109375, + "loss": 0.1845, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.124100685119629, + "rewards/margins": 4.550323963165283, + "rewards/rejected": -8.67442512512207, + "step": 44560 + }, + { + "epoch": 1.4526969230393236, + "grad_norm": 6.26591682434082, + "learning_rate": 2.5799741475760638e-05, + "logits/chosen": 3.117419481277466, + "logits/rejected": 3.1370387077331543, + "logps/chosen": -360.3454284667969, + "logps/rejected": -341.7769470214844, + "loss": 0.3386, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.2090911865234375, + "rewards/margins": 3.832125186920166, + "rewards/rejected": -7.0412163734436035, + "step": 44580 + }, + { + "epoch": 1.453348648890844, + "grad_norm": 7.18314790725708, + "learning_rate": 2.5788879112762192e-05, + "logits/chosen": 2.836369037628174, + "logits/rejected": 3.082062244415283, + "logps/chosen": -361.40557861328125, + "logps/rejected": -391.17498779296875, + "loss": 0.4572, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.607990264892578, + "rewards/margins": 3.697275161743164, + "rewards/rejected": -7.3052659034729, + "step": 44600 + }, + { + "epoch": 1.4540003747423647, + "grad_norm": 6.129943370819092, + "learning_rate": 2.5778016749763746e-05, + "logits/chosen": 2.9656190872192383, + "logits/rejected": 3.0042083263397217, + "logps/chosen": -382.02398681640625, + "logps/rejected": -393.95916748046875, + "loss": 0.2049, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5553436279296875, + "rewards/margins": 5.583305358886719, + "rewards/rejected": -8.138649940490723, + "step": 44620 + }, + { + "epoch": 1.454652100593885, + "grad_norm": 5.538296222686768, + "learning_rate": 2.5767154386765297e-05, + "logits/chosen": 3.2315831184387207, + "logits/rejected": 3.1526951789855957, + "logps/chosen": -372.542236328125, + "logps/rejected": -392.4991760253906, + "loss": 0.1844, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.950812578201294, + "rewards/margins": 5.4123945236206055, + "rewards/rejected": -8.36320686340332, + "step": 44640 + }, + { + "epoch": 1.4553038264454057, + "grad_norm": 0.328268826007843, + "learning_rate": 2.5756292023766854e-05, + "logits/chosen": 2.9459314346313477, + "logits/rejected": 2.9500832557678223, + "logps/chosen": -369.18304443359375, + "logps/rejected": -345.9239196777344, + "loss": 0.349, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.9628570079803467, + "rewards/margins": 5.317951679229736, + "rewards/rejected": -9.28080940246582, + "step": 44660 + }, + { + "epoch": 1.4559555522969263, + "grad_norm": 2.3439650535583496, + "learning_rate": 2.5745429660768405e-05, + "logits/chosen": 2.7521071434020996, + "logits/rejected": 2.9000015258789062, + "logps/chosen": -358.35076904296875, + "logps/rejected": -344.87359619140625, + "loss": 0.3648, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.006229877471924, + "rewards/margins": 3.937000274658203, + "rewards/rejected": -7.943229675292969, + "step": 44680 + }, + { + "epoch": 1.456607278148447, + "grad_norm": 13.157330513000488, + "learning_rate": 2.5734567297769956e-05, + "logits/chosen": 2.9914138317108154, + "logits/rejected": 3.3371779918670654, + "logps/chosen": -368.8359069824219, + "logps/rejected": -349.2590637207031, + "loss": 0.1541, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.4258275032043457, + "rewards/margins": 5.793702125549316, + "rewards/rejected": -9.21953010559082, + "step": 44700 + }, + { + "epoch": 1.4572590039999676, + "grad_norm": 3.814628839492798, + "learning_rate": 2.572370493477151e-05, + "logits/chosen": 2.781165361404419, + "logits/rejected": 2.8713886737823486, + "logps/chosen": -381.98077392578125, + "logps/rejected": -343.54541015625, + "loss": 0.4477, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.87931489944458, + "rewards/margins": 4.8565568923950195, + "rewards/rejected": -7.7358717918396, + "step": 44720 + }, + { + "epoch": 1.457910729851488, + "grad_norm": 3.5445075035095215, + "learning_rate": 2.5712842571773068e-05, + "logits/chosen": 2.645437002182007, + "logits/rejected": 2.867041826248169, + "logps/chosen": -335.48931884765625, + "logps/rejected": -325.30029296875, + "loss": 0.2406, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.7084527015686035, + "rewards/margins": 4.464923858642578, + "rewards/rejected": -7.173376560211182, + "step": 44740 + }, + { + "epoch": 1.4585624557030086, + "grad_norm": 3.622049331665039, + "learning_rate": 2.570198020877462e-05, + "logits/chosen": 3.0159010887145996, + "logits/rejected": 2.7918601036071777, + "logps/chosen": -388.4813537597656, + "logps/rejected": -340.1690673828125, + "loss": 0.2805, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.5279738903045654, + "rewards/margins": 4.759879112243652, + "rewards/rejected": -8.287853240966797, + "step": 44760 + }, + { + "epoch": 1.459214181554529, + "grad_norm": 3.046760082244873, + "learning_rate": 2.569111784577617e-05, + "logits/chosen": 3.084583282470703, + "logits/rejected": 3.0928797721862793, + "logps/chosen": -334.02410888671875, + "logps/rejected": -351.1303405761719, + "loss": 0.3658, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.725919008255005, + "rewards/margins": 3.8001632690429688, + "rewards/rejected": -7.5260820388793945, + "step": 44780 + }, + { + "epoch": 1.4598659074060496, + "grad_norm": 4.27665901184082, + "learning_rate": 2.5680255482777727e-05, + "logits/chosen": 2.9269635677337646, + "logits/rejected": 3.2465362548828125, + "logps/chosen": -339.3690490722656, + "logps/rejected": -330.7058410644531, + "loss": 0.2794, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.232656478881836, + "rewards/margins": 4.2346577644348145, + "rewards/rejected": -7.46731424331665, + "step": 44800 + }, + { + "epoch": 1.4605176332575702, + "grad_norm": 0.1181681677699089, + "learning_rate": 2.5669393119779277e-05, + "logits/chosen": 2.830589771270752, + "logits/rejected": 3.115696668624878, + "logps/chosen": -322.1712646484375, + "logps/rejected": -357.0555419921875, + "loss": 0.2116, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.6426920890808105, + "rewards/margins": 4.000268936157227, + "rewards/rejected": -6.642960548400879, + "step": 44820 + }, + { + "epoch": 1.4611693591090908, + "grad_norm": 0.19902420043945312, + "learning_rate": 2.5658530756780828e-05, + "logits/chosen": 3.05643630027771, + "logits/rejected": 3.0652036666870117, + "logps/chosen": -354.6441345214844, + "logps/rejected": -378.0607604980469, + "loss": 0.4761, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9644405841827393, + "rewards/margins": 3.636784315109253, + "rewards/rejected": -6.601224422454834, + "step": 44840 + }, + { + "epoch": 1.4618210849606113, + "grad_norm": 1.290140151977539, + "learning_rate": 2.5647668393782386e-05, + "logits/chosen": 2.799299478530884, + "logits/rejected": 3.0098390579223633, + "logps/chosen": -323.3069763183594, + "logps/rejected": -331.44580078125, + "loss": 0.3596, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.5761780738830566, + "rewards/margins": 4.014679908752441, + "rewards/rejected": -7.590858459472656, + "step": 44860 + }, + { + "epoch": 1.4624728108121319, + "grad_norm": 0.6864741444587708, + "learning_rate": 2.563680603078394e-05, + "logits/chosen": 3.2306008338928223, + "logits/rejected": 3.278977155685425, + "logps/chosen": -362.7711486816406, + "logps/rejected": -387.24072265625, + "loss": 0.1973, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.477240562438965, + "rewards/margins": 4.670248985290527, + "rewards/rejected": -7.14749002456665, + "step": 44880 + }, + { + "epoch": 1.4631245366636525, + "grad_norm": 0.8305959105491638, + "learning_rate": 2.562594366778549e-05, + "logits/chosen": 2.916201114654541, + "logits/rejected": 3.029742479324341, + "logps/chosen": -363.88397216796875, + "logps/rejected": -359.53826904296875, + "loss": 0.2322, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.227756977081299, + "rewards/margins": 5.360376358032227, + "rewards/rejected": -8.588132858276367, + "step": 44900 + }, + { + "epoch": 1.463776262515173, + "grad_norm": 3.174602746963501, + "learning_rate": 2.561508130478704e-05, + "logits/chosen": 3.1395134925842285, + "logits/rejected": 3.018697738647461, + "logps/chosen": -353.88787841796875, + "logps/rejected": -353.5594787597656, + "loss": 0.2879, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.1452152729034424, + "rewards/margins": 4.565221786499023, + "rewards/rejected": -7.7104363441467285, + "step": 44920 + }, + { + "epoch": 1.4644279883666935, + "grad_norm": 5.986759185791016, + "learning_rate": 2.56042189417886e-05, + "logits/chosen": 3.1113665103912354, + "logits/rejected": 3.011436700820923, + "logps/chosen": -388.685546875, + "logps/rejected": -402.1575622558594, + "loss": 0.3454, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.233375549316406, + "rewards/margins": 3.771914005279541, + "rewards/rejected": -8.005289077758789, + "step": 44940 + }, + { + "epoch": 1.4650797142182141, + "grad_norm": 1.448407769203186, + "learning_rate": 2.559335657879015e-05, + "logits/chosen": 2.810837745666504, + "logits/rejected": 2.906494140625, + "logps/chosen": -330.60394287109375, + "logps/rejected": -340.77484130859375, + "loss": 0.4501, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.3976216316223145, + "rewards/margins": 4.413825511932373, + "rewards/rejected": -7.8114471435546875, + "step": 44960 + }, + { + "epoch": 1.4657314400697348, + "grad_norm": 6.009817123413086, + "learning_rate": 2.5582494215791704e-05, + "logits/chosen": 2.527888774871826, + "logits/rejected": 2.7563581466674805, + "logps/chosen": -316.1652526855469, + "logps/rejected": -344.89959716796875, + "loss": 0.3094, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.6817893981933594, + "rewards/margins": 4.771745204925537, + "rewards/rejected": -8.453535079956055, + "step": 44980 + }, + { + "epoch": 1.4663831659212552, + "grad_norm": 0.6716554760932922, + "learning_rate": 2.5571631852793258e-05, + "logits/chosen": 3.1336748600006104, + "logits/rejected": 3.161067485809326, + "logps/chosen": -358.87335205078125, + "logps/rejected": -333.2352294921875, + "loss": 0.1509, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.6705322265625, + "rewards/margins": 4.187369346618652, + "rewards/rejected": -7.857901573181152, + "step": 45000 + }, + { + "epoch": 1.4663831659212552, + "eval_logits/chosen": 3.085916519165039, + "eval_logits/rejected": 3.096634864807129, + "eval_logps/chosen": -394.7108154296875, + "eval_logps/rejected": -380.11541748046875, + "eval_loss": 0.4869866371154785, + "eval_rewards/accuracies": 0.8324087262153625, + "eval_rewards/chosen": -4.013012886047363, + "eval_rewards/margins": 4.375205039978027, + "eval_rewards/rejected": -8.38821792602539, + "eval_runtime": 3545.4499, + "eval_samples_per_second": 3.152, + "eval_steps_per_second": 3.152, + "step": 45000 + }, + { + "epoch": 1.4670348917727758, + "grad_norm": 0.7301899790763855, + "learning_rate": 2.5560769489794812e-05, + "logits/chosen": 3.2123515605926514, + "logits/rejected": 3.1701314449310303, + "logps/chosen": -406.386474609375, + "logps/rejected": -373.4250793457031, + "loss": 0.4158, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.4420406818389893, + "rewards/margins": 4.241466045379639, + "rewards/rejected": -7.683506965637207, + "step": 45020 + }, + { + "epoch": 1.4676866176242964, + "grad_norm": 0.11257071793079376, + "learning_rate": 2.5549907126796363e-05, + "logits/chosen": 3.056630849838257, + "logits/rejected": 3.1398940086364746, + "logps/chosen": -356.94378662109375, + "logps/rejected": -362.23175048828125, + "loss": 0.2777, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.413820266723633, + "rewards/margins": 4.832298278808594, + "rewards/rejected": -8.246119499206543, + "step": 45040 + }, + { + "epoch": 1.4683383434758168, + "grad_norm": 1.3445392847061157, + "learning_rate": 2.553904476379792e-05, + "logits/chosen": 2.8673434257507324, + "logits/rejected": 3.0173046588897705, + "logps/chosen": -345.67022705078125, + "logps/rejected": -383.5451965332031, + "loss": 0.2144, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.3897526264190674, + "rewards/margins": 4.6559553146362305, + "rewards/rejected": -8.045707702636719, + "step": 45060 + }, + { + "epoch": 1.4689900693273374, + "grad_norm": 5.475305080413818, + "learning_rate": 2.552818240079947e-05, + "logits/chosen": 2.791827440261841, + "logits/rejected": 2.8866705894470215, + "logps/chosen": -343.1132507324219, + "logps/rejected": -339.20941162109375, + "loss": 0.2493, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.9956958293914795, + "rewards/margins": 4.242152214050293, + "rewards/rejected": -7.23784875869751, + "step": 45080 + }, + { + "epoch": 1.469641795178858, + "grad_norm": 4.085535526275635, + "learning_rate": 2.5517320037801022e-05, + "logits/chosen": 2.7854857444763184, + "logits/rejected": 2.900527238845825, + "logps/chosen": -353.75091552734375, + "logps/rejected": -318.97296142578125, + "loss": 0.282, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.021568536758423, + "rewards/margins": 4.755739688873291, + "rewards/rejected": -7.777308464050293, + "step": 45100 + }, + { + "epoch": 1.4702935210303787, + "grad_norm": 3.175914764404297, + "learning_rate": 2.5506457674802576e-05, + "logits/chosen": 3.166123390197754, + "logits/rejected": 3.25797700881958, + "logps/chosen": -369.8715515136719, + "logps/rejected": -350.41632080078125, + "loss": 0.2098, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.59511661529541, + "rewards/margins": 4.811615467071533, + "rewards/rejected": -7.406732082366943, + "step": 45120 + }, + { + "epoch": 1.470945246881899, + "grad_norm": 9.322479248046875, + "learning_rate": 2.5495595311804134e-05, + "logits/chosen": 2.767453193664551, + "logits/rejected": 2.890385866165161, + "logps/chosen": -314.11859130859375, + "logps/rejected": -323.94049072265625, + "loss": 0.2361, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.3839352130889893, + "rewards/margins": 4.234570503234863, + "rewards/rejected": -7.618505954742432, + "step": 45140 + }, + { + "epoch": 1.4715969727334197, + "grad_norm": 1.5109351873397827, + "learning_rate": 2.5484732948805685e-05, + "logits/chosen": 3.271782398223877, + "logits/rejected": 3.1920838356018066, + "logps/chosen": -355.50885009765625, + "logps/rejected": -363.5224609375, + "loss": 0.2683, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.7737510204315186, + "rewards/margins": 5.0598273277282715, + "rewards/rejected": -7.833578586578369, + "step": 45160 + }, + { + "epoch": 1.4722486985849401, + "grad_norm": 5.4196271896362305, + "learning_rate": 2.5473870585807236e-05, + "logits/chosen": 2.818848133087158, + "logits/rejected": 2.796344757080078, + "logps/chosen": -346.6799621582031, + "logps/rejected": -328.5724182128906, + "loss": 0.3058, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.260089159011841, + "rewards/margins": 3.814732789993286, + "rewards/rejected": -7.074821472167969, + "step": 45180 + }, + { + "epoch": 1.4729004244364607, + "grad_norm": 1.1284619569778442, + "learning_rate": 2.5463008222808793e-05, + "logits/chosen": 2.9920566082000732, + "logits/rejected": 3.0538878440856934, + "logps/chosen": -321.9656982421875, + "logps/rejected": -330.91888427734375, + "loss": 0.3425, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.432530641555786, + "rewards/margins": 4.4049224853515625, + "rewards/rejected": -6.8374528884887695, + "step": 45200 + }, + { + "epoch": 1.4735521502879814, + "grad_norm": 1.3392488956451416, + "learning_rate": 2.5452145859810344e-05, + "logits/chosen": 3.0792629718780518, + "logits/rejected": 3.2384250164031982, + "logps/chosen": -353.8184509277344, + "logps/rejected": -321.01495361328125, + "loss": 0.2695, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.544667959213257, + "rewards/margins": 4.8638916015625, + "rewards/rejected": -7.408559322357178, + "step": 45220 + }, + { + "epoch": 1.474203876139502, + "grad_norm": 6.369245529174805, + "learning_rate": 2.5441283496811895e-05, + "logits/chosen": 2.775247573852539, + "logits/rejected": 2.9969725608825684, + "logps/chosen": -303.04913330078125, + "logps/rejected": -309.6043395996094, + "loss": 0.5497, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9605445861816406, + "rewards/margins": 3.5126755237579346, + "rewards/rejected": -5.473219871520996, + "step": 45240 + }, + { + "epoch": 1.4748556019910226, + "grad_norm": 4.023491382598877, + "learning_rate": 2.5430421133813452e-05, + "logits/chosen": 3.3749184608459473, + "logits/rejected": 3.5038013458251953, + "logps/chosen": -355.85345458984375, + "logps/rejected": -329.1654357910156, + "loss": 0.3242, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.3318283557891846, + "rewards/margins": 3.873425006866455, + "rewards/rejected": -6.205252647399902, + "step": 45260 + }, + { + "epoch": 1.475507327842543, + "grad_norm": 0.38105425238609314, + "learning_rate": 2.5419558770815006e-05, + "logits/chosen": 2.921508312225342, + "logits/rejected": 3.1355953216552734, + "logps/chosen": -345.1643371582031, + "logps/rejected": -319.5451354980469, + "loss": 0.1094, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.0078160762786865, + "rewards/margins": 4.539946556091309, + "rewards/rejected": -6.547762870788574, + "step": 45280 + }, + { + "epoch": 1.4761590536940636, + "grad_norm": 1.3683444261550903, + "learning_rate": 2.5408696407816557e-05, + "logits/chosen": 3.1654105186462402, + "logits/rejected": 3.3588790893554688, + "logps/chosen": -391.1479187011719, + "logps/rejected": -325.9275207519531, + "loss": 0.2295, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3674392700195312, + "rewards/margins": 4.614102840423584, + "rewards/rejected": -6.981541633605957, + "step": 45300 + }, + { + "epoch": 1.476810779545584, + "grad_norm": 1.7900387048721313, + "learning_rate": 2.5397834044818108e-05, + "logits/chosen": 2.8853366374969482, + "logits/rejected": 3.2057387828826904, + "logps/chosen": -370.29180908203125, + "logps/rejected": -338.0276184082031, + "loss": 0.1791, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.472695827484131, + "rewards/margins": 4.682590007781982, + "rewards/rejected": -7.155285835266113, + "step": 45320 + }, + { + "epoch": 1.4774625053971047, + "grad_norm": 4.684946537017822, + "learning_rate": 2.5386971681819666e-05, + "logits/chosen": 2.83545184135437, + "logits/rejected": 2.8947739601135254, + "logps/chosen": -331.9671936035156, + "logps/rejected": -323.7582702636719, + "loss": 0.1851, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.6159892082214355, + "rewards/margins": 5.134507179260254, + "rewards/rejected": -7.750496864318848, + "step": 45340 + }, + { + "epoch": 1.4781142312486253, + "grad_norm": 2.128159999847412, + "learning_rate": 2.5376109318821216e-05, + "logits/chosen": 2.9755325317382812, + "logits/rejected": 3.0693602561950684, + "logps/chosen": -340.85443115234375, + "logps/rejected": -317.27783203125, + "loss": 0.1867, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.397108554840088, + "rewards/margins": 4.432547092437744, + "rewards/rejected": -6.829655647277832, + "step": 45360 + }, + { + "epoch": 1.478765957100146, + "grad_norm": 2.4774868488311768, + "learning_rate": 2.536524695582277e-05, + "logits/chosen": 2.939074754714966, + "logits/rejected": 3.003117322921753, + "logps/chosen": -334.4947204589844, + "logps/rejected": -371.54400634765625, + "loss": 0.3684, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.335106611251831, + "rewards/margins": 4.526800632476807, + "rewards/rejected": -7.861907005310059, + "step": 45380 + }, + { + "epoch": 1.4794176829516663, + "grad_norm": 10.148992538452148, + "learning_rate": 2.5354384592824325e-05, + "logits/chosen": 3.1900930404663086, + "logits/rejected": 3.3205318450927734, + "logps/chosen": -361.0251159667969, + "logps/rejected": -349.9911193847656, + "loss": 0.3082, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.7961232662200928, + "rewards/margins": 4.45425271987915, + "rewards/rejected": -7.250376224517822, + "step": 45400 + }, + { + "epoch": 1.480069408803187, + "grad_norm": 11.71026611328125, + "learning_rate": 2.534352222982588e-05, + "logits/chosen": 2.8322246074676514, + "logits/rejected": 2.990823745727539, + "logps/chosen": -334.6319885253906, + "logps/rejected": -333.3815002441406, + "loss": 0.2104, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2688260078430176, + "rewards/margins": 4.450606822967529, + "rewards/rejected": -7.719433784484863, + "step": 45420 + }, + { + "epoch": 1.4807211346547076, + "grad_norm": 0.8932857513427734, + "learning_rate": 2.533265986682743e-05, + "logits/chosen": 2.9937901496887207, + "logits/rejected": 3.0979371070861816, + "logps/chosen": -378.2521057128906, + "logps/rejected": -340.5050354003906, + "loss": 0.2487, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.803549289703369, + "rewards/margins": 5.336302280426025, + "rewards/rejected": -8.139852523803711, + "step": 45440 + }, + { + "epoch": 1.481372860506228, + "grad_norm": 4.416879177093506, + "learning_rate": 2.5321797503828987e-05, + "logits/chosen": 2.5188467502593994, + "logits/rejected": 2.628512382507324, + "logps/chosen": -345.3777160644531, + "logps/rejected": -334.34014892578125, + "loss": 0.3809, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.105329990386963, + "rewards/margins": 4.951078414916992, + "rewards/rejected": -8.05640697479248, + "step": 45460 + }, + { + "epoch": 1.4820245863577486, + "grad_norm": 0.7578709125518799, + "learning_rate": 2.5310935140830538e-05, + "logits/chosen": 2.982374906539917, + "logits/rejected": 3.1674892902374268, + "logps/chosen": -350.099853515625, + "logps/rejected": -342.09375, + "loss": 0.1872, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.531073808670044, + "rewards/margins": 4.741768836975098, + "rewards/rejected": -8.272843360900879, + "step": 45480 + }, + { + "epoch": 1.4826763122092692, + "grad_norm": 3.9505202770233154, + "learning_rate": 2.530007277783209e-05, + "logits/chosen": 2.8283610343933105, + "logits/rejected": 2.9284873008728027, + "logps/chosen": -376.1953125, + "logps/rejected": -390.2897644042969, + "loss": 0.1955, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.96622633934021, + "rewards/margins": 5.71293306350708, + "rewards/rejected": -8.679159164428711, + "step": 45500 + }, + { + "epoch": 1.4833280380607898, + "grad_norm": 2.587783098220825, + "learning_rate": 2.5289210414833643e-05, + "logits/chosen": 2.884131669998169, + "logits/rejected": 2.843446731567383, + "logps/chosen": -351.33343505859375, + "logps/rejected": -358.4941711425781, + "loss": 0.296, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.034038543701172, + "rewards/margins": 5.037756443023682, + "rewards/rejected": -8.071794509887695, + "step": 45520 + }, + { + "epoch": 1.4839797639123102, + "grad_norm": 5.054379463195801, + "learning_rate": 2.52783480518352e-05, + "logits/chosen": 3.0041866302490234, + "logits/rejected": 3.0297465324401855, + "logps/chosen": -345.44598388671875, + "logps/rejected": -319.59429931640625, + "loss": 0.2997, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.837284803390503, + "rewards/margins": 4.071115016937256, + "rewards/rejected": -6.908400058746338, + "step": 45540 + }, + { + "epoch": 1.4846314897638309, + "grad_norm": 2.2776710987091064, + "learning_rate": 2.526748568883675e-05, + "logits/chosen": 2.9713778495788574, + "logits/rejected": 3.0964126586914062, + "logps/chosen": -346.423583984375, + "logps/rejected": -370.1973571777344, + "loss": 0.2247, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.5850672721862793, + "rewards/margins": 5.955874443054199, + "rewards/rejected": -8.540942192077637, + "step": 45560 + }, + { + "epoch": 1.4852832156153515, + "grad_norm": 1.1068862676620483, + "learning_rate": 2.5256623325838302e-05, + "logits/chosen": 2.708005905151367, + "logits/rejected": 2.9183096885681152, + "logps/chosen": -323.0584411621094, + "logps/rejected": -349.634521484375, + "loss": 0.1606, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.1813549995422363, + "rewards/margins": 5.224708557128906, + "rewards/rejected": -8.406064987182617, + "step": 45580 + }, + { + "epoch": 1.4859349414668719, + "grad_norm": 5.692325592041016, + "learning_rate": 2.524576096283986e-05, + "logits/chosen": 2.9599366188049316, + "logits/rejected": 2.7843310832977295, + "logps/chosen": -398.0838928222656, + "logps/rejected": -375.95977783203125, + "loss": 0.1957, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.418837785720825, + "rewards/margins": 5.350954532623291, + "rewards/rejected": -8.769792556762695, + "step": 45600 + }, + { + "epoch": 1.4865866673183925, + "grad_norm": 0.6992688179016113, + "learning_rate": 2.523489859984141e-05, + "logits/chosen": 2.820713520050049, + "logits/rejected": 2.9872632026672363, + "logps/chosen": -336.2626037597656, + "logps/rejected": -348.81097412109375, + "loss": 0.1115, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.9288394451141357, + "rewards/margins": 4.635500431060791, + "rewards/rejected": -8.564340591430664, + "step": 45620 + }, + { + "epoch": 1.4872383931699131, + "grad_norm": 0.5790651440620422, + "learning_rate": 2.522403623684296e-05, + "logits/chosen": 2.819139003753662, + "logits/rejected": 2.7176413536071777, + "logps/chosen": -357.061279296875, + "logps/rejected": -374.5121154785156, + "loss": 0.1246, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.388792037963867, + "rewards/margins": 5.383658409118652, + "rewards/rejected": -8.772451400756836, + "step": 45640 + }, + { + "epoch": 1.4878901190214338, + "grad_norm": 5.2032084465026855, + "learning_rate": 2.521317387384452e-05, + "logits/chosen": 2.7075209617614746, + "logits/rejected": 2.8379297256469727, + "logps/chosen": -330.12933349609375, + "logps/rejected": -325.5061950683594, + "loss": 0.2249, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.4017090797424316, + "rewards/margins": 4.872770309448242, + "rewards/rejected": -8.274479866027832, + "step": 45660 + }, + { + "epoch": 1.4885418448729542, + "grad_norm": 1.1295403242111206, + "learning_rate": 2.5202311510846073e-05, + "logits/chosen": 2.981706380844116, + "logits/rejected": 2.8509345054626465, + "logps/chosen": -363.7142639160156, + "logps/rejected": -374.60137939453125, + "loss": 0.3536, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.053186893463135, + "rewards/margins": 4.792578220367432, + "rewards/rejected": -8.845766067504883, + "step": 45680 + }, + { + "epoch": 1.4891935707244748, + "grad_norm": 6.59604549407959, + "learning_rate": 2.5191449147847624e-05, + "logits/chosen": 2.6523475646972656, + "logits/rejected": 2.7709736824035645, + "logps/chosen": -380.0145568847656, + "logps/rejected": -331.2594299316406, + "loss": 0.1732, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.443682909011841, + "rewards/margins": 5.3071417808532715, + "rewards/rejected": -8.750825881958008, + "step": 45700 + }, + { + "epoch": 1.4898452965759952, + "grad_norm": 6.345886707305908, + "learning_rate": 2.5180586784849175e-05, + "logits/chosen": 2.507354497909546, + "logits/rejected": 2.56644344329834, + "logps/chosen": -363.30670166015625, + "logps/rejected": -353.5516052246094, + "loss": 0.4782, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.0777268409729, + "rewards/margins": 3.7785658836364746, + "rewards/rejected": -7.856292724609375, + "step": 45720 + }, + { + "epoch": 1.4904970224275158, + "grad_norm": 10.668255805969238, + "learning_rate": 2.5169724421850732e-05, + "logits/chosen": 3.2400500774383545, + "logits/rejected": 3.2632079124450684, + "logps/chosen": -397.0028991699219, + "logps/rejected": -378.7421875, + "loss": 0.3841, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.246091842651367, + "rewards/margins": 4.220654487609863, + "rewards/rejected": -8.46674633026123, + "step": 45740 + }, + { + "epoch": 1.4911487482790364, + "grad_norm": 0.9476988911628723, + "learning_rate": 2.5158862058852283e-05, + "logits/chosen": 2.5426828861236572, + "logits/rejected": 2.4161007404327393, + "logps/chosen": -310.83489990234375, + "logps/rejected": -381.31414794921875, + "loss": 0.2338, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.8816628456115723, + "rewards/margins": 4.876569747924805, + "rewards/rejected": -8.758233070373535, + "step": 45760 + }, + { + "epoch": 1.491800474130557, + "grad_norm": 6.355375289916992, + "learning_rate": 2.5147999695853837e-05, + "logits/chosen": 2.982919692993164, + "logits/rejected": 2.9660370349884033, + "logps/chosen": -352.68475341796875, + "logps/rejected": -331.5996398925781, + "loss": 0.3035, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.7552266120910645, + "rewards/margins": 3.9671730995178223, + "rewards/rejected": -7.7223992347717285, + "step": 45780 + }, + { + "epoch": 1.4924521999820777, + "grad_norm": 8.106673240661621, + "learning_rate": 2.513713733285539e-05, + "logits/chosen": 3.313321352005005, + "logits/rejected": 3.352497100830078, + "logps/chosen": -419.96966552734375, + "logps/rejected": -343.4375, + "loss": 0.1361, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.7668213844299316, + "rewards/margins": 5.2534637451171875, + "rewards/rejected": -8.020284652709961, + "step": 45800 + }, + { + "epoch": 1.493103925833598, + "grad_norm": 3.341228723526001, + "learning_rate": 2.5126274969856945e-05, + "logits/chosen": 3.0470988750457764, + "logits/rejected": 3.154527187347412, + "logps/chosen": -392.1878356933594, + "logps/rejected": -374.31988525390625, + "loss": 0.2543, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.436516523361206, + "rewards/margins": 4.915737152099609, + "rewards/rejected": -8.352253913879395, + "step": 45820 + }, + { + "epoch": 1.4937556516851187, + "grad_norm": 4.786900997161865, + "learning_rate": 2.5115412606858496e-05, + "logits/chosen": 2.8468425273895264, + "logits/rejected": 2.981084108352661, + "logps/chosen": -409.28790283203125, + "logps/rejected": -352.8370056152344, + "loss": 0.4542, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8039097785949707, + "rewards/margins": 5.409432888031006, + "rewards/rejected": -8.213342666625977, + "step": 45840 + }, + { + "epoch": 1.494407377536639, + "grad_norm": 0.47973331809043884, + "learning_rate": 2.5104550243860054e-05, + "logits/chosen": 2.948251485824585, + "logits/rejected": 3.1184616088867188, + "logps/chosen": -346.1214294433594, + "logps/rejected": -343.4844970703125, + "loss": 0.1435, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.239699125289917, + "rewards/margins": 4.852656841278076, + "rewards/rejected": -8.092355728149414, + "step": 45860 + }, + { + "epoch": 1.4950591033881597, + "grad_norm": 1.4126782417297363, + "learning_rate": 2.5093687880861605e-05, + "logits/chosen": 3.03425669670105, + "logits/rejected": 3.1838319301605225, + "logps/chosen": -329.23785400390625, + "logps/rejected": -365.5701599121094, + "loss": 0.4181, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.959315299987793, + "rewards/margins": 4.941769599914551, + "rewards/rejected": -7.901084899902344, + "step": 45880 + }, + { + "epoch": 1.4957108292396804, + "grad_norm": 0.9393126368522644, + "learning_rate": 2.5082825517863155e-05, + "logits/chosen": 2.933046817779541, + "logits/rejected": 2.9698212146759033, + "logps/chosen": -358.09844970703125, + "logps/rejected": -341.1850280761719, + "loss": 0.4885, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.806002140045166, + "rewards/margins": 4.2568135261535645, + "rewards/rejected": -7.0628156661987305, + "step": 45900 + }, + { + "epoch": 1.496362555091201, + "grad_norm": 6.638294696807861, + "learning_rate": 2.5072506273014634e-05, + "logits/chosen": 3.004126787185669, + "logits/rejected": 3.056966781616211, + "logps/chosen": -325.5474548339844, + "logps/rejected": -322.2614440917969, + "loss": 0.5089, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.367473602294922, + "rewards/margins": 3.287853240966797, + "rewards/rejected": -6.655327796936035, + "step": 45920 + }, + { + "epoch": 1.4970142809427214, + "grad_norm": 1.8622196912765503, + "learning_rate": 2.5061643910016184e-05, + "logits/chosen": 3.068619728088379, + "logits/rejected": 3.238150119781494, + "logps/chosen": -393.27630615234375, + "logps/rejected": -354.2180480957031, + "loss": 0.3886, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.7153234481811523, + "rewards/margins": 4.380537986755371, + "rewards/rejected": -7.095861911773682, + "step": 45940 + }, + { + "epoch": 1.497666006794242, + "grad_norm": 0.23732781410217285, + "learning_rate": 2.505078154701774e-05, + "logits/chosen": 2.5933735370635986, + "logits/rejected": 2.8582141399383545, + "logps/chosen": -324.781982421875, + "logps/rejected": -303.82537841796875, + "loss": 0.2549, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.1759419441223145, + "rewards/margins": 4.3255510330200195, + "rewards/rejected": -6.501492977142334, + "step": 45960 + }, + { + "epoch": 1.4983177326457626, + "grad_norm": 5.290441513061523, + "learning_rate": 2.5039919184019296e-05, + "logits/chosen": 3.101656675338745, + "logits/rejected": 3.1261649131774902, + "logps/chosen": -310.79449462890625, + "logps/rejected": -338.06207275390625, + "loss": 0.3921, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.916301727294922, + "rewards/margins": 3.654395580291748, + "rewards/rejected": -6.570697784423828, + "step": 45980 + }, + { + "epoch": 1.498969458497283, + "grad_norm": 4.775969505310059, + "learning_rate": 2.5029056821020847e-05, + "logits/chosen": 3.317570209503174, + "logits/rejected": 3.3782882690429688, + "logps/chosen": -350.2411193847656, + "logps/rejected": -356.6383056640625, + "loss": 0.3414, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5749928951263428, + "rewards/margins": 4.770869731903076, + "rewards/rejected": -7.34586238861084, + "step": 46000 + }, + { + "epoch": 1.4996211843488036, + "grad_norm": 1.3514264822006226, + "learning_rate": 2.5018194458022398e-05, + "logits/chosen": 3.1546192169189453, + "logits/rejected": 3.2495779991149902, + "logps/chosen": -365.68035888671875, + "logps/rejected": -388.9416198730469, + "loss": 0.2735, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.3184654712677, + "rewards/margins": 4.116665363311768, + "rewards/rejected": -7.435132026672363, + "step": 46020 + }, + { + "epoch": 1.5002729102003243, + "grad_norm": 7.834432125091553, + "learning_rate": 2.5007332095023955e-05, + "logits/chosen": 3.0899410247802734, + "logits/rejected": 3.3041954040527344, + "logps/chosen": -356.1417541503906, + "logps/rejected": -327.3580627441406, + "loss": 0.2563, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.6472480297088623, + "rewards/margins": 4.230648994445801, + "rewards/rejected": -6.877896785736084, + "step": 46040 + }, + { + "epoch": 1.500924636051845, + "grad_norm": 0.7178816199302673, + "learning_rate": 2.4996469732025506e-05, + "logits/chosen": 2.9505343437194824, + "logits/rejected": 3.141270875930786, + "logps/chosen": -358.96795654296875, + "logps/rejected": -333.5631408691406, + "loss": 0.2245, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.1377992630004883, + "rewards/margins": 4.106805801391602, + "rewards/rejected": -7.244604587554932, + "step": 46060 + }, + { + "epoch": 1.5015763619033653, + "grad_norm": 0.14140602946281433, + "learning_rate": 2.498560736902706e-05, + "logits/chosen": 3.1919524669647217, + "logits/rejected": 3.0398833751678467, + "logps/chosen": -376.7148132324219, + "logps/rejected": -354.06121826171875, + "loss": 0.348, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.161954879760742, + "rewards/margins": 4.238705158233643, + "rewards/rejected": -7.400660037994385, + "step": 46080 + }, + { + "epoch": 1.502228087754886, + "grad_norm": 2.9070985317230225, + "learning_rate": 2.4974745006028614e-05, + "logits/chosen": 2.7266926765441895, + "logits/rejected": 2.8447651863098145, + "logps/chosen": -319.81011962890625, + "logps/rejected": -333.0292663574219, + "loss": 0.225, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.773444652557373, + "rewards/margins": 5.036558628082275, + "rewards/rejected": -7.81000280380249, + "step": 46100 + }, + { + "epoch": 1.5028798136064063, + "grad_norm": 0.016834119334816933, + "learning_rate": 2.4963882643030165e-05, + "logits/chosen": 3.357433319091797, + "logits/rejected": 3.2452492713928223, + "logps/chosen": -345.060302734375, + "logps/rejected": -359.4610900878906, + "loss": 0.2676, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.605056047439575, + "rewards/margins": 4.371623992919922, + "rewards/rejected": -7.976679801940918, + "step": 46120 + }, + { + "epoch": 1.503531539457927, + "grad_norm": 0.6226794123649597, + "learning_rate": 2.495302028003172e-05, + "logits/chosen": 2.630265951156616, + "logits/rejected": 2.8559770584106445, + "logps/chosen": -356.8809814453125, + "logps/rejected": -354.16021728515625, + "loss": 0.1584, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.6310012340545654, + "rewards/margins": 4.751465797424316, + "rewards/rejected": -8.382467269897461, + "step": 46140 + }, + { + "epoch": 1.5041832653094476, + "grad_norm": 5.274168014526367, + "learning_rate": 2.4942157917033274e-05, + "logits/chosen": 2.8022103309631348, + "logits/rejected": 3.074486494064331, + "logps/chosen": -362.9224548339844, + "logps/rejected": -399.2565002441406, + "loss": 0.2713, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.6277718544006348, + "rewards/margins": 4.863004207611084, + "rewards/rejected": -8.490776062011719, + "step": 46160 + }, + { + "epoch": 1.5048349911609682, + "grad_norm": 34.390167236328125, + "learning_rate": 2.4931295554034824e-05, + "logits/chosen": 3.097400426864624, + "logits/rejected": 3.202019214630127, + "logps/chosen": -363.35870361328125, + "logps/rejected": -381.6062316894531, + "loss": 0.3089, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7473037242889404, + "rewards/margins": 4.676815986633301, + "rewards/rejected": -8.42411994934082, + "step": 46180 + }, + { + "epoch": 1.5054867170124888, + "grad_norm": 0.6545085906982422, + "learning_rate": 2.492043319103638e-05, + "logits/chosen": 2.708815336227417, + "logits/rejected": 2.8107826709747314, + "logps/chosen": -373.9330749511719, + "logps/rejected": -353.95751953125, + "loss": 0.3073, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.717865467071533, + "rewards/margins": 4.8648681640625, + "rewards/rejected": -7.582733154296875, + "step": 46200 + }, + { + "epoch": 1.5061384428640092, + "grad_norm": 0.07820504903793335, + "learning_rate": 2.4909570828037933e-05, + "logits/chosen": 3.0280213356018066, + "logits/rejected": 3.2127652168273926, + "logps/chosen": -350.71685791015625, + "logps/rejected": -370.86968994140625, + "loss": 0.1999, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.1054370403289795, + "rewards/margins": 4.7104315757751465, + "rewards/rejected": -7.815869331359863, + "step": 46220 + }, + { + "epoch": 1.5067901687155298, + "grad_norm": 0.03261277452111244, + "learning_rate": 2.4898708465039487e-05, + "logits/chosen": 2.7130391597747803, + "logits/rejected": 2.795928955078125, + "logps/chosen": -343.8630676269531, + "logps/rejected": -368.5834655761719, + "loss": 0.2297, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.441570281982422, + "rewards/margins": 5.400332450866699, + "rewards/rejected": -8.841901779174805, + "step": 46240 + }, + { + "epoch": 1.5074418945670502, + "grad_norm": 8.277109146118164, + "learning_rate": 2.488784610204104e-05, + "logits/chosen": 3.2344279289245605, + "logits/rejected": 3.190537929534912, + "logps/chosen": -355.02734375, + "logps/rejected": -381.49334716796875, + "loss": 0.2703, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.8904547691345215, + "rewards/margins": 4.366458892822266, + "rewards/rejected": -8.256914138793945, + "step": 46260 + }, + { + "epoch": 1.5080936204185709, + "grad_norm": 4.807264804840088, + "learning_rate": 2.4876983739042592e-05, + "logits/chosen": 3.152690887451172, + "logits/rejected": 3.155707359313965, + "logps/chosen": -416.1663513183594, + "logps/rejected": -376.085693359375, + "loss": 0.2886, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.30440616607666, + "rewards/margins": 5.361538887023926, + "rewards/rejected": -9.66594409942627, + "step": 46280 + }, + { + "epoch": 1.5087453462700915, + "grad_norm": 0.04032020643353462, + "learning_rate": 2.4866121376044146e-05, + "logits/chosen": 3.0370678901672363, + "logits/rejected": 3.1808245182037354, + "logps/chosen": -353.1404113769531, + "logps/rejected": -375.5109558105469, + "loss": 0.3061, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.2096452713012695, + "rewards/margins": 4.45550537109375, + "rewards/rejected": -8.665151596069336, + "step": 46300 + }, + { + "epoch": 1.5093970721216121, + "grad_norm": 16.35740089416504, + "learning_rate": 2.4855259013045697e-05, + "logits/chosen": 3.1232833862304688, + "logits/rejected": 3.025204658508301, + "logps/chosen": -333.9910888671875, + "logps/rejected": -367.4676208496094, + "loss": 0.3717, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.434157371520996, + "rewards/margins": 4.352827548980713, + "rewards/rejected": -6.786984920501709, + "step": 46320 + }, + { + "epoch": 1.5100487979731327, + "grad_norm": 12.040300369262695, + "learning_rate": 2.484439665004725e-05, + "logits/chosen": 3.0064585208892822, + "logits/rejected": 3.151362895965576, + "logps/chosen": -347.420654296875, + "logps/rejected": -327.67657470703125, + "loss": 0.2955, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.3513050079345703, + "rewards/margins": 4.599715709686279, + "rewards/rejected": -7.95102071762085, + "step": 46340 + }, + { + "epoch": 1.5107005238246531, + "grad_norm": 2.7127883434295654, + "learning_rate": 2.483353428704881e-05, + "logits/chosen": 2.617447853088379, + "logits/rejected": 2.77531361579895, + "logps/chosen": -333.24786376953125, + "logps/rejected": -371.5814514160156, + "loss": 0.1618, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.2096168994903564, + "rewards/margins": 5.343268394470215, + "rewards/rejected": -8.552885055541992, + "step": 46360 + }, + { + "epoch": 1.5113522496761738, + "grad_norm": 4.2413330078125, + "learning_rate": 2.482267192405036e-05, + "logits/chosen": 3.096613645553589, + "logits/rejected": 3.2555370330810547, + "logps/chosen": -340.965576171875, + "logps/rejected": -342.1857604980469, + "loss": 0.316, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.4634718894958496, + "rewards/margins": 4.328984260559082, + "rewards/rejected": -7.79245662689209, + "step": 46380 + }, + { + "epoch": 1.5120039755276942, + "grad_norm": 2.4516992568969727, + "learning_rate": 2.4811809561051913e-05, + "logits/chosen": 2.7637219429016113, + "logits/rejected": 2.899445056915283, + "logps/chosen": -326.5643005371094, + "logps/rejected": -318.9764709472656, + "loss": 0.3337, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.7344608306884766, + "rewards/margins": 4.784697532653809, + "rewards/rejected": -8.519158363342285, + "step": 46400 + }, + { + "epoch": 1.5126557013792148, + "grad_norm": 4.592413902282715, + "learning_rate": 2.4800947198053464e-05, + "logits/chosen": 2.9851953983306885, + "logits/rejected": 3.186896800994873, + "logps/chosen": -364.27215576171875, + "logps/rejected": -346.06170654296875, + "loss": 0.256, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.729339838027954, + "rewards/margins": 4.163564682006836, + "rewards/rejected": -6.892903804779053, + "step": 46420 + }, + { + "epoch": 1.5133074272307354, + "grad_norm": 1.633949875831604, + "learning_rate": 2.479008483505502e-05, + "logits/chosen": 3.2129077911376953, + "logits/rejected": 3.1761910915374756, + "logps/chosen": -336.0630798339844, + "logps/rejected": -324.9976501464844, + "loss": 0.3579, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8776469230651855, + "rewards/margins": 4.298529148101807, + "rewards/rejected": -7.176175594329834, + "step": 46440 + }, + { + "epoch": 1.513959153082256, + "grad_norm": 5.120908737182617, + "learning_rate": 2.4779222472056573e-05, + "logits/chosen": 2.77199649810791, + "logits/rejected": 2.9005229473114014, + "logps/chosen": -379.0164489746094, + "logps/rejected": -359.4737854003906, + "loss": 0.3752, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2880008220672607, + "rewards/margins": 4.765768051147461, + "rewards/rejected": -8.053768157958984, + "step": 46460 + }, + { + "epoch": 1.5146108789337767, + "grad_norm": 3.5515613555908203, + "learning_rate": 2.4768360109058127e-05, + "logits/chosen": 2.8700623512268066, + "logits/rejected": 2.9436774253845215, + "logps/chosen": -322.6849060058594, + "logps/rejected": -310.11993408203125, + "loss": 0.3632, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.629241943359375, + "rewards/margins": 4.5642571449279785, + "rewards/rejected": -8.193498611450195, + "step": 46480 + }, + { + "epoch": 1.515262604785297, + "grad_norm": 2.0946202278137207, + "learning_rate": 2.475749774605968e-05, + "logits/chosen": 2.6302452087402344, + "logits/rejected": 2.998534679412842, + "logps/chosen": -356.3906555175781, + "logps/rejected": -342.69769287109375, + "loss": 0.2159, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.743356704711914, + "rewards/margins": 4.9950361251831055, + "rewards/rejected": -8.73839282989502, + "step": 46500 + }, + { + "epoch": 1.5159143306368175, + "grad_norm": 4.876975059509277, + "learning_rate": 2.474663538306123e-05, + "logits/chosen": 3.04748797416687, + "logits/rejected": 3.213413715362549, + "logps/chosen": -345.3662414550781, + "logps/rejected": -332.19696044921875, + "loss": 0.2284, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.555553913116455, + "rewards/margins": 3.8081791400909424, + "rewards/rejected": -7.363732814788818, + "step": 46520 + }, + { + "epoch": 1.516566056488338, + "grad_norm": 4.402562141418457, + "learning_rate": 2.4735773020062786e-05, + "logits/chosen": 3.0007078647613525, + "logits/rejected": 2.907426357269287, + "logps/chosen": -348.0930480957031, + "logps/rejected": -334.4562072753906, + "loss": 0.244, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.8405189514160156, + "rewards/margins": 4.887146949768066, + "rewards/rejected": -7.72766637802124, + "step": 46540 + }, + { + "epoch": 1.5172177823398587, + "grad_norm": 2.5823779106140137, + "learning_rate": 2.472491065706434e-05, + "logits/chosen": 2.7493972778320312, + "logits/rejected": 2.8611721992492676, + "logps/chosen": -333.39630126953125, + "logps/rejected": -355.60394287109375, + "loss": 0.3646, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.19315242767334, + "rewards/margins": 3.8480124473571777, + "rewards/rejected": -7.041165351867676, + "step": 46560 + }, + { + "epoch": 1.5178695081913793, + "grad_norm": 0.23885732889175415, + "learning_rate": 2.471404829406589e-05, + "logits/chosen": 3.0461955070495605, + "logits/rejected": 2.985731601715088, + "logps/chosen": -438.3363342285156, + "logps/rejected": -399.3999328613281, + "loss": 0.3044, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.644953966140747, + "rewards/margins": 4.5418853759765625, + "rewards/rejected": -8.18683910369873, + "step": 46580 + }, + { + "epoch": 1.5185212340429, + "grad_norm": 3.4663825035095215, + "learning_rate": 2.4703185931067445e-05, + "logits/chosen": 2.8100173473358154, + "logits/rejected": 3.077273368835449, + "logps/chosen": -345.6070861816406, + "logps/rejected": -339.8207702636719, + "loss": 0.3452, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.364231586456299, + "rewards/margins": 4.481266975402832, + "rewards/rejected": -7.845497131347656, + "step": 46600 + }, + { + "epoch": 1.5191729598944204, + "grad_norm": 5.455982208251953, + "learning_rate": 2.4692323568069e-05, + "logits/chosen": 2.858384609222412, + "logits/rejected": 3.002187967300415, + "logps/chosen": -378.3850402832031, + "logps/rejected": -394.24676513671875, + "loss": 0.219, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.134710788726807, + "rewards/margins": 4.9485249519348145, + "rewards/rejected": -9.083236694335938, + "step": 46620 + }, + { + "epoch": 1.519824685745941, + "grad_norm": 2.8538801670074463, + "learning_rate": 2.4681461205070553e-05, + "logits/chosen": 2.8021304607391357, + "logits/rejected": 2.8292076587677, + "logps/chosen": -355.39404296875, + "logps/rejected": -322.92095947265625, + "loss": 0.419, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.8687286376953125, + "rewards/margins": 4.068373203277588, + "rewards/rejected": -7.937100887298584, + "step": 46640 + }, + { + "epoch": 1.5204764115974614, + "grad_norm": 8.576823234558105, + "learning_rate": 2.4670598842072107e-05, + "logits/chosen": 2.6404829025268555, + "logits/rejected": 2.6693825721740723, + "logps/chosen": -333.72796630859375, + "logps/rejected": -342.75762939453125, + "loss": 0.4074, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.516552686691284, + "rewards/margins": 3.4834511280059814, + "rewards/rejected": -7.000004768371582, + "step": 46660 + }, + { + "epoch": 1.521128137448982, + "grad_norm": 3.9188406467437744, + "learning_rate": 2.4659736479073658e-05, + "logits/chosen": 2.906914234161377, + "logits/rejected": 2.956655979156494, + "logps/chosen": -388.5183410644531, + "logps/rejected": -345.33221435546875, + "loss": 0.31, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.1358442306518555, + "rewards/margins": 4.040639877319336, + "rewards/rejected": -8.176484107971191, + "step": 46680 + }, + { + "epoch": 1.5217798633005026, + "grad_norm": 1.8290820121765137, + "learning_rate": 2.4648874116075212e-05, + "logits/chosen": 3.159359931945801, + "logits/rejected": 3.351400852203369, + "logps/chosen": -382.9386291503906, + "logps/rejected": -409.5201721191406, + "loss": 0.2792, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.6199660301208496, + "rewards/margins": 4.407707691192627, + "rewards/rejected": -8.027673721313477, + "step": 46700 + }, + { + "epoch": 1.5224315891520233, + "grad_norm": 3.3587863445281982, + "learning_rate": 2.4638011753076763e-05, + "logits/chosen": 2.600834608078003, + "logits/rejected": 2.733126401901245, + "logps/chosen": -349.70086669921875, + "logps/rejected": -375.2479553222656, + "loss": 0.229, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.59138560295105, + "rewards/margins": 4.635002613067627, + "rewards/rejected": -8.226387977600098, + "step": 46720 + }, + { + "epoch": 1.5230833150035439, + "grad_norm": 2.2193844318389893, + "learning_rate": 2.4627149390078317e-05, + "logits/chosen": 2.9274868965148926, + "logits/rejected": 2.9980132579803467, + "logps/chosen": -362.99920654296875, + "logps/rejected": -350.823486328125, + "loss": 0.1327, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.3247947692871094, + "rewards/margins": 4.721134185791016, + "rewards/rejected": -8.045928955078125, + "step": 46740 + }, + { + "epoch": 1.5237350408550643, + "grad_norm": 1.0157263278961182, + "learning_rate": 2.4616287027079875e-05, + "logits/chosen": 3.656463623046875, + "logits/rejected": 3.4475715160369873, + "logps/chosen": -404.5909118652344, + "logps/rejected": -397.86669921875, + "loss": 0.4187, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.516162157058716, + "rewards/margins": 3.430615186691284, + "rewards/rejected": -6.94677734375, + "step": 46760 + }, + { + "epoch": 1.524386766706585, + "grad_norm": 3.9205732345581055, + "learning_rate": 2.4605424664081426e-05, + "logits/chosen": 2.9206900596618652, + "logits/rejected": 2.9462475776672363, + "logps/chosen": -342.3529052734375, + "logps/rejected": -346.117431640625, + "loss": 0.2996, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.991806745529175, + "rewards/margins": 3.9161829948425293, + "rewards/rejected": -7.907989501953125, + "step": 46780 + }, + { + "epoch": 1.5250384925581053, + "grad_norm": 1.7272318601608276, + "learning_rate": 2.459456230108298e-05, + "logits/chosen": 2.865572690963745, + "logits/rejected": 3.011774778366089, + "logps/chosen": -322.0169982910156, + "logps/rejected": -353.48419189453125, + "loss": 0.3691, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.731707811355591, + "rewards/margins": 3.9507758617401123, + "rewards/rejected": -7.682482719421387, + "step": 46800 + }, + { + "epoch": 1.525690218409626, + "grad_norm": 3.872014045715332, + "learning_rate": 2.458369993808453e-05, + "logits/chosen": 3.324885129928589, + "logits/rejected": 3.2184901237487793, + "logps/chosen": -401.4813537597656, + "logps/rejected": -356.65216064453125, + "loss": 0.2588, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.939223527908325, + "rewards/margins": 4.337066650390625, + "rewards/rejected": -7.276289939880371, + "step": 46820 + }, + { + "epoch": 1.5263419442611466, + "grad_norm": 8.120295524597168, + "learning_rate": 2.4572837575086085e-05, + "logits/chosen": 3.0532734394073486, + "logits/rejected": 3.0965025424957275, + "logps/chosen": -349.1806335449219, + "logps/rejected": -343.693359375, + "loss": 0.2484, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.7393295764923096, + "rewards/margins": 4.437226295471191, + "rewards/rejected": -8.176556587219238, + "step": 46840 + }, + { + "epoch": 1.5269936701126672, + "grad_norm": 6.687378406524658, + "learning_rate": 2.456197521208764e-05, + "logits/chosen": 3.0848865509033203, + "logits/rejected": 3.092088222503662, + "logps/chosen": -392.2403564453125, + "logps/rejected": -356.14971923828125, + "loss": 0.3155, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.180607557296753, + "rewards/margins": 4.356574058532715, + "rewards/rejected": -7.537181854248047, + "step": 46860 + }, + { + "epoch": 1.5276453959641878, + "grad_norm": 5.095002174377441, + "learning_rate": 2.4551112849089193e-05, + "logits/chosen": 2.8458023071289062, + "logits/rejected": 3.3038814067840576, + "logps/chosen": -319.3997497558594, + "logps/rejected": -347.95880126953125, + "loss": 0.463, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.3502678871154785, + "rewards/margins": 3.553309917449951, + "rewards/rejected": -6.9035773277282715, + "step": 46880 + }, + { + "epoch": 1.5282971218157082, + "grad_norm": 0.9744176268577576, + "learning_rate": 2.4540250486090747e-05, + "logits/chosen": 2.8829808235168457, + "logits/rejected": 2.8611273765563965, + "logps/chosen": -404.2281188964844, + "logps/rejected": -345.95477294921875, + "loss": 0.4443, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.6807479858398438, + "rewards/margins": 3.525517702102661, + "rewards/rejected": -7.206265449523926, + "step": 46900 + }, + { + "epoch": 1.5289488476672288, + "grad_norm": 0.9039067625999451, + "learning_rate": 2.4529388123092298e-05, + "logits/chosen": 2.817713499069214, + "logits/rejected": 2.948029041290283, + "logps/chosen": -366.7959289550781, + "logps/rejected": -335.24432373046875, + "loss": 0.2176, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.294297456741333, + "rewards/margins": 4.608363628387451, + "rewards/rejected": -7.9026618003845215, + "step": 46920 + }, + { + "epoch": 1.5296005735187492, + "grad_norm": 5.9715986251831055, + "learning_rate": 2.4518525760093852e-05, + "logits/chosen": 3.0222244262695312, + "logits/rejected": 3.1067306995391846, + "logps/chosen": -307.66314697265625, + "logps/rejected": -318.52044677734375, + "loss": 0.3054, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.6477162837982178, + "rewards/margins": 3.7241318225860596, + "rewards/rejected": -6.371847629547119, + "step": 46940 + }, + { + "epoch": 1.5302522993702699, + "grad_norm": 0.5870254039764404, + "learning_rate": 2.4507663397095403e-05, + "logits/chosen": 2.9607672691345215, + "logits/rejected": 2.9936766624450684, + "logps/chosen": -326.37420654296875, + "logps/rejected": -305.9781188964844, + "loss": 0.3259, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4133830070495605, + "rewards/margins": 4.192424774169922, + "rewards/rejected": -6.605807304382324, + "step": 46960 + }, + { + "epoch": 1.5309040252217905, + "grad_norm": 5.446567058563232, + "learning_rate": 2.4496801034096957e-05, + "logits/chosen": 2.9188156127929688, + "logits/rejected": 2.9155526161193848, + "logps/chosen": -347.92022705078125, + "logps/rejected": -324.02581787109375, + "loss": 0.5123, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.554739475250244, + "rewards/margins": 3.92872953414917, + "rewards/rejected": -7.483468532562256, + "step": 46980 + }, + { + "epoch": 1.531555751073311, + "grad_norm": 3.1753804683685303, + "learning_rate": 2.448593867109851e-05, + "logits/chosen": 2.9142990112304688, + "logits/rejected": 3.068911075592041, + "logps/chosen": -341.7940979003906, + "logps/rejected": -334.36529541015625, + "loss": 0.2717, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.1603760719299316, + "rewards/margins": 4.343580722808838, + "rewards/rejected": -6.5039567947387695, + "step": 47000 + }, + { + "epoch": 1.5322074769248317, + "grad_norm": 1.0340644121170044, + "learning_rate": 2.4475076308100066e-05, + "logits/chosen": 3.324720859527588, + "logits/rejected": 3.46390962600708, + "logps/chosen": -385.86663818359375, + "logps/rejected": -320.79742431640625, + "loss": 0.2936, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.7403855323791504, + "rewards/margins": 4.0507707595825195, + "rewards/rejected": -6.791156768798828, + "step": 47020 + }, + { + "epoch": 1.5328592027763521, + "grad_norm": 2.4583001136779785, + "learning_rate": 2.446421394510162e-05, + "logits/chosen": 2.7963180541992188, + "logits/rejected": 3.0747909545898438, + "logps/chosen": -359.1187438964844, + "logps/rejected": -380.40350341796875, + "loss": 0.0895, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.653916835784912, + "rewards/margins": 5.566712379455566, + "rewards/rejected": -8.22062873840332, + "step": 47040 + }, + { + "epoch": 1.5335109286278725, + "grad_norm": 1.5692507028579712, + "learning_rate": 2.445335158210317e-05, + "logits/chosen": 3.0063107013702393, + "logits/rejected": 3.154320001602173, + "logps/chosen": -382.2151184082031, + "logps/rejected": -363.16033935546875, + "loss": 0.2471, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4047789573669434, + "rewards/margins": 4.7227067947387695, + "rewards/rejected": -8.127485275268555, + "step": 47060 + }, + { + "epoch": 1.5341626544793932, + "grad_norm": 4.353551387786865, + "learning_rate": 2.4442489219104725e-05, + "logits/chosen": 2.8797290325164795, + "logits/rejected": 2.9482576847076416, + "logps/chosen": -308.92022705078125, + "logps/rejected": -356.44757080078125, + "loss": 0.4106, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.6155991554260254, + "rewards/margins": 3.798609972000122, + "rewards/rejected": -7.414208889007568, + "step": 47080 + }, + { + "epoch": 1.5348143803309138, + "grad_norm": 1.6882609128952026, + "learning_rate": 2.443162685610628e-05, + "logits/chosen": 3.2851357460021973, + "logits/rejected": 3.2704861164093018, + "logps/chosen": -396.98687744140625, + "logps/rejected": -391.484619140625, + "loss": 0.3227, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.3181865215301514, + "rewards/margins": 4.927928924560547, + "rewards/rejected": -8.246114730834961, + "step": 47100 + }, + { + "epoch": 1.5354661061824344, + "grad_norm": 1.020176649093628, + "learning_rate": 2.442076449310783e-05, + "logits/chosen": 3.1564793586730957, + "logits/rejected": 3.064519166946411, + "logps/chosen": -382.24664306640625, + "logps/rejected": -374.0194091796875, + "loss": 0.2788, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.3345322608947754, + "rewards/margins": 4.448270320892334, + "rewards/rejected": -7.782801628112793, + "step": 47120 + }, + { + "epoch": 1.536117832033955, + "grad_norm": 4.677000999450684, + "learning_rate": 2.4409902130109384e-05, + "logits/chosen": 3.0935027599334717, + "logits/rejected": 3.0452773571014404, + "logps/chosen": -392.226318359375, + "logps/rejected": -362.25665283203125, + "loss": 0.1966, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.121293306350708, + "rewards/margins": 4.831225395202637, + "rewards/rejected": -7.952519416809082, + "step": 47140 + }, + { + "epoch": 1.5367695578854754, + "grad_norm": 2.9505529403686523, + "learning_rate": 2.4399039767110938e-05, + "logits/chosen": 2.9934678077697754, + "logits/rejected": 3.1707749366760254, + "logps/chosen": -339.1491394042969, + "logps/rejected": -330.89654541015625, + "loss": 0.3928, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.392564058303833, + "rewards/margins": 4.5631513595581055, + "rewards/rejected": -7.955715179443359, + "step": 47160 + }, + { + "epoch": 1.537421283736996, + "grad_norm": 1.4775818586349487, + "learning_rate": 2.4388177404112492e-05, + "logits/chosen": 3.2138564586639404, + "logits/rejected": 3.2894446849823, + "logps/chosen": -405.77117919921875, + "logps/rejected": -371.27178955078125, + "loss": 0.2075, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.5583724975585938, + "rewards/margins": 4.292193412780762, + "rewards/rejected": -7.850565433502197, + "step": 47180 + }, + { + "epoch": 1.5380730095885164, + "grad_norm": 7.110275745391846, + "learning_rate": 2.4377315041114046e-05, + "logits/chosen": 3.0187811851501465, + "logits/rejected": 3.0815365314483643, + "logps/chosen": -315.50030517578125, + "logps/rejected": -307.5865173339844, + "loss": 0.2985, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9978623390197754, + "rewards/margins": 3.9648633003234863, + "rewards/rejected": -6.962725639343262, + "step": 47200 + }, + { + "epoch": 1.538724735440037, + "grad_norm": 1.271522045135498, + "learning_rate": 2.4366452678115597e-05, + "logits/chosen": 2.7147862911224365, + "logits/rejected": 3.0317728519439697, + "logps/chosen": -312.5230407714844, + "logps/rejected": -325.0689392089844, + "loss": 0.4078, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.3684909343719482, + "rewards/margins": 4.019382953643799, + "rewards/rejected": -7.387874603271484, + "step": 47220 + }, + { + "epoch": 1.5393764612915577, + "grad_norm": 0.6936112642288208, + "learning_rate": 2.435559031511715e-05, + "logits/chosen": 2.8398594856262207, + "logits/rejected": 2.8659844398498535, + "logps/chosen": -354.71649169921875, + "logps/rejected": -360.6065979003906, + "loss": 0.3525, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.156116485595703, + "rewards/margins": 3.9063491821289062, + "rewards/rejected": -8.06246566772461, + "step": 47240 + }, + { + "epoch": 1.5400281871430783, + "grad_norm": 3.3186748027801514, + "learning_rate": 2.4344727952118702e-05, + "logits/chosen": 2.956646680831909, + "logits/rejected": 2.9104714393615723, + "logps/chosen": -354.4120788574219, + "logps/rejected": -342.81005859375, + "loss": 0.29, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.972501277923584, + "rewards/margins": 4.41515588760376, + "rewards/rejected": -8.387657165527344, + "step": 47260 + }, + { + "epoch": 1.540679912994599, + "grad_norm": 8.482156753540039, + "learning_rate": 2.433386558912026e-05, + "logits/chosen": 2.9762444496154785, + "logits/rejected": 3.0722365379333496, + "logps/chosen": -346.72955322265625, + "logps/rejected": -308.9620056152344, + "loss": 0.3278, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.594264507293701, + "rewards/margins": 4.2341508865356445, + "rewards/rejected": -7.8284149169921875, + "step": 47280 + }, + { + "epoch": 1.5413316388461193, + "grad_norm": 2.1886420249938965, + "learning_rate": 2.4323003226121814e-05, + "logits/chosen": 2.863459348678589, + "logits/rejected": 2.9923388957977295, + "logps/chosen": -403.99847412109375, + "logps/rejected": -391.29541015625, + "loss": 0.2439, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.501112461090088, + "rewards/margins": 4.675766944885254, + "rewards/rejected": -8.1768798828125, + "step": 47300 + }, + { + "epoch": 1.54198336469764, + "grad_norm": 6.288395404815674, + "learning_rate": 2.4312140863123365e-05, + "logits/chosen": 3.148878574371338, + "logits/rejected": 3.1026461124420166, + "logps/chosen": -408.3789978027344, + "logps/rejected": -341.90789794921875, + "loss": 0.4967, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.171786308288574, + "rewards/margins": 3.459773540496826, + "rewards/rejected": -7.6315598487854, + "step": 47320 + }, + { + "epoch": 1.5426350905491604, + "grad_norm": 1.483563780784607, + "learning_rate": 2.430127850012492e-05, + "logits/chosen": 3.1734111309051514, + "logits/rejected": 3.3686084747314453, + "logps/chosen": -360.6659240722656, + "logps/rejected": -298.04510498046875, + "loss": 0.1236, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.527611255645752, + "rewards/margins": 4.915004730224609, + "rewards/rejected": -7.442615509033203, + "step": 47340 + }, + { + "epoch": 1.543286816400681, + "grad_norm": 15.34343433380127, + "learning_rate": 2.429041613712647e-05, + "logits/chosen": 2.8900976181030273, + "logits/rejected": 3.1715991497039795, + "logps/chosen": -366.10491943359375, + "logps/rejected": -334.89599609375, + "loss": 0.3047, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.053858995437622, + "rewards/margins": 4.094682693481445, + "rewards/rejected": -7.1485419273376465, + "step": 47360 + }, + { + "epoch": 1.5439385422522016, + "grad_norm": 0.8902096152305603, + "learning_rate": 2.4279553774128024e-05, + "logits/chosen": 3.481733798980713, + "logits/rejected": 3.575305938720703, + "logps/chosen": -362.5108337402344, + "logps/rejected": -403.0234375, + "loss": 0.3398, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.1944003105163574, + "rewards/margins": 4.410116672515869, + "rewards/rejected": -7.604516506195068, + "step": 47380 + }, + { + "epoch": 1.5445902681037222, + "grad_norm": 0.5341829061508179, + "learning_rate": 2.4268691411129578e-05, + "logits/chosen": 3.380821704864502, + "logits/rejected": 3.3745293617248535, + "logps/chosen": -392.61871337890625, + "logps/rejected": -361.6218566894531, + "loss": 0.2786, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3836066722869873, + "rewards/margins": 5.017121315002441, + "rewards/rejected": -7.40072774887085, + "step": 47400 + }, + { + "epoch": 1.5452419939552429, + "grad_norm": 4.322538375854492, + "learning_rate": 2.4257829048131132e-05, + "logits/chosen": 3.0291221141815186, + "logits/rejected": 3.1337952613830566, + "logps/chosen": -333.1748046875, + "logps/rejected": -361.633544921875, + "loss": 0.3123, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.251420259475708, + "rewards/margins": 3.893303394317627, + "rewards/rejected": -7.144723415374756, + "step": 47420 + }, + { + "epoch": 1.5458937198067633, + "grad_norm": 0.08369240909814835, + "learning_rate": 2.4246966685132686e-05, + "logits/chosen": 3.3373656272888184, + "logits/rejected": 3.3785576820373535, + "logps/chosen": -346.84814453125, + "logps/rejected": -350.13067626953125, + "loss": 0.4765, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7874112129211426, + "rewards/margins": 3.6770052909851074, + "rewards/rejected": -6.46441650390625, + "step": 47440 + }, + { + "epoch": 1.546545445658284, + "grad_norm": 1.3245307207107544, + "learning_rate": 2.4236104322134237e-05, + "logits/chosen": 3.354340076446533, + "logits/rejected": 3.5859763622283936, + "logps/chosen": -352.4827575683594, + "logps/rejected": -353.1660461425781, + "loss": 0.2717, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.9837136268615723, + "rewards/margins": 4.241698265075684, + "rewards/rejected": -7.225411415100098, + "step": 47460 + }, + { + "epoch": 1.5471971715098043, + "grad_norm": 0.474825918674469, + "learning_rate": 2.422524195913579e-05, + "logits/chosen": 2.6819634437561035, + "logits/rejected": 2.784219264984131, + "logps/chosen": -336.89617919921875, + "logps/rejected": -319.8518981933594, + "loss": 0.3128, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.2191519737243652, + "rewards/margins": 4.0408172607421875, + "rewards/rejected": -7.259970188140869, + "step": 47480 + }, + { + "epoch": 1.547848897361325, + "grad_norm": 0.08457114547491074, + "learning_rate": 2.4214379596137345e-05, + "logits/chosen": 3.0469970703125, + "logits/rejected": 3.137316942214966, + "logps/chosen": -335.639404296875, + "logps/rejected": -332.9694519042969, + "loss": 0.156, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.595693349838257, + "rewards/margins": 5.090440273284912, + "rewards/rejected": -7.68613338470459, + "step": 47500 + }, + { + "epoch": 1.5485006232128455, + "grad_norm": 7.691556453704834, + "learning_rate": 2.4203517233138896e-05, + "logits/chosen": 2.9363465309143066, + "logits/rejected": 3.137235164642334, + "logps/chosen": -336.63079833984375, + "logps/rejected": -378.7700500488281, + "loss": 0.2053, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.4994354248046875, + "rewards/margins": 4.691874980926514, + "rewards/rejected": -8.19131088256836, + "step": 47520 + }, + { + "epoch": 1.5491523490643662, + "grad_norm": 1.372283935546875, + "learning_rate": 2.419265487014045e-05, + "logits/chosen": 3.0792407989501953, + "logits/rejected": 2.875053882598877, + "logps/chosen": -348.52911376953125, + "logps/rejected": -364.5553283691406, + "loss": 0.3081, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.1667563915252686, + "rewards/margins": 4.648416519165039, + "rewards/rejected": -7.815173149108887, + "step": 47540 + }, + { + "epoch": 1.5498040749158868, + "grad_norm": 3.4280247688293457, + "learning_rate": 2.4181792507142005e-05, + "logits/chosen": 3.227029800415039, + "logits/rejected": 3.1682028770446777, + "logps/chosen": -377.2408142089844, + "logps/rejected": -311.8102722167969, + "loss": 0.2219, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.5147509574890137, + "rewards/margins": 4.239048957824707, + "rewards/rejected": -7.753800392150879, + "step": 47560 + }, + { + "epoch": 1.5504558007674072, + "grad_norm": 5.627992153167725, + "learning_rate": 2.417093014414356e-05, + "logits/chosen": 2.7772278785705566, + "logits/rejected": 2.7918548583984375, + "logps/chosen": -324.34478759765625, + "logps/rejected": -339.71136474609375, + "loss": 0.2738, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.903878688812256, + "rewards/margins": 4.081917762756348, + "rewards/rejected": -7.9857964515686035, + "step": 47580 + }, + { + "epoch": 1.5511075266189276, + "grad_norm": 2.778506278991699, + "learning_rate": 2.4160067781145113e-05, + "logits/chosen": 2.8042666912078857, + "logits/rejected": 3.005889415740967, + "logps/chosen": -329.99041748046875, + "logps/rejected": -342.585205078125, + "loss": 0.2207, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.5383574962615967, + "rewards/margins": 4.6266937255859375, + "rewards/rejected": -8.16505241394043, + "step": 47600 + }, + { + "epoch": 1.5517592524704482, + "grad_norm": 4.764005184173584, + "learning_rate": 2.4149205418146664e-05, + "logits/chosen": 3.104970693588257, + "logits/rejected": 3.1594743728637695, + "logps/chosen": -337.1197814941406, + "logps/rejected": -355.1172790527344, + "loss": 0.2812, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.5779526233673096, + "rewards/margins": 4.544331073760986, + "rewards/rejected": -7.122282981872559, + "step": 47620 + }, + { + "epoch": 1.5524109783219688, + "grad_norm": 2.9867186546325684, + "learning_rate": 2.4138343055148218e-05, + "logits/chosen": 3.153978109359741, + "logits/rejected": 3.194681167602539, + "logps/chosen": -369.2735595703125, + "logps/rejected": -389.087890625, + "loss": 0.3368, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.064532279968262, + "rewards/margins": 4.282090187072754, + "rewards/rejected": -8.346622467041016, + "step": 47640 + }, + { + "epoch": 1.5530627041734895, + "grad_norm": 9.647098541259766, + "learning_rate": 2.4127480692149772e-05, + "logits/chosen": 2.959134340286255, + "logits/rejected": 2.7926652431488037, + "logps/chosen": -371.5303955078125, + "logps/rejected": -391.8028869628906, + "loss": 0.2828, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.052228927612305, + "rewards/margins": 4.812849521636963, + "rewards/rejected": -8.865079879760742, + "step": 47660 + }, + { + "epoch": 1.55371443002501, + "grad_norm": 0.5772363543510437, + "learning_rate": 2.4116618329151326e-05, + "logits/chosen": 2.7488532066345215, + "logits/rejected": 2.6721198558807373, + "logps/chosen": -312.3885803222656, + "logps/rejected": -359.354736328125, + "loss": 0.2615, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.83984637260437, + "rewards/margins": 4.460952281951904, + "rewards/rejected": -8.300798416137695, + "step": 47680 + }, + { + "epoch": 1.5543661558765305, + "grad_norm": 3.5861973762512207, + "learning_rate": 2.410575596615288e-05, + "logits/chosen": 2.9534993171691895, + "logits/rejected": 3.0230605602264404, + "logps/chosen": -355.0060119628906, + "logps/rejected": -380.2003173828125, + "loss": 0.1579, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.603558301925659, + "rewards/margins": 5.5706329345703125, + "rewards/rejected": -9.17419147491455, + "step": 47700 + }, + { + "epoch": 1.555017881728051, + "grad_norm": 1.0080478191375732, + "learning_rate": 2.409489360315443e-05, + "logits/chosen": 3.103787660598755, + "logits/rejected": 2.898833751678467, + "logps/chosen": -362.6045837402344, + "logps/rejected": -330.8790283203125, + "loss": 0.326, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.987011671066284, + "rewards/margins": 4.149323463439941, + "rewards/rejected": -7.1363348960876465, + "step": 47720 + }, + { + "epoch": 1.5556696075795715, + "grad_norm": 5.671264171600342, + "learning_rate": 2.4084031240155985e-05, + "logits/chosen": 2.923931360244751, + "logits/rejected": 2.8645949363708496, + "logps/chosen": -384.6461486816406, + "logps/rejected": -367.0308837890625, + "loss": 0.3398, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.5193989276885986, + "rewards/margins": 5.096757411956787, + "rewards/rejected": -8.616155624389648, + "step": 47740 + }, + { + "epoch": 1.5563213334310921, + "grad_norm": 8.173160552978516, + "learning_rate": 2.4073168877157536e-05, + "logits/chosen": 3.263275146484375, + "logits/rejected": 3.246628522872925, + "logps/chosen": -375.13214111328125, + "logps/rejected": -372.9625549316406, + "loss": 0.2995, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.6700305938720703, + "rewards/margins": 4.0575456619262695, + "rewards/rejected": -7.72757625579834, + "step": 47760 + }, + { + "epoch": 1.5569730592826128, + "grad_norm": 6.152856826782227, + "learning_rate": 2.406230651415909e-05, + "logits/chosen": 2.9254133701324463, + "logits/rejected": 3.198756217956543, + "logps/chosen": -360.8356628417969, + "logps/rejected": -361.0765075683594, + "loss": 0.4703, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1208958625793457, + "rewards/margins": 3.534550428390503, + "rewards/rejected": -6.6554460525512695, + "step": 47780 + }, + { + "epoch": 1.5576247851341334, + "grad_norm": 3.4504058361053467, + "learning_rate": 2.4051444151160644e-05, + "logits/chosen": 3.114237070083618, + "logits/rejected": 3.1227946281433105, + "logps/chosen": -386.62860107421875, + "logps/rejected": -353.1175842285156, + "loss": 0.1912, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.114821195602417, + "rewards/margins": 5.321528434753418, + "rewards/rejected": -8.436349868774414, + "step": 47800 + }, + { + "epoch": 1.558276510985654, + "grad_norm": 0.2775672376155853, + "learning_rate": 2.40405817881622e-05, + "logits/chosen": 2.82165789604187, + "logits/rejected": 2.9188942909240723, + "logps/chosen": -382.5315246582031, + "logps/rejected": -376.5762634277344, + "loss": 0.2609, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.652517795562744, + "rewards/margins": 4.1635637283325195, + "rewards/rejected": -7.816082954406738, + "step": 47820 + }, + { + "epoch": 1.5589282368371744, + "grad_norm": 2.11773419380188, + "learning_rate": 2.4029719425163753e-05, + "logits/chosen": 2.9100875854492188, + "logits/rejected": 3.0461771488189697, + "logps/chosen": -358.64111328125, + "logps/rejected": -371.9058532714844, + "loss": 0.1881, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.3571105003356934, + "rewards/margins": 4.514225959777832, + "rewards/rejected": -7.871336460113525, + "step": 47840 + }, + { + "epoch": 1.559579962688695, + "grad_norm": 29.971458435058594, + "learning_rate": 2.4018857062165304e-05, + "logits/chosen": 3.0935516357421875, + "logits/rejected": 3.1858091354370117, + "logps/chosen": -395.69305419921875, + "logps/rejected": -367.33734130859375, + "loss": 0.2162, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.689068555831909, + "rewards/margins": 5.16897439956665, + "rewards/rejected": -7.858042240142822, + "step": 47860 + }, + { + "epoch": 1.5602316885402154, + "grad_norm": 0.7468813061714172, + "learning_rate": 2.4007994699166858e-05, + "logits/chosen": 2.929831027984619, + "logits/rejected": 3.156452178955078, + "logps/chosen": -349.3305969238281, + "logps/rejected": -358.25177001953125, + "loss": 0.2524, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2301621437072754, + "rewards/margins": 5.368889808654785, + "rewards/rejected": -8.599051475524902, + "step": 47880 + }, + { + "epoch": 1.560883414391736, + "grad_norm": 0.35107696056365967, + "learning_rate": 2.3997132336168412e-05, + "logits/chosen": 3.099654197692871, + "logits/rejected": 3.1298346519470215, + "logps/chosen": -373.2898864746094, + "logps/rejected": -358.3042907714844, + "loss": 0.2372, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.16087007522583, + "rewards/margins": 4.087180137634277, + "rewards/rejected": -7.248050689697266, + "step": 47900 + }, + { + "epoch": 1.5615351402432567, + "grad_norm": 2.7655301094055176, + "learning_rate": 2.3986269973169963e-05, + "logits/chosen": 2.9338796138763428, + "logits/rejected": 3.0752627849578857, + "logps/chosen": -345.437744140625, + "logps/rejected": -373.601318359375, + "loss": 0.4098, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.3010451793670654, + "rewards/margins": 4.193851947784424, + "rewards/rejected": -7.49489688873291, + "step": 47920 + }, + { + "epoch": 1.5621868660947773, + "grad_norm": 8.930872917175293, + "learning_rate": 2.397540761017152e-05, + "logits/chosen": 3.281019687652588, + "logits/rejected": 3.3037655353546143, + "logps/chosen": -387.7132568359375, + "logps/rejected": -346.6271667480469, + "loss": 0.1473, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.0005898475646973, + "rewards/margins": 4.808010578155518, + "rewards/rejected": -7.808600425720215, + "step": 47940 + }, + { + "epoch": 1.562838591946298, + "grad_norm": 0.8331851959228516, + "learning_rate": 2.396454524717307e-05, + "logits/chosen": 2.8710708618164062, + "logits/rejected": 2.927414894104004, + "logps/chosen": -330.0704345703125, + "logps/rejected": -329.0881042480469, + "loss": 0.2631, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.91858172416687, + "rewards/margins": 5.505433082580566, + "rewards/rejected": -8.424015045166016, + "step": 47960 + }, + { + "epoch": 1.5634903177978183, + "grad_norm": 4.127053260803223, + "learning_rate": 2.3953682884174625e-05, + "logits/chosen": 3.2216784954071045, + "logits/rejected": 3.3277134895324707, + "logps/chosen": -364.5812683105469, + "logps/rejected": -344.3700866699219, + "loss": 0.2933, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.997852087020874, + "rewards/margins": 4.463468074798584, + "rewards/rejected": -7.461320400238037, + "step": 47980 + }, + { + "epoch": 1.564142043649339, + "grad_norm": 2.2640836238861084, + "learning_rate": 2.394282052117618e-05, + "logits/chosen": 2.818772554397583, + "logits/rejected": 2.761716842651367, + "logps/chosen": -335.38677978515625, + "logps/rejected": -324.8480529785156, + "loss": 0.38, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.7738184928894043, + "rewards/margins": 3.758033037185669, + "rewards/rejected": -6.531851768493652, + "step": 48000 + }, + { + "epoch": 1.5647937695008594, + "grad_norm": 0.5597114562988281, + "learning_rate": 2.393195815817773e-05, + "logits/chosen": 3.1449506282806396, + "logits/rejected": 3.288790464401245, + "logps/chosen": -369.2069091796875, + "logps/rejected": -370.89788818359375, + "loss": 0.3155, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.877203941345215, + "rewards/margins": 4.983097076416016, + "rewards/rejected": -7.860300540924072, + "step": 48020 + }, + { + "epoch": 1.56544549535238, + "grad_norm": 0.7033064961433411, + "learning_rate": 2.3921095795179284e-05, + "logits/chosen": 2.9445128440856934, + "logits/rejected": 3.0957274436950684, + "logps/chosen": -347.91961669921875, + "logps/rejected": -360.58795166015625, + "loss": 0.2506, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5398857593536377, + "rewards/margins": 4.784229755401611, + "rewards/rejected": -7.324114799499512, + "step": 48040 + }, + { + "epoch": 1.5660972212039006, + "grad_norm": 6.223433017730713, + "learning_rate": 2.391023343218084e-05, + "logits/chosen": 3.0875024795532227, + "logits/rejected": 3.1703693866729736, + "logps/chosen": -353.3943176269531, + "logps/rejected": -317.14422607421875, + "loss": 0.2075, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.937234401702881, + "rewards/margins": 4.383636951446533, + "rewards/rejected": -7.320871829986572, + "step": 48060 + }, + { + "epoch": 1.5667489470554212, + "grad_norm": 0.04060354456305504, + "learning_rate": 2.3899371069182393e-05, + "logits/chosen": 3.112250804901123, + "logits/rejected": 3.091481924057007, + "logps/chosen": -378.3039245605469, + "logps/rejected": -373.5606689453125, + "loss": 0.1809, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2028586864471436, + "rewards/margins": 5.326205253601074, + "rewards/rejected": -8.52906322479248, + "step": 48080 + }, + { + "epoch": 1.5674006729069418, + "grad_norm": 3.1942615509033203, + "learning_rate": 2.3888508706183947e-05, + "logits/chosen": 3.023144245147705, + "logits/rejected": 3.04844069480896, + "logps/chosen": -348.4328308105469, + "logps/rejected": -367.33721923828125, + "loss": 0.1932, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.7623040676116943, + "rewards/margins": 4.500598907470703, + "rewards/rejected": -7.262902736663818, + "step": 48100 + }, + { + "epoch": 1.5680523987584623, + "grad_norm": 1.2870032787322998, + "learning_rate": 2.3877646343185498e-05, + "logits/chosen": 3.1025872230529785, + "logits/rejected": 3.0924735069274902, + "logps/chosen": -333.203125, + "logps/rejected": -321.564697265625, + "loss": 0.2052, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.4346556663513184, + "rewards/margins": 4.0805888175964355, + "rewards/rejected": -6.515244960784912, + "step": 48120 + }, + { + "epoch": 1.5687041246099827, + "grad_norm": 1.2476202249526978, + "learning_rate": 2.3866783980187052e-05, + "logits/chosen": 3.080615520477295, + "logits/rejected": 3.1561241149902344, + "logps/chosen": -397.00885009765625, + "logps/rejected": -352.60760498046875, + "loss": 0.1349, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.2369751930236816, + "rewards/margins": 4.639936447143555, + "rewards/rejected": -7.8769121170043945, + "step": 48140 + }, + { + "epoch": 1.5693558504615033, + "grad_norm": 0.10928735136985779, + "learning_rate": 2.3855921617188603e-05, + "logits/chosen": 3.1057097911834717, + "logits/rejected": 3.1057162284851074, + "logps/chosen": -343.595947265625, + "logps/rejected": -380.3309631347656, + "loss": 0.2933, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.2144322395324707, + "rewards/margins": 4.839587211608887, + "rewards/rejected": -7.054019927978516, + "step": 48160 + }, + { + "epoch": 1.570007576313024, + "grad_norm": 3.3151941299438477, + "learning_rate": 2.3845059254190157e-05, + "logits/chosen": 2.552497148513794, + "logits/rejected": 2.6761183738708496, + "logps/chosen": -305.7020568847656, + "logps/rejected": -329.071044921875, + "loss": 0.3145, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.110847234725952, + "rewards/margins": 4.013246536254883, + "rewards/rejected": -7.124094486236572, + "step": 48180 + }, + { + "epoch": 1.5706593021645445, + "grad_norm": 9.626677513122559, + "learning_rate": 2.383419689119171e-05, + "logits/chosen": 2.937991142272949, + "logits/rejected": 2.9603450298309326, + "logps/chosen": -344.5231628417969, + "logps/rejected": -371.2861633300781, + "loss": 0.3245, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.8006346225738525, + "rewards/margins": 4.338347911834717, + "rewards/rejected": -7.13898229598999, + "step": 48200 + }, + { + "epoch": 1.5713110280160651, + "grad_norm": 1.3132420778274536, + "learning_rate": 2.3823334528193265e-05, + "logits/chosen": 3.0950026512145996, + "logits/rejected": 3.2205700874328613, + "logps/chosen": -326.11029052734375, + "logps/rejected": -348.4542541503906, + "loss": 0.2962, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2664647102355957, + "rewards/margins": 4.729035377502441, + "rewards/rejected": -7.995499610900879, + "step": 48220 + }, + { + "epoch": 1.5719627538675855, + "grad_norm": 1.0430549383163452, + "learning_rate": 2.381247216519482e-05, + "logits/chosen": 3.286719560623169, + "logits/rejected": 3.438159942626953, + "logps/chosen": -373.35723876953125, + "logps/rejected": -347.009521484375, + "loss": 0.3057, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.687150239944458, + "rewards/margins": 4.30105447769165, + "rewards/rejected": -6.9882049560546875, + "step": 48240 + }, + { + "epoch": 1.5726144797191062, + "grad_norm": 10.868276596069336, + "learning_rate": 2.380160980219637e-05, + "logits/chosen": 3.3361968994140625, + "logits/rejected": 3.4171764850616455, + "logps/chosen": -363.8382263183594, + "logps/rejected": -336.85980224609375, + "loss": 0.5288, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8708906173706055, + "rewards/margins": 4.040804862976074, + "rewards/rejected": -6.911694526672363, + "step": 48260 + }, + { + "epoch": 1.5732662055706266, + "grad_norm": 11.717275619506836, + "learning_rate": 2.3790747439197924e-05, + "logits/chosen": 3.4255897998809814, + "logits/rejected": 3.478825330734253, + "logps/chosen": -343.09503173828125, + "logps/rejected": -345.3067932128906, + "loss": 0.2819, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.5947787761688232, + "rewards/margins": 4.164495944976807, + "rewards/rejected": -6.759274959564209, + "step": 48280 + }, + { + "epoch": 1.5739179314221472, + "grad_norm": 2.7645647525787354, + "learning_rate": 2.3779885076199475e-05, + "logits/chosen": 3.2834839820861816, + "logits/rejected": 3.392256259918213, + "logps/chosen": -339.6672058105469, + "logps/rejected": -364.0317077636719, + "loss": 0.3783, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.8232431411743164, + "rewards/margins": 4.572394847869873, + "rewards/rejected": -7.395637512207031, + "step": 48300 + }, + { + "epoch": 1.5745696572736678, + "grad_norm": 1.8417963981628418, + "learning_rate": 2.376902271320103e-05, + "logits/chosen": 2.9075469970703125, + "logits/rejected": 3.1250452995300293, + "logps/chosen": -335.1153259277344, + "logps/rejected": -345.25238037109375, + "loss": 0.4013, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.4306864738464355, + "rewards/margins": 4.970005512237549, + "rewards/rejected": -7.400691032409668, + "step": 48320 + }, + { + "epoch": 1.5752213831251884, + "grad_norm": 0.24366623163223267, + "learning_rate": 2.3758160350202587e-05, + "logits/chosen": 3.4547417163848877, + "logits/rejected": 3.4572975635528564, + "logps/chosen": -396.88348388671875, + "logps/rejected": -316.787841796875, + "loss": 0.4512, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.459794759750366, + "rewards/margins": 3.8991332054138184, + "rewards/rejected": -6.3589277267456055, + "step": 48340 + }, + { + "epoch": 1.575873108976709, + "grad_norm": 4.176121711730957, + "learning_rate": 2.3747297987204138e-05, + "logits/chosen": 3.122847557067871, + "logits/rejected": 3.1078498363494873, + "logps/chosen": -363.79425048828125, + "logps/rejected": -325.7791442871094, + "loss": 0.3336, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.823275089263916, + "rewards/margins": 4.131659984588623, + "rewards/rejected": -6.954935550689697, + "step": 48360 + }, + { + "epoch": 1.5765248348282295, + "grad_norm": 0.3236366808414459, + "learning_rate": 2.3736435624205692e-05, + "logits/chosen": 3.2617008686065674, + "logits/rejected": 3.2736334800720215, + "logps/chosen": -389.054931640625, + "logps/rejected": -361.4235534667969, + "loss": 0.1853, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.914975643157959, + "rewards/margins": 4.694234371185303, + "rewards/rejected": -7.609210968017578, + "step": 48380 + }, + { + "epoch": 1.57717656067975, + "grad_norm": 2.9278564453125, + "learning_rate": 2.3725573261207242e-05, + "logits/chosen": 2.802446126937866, + "logits/rejected": 2.8316850662231445, + "logps/chosen": -380.7583923339844, + "logps/rejected": -405.677978515625, + "loss": 0.2233, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.7135627269744873, + "rewards/margins": 5.3725104331970215, + "rewards/rejected": -9.08607292175293, + "step": 48400 + }, + { + "epoch": 1.5778282865312705, + "grad_norm": 0.6837007403373718, + "learning_rate": 2.3714710898208797e-05, + "logits/chosen": 2.9972712993621826, + "logits/rejected": 3.1238853931427, + "logps/chosen": -391.9122314453125, + "logps/rejected": -391.2652282714844, + "loss": 0.2927, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.104825019836426, + "rewards/margins": 5.669535160064697, + "rewards/rejected": -8.774359703063965, + "step": 48420 + }, + { + "epoch": 1.5784800123827911, + "grad_norm": 6.467489242553711, + "learning_rate": 2.370384853521035e-05, + "logits/chosen": 3.0920281410217285, + "logits/rejected": 3.1175434589385986, + "logps/chosen": -384.317626953125, + "logps/rejected": -358.70513916015625, + "loss": 0.2213, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.0333704948425293, + "rewards/margins": 4.26186466217041, + "rewards/rejected": -7.295236110687256, + "step": 48440 + }, + { + "epoch": 1.5791317382343117, + "grad_norm": 2.386455774307251, + "learning_rate": 2.3692986172211905e-05, + "logits/chosen": 3.1507110595703125, + "logits/rejected": 3.388460159301758, + "logps/chosen": -383.0445556640625, + "logps/rejected": -383.12689208984375, + "loss": 0.3093, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.789644241333008, + "rewards/margins": 4.759286880493164, + "rewards/rejected": -8.548930168151855, + "step": 48460 + }, + { + "epoch": 1.5797834640858324, + "grad_norm": 0.38501057028770447, + "learning_rate": 2.368212380921346e-05, + "logits/chosen": 2.75091552734375, + "logits/rejected": 2.7402586936950684, + "logps/chosen": -360.98272705078125, + "logps/rejected": -337.39398193359375, + "loss": 0.3397, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.79706072807312, + "rewards/margins": 4.849315166473389, + "rewards/rejected": -8.64637565612793, + "step": 48480 + }, + { + "epoch": 1.580435189937353, + "grad_norm": 0.25314998626708984, + "learning_rate": 2.367126144621501e-05, + "logits/chosen": 2.958691358566284, + "logits/rejected": 2.959115505218506, + "logps/chosen": -403.61456298828125, + "logps/rejected": -391.9292907714844, + "loss": 0.2205, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.5786590576171875, + "rewards/margins": 5.471283912658691, + "rewards/rejected": -9.049942016601562, + "step": 48500 + }, + { + "epoch": 1.5810869157888734, + "grad_norm": 1.6475565433502197, + "learning_rate": 2.3660399083216564e-05, + "logits/chosen": 3.0265445709228516, + "logits/rejected": 3.0011942386627197, + "logps/chosen": -382.23626708984375, + "logps/rejected": -360.345703125, + "loss": 0.3562, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.320287704467773, + "rewards/margins": 4.097567558288574, + "rewards/rejected": -8.417856216430664, + "step": 48520 + }, + { + "epoch": 1.581738641640394, + "grad_norm": 2.6508517265319824, + "learning_rate": 2.3649536720218118e-05, + "logits/chosen": 3.1623780727386475, + "logits/rejected": 3.2822937965393066, + "logps/chosen": -382.03009033203125, + "logps/rejected": -381.72137451171875, + "loss": 0.1973, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.9870247840881348, + "rewards/margins": 5.608763694763184, + "rewards/rejected": -9.595788955688477, + "step": 48540 + }, + { + "epoch": 1.5823903674919144, + "grad_norm": 3.088273286819458, + "learning_rate": 2.363867435721967e-05, + "logits/chosen": 2.9181666374206543, + "logits/rejected": 3.047262668609619, + "logps/chosen": -384.85198974609375, + "logps/rejected": -371.6404113769531, + "loss": 0.1236, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.699521064758301, + "rewards/margins": 5.738251686096191, + "rewards/rejected": -10.437771797180176, + "step": 48560 + }, + { + "epoch": 1.583042093343435, + "grad_norm": 4.816008567810059, + "learning_rate": 2.3627811994221223e-05, + "logits/chosen": 2.949202299118042, + "logits/rejected": 3.0750274658203125, + "logps/chosen": -378.3948669433594, + "logps/rejected": -355.10870361328125, + "loss": 0.2667, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.741070508956909, + "rewards/margins": 4.588704586029053, + "rewards/rejected": -8.329774856567383, + "step": 48580 + }, + { + "epoch": 1.5836938191949557, + "grad_norm": 9.537858009338379, + "learning_rate": 2.3616949631222777e-05, + "logits/chosen": 2.95383882522583, + "logits/rejected": 3.050624132156372, + "logps/chosen": -378.2767639160156, + "logps/rejected": -360.9707946777344, + "loss": 0.1882, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.649284839630127, + "rewards/margins": 4.739728927612305, + "rewards/rejected": -8.389013290405273, + "step": 48600 + }, + { + "epoch": 1.5843455450464763, + "grad_norm": 0.5237187743186951, + "learning_rate": 2.360608726822433e-05, + "logits/chosen": 3.0992062091827393, + "logits/rejected": 3.111459255218506, + "logps/chosen": -332.04833984375, + "logps/rejected": -341.10333251953125, + "loss": 0.2144, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.934575319290161, + "rewards/margins": 5.173268795013428, + "rewards/rejected": -9.107844352722168, + "step": 48620 + }, + { + "epoch": 1.584997270897997, + "grad_norm": 1.257238745689392, + "learning_rate": 2.3595224905225886e-05, + "logits/chosen": 3.0153963565826416, + "logits/rejected": 2.986351490020752, + "logps/chosen": -419.8421325683594, + "logps/rejected": -389.493408203125, + "loss": 0.1426, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.4731392860412598, + "rewards/margins": 5.576827526092529, + "rewards/rejected": -9.049966812133789, + "step": 48640 + }, + { + "epoch": 1.5856489967495173, + "grad_norm": 2.8734970092773438, + "learning_rate": 2.3584362542227437e-05, + "logits/chosen": 3.1595067977905273, + "logits/rejected": 3.0793445110321045, + "logps/chosen": -366.67742919921875, + "logps/rejected": -371.4615173339844, + "loss": 0.2303, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.194291591644287, + "rewards/margins": 4.409900188446045, + "rewards/rejected": -8.604192733764648, + "step": 48660 + }, + { + "epoch": 1.5863007226010377, + "grad_norm": 1.1790645122528076, + "learning_rate": 2.357350017922899e-05, + "logits/chosen": 3.1654059886932373, + "logits/rejected": 3.3382105827331543, + "logps/chosen": -362.08746337890625, + "logps/rejected": -384.39093017578125, + "loss": 0.2553, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.9904208183288574, + "rewards/margins": 5.0874342918396, + "rewards/rejected": -9.077855110168457, + "step": 48680 + }, + { + "epoch": 1.5869524484525583, + "grad_norm": 6.785734176635742, + "learning_rate": 2.356263781623054e-05, + "logits/chosen": 3.0290608406066895, + "logits/rejected": 3.125944137573242, + "logps/chosen": -311.5033264160156, + "logps/rejected": -361.16473388671875, + "loss": 0.3049, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.9750938415527344, + "rewards/margins": 5.645039081573486, + "rewards/rejected": -9.620132446289062, + "step": 48700 + }, + { + "epoch": 1.587604174304079, + "grad_norm": 2.012859344482422, + "learning_rate": 2.3551775453232096e-05, + "logits/chosen": 2.9472548961639404, + "logits/rejected": 3.0956790447235107, + "logps/chosen": -361.2550354003906, + "logps/rejected": -374.7557678222656, + "loss": 0.388, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.728907823562622, + "rewards/margins": 4.632444858551025, + "rewards/rejected": -8.361352920532227, + "step": 48720 + }, + { + "epoch": 1.5882559001555996, + "grad_norm": 0.17572306096553802, + "learning_rate": 2.3540913090233653e-05, + "logits/chosen": 2.9591331481933594, + "logits/rejected": 3.0345687866210938, + "logps/chosen": -357.02972412109375, + "logps/rejected": -385.2792053222656, + "loss": 0.2544, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.159234523773193, + "rewards/margins": 4.962649822235107, + "rewards/rejected": -9.1218843460083, + "step": 48740 + }, + { + "epoch": 1.5889076260071202, + "grad_norm": 7.364583492279053, + "learning_rate": 2.3530050727235204e-05, + "logits/chosen": 2.8004589080810547, + "logits/rejected": 2.900588035583496, + "logps/chosen": -366.40460205078125, + "logps/rejected": -346.65838623046875, + "loss": 0.1913, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.882422685623169, + "rewards/margins": 5.236029148101807, + "rewards/rejected": -9.118452072143555, + "step": 48760 + }, + { + "epoch": 1.5895593518586406, + "grad_norm": 9.857831954956055, + "learning_rate": 2.3519188364236758e-05, + "logits/chosen": 3.092656135559082, + "logits/rejected": 3.0360655784606934, + "logps/chosen": -341.4164123535156, + "logps/rejected": -356.02825927734375, + "loss": 0.2223, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.8196074962615967, + "rewards/margins": 5.5613298416137695, + "rewards/rejected": -9.380936622619629, + "step": 48780 + }, + { + "epoch": 1.5902110777101612, + "grad_norm": 4.921411514282227, + "learning_rate": 2.350832600123831e-05, + "logits/chosen": 3.033698320388794, + "logits/rejected": 3.103816509246826, + "logps/chosen": -421.5226135253906, + "logps/rejected": -387.9981384277344, + "loss": 0.251, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.8781514167785645, + "rewards/margins": 4.853976726531982, + "rewards/rejected": -8.732128143310547, + "step": 48800 + }, + { + "epoch": 1.5908628035616816, + "grad_norm": 5.504306793212891, + "learning_rate": 2.3497463638239863e-05, + "logits/chosen": 2.743175506591797, + "logits/rejected": 2.6965670585632324, + "logps/chosen": -313.4986267089844, + "logps/rejected": -327.76556396484375, + "loss": 0.2063, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.7049560546875, + "rewards/margins": 4.57639741897583, + "rewards/rejected": -8.281352996826172, + "step": 48820 + }, + { + "epoch": 1.5915145294132023, + "grad_norm": 2.405773878097534, + "learning_rate": 2.3486601275241417e-05, + "logits/chosen": 3.1152894496917725, + "logits/rejected": 3.1736900806427, + "logps/chosen": -388.95782470703125, + "logps/rejected": -362.7076110839844, + "loss": 0.3741, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.374922752380371, + "rewards/margins": 4.2184224128723145, + "rewards/rejected": -8.593343734741211, + "step": 48840 + }, + { + "epoch": 1.5921662552647229, + "grad_norm": 3.905205488204956, + "learning_rate": 2.3476282030392892e-05, + "logits/chosen": 2.741490125656128, + "logits/rejected": 2.990711212158203, + "logps/chosen": -372.6162109375, + "logps/rejected": -359.34478759765625, + "loss": 0.2679, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.328974723815918, + "rewards/margins": 4.671539306640625, + "rewards/rejected": -9.000514030456543, + "step": 48860 + }, + { + "epoch": 1.5928179811162435, + "grad_norm": 1.3027139902114868, + "learning_rate": 2.3465419667394446e-05, + "logits/chosen": 2.6581504344940186, + "logits/rejected": 2.972367763519287, + "logps/chosen": -378.82562255859375, + "logps/rejected": -363.37738037109375, + "loss": 0.2351, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.5777359008789062, + "rewards/margins": 5.319777488708496, + "rewards/rejected": -8.897513389587402, + "step": 48880 + }, + { + "epoch": 1.5934697069677641, + "grad_norm": 5.109238147735596, + "learning_rate": 2.3454557304396e-05, + "logits/chosen": 2.8143177032470703, + "logits/rejected": 2.865457057952881, + "logps/chosen": -342.70404052734375, + "logps/rejected": -340.98687744140625, + "loss": 0.1984, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.266944169998169, + "rewards/margins": 4.977555274963379, + "rewards/rejected": -8.244500160217285, + "step": 48900 + }, + { + "epoch": 1.5941214328192845, + "grad_norm": 7.104873180389404, + "learning_rate": 2.3443694941397555e-05, + "logits/chosen": 2.7132182121276855, + "logits/rejected": 2.89555025100708, + "logps/chosen": -349.1607971191406, + "logps/rejected": -326.74322509765625, + "loss": 0.2305, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.541715145111084, + "rewards/margins": 4.520394325256348, + "rewards/rejected": -8.062108993530273, + "step": 48920 + }, + { + "epoch": 1.5947731586708052, + "grad_norm": 5.050650119781494, + "learning_rate": 2.3432832578399106e-05, + "logits/chosen": 2.7908737659454346, + "logits/rejected": 2.9677281379699707, + "logps/chosen": -357.4103088378906, + "logps/rejected": -362.1496276855469, + "loss": 0.4115, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.229279041290283, + "rewards/margins": 5.425237655639648, + "rewards/rejected": -9.65451717376709, + "step": 48940 + }, + { + "epoch": 1.5954248845223256, + "grad_norm": 3.061535120010376, + "learning_rate": 2.342197021540066e-05, + "logits/chosen": 2.4583492279052734, + "logits/rejected": 2.6644132137298584, + "logps/chosen": -340.57989501953125, + "logps/rejected": -331.5163879394531, + "loss": 0.1781, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.289316177368164, + "rewards/margins": 4.845767021179199, + "rewards/rejected": -9.135083198547363, + "step": 48960 + }, + { + "epoch": 1.5960766103738462, + "grad_norm": 0.2682401239871979, + "learning_rate": 2.341110785240221e-05, + "logits/chosen": 3.1088061332702637, + "logits/rejected": 3.101003646850586, + "logps/chosen": -340.3014831542969, + "logps/rejected": -325.8031005859375, + "loss": 0.4667, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.234434127807617, + "rewards/margins": 3.8131370544433594, + "rewards/rejected": -8.047571182250977, + "step": 48980 + }, + { + "epoch": 1.5967283362253668, + "grad_norm": 5.3084492683410645, + "learning_rate": 2.3400245489403765e-05, + "logits/chosen": 2.8910036087036133, + "logits/rejected": 2.9799551963806152, + "logps/chosen": -354.9590148925781, + "logps/rejected": -338.8826599121094, + "loss": 0.2505, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.9770705699920654, + "rewards/margins": 4.700737476348877, + "rewards/rejected": -8.677807807922363, + "step": 49000 + }, + { + "epoch": 1.5973800620768874, + "grad_norm": 6.204286575317383, + "learning_rate": 2.338938312640532e-05, + "logits/chosen": 3.1857829093933105, + "logits/rejected": 3.065577745437622, + "logps/chosen": -362.074951171875, + "logps/rejected": -321.3095703125, + "loss": 0.2413, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.4728317260742188, + "rewards/margins": 4.950928688049316, + "rewards/rejected": -8.423760414123535, + "step": 49020 + }, + { + "epoch": 1.598031787928408, + "grad_norm": 3.482322931289673, + "learning_rate": 2.3378520763406873e-05, + "logits/chosen": 2.923844814300537, + "logits/rejected": 2.9921212196350098, + "logps/chosen": -346.74371337890625, + "logps/rejected": -360.1824035644531, + "loss": 0.3374, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.3354225158691406, + "rewards/margins": 4.065802574157715, + "rewards/rejected": -7.401223659515381, + "step": 49040 + }, + { + "epoch": 1.5986835137799285, + "grad_norm": 0.6399060487747192, + "learning_rate": 2.3367658400408427e-05, + "logits/chosen": 3.0092661380767822, + "logits/rejected": 3.086725950241089, + "logps/chosen": -382.1448669433594, + "logps/rejected": -395.3185119628906, + "loss": 0.4465, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.416825294494629, + "rewards/margins": 4.4055962562561035, + "rewards/rejected": -8.82242202758789, + "step": 49060 + }, + { + "epoch": 1.599335239631449, + "grad_norm": 1.8335940837860107, + "learning_rate": 2.3356796037409978e-05, + "logits/chosen": 2.9240918159484863, + "logits/rejected": 3.045194149017334, + "logps/chosen": -373.50006103515625, + "logps/rejected": -344.9103698730469, + "loss": 0.3084, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.881133556365967, + "rewards/margins": 4.270883560180664, + "rewards/rejected": -8.152017593383789, + "step": 49080 + }, + { + "epoch": 1.5999869654829695, + "grad_norm": 2.571074962615967, + "learning_rate": 2.3345933674411532e-05, + "logits/chosen": 2.9259581565856934, + "logits/rejected": 2.9268720149993896, + "logps/chosen": -339.6220703125, + "logps/rejected": -395.7144470214844, + "loss": 0.2883, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.712181806564331, + "rewards/margins": 4.2802605628967285, + "rewards/rejected": -7.9924421310424805, + "step": 49100 + }, + { + "epoch": 1.60063869133449, + "grad_norm": 8.045319557189941, + "learning_rate": 2.3335071311413086e-05, + "logits/chosen": 2.6395692825317383, + "logits/rejected": 2.989586591720581, + "logps/chosen": -327.531982421875, + "logps/rejected": -325.30535888671875, + "loss": 0.4634, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7042477130889893, + "rewards/margins": 4.593677520751953, + "rewards/rejected": -8.297924995422363, + "step": 49120 + }, + { + "epoch": 1.6012904171860107, + "grad_norm": 0.004270435776561499, + "learning_rate": 2.3324208948414637e-05, + "logits/chosen": 3.1726956367492676, + "logits/rejected": 3.031501054763794, + "logps/chosen": -372.7959289550781, + "logps/rejected": -394.1033630371094, + "loss": 0.2197, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.466111660003662, + "rewards/margins": 5.620660781860352, + "rewards/rejected": -9.086771965026855, + "step": 49140 + }, + { + "epoch": 1.6019421430375314, + "grad_norm": 0.6108613014221191, + "learning_rate": 2.3313346585416195e-05, + "logits/chosen": 2.924807071685791, + "logits/rejected": 2.8931589126586914, + "logps/chosen": -381.825439453125, + "logps/rejected": -365.8531188964844, + "loss": 0.2262, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.5474705696105957, + "rewards/margins": 5.043208122253418, + "rewards/rejected": -8.590678215026855, + "step": 49160 + }, + { + "epoch": 1.602593868889052, + "grad_norm": 0.8895410895347595, + "learning_rate": 2.3302484222417745e-05, + "logits/chosen": 2.754976272583008, + "logits/rejected": 2.6207809448242188, + "logps/chosen": -359.0281677246094, + "logps/rejected": -341.2139587402344, + "loss": 0.2755, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.416965961456299, + "rewards/margins": 4.473359107971191, + "rewards/rejected": -7.890324592590332, + "step": 49180 + }, + { + "epoch": 1.6032455947405724, + "grad_norm": 3.7757718563079834, + "learning_rate": 2.32916218594193e-05, + "logits/chosen": 3.255783796310425, + "logits/rejected": 3.1052935123443604, + "logps/chosen": -365.32464599609375, + "logps/rejected": -352.4889831542969, + "loss": 0.3202, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.1561288833618164, + "rewards/margins": 4.529509544372559, + "rewards/rejected": -7.685639381408691, + "step": 49200 + }, + { + "epoch": 1.6038973205920928, + "grad_norm": 0.018995534628629684, + "learning_rate": 2.3280759496420854e-05, + "logits/chosen": 2.8952908515930176, + "logits/rejected": 3.1739988327026367, + "logps/chosen": -317.005126953125, + "logps/rejected": -331.7239074707031, + "loss": 0.2696, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.2049267292022705, + "rewards/margins": 4.320925712585449, + "rewards/rejected": -7.525852203369141, + "step": 49220 + }, + { + "epoch": 1.6045490464436134, + "grad_norm": 0.40394213795661926, + "learning_rate": 2.3269897133422405e-05, + "logits/chosen": 3.140575885772705, + "logits/rejected": 3.2455101013183594, + "logps/chosen": -364.4975891113281, + "logps/rejected": -366.41363525390625, + "loss": 0.2509, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.5486958026885986, + "rewards/margins": 4.332826614379883, + "rewards/rejected": -7.881521701812744, + "step": 49240 + }, + { + "epoch": 1.605200772295134, + "grad_norm": 3.0537590980529785, + "learning_rate": 2.325903477042396e-05, + "logits/chosen": 2.729637861251831, + "logits/rejected": 2.757690668106079, + "logps/chosen": -353.7655334472656, + "logps/rejected": -365.1950378417969, + "loss": 0.38, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3987326622009277, + "rewards/margins": 4.362998008728027, + "rewards/rejected": -6.761731147766113, + "step": 49260 + }, + { + "epoch": 1.6058524981466546, + "grad_norm": 1.74220871925354, + "learning_rate": 2.3248172407425513e-05, + "logits/chosen": 3.028895616531372, + "logits/rejected": 3.158616542816162, + "logps/chosen": -348.73309326171875, + "logps/rejected": -326.3205871582031, + "loss": 0.2326, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.6033682823181152, + "rewards/margins": 4.404727935791016, + "rewards/rejected": -7.008096218109131, + "step": 49280 + }, + { + "epoch": 1.6065042239981753, + "grad_norm": 1.9462016820907593, + "learning_rate": 2.3237310044427067e-05, + "logits/chosen": 3.00945782661438, + "logits/rejected": 3.1559255123138428, + "logps/chosen": -358.18927001953125, + "logps/rejected": -341.6080017089844, + "loss": 0.3032, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.6836743354797363, + "rewards/margins": 5.11653995513916, + "rewards/rejected": -7.8002142906188965, + "step": 49300 + }, + { + "epoch": 1.6071559498496957, + "grad_norm": 4.022315979003906, + "learning_rate": 2.322644768142862e-05, + "logits/chosen": 3.184446334838867, + "logits/rejected": 3.230285167694092, + "logps/chosen": -358.30816650390625, + "logps/rejected": -363.25152587890625, + "loss": 0.332, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.670267343521118, + "rewards/margins": 4.526421546936035, + "rewards/rejected": -7.196688652038574, + "step": 49320 + }, + { + "epoch": 1.6078076757012163, + "grad_norm": 6.492680072784424, + "learning_rate": 2.3215585318430172e-05, + "logits/chosen": 2.79982328414917, + "logits/rejected": 2.743591547012329, + "logps/chosen": -366.0579528808594, + "logps/rejected": -332.95550537109375, + "loss": 0.4585, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.7788681983947754, + "rewards/margins": 3.9990718364715576, + "rewards/rejected": -7.777940273284912, + "step": 49340 + }, + { + "epoch": 1.6084594015527367, + "grad_norm": 2.462933301925659, + "learning_rate": 2.3204722955431726e-05, + "logits/chosen": 2.744985580444336, + "logits/rejected": 2.834531545639038, + "logps/chosen": -304.87200927734375, + "logps/rejected": -347.85791015625, + "loss": 0.106, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.063763380050659, + "rewards/margins": 5.667952537536621, + "rewards/rejected": -8.73171615600586, + "step": 49360 + }, + { + "epoch": 1.6091111274042573, + "grad_norm": 5.304034233093262, + "learning_rate": 2.3193860592433277e-05, + "logits/chosen": 3.0450656414031982, + "logits/rejected": 2.860278367996216, + "logps/chosen": -359.14373779296875, + "logps/rejected": -381.0156555175781, + "loss": 0.2637, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8282246589660645, + "rewards/margins": 4.386308193206787, + "rewards/rejected": -7.214533805847168, + "step": 49380 + }, + { + "epoch": 1.609762853255778, + "grad_norm": 3.462704658508301, + "learning_rate": 2.318299822943483e-05, + "logits/chosen": 3.0398499965667725, + "logits/rejected": 3.0684597492218018, + "logps/chosen": -351.6471252441406, + "logps/rejected": -318.3976135253906, + "loss": 0.2862, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.253180742263794, + "rewards/margins": 4.671764373779297, + "rewards/rejected": -6.924944877624512, + "step": 49400 + }, + { + "epoch": 1.6104145791072986, + "grad_norm": 8.439933776855469, + "learning_rate": 2.3172135866436385e-05, + "logits/chosen": 3.146005868911743, + "logits/rejected": 3.340402126312256, + "logps/chosen": -355.400390625, + "logps/rejected": -321.6925048828125, + "loss": 0.2635, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.511056900024414, + "rewards/margins": 4.268962860107422, + "rewards/rejected": -7.780020236968994, + "step": 49420 + }, + { + "epoch": 1.6110663049588192, + "grad_norm": 0.9874507188796997, + "learning_rate": 2.316127350343794e-05, + "logits/chosen": 3.4130947589874268, + "logits/rejected": 3.302091121673584, + "logps/chosen": -376.1107177734375, + "logps/rejected": -369.98919677734375, + "loss": 0.3226, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.754711627960205, + "rewards/margins": 4.349940299987793, + "rewards/rejected": -7.104652404785156, + "step": 49440 + }, + { + "epoch": 1.6117180308103396, + "grad_norm": 4.123676776885986, + "learning_rate": 2.3150411140439494e-05, + "logits/chosen": 2.8670222759246826, + "logits/rejected": 2.9579787254333496, + "logps/chosen": -357.8203430175781, + "logps/rejected": -373.93292236328125, + "loss": 0.1904, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.9288289546966553, + "rewards/margins": 4.940762996673584, + "rewards/rejected": -7.869592189788818, + "step": 49460 + }, + { + "epoch": 1.6123697566618602, + "grad_norm": 9.160096168518066, + "learning_rate": 2.3139548777441044e-05, + "logits/chosen": 2.901075839996338, + "logits/rejected": 3.092010974884033, + "logps/chosen": -341.58251953125, + "logps/rejected": -345.6556701660156, + "loss": 0.3641, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.6144826412200928, + "rewards/margins": 4.837538242340088, + "rewards/rejected": -7.45202112197876, + "step": 49480 + }, + { + "epoch": 1.6130214825133806, + "grad_norm": 2.7539446353912354, + "learning_rate": 2.31286864144426e-05, + "logits/chosen": 2.9369301795959473, + "logits/rejected": 2.881423234939575, + "logps/chosen": -351.6465759277344, + "logps/rejected": -344.59454345703125, + "loss": 0.4383, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.5083186626434326, + "rewards/margins": 4.107295989990234, + "rewards/rejected": -7.615614891052246, + "step": 49500 + }, + { + "epoch": 1.6136732083649012, + "grad_norm": 2.011568784713745, + "learning_rate": 2.3117824051444153e-05, + "logits/chosen": 2.83314847946167, + "logits/rejected": 2.9985663890838623, + "logps/chosen": -359.1805114746094, + "logps/rejected": -334.45220947265625, + "loss": 0.1958, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.825706958770752, + "rewards/margins": 4.780521869659424, + "rewards/rejected": -7.606229305267334, + "step": 49520 + }, + { + "epoch": 1.6143249342164219, + "grad_norm": 2.2858145236968994, + "learning_rate": 2.3106961688445704e-05, + "logits/chosen": 2.9062180519104004, + "logits/rejected": 2.7415995597839355, + "logps/chosen": -338.6441345214844, + "logps/rejected": -331.5352478027344, + "loss": 0.2438, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.2075183391571045, + "rewards/margins": 4.688336372375488, + "rewards/rejected": -7.8958539962768555, + "step": 49540 + }, + { + "epoch": 1.6149766600679425, + "grad_norm": 1.0270835161209106, + "learning_rate": 2.309609932544726e-05, + "logits/chosen": 2.9314842224121094, + "logits/rejected": 3.0579299926757812, + "logps/chosen": -355.739990234375, + "logps/rejected": -341.80487060546875, + "loss": 0.4331, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.890474796295166, + "rewards/margins": 3.7154033184051514, + "rewards/rejected": -7.6058783531188965, + "step": 49560 + }, + { + "epoch": 1.6156283859194631, + "grad_norm": 0.19996559619903564, + "learning_rate": 2.3085236962448812e-05, + "logits/chosen": 3.381385087966919, + "logits/rejected": 3.377838134765625, + "logps/chosen": -400.654052734375, + "logps/rejected": -380.4075927734375, + "loss": 0.4574, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.142698287963867, + "rewards/margins": 4.034360885620117, + "rewards/rejected": -7.177059173583984, + "step": 49580 + }, + { + "epoch": 1.6162801117709835, + "grad_norm": 3.726032257080078, + "learning_rate": 2.3074374599450366e-05, + "logits/chosen": 3.002457857131958, + "logits/rejected": 2.968024730682373, + "logps/chosen": -358.50799560546875, + "logps/rejected": -361.1194763183594, + "loss": 0.2515, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.9650378227233887, + "rewards/margins": 3.8803818225860596, + "rewards/rejected": -6.845419883728027, + "step": 49600 + }, + { + "epoch": 1.6169318376225041, + "grad_norm": 3.4977638721466064, + "learning_rate": 2.306351223645192e-05, + "logits/chosen": 2.744654655456543, + "logits/rejected": 3.117551326751709, + "logps/chosen": -349.4659423828125, + "logps/rejected": -337.9775085449219, + "loss": 0.2215, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.3095576763153076, + "rewards/margins": 4.597569465637207, + "rewards/rejected": -7.907127380371094, + "step": 49620 + }, + { + "epoch": 1.6175835634740245, + "grad_norm": 0.17745837569236755, + "learning_rate": 2.305264987345347e-05, + "logits/chosen": 3.1019396781921387, + "logits/rejected": 3.387418270111084, + "logps/chosen": -408.34344482421875, + "logps/rejected": -361.35784912109375, + "loss": 0.1671, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.2312896251678467, + "rewards/margins": 5.413308143615723, + "rewards/rejected": -8.644598007202148, + "step": 49640 + }, + { + "epoch": 1.6182352893255452, + "grad_norm": 3.508451461791992, + "learning_rate": 2.3041787510455025e-05, + "logits/chosen": 3.0126194953918457, + "logits/rejected": 3.02250337600708, + "logps/chosen": -378.5431823730469, + "logps/rejected": -309.43865966796875, + "loss": 0.1757, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.426309585571289, + "rewards/margins": 4.672289848327637, + "rewards/rejected": -8.098600387573242, + "step": 49660 + }, + { + "epoch": 1.6188870151770658, + "grad_norm": 6.540155410766602, + "learning_rate": 2.303092514745658e-05, + "logits/chosen": 2.7665932178497314, + "logits/rejected": 2.8858025074005127, + "logps/chosen": -334.1335754394531, + "logps/rejected": -316.46783447265625, + "loss": 0.2178, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.6269428730010986, + "rewards/margins": 3.842444658279419, + "rewards/rejected": -7.469387054443359, + "step": 49680 + }, + { + "epoch": 1.6195387410285864, + "grad_norm": 0.267711341381073, + "learning_rate": 2.3020062784458134e-05, + "logits/chosen": 2.8858160972595215, + "logits/rejected": 2.9912829399108887, + "logps/chosen": -312.030029296875, + "logps/rejected": -330.87054443359375, + "loss": 0.2559, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.4151275157928467, + "rewards/margins": 4.8486528396606445, + "rewards/rejected": -8.26378059387207, + "step": 49700 + }, + { + "epoch": 1.620190466880107, + "grad_norm": 2.8944265842437744, + "learning_rate": 2.3009200421459688e-05, + "logits/chosen": 3.144416332244873, + "logits/rejected": 3.18589448928833, + "logps/chosen": -364.88555908203125, + "logps/rejected": -356.4075622558594, + "loss": 0.2321, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.3630168437957764, + "rewards/margins": 4.4279046058654785, + "rewards/rejected": -7.790921211242676, + "step": 49720 + }, + { + "epoch": 1.6208421927316274, + "grad_norm": 8.721359252929688, + "learning_rate": 2.299833805846124e-05, + "logits/chosen": 2.7753121852874756, + "logits/rejected": 2.995523452758789, + "logps/chosen": -310.2757263183594, + "logps/rejected": -329.46038818359375, + "loss": 0.3881, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.4365315437316895, + "rewards/margins": 4.606518745422363, + "rewards/rejected": -8.043050765991211, + "step": 49740 + }, + { + "epoch": 1.6214939185831478, + "grad_norm": 2.533679962158203, + "learning_rate": 2.2987475695462793e-05, + "logits/chosen": 2.814685344696045, + "logits/rejected": 3.0669682025909424, + "logps/chosen": -363.96942138671875, + "logps/rejected": -397.2508239746094, + "loss": 0.2335, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.6083221435546875, + "rewards/margins": 4.988816261291504, + "rewards/rejected": -8.597139358520508, + "step": 49760 + }, + { + "epoch": 1.6221456444346685, + "grad_norm": 5.447138786315918, + "learning_rate": 2.2976613332464343e-05, + "logits/chosen": 2.8329920768737793, + "logits/rejected": 2.824441432952881, + "logps/chosen": -347.64398193359375, + "logps/rejected": -372.06005859375, + "loss": 0.2865, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.8682265281677246, + "rewards/margins": 5.2864508628845215, + "rewards/rejected": -8.15467643737793, + "step": 49780 + }, + { + "epoch": 1.622797370286189, + "grad_norm": 1.4216309785842896, + "learning_rate": 2.2965750969465898e-05, + "logits/chosen": 3.114471912384033, + "logits/rejected": 3.162898302078247, + "logps/chosen": -376.42449951171875, + "logps/rejected": -351.53656005859375, + "loss": 0.2536, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.081361770629883, + "rewards/margins": 4.587998390197754, + "rewards/rejected": -7.669360160827637, + "step": 49800 + }, + { + "epoch": 1.6234490961377097, + "grad_norm": 1.5357059240341187, + "learning_rate": 2.2954888606467452e-05, + "logits/chosen": 2.9047036170959473, + "logits/rejected": 3.055088996887207, + "logps/chosen": -344.1758728027344, + "logps/rejected": -358.4161682128906, + "loss": 0.3558, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.615610122680664, + "rewards/margins": 4.806178092956543, + "rewards/rejected": -7.421788692474365, + "step": 49820 + }, + { + "epoch": 1.6241008219892303, + "grad_norm": 13.475485801696777, + "learning_rate": 2.2944026243469006e-05, + "logits/chosen": 3.1078691482543945, + "logits/rejected": 3.099653959274292, + "logps/chosen": -370.12286376953125, + "logps/rejected": -391.00030517578125, + "loss": 0.4093, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.795397996902466, + "rewards/margins": 4.932214736938477, + "rewards/rejected": -7.7276129722595215, + "step": 49840 + }, + { + "epoch": 1.6247525478407507, + "grad_norm": 0.7974008917808533, + "learning_rate": 2.293316388047056e-05, + "logits/chosen": 3.1121363639831543, + "logits/rejected": 3.0196523666381836, + "logps/chosen": -386.6156005859375, + "logps/rejected": -315.56304931640625, + "loss": 0.3685, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1555874347686768, + "rewards/margins": 3.825077533721924, + "rewards/rejected": -6.980665683746338, + "step": 49860 + }, + { + "epoch": 1.6254042736922714, + "grad_norm": 0.14528614282608032, + "learning_rate": 2.292230151747211e-05, + "logits/chosen": 3.4419307708740234, + "logits/rejected": 3.344963788986206, + "logps/chosen": -382.26446533203125, + "logps/rejected": -361.9426574707031, + "loss": 0.3536, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.7351012229919434, + "rewards/margins": 4.1993513107299805, + "rewards/rejected": -7.934453010559082, + "step": 49880 + }, + { + "epoch": 1.6260559995437918, + "grad_norm": 0.08529692888259888, + "learning_rate": 2.2911439154473665e-05, + "logits/chosen": 2.6810050010681152, + "logits/rejected": 2.996727228164673, + "logps/chosen": -315.4955749511719, + "logps/rejected": -360.18121337890625, + "loss": 0.2474, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.1756186485290527, + "rewards/margins": 5.013758182525635, + "rewards/rejected": -8.189376831054688, + "step": 49900 + }, + { + "epoch": 1.6267077253953124, + "grad_norm": 0.2683076858520508, + "learning_rate": 2.2900576791475216e-05, + "logits/chosen": 2.8792383670806885, + "logits/rejected": 3.0090878009796143, + "logps/chosen": -328.9118347167969, + "logps/rejected": -332.697998046875, + "loss": 0.3046, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.39393949508667, + "rewards/margins": 4.732417106628418, + "rewards/rejected": -8.12635612487793, + "step": 49920 + }, + { + "epoch": 1.627359451246833, + "grad_norm": 0.39883512258529663, + "learning_rate": 2.288971442847677e-05, + "logits/chosen": 3.117436408996582, + "logits/rejected": 3.182399034500122, + "logps/chosen": -394.33477783203125, + "logps/rejected": -401.6648864746094, + "loss": 0.4501, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.7390785217285156, + "rewards/margins": 5.040881633758545, + "rewards/rejected": -8.779960632324219, + "step": 49940 + }, + { + "epoch": 1.6280111770983536, + "grad_norm": 0.0642978772521019, + "learning_rate": 2.2878852065478328e-05, + "logits/chosen": 3.169487714767456, + "logits/rejected": 3.404759645462036, + "logps/chosen": -389.0780334472656, + "logps/rejected": -383.26165771484375, + "loss": 0.2725, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9437379837036133, + "rewards/margins": 4.551203727722168, + "rewards/rejected": -7.494941711425781, + "step": 49960 + }, + { + "epoch": 1.6286629029498743, + "grad_norm": 5.868561267852783, + "learning_rate": 2.286798970247988e-05, + "logits/chosen": 2.910477876663208, + "logits/rejected": 2.94297456741333, + "logps/chosen": -340.83001708984375, + "logps/rejected": -318.2735900878906, + "loss": 0.4395, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.6318752765655518, + "rewards/margins": 3.7329087257385254, + "rewards/rejected": -7.36478328704834, + "step": 49980 + }, + { + "epoch": 1.6293146288013947, + "grad_norm": 4.531007766723633, + "learning_rate": 2.2857127339481433e-05, + "logits/chosen": 2.698951244354248, + "logits/rejected": 2.869119882583618, + "logps/chosen": -325.3194274902344, + "logps/rejected": -308.85992431640625, + "loss": 0.3621, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.865496873855591, + "rewards/margins": 4.004071235656738, + "rewards/rejected": -6.869568824768066, + "step": 50000 + }, + { + "epoch": 1.6299663546529153, + "grad_norm": 0.7487460970878601, + "learning_rate": 2.2846264976482983e-05, + "logits/chosen": 3.2578320503234863, + "logits/rejected": 3.2574009895324707, + "logps/chosen": -381.478515625, + "logps/rejected": -354.48382568359375, + "loss": 0.2295, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.0159671306610107, + "rewards/margins": 4.516800403594971, + "rewards/rejected": -7.532767295837402, + "step": 50020 + }, + { + "epoch": 1.6306180805044357, + "grad_norm": 9.134336471557617, + "learning_rate": 2.2835402613484538e-05, + "logits/chosen": 2.617047071456909, + "logits/rejected": 2.762911558151245, + "logps/chosen": -354.22259521484375, + "logps/rejected": -330.77313232421875, + "loss": 0.3817, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.331557035446167, + "rewards/margins": 3.9489803314208984, + "rewards/rejected": -7.2805376052856445, + "step": 50040 + }, + { + "epoch": 1.6312698063559563, + "grad_norm": 2.9958949089050293, + "learning_rate": 2.2824540250486092e-05, + "logits/chosen": 3.1414992809295654, + "logits/rejected": 3.150759220123291, + "logps/chosen": -369.4018859863281, + "logps/rejected": -366.7098693847656, + "loss": 0.3534, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.644145965576172, + "rewards/margins": 3.6245155334472656, + "rewards/rejected": -7.268660545349121, + "step": 50060 + }, + { + "epoch": 1.631921532207477, + "grad_norm": 0.691737711429596, + "learning_rate": 2.2813677887487646e-05, + "logits/chosen": 3.046827793121338, + "logits/rejected": 3.12648344039917, + "logps/chosen": -398.2712097167969, + "logps/rejected": -337.8538513183594, + "loss": 0.1527, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.538100004196167, + "rewards/margins": 5.377953052520752, + "rewards/rejected": -7.91605281829834, + "step": 50080 + }, + { + "epoch": 1.6325732580589976, + "grad_norm": 4.291834354400635, + "learning_rate": 2.28028155244892e-05, + "logits/chosen": 3.0375285148620605, + "logits/rejected": 3.254241943359375, + "logps/chosen": -356.260009765625, + "logps/rejected": -350.16802978515625, + "loss": 0.3365, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.8298773765563965, + "rewards/margins": 4.169909477233887, + "rewards/rejected": -6.999786376953125, + "step": 50100 + }, + { + "epoch": 1.6332249839105182, + "grad_norm": 8.473187446594238, + "learning_rate": 2.279195316149075e-05, + "logits/chosen": 3.0880682468414307, + "logits/rejected": 3.1452431678771973, + "logps/chosen": -353.1591796875, + "logps/rejected": -344.76513671875, + "loss": 0.3301, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.418912887573242, + "rewards/margins": 3.9931259155273438, + "rewards/rejected": -6.412038326263428, + "step": 50120 + }, + { + "epoch": 1.6338767097620386, + "grad_norm": 11.320267677307129, + "learning_rate": 2.2781090798492305e-05, + "logits/chosen": 3.1492323875427246, + "logits/rejected": 3.229130268096924, + "logps/chosen": -353.3573913574219, + "logps/rejected": -311.25225830078125, + "loss": 0.2986, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.5800514221191406, + "rewards/margins": 4.41864013671875, + "rewards/rejected": -6.998690605163574, + "step": 50140 + }, + { + "epoch": 1.6345284356135592, + "grad_norm": 3.004966974258423, + "learning_rate": 2.277022843549386e-05, + "logits/chosen": 3.183980703353882, + "logits/rejected": 3.570006847381592, + "logps/chosen": -351.0714111328125, + "logps/rejected": -414.11761474609375, + "loss": 0.3092, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.148423433303833, + "rewards/margins": 4.423720359802246, + "rewards/rejected": -7.5721435546875, + "step": 50160 + }, + { + "epoch": 1.6351801614650796, + "grad_norm": 6.311765193939209, + "learning_rate": 2.275936607249541e-05, + "logits/chosen": 2.828303337097168, + "logits/rejected": 3.0722317695617676, + "logps/chosen": -379.61602783203125, + "logps/rejected": -330.4299011230469, + "loss": 0.2678, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.6127421855926514, + "rewards/margins": 5.302727222442627, + "rewards/rejected": -6.915468692779541, + "step": 50180 + }, + { + "epoch": 1.6358318873166002, + "grad_norm": 1.394921064376831, + "learning_rate": 2.2748503709496964e-05, + "logits/chosen": 3.3425040245056152, + "logits/rejected": 3.3244102001190186, + "logps/chosen": -401.21337890625, + "logps/rejected": -368.5771484375, + "loss": 0.3045, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.9053277969360352, + "rewards/margins": 4.65162992477417, + "rewards/rejected": -6.556958198547363, + "step": 50200 + }, + { + "epoch": 1.6364836131681209, + "grad_norm": 1.150673508644104, + "learning_rate": 2.273764134649852e-05, + "logits/chosen": 3.271768569946289, + "logits/rejected": 3.2561771869659424, + "logps/chosen": -371.554443359375, + "logps/rejected": -367.335205078125, + "loss": 0.3606, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.084162473678589, + "rewards/margins": 3.8176848888397217, + "rewards/rejected": -6.901847839355469, + "step": 50220 + }, + { + "epoch": 1.6371353390196415, + "grad_norm": 2.7565362453460693, + "learning_rate": 2.2726778983500073e-05, + "logits/chosen": 2.9736459255218506, + "logits/rejected": 3.2444007396698, + "logps/chosen": -338.0980529785156, + "logps/rejected": -342.42901611328125, + "loss": 0.2732, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.5455214977264404, + "rewards/margins": 4.078973293304443, + "rewards/rejected": -6.624495029449463, + "step": 50240 + }, + { + "epoch": 1.637787064871162, + "grad_norm": 2.0490758419036865, + "learning_rate": 2.2715916620501627e-05, + "logits/chosen": 3.0408072471618652, + "logits/rejected": 3.138152599334717, + "logps/chosen": -376.8714599609375, + "logps/rejected": -367.7257080078125, + "loss": 0.2367, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.5481455326080322, + "rewards/margins": 5.774170875549316, + "rewards/rejected": -8.32231616973877, + "step": 50260 + }, + { + "epoch": 1.6384387907226825, + "grad_norm": 4.10296630859375, + "learning_rate": 2.2705054257503177e-05, + "logits/chosen": 2.9471840858459473, + "logits/rejected": 2.961124897003174, + "logps/chosen": -306.9251708984375, + "logps/rejected": -292.21173095703125, + "loss": 0.2764, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7918848991394043, + "rewards/margins": 3.8972842693328857, + "rewards/rejected": -6.689169406890869, + "step": 50280 + }, + { + "epoch": 1.639090516574203, + "grad_norm": 1.1711914539337158, + "learning_rate": 2.269419189450473e-05, + "logits/chosen": 2.938847064971924, + "logits/rejected": 3.038872718811035, + "logps/chosen": -357.41168212890625, + "logps/rejected": -360.4729309082031, + "loss": 0.3947, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.862602710723877, + "rewards/margins": 3.915863513946533, + "rewards/rejected": -6.77846622467041, + "step": 50300 + }, + { + "epoch": 1.6397422424257235, + "grad_norm": 3.663778781890869, + "learning_rate": 2.2683329531506282e-05, + "logits/chosen": 3.117276668548584, + "logits/rejected": 3.230437755584717, + "logps/chosen": -376.84197998046875, + "logps/rejected": -345.09765625, + "loss": 0.2413, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6638264656066895, + "rewards/margins": 3.710491895675659, + "rewards/rejected": -6.3743181228637695, + "step": 50320 + }, + { + "epoch": 1.6403939682772442, + "grad_norm": 1.5724260807037354, + "learning_rate": 2.2672467168507837e-05, + "logits/chosen": 2.967984437942505, + "logits/rejected": 3.0139567852020264, + "logps/chosen": -361.04937744140625, + "logps/rejected": -342.9726257324219, + "loss": 0.188, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.500347852706909, + "rewards/margins": 5.181713104248047, + "rewards/rejected": -7.682061195373535, + "step": 50340 + }, + { + "epoch": 1.6410456941287648, + "grad_norm": 4.860113143920898, + "learning_rate": 2.2661604805509394e-05, + "logits/chosen": 2.9634737968444824, + "logits/rejected": 3.06986665725708, + "logps/chosen": -333.4290466308594, + "logps/rejected": -320.69549560546875, + "loss": 0.3365, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.6606152057647705, + "rewards/margins": 3.464963436126709, + "rewards/rejected": -6.125577926635742, + "step": 50360 + }, + { + "epoch": 1.6416974199802854, + "grad_norm": 5.550968647003174, + "learning_rate": 2.2650742442510945e-05, + "logits/chosen": 2.871609687805176, + "logits/rejected": 3.039905548095703, + "logps/chosen": -327.99505615234375, + "logps/rejected": -367.08135986328125, + "loss": 0.2136, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.5017142295837402, + "rewards/margins": 4.7977800369262695, + "rewards/rejected": -7.299493312835693, + "step": 50380 + }, + { + "epoch": 1.6423491458318058, + "grad_norm": 0.5521049499511719, + "learning_rate": 2.26398800795125e-05, + "logits/chosen": 3.5466904640197754, + "logits/rejected": 3.585732936859131, + "logps/chosen": -366.83538818359375, + "logps/rejected": -354.1551208496094, + "loss": 0.2524, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.601304531097412, + "rewards/margins": 4.642442226409912, + "rewards/rejected": -7.243746280670166, + "step": 50400 + }, + { + "epoch": 1.6430008716833264, + "grad_norm": 0.7798062562942505, + "learning_rate": 2.262901771651405e-05, + "logits/chosen": 3.2662570476531982, + "logits/rejected": 3.183997869491577, + "logps/chosen": -331.2005310058594, + "logps/rejected": -317.33770751953125, + "loss": 0.2867, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.862905502319336, + "rewards/margins": 3.6960320472717285, + "rewards/rejected": -6.558938503265381, + "step": 50420 + }, + { + "epoch": 1.6436525975348468, + "grad_norm": 0.9184467792510986, + "learning_rate": 2.2618155353515604e-05, + "logits/chosen": 3.1297144889831543, + "logits/rejected": 3.082796096801758, + "logps/chosen": -379.1151428222656, + "logps/rejected": -343.60626220703125, + "loss": 0.3176, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.7693960666656494, + "rewards/margins": 4.1271138191223145, + "rewards/rejected": -6.896509647369385, + "step": 50440 + }, + { + "epoch": 1.6443043233863675, + "grad_norm": 1.557390809059143, + "learning_rate": 2.2607292990517158e-05, + "logits/chosen": 2.8326640129089355, + "logits/rejected": 3.109055995941162, + "logps/chosen": -352.5948181152344, + "logps/rejected": -317.46759033203125, + "loss": 0.1938, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.401707887649536, + "rewards/margins": 4.622281074523926, + "rewards/rejected": -7.023988246917725, + "step": 50460 + }, + { + "epoch": 1.644956049237888, + "grad_norm": 0.09993775188922882, + "learning_rate": 2.2596430627518712e-05, + "logits/chosen": 2.9428024291992188, + "logits/rejected": 3.0928030014038086, + "logps/chosen": -381.2808532714844, + "logps/rejected": -357.1573181152344, + "loss": 0.1866, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.3985683917999268, + "rewards/margins": 4.85231876373291, + "rewards/rejected": -7.250887393951416, + "step": 50480 + }, + { + "epoch": 1.6456077750894087, + "grad_norm": 17.211820602416992, + "learning_rate": 2.2585568264520267e-05, + "logits/chosen": 3.278488874435425, + "logits/rejected": 3.3033652305603027, + "logps/chosen": -382.47418212890625, + "logps/rejected": -350.05902099609375, + "loss": 0.3532, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.7488200664520264, + "rewards/margins": 5.363893508911133, + "rewards/rejected": -8.112713813781738, + "step": 50500 + }, + { + "epoch": 1.6462595009409293, + "grad_norm": 2.0111019611358643, + "learning_rate": 2.2574705901521817e-05, + "logits/chosen": 3.1837775707244873, + "logits/rejected": 3.0899147987365723, + "logps/chosen": -355.99639892578125, + "logps/rejected": -344.2687072753906, + "loss": 0.2501, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.02097749710083, + "rewards/margins": 4.4714813232421875, + "rewards/rejected": -7.492459297180176, + "step": 50520 + }, + { + "epoch": 1.6469112267924497, + "grad_norm": 2.2389068603515625, + "learning_rate": 2.256384353852337e-05, + "logits/chosen": 2.6620805263519287, + "logits/rejected": 2.995730400085449, + "logps/chosen": -320.53070068359375, + "logps/rejected": -407.74481201171875, + "loss": 0.2609, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0063812732696533, + "rewards/margins": 3.827439785003662, + "rewards/rejected": -6.8338212966918945, + "step": 50540 + }, + { + "epoch": 1.6475629526439703, + "grad_norm": 7.210673809051514, + "learning_rate": 2.2552981175524926e-05, + "logits/chosen": 2.6896421909332275, + "logits/rejected": 2.736192464828491, + "logps/chosen": -329.63165283203125, + "logps/rejected": -332.7167053222656, + "loss": 0.2595, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.778548002243042, + "rewards/margins": 4.712696075439453, + "rewards/rejected": -7.491243839263916, + "step": 50560 + }, + { + "epoch": 1.6482146784954907, + "grad_norm": 0.3857002854347229, + "learning_rate": 2.2542118812526476e-05, + "logits/chosen": 3.0658884048461914, + "logits/rejected": 3.123149871826172, + "logps/chosen": -341.10235595703125, + "logps/rejected": -320.2115173339844, + "loss": 0.3017, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.384795665740967, + "rewards/margins": 4.388270854949951, + "rewards/rejected": -7.773068428039551, + "step": 50580 + }, + { + "epoch": 1.6488664043470114, + "grad_norm": 5.348842144012451, + "learning_rate": 2.253125644952803e-05, + "logits/chosen": 2.892733097076416, + "logits/rejected": 3.0353875160217285, + "logps/chosen": -346.0537414550781, + "logps/rejected": -355.5171813964844, + "loss": 0.3468, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.583155393600464, + "rewards/margins": 4.099767208099365, + "rewards/rejected": -7.682923316955566, + "step": 50600 + }, + { + "epoch": 1.649518130198532, + "grad_norm": 6.268542289733887, + "learning_rate": 2.2520394086529585e-05, + "logits/chosen": 2.885291814804077, + "logits/rejected": 2.8446412086486816, + "logps/chosen": -323.68914794921875, + "logps/rejected": -335.0577697753906, + "loss": 0.2599, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.0085299015045166, + "rewards/margins": 4.41144323348999, + "rewards/rejected": -7.419972896575928, + "step": 50620 + }, + { + "epoch": 1.6501698560500526, + "grad_norm": 0.47773614525794983, + "learning_rate": 2.250953172353114e-05, + "logits/chosen": 3.0418009757995605, + "logits/rejected": 3.1363158226013184, + "logps/chosen": -406.08660888671875, + "logps/rejected": -339.6676330566406, + "loss": 0.427, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3545145988464355, + "rewards/margins": 3.540489673614502, + "rewards/rejected": -6.8950042724609375, + "step": 50640 + }, + { + "epoch": 1.6508215819015732, + "grad_norm": 0.7286627292633057, + "learning_rate": 2.2498669360532693e-05, + "logits/chosen": 3.016944408416748, + "logits/rejected": 2.9209630489349365, + "logps/chosen": -393.2317810058594, + "logps/rejected": -341.83709716796875, + "loss": 0.3125, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.2561373710632324, + "rewards/margins": 4.137354373931885, + "rewards/rejected": -7.393492221832275, + "step": 50660 + }, + { + "epoch": 1.6514733077530936, + "grad_norm": 1.4250301122665405, + "learning_rate": 2.2487806997534244e-05, + "logits/chosen": 2.994227647781372, + "logits/rejected": 2.88197660446167, + "logps/chosen": -369.3062438964844, + "logps/rejected": -357.15399169921875, + "loss": 0.2501, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.2897565364837646, + "rewards/margins": 4.925500392913818, + "rewards/rejected": -7.215257167816162, + "step": 50680 + }, + { + "epoch": 1.6521250336046143, + "grad_norm": 3.7492663860321045, + "learning_rate": 2.2476944634535798e-05, + "logits/chosen": 3.007387638092041, + "logits/rejected": 3.0991294384002686, + "logps/chosen": -343.3382263183594, + "logps/rejected": -335.2872009277344, + "loss": 0.4404, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.3717072010040283, + "rewards/margins": 3.509850263595581, + "rewards/rejected": -6.881556510925293, + "step": 50700 + }, + { + "epoch": 1.6527767594561347, + "grad_norm": 11.786653518676758, + "learning_rate": 2.246608227153735e-05, + "logits/chosen": 3.0282952785491943, + "logits/rejected": 3.001399517059326, + "logps/chosen": -373.09710693359375, + "logps/rejected": -371.8935852050781, + "loss": 0.3014, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.036881446838379, + "rewards/margins": 5.03115177154541, + "rewards/rejected": -8.068034172058105, + "step": 50720 + }, + { + "epoch": 1.6534284853076553, + "grad_norm": 3.622427463531494, + "learning_rate": 2.2455219908538906e-05, + "logits/chosen": 3.007999897003174, + "logits/rejected": 3.312774181365967, + "logps/chosen": -386.0018615722656, + "logps/rejected": -379.1722412109375, + "loss": 0.2036, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.0268006324768066, + "rewards/margins": 4.629674911499023, + "rewards/rejected": -7.656475067138672, + "step": 50740 + }, + { + "epoch": 1.654080211159176, + "grad_norm": 1.7745261192321777, + "learning_rate": 2.244435754554046e-05, + "logits/chosen": 3.355170488357544, + "logits/rejected": 3.2959485054016113, + "logps/chosen": -413.29205322265625, + "logps/rejected": -352.9306945800781, + "loss": 0.3811, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.3689894676208496, + "rewards/margins": 4.400759696960449, + "rewards/rejected": -7.769748687744141, + "step": 50760 + }, + { + "epoch": 1.6547319370106965, + "grad_norm": 2.662479877471924, + "learning_rate": 2.243349518254201e-05, + "logits/chosen": 2.758206605911255, + "logits/rejected": 2.8179073333740234, + "logps/chosen": -346.75946044921875, + "logps/rejected": -370.70623779296875, + "loss": 0.2543, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.7692618370056152, + "rewards/margins": 4.924914360046387, + "rewards/rejected": -7.694177150726318, + "step": 50780 + }, + { + "epoch": 1.6553836628622172, + "grad_norm": 1.3780975341796875, + "learning_rate": 2.2422632819543566e-05, + "logits/chosen": 2.85029935836792, + "logits/rejected": 2.829285144805908, + "logps/chosen": -330.9534606933594, + "logps/rejected": -335.78924560546875, + "loss": 0.2053, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.9841885566711426, + "rewards/margins": 5.246164321899414, + "rewards/rejected": -8.230353355407715, + "step": 50800 + }, + { + "epoch": 1.6560353887137376, + "grad_norm": 13.61634635925293, + "learning_rate": 2.2411770456545116e-05, + "logits/chosen": 3.2098846435546875, + "logits/rejected": 3.0900702476501465, + "logps/chosen": -387.8196105957031, + "logps/rejected": -378.0350036621094, + "loss": 0.2217, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.3044018745422363, + "rewards/margins": 5.169687747955322, + "rewards/rejected": -8.474088668823242, + "step": 50820 + }, + { + "epoch": 1.656687114565258, + "grad_norm": 3.6620776653289795, + "learning_rate": 2.240090809354667e-05, + "logits/chosen": 2.6253552436828613, + "logits/rejected": 2.7112040519714355, + "logps/chosen": -348.4719543457031, + "logps/rejected": -358.12005615234375, + "loss": 0.2991, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.507965564727783, + "rewards/margins": 4.043755531311035, + "rewards/rejected": -7.55172061920166, + "step": 50840 + }, + { + "epoch": 1.6573388404167786, + "grad_norm": 1.1088931560516357, + "learning_rate": 2.2390045730548225e-05, + "logits/chosen": 2.9445207118988037, + "logits/rejected": 2.6663942337036133, + "logps/chosen": -364.1517333984375, + "logps/rejected": -358.72357177734375, + "loss": 0.2878, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.6612136363983154, + "rewards/margins": 4.471937656402588, + "rewards/rejected": -8.133151054382324, + "step": 50860 + }, + { + "epoch": 1.6579905662682992, + "grad_norm": 0.41578730940818787, + "learning_rate": 2.237918336754978e-05, + "logits/chosen": 2.6789374351501465, + "logits/rejected": 2.914163589477539, + "logps/chosen": -338.52728271484375, + "logps/rejected": -359.568115234375, + "loss": 0.2248, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.082801103591919, + "rewards/margins": 4.574549674987793, + "rewards/rejected": -7.657351016998291, + "step": 50880 + }, + { + "epoch": 1.6586422921198198, + "grad_norm": 1.1147947311401367, + "learning_rate": 2.2368321004551333e-05, + "logits/chosen": 2.7669851779937744, + "logits/rejected": 2.7846016883850098, + "logps/chosen": -312.50665283203125, + "logps/rejected": -372.4079284667969, + "loss": 0.2052, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.994225025177002, + "rewards/margins": 4.186005592346191, + "rewards/rejected": -7.180230140686035, + "step": 50900 + }, + { + "epoch": 1.6592940179713405, + "grad_norm": 4.48382568359375, + "learning_rate": 2.2357458641552884e-05, + "logits/chosen": 2.8558566570281982, + "logits/rejected": 3.0398926734924316, + "logps/chosen": -382.40338134765625, + "logps/rejected": -369.140869140625, + "loss": 0.3706, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.216352462768555, + "rewards/margins": 4.120358467102051, + "rewards/rejected": -8.336710929870605, + "step": 50920 + }, + { + "epoch": 1.6599457438228609, + "grad_norm": 1.3999524116516113, + "learning_rate": 2.2346596278554438e-05, + "logits/chosen": 2.7739310264587402, + "logits/rejected": 2.970701217651367, + "logps/chosen": -371.59100341796875, + "logps/rejected": -368.75750732421875, + "loss": 0.3226, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.4995715618133545, + "rewards/margins": 4.337897300720215, + "rewards/rejected": -7.837468147277832, + "step": 50940 + }, + { + "epoch": 1.6605974696743815, + "grad_norm": 1.5012540817260742, + "learning_rate": 2.2335733915555992e-05, + "logits/chosen": 2.6040492057800293, + "logits/rejected": 2.6545841693878174, + "logps/chosen": -393.71319580078125, + "logps/rejected": -362.28802490234375, + "loss": 0.2152, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.535837173461914, + "rewards/margins": 4.388018608093262, + "rewards/rejected": -7.923854827880859, + "step": 50960 + }, + { + "epoch": 1.661249195525902, + "grad_norm": 0.3805747628211975, + "learning_rate": 2.2324871552557543e-05, + "logits/chosen": 2.941274881362915, + "logits/rejected": 3.009904146194458, + "logps/chosen": -327.53546142578125, + "logps/rejected": -329.68511962890625, + "loss": 0.2532, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.7520601749420166, + "rewards/margins": 4.810019016265869, + "rewards/rejected": -7.562078952789307, + "step": 50980 + }, + { + "epoch": 1.6619009213774225, + "grad_norm": 9.6272611618042, + "learning_rate": 2.2314009189559097e-05, + "logits/chosen": 3.1633663177490234, + "logits/rejected": 3.190056800842285, + "logps/chosen": -361.93865966796875, + "logps/rejected": -366.01416015625, + "loss": 0.4342, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.4184024333953857, + "rewards/margins": 4.4098992347717285, + "rewards/rejected": -7.828300476074219, + "step": 51000 + }, + { + "epoch": 1.6625526472289431, + "grad_norm": 10.15963077545166, + "learning_rate": 2.230314682656065e-05, + "logits/chosen": 2.8187832832336426, + "logits/rejected": 2.7437801361083984, + "logps/chosen": -357.35491943359375, + "logps/rejected": -347.6815185546875, + "loss": 0.1901, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.416738986968994, + "rewards/margins": 5.052667617797852, + "rewards/rejected": -8.469406127929688, + "step": 51020 + }, + { + "epoch": 1.6632043730804638, + "grad_norm": 1.1386018991470337, + "learning_rate": 2.2292284463562205e-05, + "logits/chosen": 2.9491138458251953, + "logits/rejected": 2.8102173805236816, + "logps/chosen": -371.548828125, + "logps/rejected": -381.2657470703125, + "loss": 0.1868, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.97322416305542, + "rewards/margins": 5.301505088806152, + "rewards/rejected": -8.274728775024414, + "step": 51040 + }, + { + "epoch": 1.6638560989319844, + "grad_norm": 1.2558673620224, + "learning_rate": 2.2281422100563756e-05, + "logits/chosen": 2.7532715797424316, + "logits/rejected": 2.973968982696533, + "logps/chosen": -398.2465515136719, + "logps/rejected": -361.3500061035156, + "loss": 0.3925, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.299797534942627, + "rewards/margins": 4.305182933807373, + "rewards/rejected": -7.60498046875, + "step": 51060 + }, + { + "epoch": 1.6645078247835048, + "grad_norm": 0.8003730177879333, + "learning_rate": 2.227055973756531e-05, + "logits/chosen": 2.7482101917266846, + "logits/rejected": 2.9247517585754395, + "logps/chosen": -343.1300964355469, + "logps/rejected": -367.4540710449219, + "loss": 0.2169, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.278097629547119, + "rewards/margins": 4.685575008392334, + "rewards/rejected": -7.963672637939453, + "step": 51080 + }, + { + "epoch": 1.6651595506350254, + "grad_norm": 0.10886581242084503, + "learning_rate": 2.2259697374566865e-05, + "logits/chosen": 2.8318703174591064, + "logits/rejected": 2.9048409461975098, + "logps/chosen": -325.2841796875, + "logps/rejected": -315.24188232421875, + "loss": 0.1741, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.746340036392212, + "rewards/margins": 4.307202339172363, + "rewards/rejected": -7.053542137145996, + "step": 51100 + }, + { + "epoch": 1.6658112764865458, + "grad_norm": 3.564838171005249, + "learning_rate": 2.2248835011568415e-05, + "logits/chosen": 3.254387617111206, + "logits/rejected": 3.282022476196289, + "logps/chosen": -445.835205078125, + "logps/rejected": -416.4703674316406, + "loss": 0.262, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.460817813873291, + "rewards/margins": 5.566853046417236, + "rewards/rejected": -9.027669906616211, + "step": 51120 + }, + { + "epoch": 1.6664630023380664, + "grad_norm": 0.1243802011013031, + "learning_rate": 2.2237972648569973e-05, + "logits/chosen": 2.7784268856048584, + "logits/rejected": 2.9815163612365723, + "logps/chosen": -336.08209228515625, + "logps/rejected": -360.28521728515625, + "loss": 0.6255, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.8333771228790283, + "rewards/margins": 4.024033546447754, + "rewards/rejected": -7.8574113845825195, + "step": 51140 + }, + { + "epoch": 1.667114728189587, + "grad_norm": 1.1774340867996216, + "learning_rate": 2.2227110285571524e-05, + "logits/chosen": 2.9362306594848633, + "logits/rejected": 2.602884531021118, + "logps/chosen": -379.10198974609375, + "logps/rejected": -350.75897216796875, + "loss": 0.3085, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.2096340656280518, + "rewards/margins": 4.309336185455322, + "rewards/rejected": -7.518969535827637, + "step": 51160 + }, + { + "epoch": 1.6677664540411077, + "grad_norm": 6.712400913238525, + "learning_rate": 2.2216247922573078e-05, + "logits/chosen": 2.8409698009490967, + "logits/rejected": 2.8753349781036377, + "logps/chosen": -389.9633483886719, + "logps/rejected": -376.9849548339844, + "loss": 0.3201, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.6432852745056152, + "rewards/margins": 4.219769477844238, + "rewards/rejected": -7.8630547523498535, + "step": 51180 + }, + { + "epoch": 1.6684181798926283, + "grad_norm": 5.596046447753906, + "learning_rate": 2.2205385559574632e-05, + "logits/chosen": 3.2310104370117188, + "logits/rejected": 3.107905149459839, + "logps/chosen": -362.0110778808594, + "logps/rejected": -355.29486083984375, + "loss": 0.2394, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.9753775596618652, + "rewards/margins": 4.549224376678467, + "rewards/rejected": -7.52460241317749, + "step": 51200 + }, + { + "epoch": 1.6690699057441487, + "grad_norm": 4.499862194061279, + "learning_rate": 2.2194523196576183e-05, + "logits/chosen": 2.911956310272217, + "logits/rejected": 2.877070188522339, + "logps/chosen": -355.0531311035156, + "logps/rejected": -299.5860900878906, + "loss": 0.4477, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.3905155658721924, + "rewards/margins": 3.70617413520813, + "rewards/rejected": -7.096688747406006, + "step": 51220 + }, + { + "epoch": 1.6697216315956693, + "grad_norm": 2.233142852783203, + "learning_rate": 2.2183660833577737e-05, + "logits/chosen": 2.8685214519500732, + "logits/rejected": 2.8008718490600586, + "logps/chosen": -330.6158142089844, + "logps/rejected": -357.0657958984375, + "loss": 0.4108, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.337327480316162, + "rewards/margins": 3.567600965499878, + "rewards/rejected": -6.904927730560303, + "step": 51240 + }, + { + "epoch": 1.6703733574471897, + "grad_norm": 1.340347170829773, + "learning_rate": 2.217279847057929e-05, + "logits/chosen": 3.1200368404388428, + "logits/rejected": 2.9836173057556152, + "logps/chosen": -400.62115478515625, + "logps/rejected": -366.7025146484375, + "loss": 0.3113, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.7199807167053223, + "rewards/margins": 4.024300575256348, + "rewards/rejected": -7.744281768798828, + "step": 51260 + }, + { + "epoch": 1.6710250832987104, + "grad_norm": 3.6039133071899414, + "learning_rate": 2.2161936107580845e-05, + "logits/chosen": 3.044058322906494, + "logits/rejected": 2.950819492340088, + "logps/chosen": -345.40289306640625, + "logps/rejected": -354.2045593261719, + "loss": 0.3143, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.049715280532837, + "rewards/margins": 4.069616317749023, + "rewards/rejected": -7.119331359863281, + "step": 51280 + }, + { + "epoch": 1.671676809150231, + "grad_norm": 13.56783390045166, + "learning_rate": 2.21510737445824e-05, + "logits/chosen": 2.7752952575683594, + "logits/rejected": 2.7314541339874268, + "logps/chosen": -335.8523864746094, + "logps/rejected": -333.3578796386719, + "loss": 0.4051, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.3838553428649902, + "rewards/margins": 4.556440830230713, + "rewards/rejected": -7.940296173095703, + "step": 51300 + }, + { + "epoch": 1.6723285350017516, + "grad_norm": 1.1309651136398315, + "learning_rate": 2.214021138158395e-05, + "logits/chosen": 2.8409836292266846, + "logits/rejected": 2.959538698196411, + "logps/chosen": -347.68023681640625, + "logps/rejected": -343.3911437988281, + "loss": 0.2877, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.769761562347412, + "rewards/margins": 4.269696235656738, + "rewards/rejected": -7.03945779800415, + "step": 51320 + }, + { + "epoch": 1.6729802608532722, + "grad_norm": 5.5301995277404785, + "learning_rate": 2.2129349018585505e-05, + "logits/chosen": 3.152130126953125, + "logits/rejected": 3.1869494915008545, + "logps/chosen": -405.1345520019531, + "logps/rejected": -343.6365661621094, + "loss": 0.2949, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.0091168880462646, + "rewards/margins": 4.1188063621521, + "rewards/rejected": -7.127923488616943, + "step": 51340 + }, + { + "epoch": 1.6736319867047926, + "grad_norm": 0.3217616677284241, + "learning_rate": 2.2118486655587055e-05, + "logits/chosen": 3.0371289253234863, + "logits/rejected": 3.0415966510772705, + "logps/chosen": -372.270751953125, + "logps/rejected": -331.6794128417969, + "loss": 0.2951, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.0152831077575684, + "rewards/margins": 4.846810340881348, + "rewards/rejected": -7.862093448638916, + "step": 51360 + }, + { + "epoch": 1.674283712556313, + "grad_norm": 11.688060760498047, + "learning_rate": 2.210762429258861e-05, + "logits/chosen": 2.828225612640381, + "logits/rejected": 2.9283642768859863, + "logps/chosen": -374.24365234375, + "logps/rejected": -365.212890625, + "loss": 0.2436, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.0160088539123535, + "rewards/margins": 4.753903865814209, + "rewards/rejected": -7.769913673400879, + "step": 51380 + }, + { + "epoch": 1.6749354384078337, + "grad_norm": 1.3684093952178955, + "learning_rate": 2.2096761929590164e-05, + "logits/chosen": 3.2697913646698, + "logits/rejected": 3.3186964988708496, + "logps/chosen": -375.1898193359375, + "logps/rejected": -361.2965393066406, + "loss": 0.2438, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.889028549194336, + "rewards/margins": 3.8629233837127686, + "rewards/rejected": -6.751951694488525, + "step": 51400 + }, + { + "epoch": 1.6755871642593543, + "grad_norm": 2.7517035007476807, + "learning_rate": 2.2085899566591718e-05, + "logits/chosen": 2.6862359046936035, + "logits/rejected": 2.775503158569336, + "logps/chosen": -346.65960693359375, + "logps/rejected": -345.47564697265625, + "loss": 0.2187, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.7353179454803467, + "rewards/margins": 4.56951379776001, + "rewards/rejected": -7.304831027984619, + "step": 51420 + }, + { + "epoch": 1.676238890110875, + "grad_norm": 6.54379940032959, + "learning_rate": 2.2075037203593272e-05, + "logits/chosen": 3.056964159011841, + "logits/rejected": 3.078439235687256, + "logps/chosen": -386.6111755371094, + "logps/rejected": -347.18121337890625, + "loss": 0.1675, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.5764541625976562, + "rewards/margins": 4.691084384918213, + "rewards/rejected": -8.267538070678711, + "step": 51440 + }, + { + "epoch": 1.6768906159623955, + "grad_norm": 9.539932250976562, + "learning_rate": 2.2064174840594823e-05, + "logits/chosen": 2.9917120933532715, + "logits/rejected": 2.895944118499756, + "logps/chosen": -395.76617431640625, + "logps/rejected": -374.2176513671875, + "loss": 0.1652, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.068007230758667, + "rewards/margins": 4.85322904586792, + "rewards/rejected": -7.921236515045166, + "step": 51460 + }, + { + "epoch": 1.677542341813916, + "grad_norm": 5.79727840423584, + "learning_rate": 2.2053312477596377e-05, + "logits/chosen": 2.944286346435547, + "logits/rejected": 2.9890859127044678, + "logps/chosen": -351.6230773925781, + "logps/rejected": -341.8663635253906, + "loss": 0.2777, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.034109592437744, + "rewards/margins": 5.069498538970947, + "rewards/rejected": -8.103609085083008, + "step": 51480 + }, + { + "epoch": 1.6781940676654366, + "grad_norm": 0.3724825978279114, + "learning_rate": 2.204245011459793e-05, + "logits/chosen": 3.309253692626953, + "logits/rejected": 3.0762956142425537, + "logps/chosen": -364.2828674316406, + "logps/rejected": -330.13665771484375, + "loss": 0.2364, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.830190896987915, + "rewards/margins": 4.770234107971191, + "rewards/rejected": -7.600424289703369, + "step": 51500 + }, + { + "epoch": 1.678845793516957, + "grad_norm": 3.223487615585327, + "learning_rate": 2.2031587751599482e-05, + "logits/chosen": 3.3869967460632324, + "logits/rejected": 3.5444939136505127, + "logps/chosen": -390.9617004394531, + "logps/rejected": -392.16864013671875, + "loss": 0.4857, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.417506456375122, + "rewards/margins": 4.605403423309326, + "rewards/rejected": -8.022909164428711, + "step": 51520 + }, + { + "epoch": 1.6794975193684776, + "grad_norm": 5.705449104309082, + "learning_rate": 2.202072538860104e-05, + "logits/chosen": 2.633073329925537, + "logits/rejected": 2.9655349254608154, + "logps/chosen": -338.5724182128906, + "logps/rejected": -339.4932556152344, + "loss": 0.1897, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.5034942626953125, + "rewards/margins": 4.126744270324707, + "rewards/rejected": -6.630239009857178, + "step": 51540 + }, + { + "epoch": 1.6801492452199982, + "grad_norm": 0.8373200297355652, + "learning_rate": 2.200986302560259e-05, + "logits/chosen": 3.200016736984253, + "logits/rejected": 3.1205086708068848, + "logps/chosen": -413.5492248535156, + "logps/rejected": -351.0524597167969, + "loss": 0.1722, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.125321388244629, + "rewards/margins": 4.895491123199463, + "rewards/rejected": -8.020812034606934, + "step": 51560 + }, + { + "epoch": 1.6808009710715188, + "grad_norm": 7.777500152587891, + "learning_rate": 2.1999000662604144e-05, + "logits/chosen": 2.807088851928711, + "logits/rejected": 3.0161292552948, + "logps/chosen": -381.2278137207031, + "logps/rejected": -332.9525146484375, + "loss": 0.4289, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.1478617191314697, + "rewards/margins": 4.669977188110352, + "rewards/rejected": -7.8178391456604, + "step": 51580 + }, + { + "epoch": 1.6814526969230394, + "grad_norm": 0.09567653387784958, + "learning_rate": 2.19881382996057e-05, + "logits/chosen": 2.8533291816711426, + "logits/rejected": 2.858030319213867, + "logps/chosen": -347.89312744140625, + "logps/rejected": -369.36138916015625, + "loss": 0.2046, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.4364781379699707, + "rewards/margins": 5.1968207359313965, + "rewards/rejected": -8.633298873901367, + "step": 51600 + }, + { + "epoch": 1.6821044227745598, + "grad_norm": 6.433924674987793, + "learning_rate": 2.197727593660725e-05, + "logits/chosen": 3.47560453414917, + "logits/rejected": 3.5563740730285645, + "logps/chosen": -436.32965087890625, + "logps/rejected": -353.94219970703125, + "loss": 0.2823, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.202451705932617, + "rewards/margins": 4.818530082702637, + "rewards/rejected": -8.02098274230957, + "step": 51620 + }, + { + "epoch": 1.6827561486260805, + "grad_norm": 0.7227498292922974, + "learning_rate": 2.1966413573608804e-05, + "logits/chosen": 3.0137267112731934, + "logits/rejected": 3.1987926959991455, + "logps/chosen": -342.4000244140625, + "logps/rejected": -379.6413879394531, + "loss": 0.3011, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.5402369499206543, + "rewards/margins": 4.6852030754089355, + "rewards/rejected": -8.22544002532959, + "step": 51640 + }, + { + "epoch": 1.6834078744776009, + "grad_norm": 0.6894111037254333, + "learning_rate": 2.1955551210610358e-05, + "logits/chosen": 2.6701114177703857, + "logits/rejected": 2.7666521072387695, + "logps/chosen": -370.578857421875, + "logps/rejected": -329.023193359375, + "loss": 0.2167, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.3528664112091064, + "rewards/margins": 4.669728755950928, + "rewards/rejected": -8.022595405578613, + "step": 51660 + }, + { + "epoch": 1.6840596003291215, + "grad_norm": 2.123944044113159, + "learning_rate": 2.1944688847611912e-05, + "logits/chosen": 2.743882656097412, + "logits/rejected": 2.9054596424102783, + "logps/chosen": -353.9554138183594, + "logps/rejected": -351.9676513671875, + "loss": 0.2257, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.191735029220581, + "rewards/margins": 4.507001876831055, + "rewards/rejected": -7.698736667633057, + "step": 51680 + }, + { + "epoch": 1.6847113261806421, + "grad_norm": 3.6044321060180664, + "learning_rate": 2.1933826484613466e-05, + "logits/chosen": 2.9579970836639404, + "logits/rejected": 3.0222668647766113, + "logps/chosen": -407.90716552734375, + "logps/rejected": -359.15875244140625, + "loss": 0.2089, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.60310697555542, + "rewards/margins": 5.163002014160156, + "rewards/rejected": -8.766109466552734, + "step": 51700 + }, + { + "epoch": 1.6853630520321627, + "grad_norm": 1.7588847875595093, + "learning_rate": 2.1922964121615017e-05, + "logits/chosen": 3.3166141510009766, + "logits/rejected": 3.1577162742614746, + "logps/chosen": -409.827880859375, + "logps/rejected": -391.8436584472656, + "loss": 0.337, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.930041551589966, + "rewards/margins": 4.8453145027160645, + "rewards/rejected": -8.775355339050293, + "step": 51720 + }, + { + "epoch": 1.6860147778836834, + "grad_norm": 9.559508323669434, + "learning_rate": 2.191210175861657e-05, + "logits/chosen": 2.8159961700439453, + "logits/rejected": 2.8620548248291016, + "logps/chosen": -311.4064025878906, + "logps/rejected": -328.19256591796875, + "loss": 0.4174, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6493403911590576, + "rewards/margins": 4.199822425842285, + "rewards/rejected": -7.8491621017456055, + "step": 51740 + }, + { + "epoch": 1.6866665037352038, + "grad_norm": 9.633124351501465, + "learning_rate": 2.1901239395618122e-05, + "logits/chosen": 2.855238437652588, + "logits/rejected": 3.050184726715088, + "logps/chosen": -369.78900146484375, + "logps/rejected": -334.512451171875, + "loss": 0.2105, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.7891318798065186, + "rewards/margins": 5.013020992279053, + "rewards/rejected": -8.802152633666992, + "step": 51760 + }, + { + "epoch": 1.6873182295867244, + "grad_norm": 5.1927080154418945, + "learning_rate": 2.1890377032619676e-05, + "logits/chosen": 2.950770854949951, + "logits/rejected": 3.101463794708252, + "logps/chosen": -377.6283264160156, + "logps/rejected": -360.6609191894531, + "loss": 0.2664, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.6536126136779785, + "rewards/margins": 4.393035411834717, + "rewards/rejected": -8.046648025512695, + "step": 51780 + }, + { + "epoch": 1.6879699554382448, + "grad_norm": 11.683548927307129, + "learning_rate": 2.187951466962123e-05, + "logits/chosen": 2.9326400756835938, + "logits/rejected": 3.0623974800109863, + "logps/chosen": -348.1426086425781, + "logps/rejected": -352.12847900390625, + "loss": 0.3678, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.4470832347869873, + "rewards/margins": 4.065074443817139, + "rewards/rejected": -7.512158393859863, + "step": 51800 + }, + { + "epoch": 1.6886216812897654, + "grad_norm": 1.3491731882095337, + "learning_rate": 2.1868652306622784e-05, + "logits/chosen": 3.0776658058166504, + "logits/rejected": 2.97273588180542, + "logps/chosen": -362.24383544921875, + "logps/rejected": -372.82806396484375, + "loss": 0.3597, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.3266639709472656, + "rewards/margins": 5.156022071838379, + "rewards/rejected": -8.482686042785645, + "step": 51820 + }, + { + "epoch": 1.689273407141286, + "grad_norm": 12.175695419311523, + "learning_rate": 2.185778994362434e-05, + "logits/chosen": 2.91903018951416, + "logits/rejected": 2.816633939743042, + "logps/chosen": -347.9843444824219, + "logps/rejected": -335.553466796875, + "loss": 0.2875, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.6547207832336426, + "rewards/margins": 4.971064567565918, + "rewards/rejected": -8.625785827636719, + "step": 51840 + }, + { + "epoch": 1.6899251329928067, + "grad_norm": 3.8331949710845947, + "learning_rate": 2.184692758062589e-05, + "logits/chosen": 2.9020845890045166, + "logits/rejected": 3.0335114002227783, + "logps/chosen": -323.1875, + "logps/rejected": -364.380126953125, + "loss": 0.1885, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.2754909992218018, + "rewards/margins": 4.82058048248291, + "rewards/rejected": -8.096071243286133, + "step": 51860 + }, + { + "epoch": 1.6905768588443273, + "grad_norm": 6.794792652130127, + "learning_rate": 2.1836065217627443e-05, + "logits/chosen": 2.8968393802642822, + "logits/rejected": 3.0056774616241455, + "logps/chosen": -338.1926574707031, + "logps/rejected": -367.82342529296875, + "loss": 0.3547, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.300872325897217, + "rewards/margins": 4.5999932289123535, + "rewards/rejected": -8.900864601135254, + "step": 51880 + }, + { + "epoch": 1.6912285846958477, + "grad_norm": 0.08758356422185898, + "learning_rate": 2.1825202854628998e-05, + "logits/chosen": 2.8979954719543457, + "logits/rejected": 2.8122775554656982, + "logps/chosen": -363.5429382324219, + "logps/rejected": -384.61102294921875, + "loss": 0.1865, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.2400741577148438, + "rewards/margins": 5.253930568695068, + "rewards/rejected": -8.49400520324707, + "step": 51900 + }, + { + "epoch": 1.691880310547368, + "grad_norm": 8.9513578414917, + "learning_rate": 2.181434049163055e-05, + "logits/chosen": 2.5910491943359375, + "logits/rejected": 2.968919038772583, + "logps/chosen": -362.88232421875, + "logps/rejected": -342.5240783691406, + "loss": 0.3328, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.74573016166687, + "rewards/margins": 4.512886047363281, + "rewards/rejected": -8.25861644744873, + "step": 51920 + }, + { + "epoch": 1.6925320363988887, + "grad_norm": 6.5860419273376465, + "learning_rate": 2.1803478128632106e-05, + "logits/chosen": 3.0081627368927, + "logits/rejected": 2.971038341522217, + "logps/chosen": -349.26239013671875, + "logps/rejected": -320.54315185546875, + "loss": 0.1603, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.116686820983887, + "rewards/margins": 5.015560150146484, + "rewards/rejected": -9.132246971130371, + "step": 51940 + }, + { + "epoch": 1.6931837622504093, + "grad_norm": 1.207473874092102, + "learning_rate": 2.1792615765633657e-05, + "logits/chosen": 2.670769214630127, + "logits/rejected": 2.856384754180908, + "logps/chosen": -366.85992431640625, + "logps/rejected": -317.0414733886719, + "loss": 0.2916, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7187907695770264, + "rewards/margins": 4.781485557556152, + "rewards/rejected": -8.500276565551758, + "step": 51960 + }, + { + "epoch": 1.69383548810193, + "grad_norm": 1.2427934408187866, + "learning_rate": 2.178175340263521e-05, + "logits/chosen": 2.947432041168213, + "logits/rejected": 3.022411346435547, + "logps/chosen": -345.9934997558594, + "logps/rejected": -323.5576477050781, + "loss": 0.2544, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.282684803009033, + "rewards/margins": 4.417555809020996, + "rewards/rejected": -7.700240135192871, + "step": 51980 + }, + { + "epoch": 1.6944872139534506, + "grad_norm": 5.290392875671387, + "learning_rate": 2.1770891039636765e-05, + "logits/chosen": 2.983792304992676, + "logits/rejected": 2.97894024848938, + "logps/chosen": -355.73187255859375, + "logps/rejected": -357.424072265625, + "loss": 0.3579, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.6498310565948486, + "rewards/margins": 4.722323894500732, + "rewards/rejected": -8.37215518951416, + "step": 52000 + }, + { + "epoch": 1.695138939804971, + "grad_norm": 4.149811267852783, + "learning_rate": 2.1760028676638316e-05, + "logits/chosen": 2.694443941116333, + "logits/rejected": 2.777813673019409, + "logps/chosen": -358.0856018066406, + "logps/rejected": -338.7392272949219, + "loss": 0.2707, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.4353079795837402, + "rewards/margins": 4.3386688232421875, + "rewards/rejected": -7.7739763259887695, + "step": 52020 + }, + { + "epoch": 1.6957906656564916, + "grad_norm": 4.937337398529053, + "learning_rate": 2.174916631363987e-05, + "logits/chosen": 2.6488046646118164, + "logits/rejected": 2.8041558265686035, + "logps/chosen": -368.80230712890625, + "logps/rejected": -401.0780029296875, + "loss": 0.1813, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.369225978851318, + "rewards/margins": 4.64394474029541, + "rewards/rejected": -9.01317024230957, + "step": 52040 + }, + { + "epoch": 1.696442391508012, + "grad_norm": 1.3332655429840088, + "learning_rate": 2.1738303950641424e-05, + "logits/chosen": 2.958461046218872, + "logits/rejected": 3.1421966552734375, + "logps/chosen": -355.70489501953125, + "logps/rejected": -319.9436950683594, + "loss": 0.2862, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.316847324371338, + "rewards/margins": 4.00998067855835, + "rewards/rejected": -7.3268280029296875, + "step": 52060 + }, + { + "epoch": 1.6970941173595326, + "grad_norm": 1.0727628469467163, + "learning_rate": 2.172744158764298e-05, + "logits/chosen": 2.6148407459259033, + "logits/rejected": 2.804199695587158, + "logps/chosen": -317.6590881347656, + "logps/rejected": -348.094970703125, + "loss": 0.3363, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.8226237297058105, + "rewards/margins": 3.983699083328247, + "rewards/rejected": -7.806323051452637, + "step": 52080 + }, + { + "epoch": 1.6977458432110533, + "grad_norm": 1.6599375009536743, + "learning_rate": 2.1716579224644533e-05, + "logits/chosen": 2.7396023273468018, + "logits/rejected": 2.8081634044647217, + "logps/chosen": -361.6292419433594, + "logps/rejected": -362.8099670410156, + "loss": 0.4259, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.566604137420654, + "rewards/margins": 4.311910152435303, + "rewards/rejected": -8.878514289855957, + "step": 52100 + }, + { + "epoch": 1.6983975690625739, + "grad_norm": 4.719406604766846, + "learning_rate": 2.1705716861646083e-05, + "logits/chosen": 2.3393850326538086, + "logits/rejected": 2.472792148590088, + "logps/chosen": -378.7547302246094, + "logps/rejected": -344.2581481933594, + "loss": 0.3784, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.8184006214141846, + "rewards/margins": 4.755797386169434, + "rewards/rejected": -8.574197769165039, + "step": 52120 + }, + { + "epoch": 1.6990492949140945, + "grad_norm": 5.173814296722412, + "learning_rate": 2.1694854498647637e-05, + "logits/chosen": 2.9063544273376465, + "logits/rejected": 2.832613945007324, + "logps/chosen": -371.3934326171875, + "logps/rejected": -393.5244140625, + "loss": 0.2434, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.263818264007568, + "rewards/margins": 5.074753284454346, + "rewards/rejected": -9.338571548461914, + "step": 52140 + }, + { + "epoch": 1.699701020765615, + "grad_norm": 0.15976688265800476, + "learning_rate": 2.1683992135649188e-05, + "logits/chosen": 2.7765536308288574, + "logits/rejected": 2.87677264213562, + "logps/chosen": -371.3898620605469, + "logps/rejected": -358.23907470703125, + "loss": 0.3474, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.8105595111846924, + "rewards/margins": 4.851896286010742, + "rewards/rejected": -8.662455558776855, + "step": 52160 + }, + { + "epoch": 1.7003527466171355, + "grad_norm": 0.720253050327301, + "learning_rate": 2.1673129772650742e-05, + "logits/chosen": 2.71136736869812, + "logits/rejected": 2.833465099334717, + "logps/chosen": -405.44427490234375, + "logps/rejected": -356.7533264160156, + "loss": 0.1592, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.460837364196777, + "rewards/margins": 5.35818338394165, + "rewards/rejected": -9.81902027130127, + "step": 52180 + }, + { + "epoch": 1.701004472468656, + "grad_norm": 3.8571879863739014, + "learning_rate": 2.1662267409652297e-05, + "logits/chosen": 2.950331211090088, + "logits/rejected": 2.917362689971924, + "logps/chosen": -391.5950622558594, + "logps/rejected": -357.28973388671875, + "loss": 0.3467, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.8278229236602783, + "rewards/margins": 5.398169994354248, + "rewards/rejected": -9.225992202758789, + "step": 52200 + }, + { + "epoch": 1.7016561983201766, + "grad_norm": 1.7503725290298462, + "learning_rate": 2.165140504665385e-05, + "logits/chosen": 2.803091287612915, + "logits/rejected": 2.9486615657806396, + "logps/chosen": -381.84954833984375, + "logps/rejected": -354.1528015136719, + "loss": 0.2724, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.1496806144714355, + "rewards/margins": 4.822152614593506, + "rewards/rejected": -8.971832275390625, + "step": 52220 + }, + { + "epoch": 1.7023079241716972, + "grad_norm": 3.470137596130371, + "learning_rate": 2.1640542683655405e-05, + "logits/chosen": 2.9684512615203857, + "logits/rejected": 2.87742280960083, + "logps/chosen": -366.1014709472656, + "logps/rejected": -363.4745178222656, + "loss": 0.2735, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.3134303092956543, + "rewards/margins": 5.101914405822754, + "rewards/rejected": -8.415346145629883, + "step": 52240 + }, + { + "epoch": 1.7029596500232178, + "grad_norm": 0.45750293135643005, + "learning_rate": 2.1629680320656956e-05, + "logits/chosen": 3.2333438396453857, + "logits/rejected": 3.2288806438446045, + "logps/chosen": -405.86199951171875, + "logps/rejected": -335.7257385253906, + "loss": 0.2645, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.9807612895965576, + "rewards/margins": 4.576167106628418, + "rewards/rejected": -7.5569281578063965, + "step": 52260 + }, + { + "epoch": 1.7036113758747384, + "grad_norm": 0.8034988641738892, + "learning_rate": 2.161881795765851e-05, + "logits/chosen": 3.10217022895813, + "logits/rejected": 3.1888585090637207, + "logps/chosen": -408.14898681640625, + "logps/rejected": -397.38262939453125, + "loss": 0.2768, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.5124526023864746, + "rewards/margins": 4.941011905670166, + "rewards/rejected": -8.45346450805664, + "step": 52280 + }, + { + "epoch": 1.7042631017262588, + "grad_norm": 0.452240526676178, + "learning_rate": 2.160795559466006e-05, + "logits/chosen": 3.0441060066223145, + "logits/rejected": 3.124101161956787, + "logps/chosen": -397.00482177734375, + "logps/rejected": -365.2588195800781, + "loss": 0.4611, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.6986916065216064, + "rewards/margins": 4.787667274475098, + "rewards/rejected": -8.486358642578125, + "step": 52300 + }, + { + "epoch": 1.7049148275777795, + "grad_norm": 4.038734436035156, + "learning_rate": 2.1597093231661615e-05, + "logits/chosen": 2.7219786643981934, + "logits/rejected": 2.8581080436706543, + "logps/chosen": -329.6221923828125, + "logps/rejected": -322.60626220703125, + "loss": 0.2611, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.2193140983581543, + "rewards/margins": 4.040524005889893, + "rewards/rejected": -7.259839057922363, + "step": 52320 + }, + { + "epoch": 1.7055665534292999, + "grad_norm": 0.6124870181083679, + "learning_rate": 2.1586230868663172e-05, + "logits/chosen": 3.1366324424743652, + "logits/rejected": 3.104152202606201, + "logps/chosen": -397.38934326171875, + "logps/rejected": -364.52032470703125, + "loss": 0.228, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.307480573654175, + "rewards/margins": 4.944165229797363, + "rewards/rejected": -8.251646041870117, + "step": 52340 + }, + { + "epoch": 1.7062182792808205, + "grad_norm": 3.948974132537842, + "learning_rate": 2.1575368505664723e-05, + "logits/chosen": 2.904550552368164, + "logits/rejected": 3.0590896606445312, + "logps/chosen": -367.80621337890625, + "logps/rejected": -360.7848815917969, + "loss": 0.3819, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.379692792892456, + "rewards/margins": 4.037418365478516, + "rewards/rejected": -7.417111396789551, + "step": 52360 + }, + { + "epoch": 1.706870005132341, + "grad_norm": 3.9559783935546875, + "learning_rate": 2.1564506142666277e-05, + "logits/chosen": 3.0805323123931885, + "logits/rejected": 3.0589847564697266, + "logps/chosen": -361.183349609375, + "logps/rejected": -316.13800048828125, + "loss": 0.3666, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.242156982421875, + "rewards/margins": 4.565409183502197, + "rewards/rejected": -7.8075666427612305, + "step": 52380 + }, + { + "epoch": 1.7075217309838617, + "grad_norm": 2.406156063079834, + "learning_rate": 2.1553643779667828e-05, + "logits/chosen": 2.9962496757507324, + "logits/rejected": 3.013831615447998, + "logps/chosen": -310.82489013671875, + "logps/rejected": -328.4425354003906, + "loss": 0.4467, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.5914809703826904, + "rewards/margins": 4.43130350112915, + "rewards/rejected": -8.022784233093262, + "step": 52400 + }, + { + "epoch": 1.7081734568353824, + "grad_norm": 2.230024814605713, + "learning_rate": 2.1542781416669382e-05, + "logits/chosen": 3.277987003326416, + "logits/rejected": 3.0659279823303223, + "logps/chosen": -412.9029235839844, + "logps/rejected": -418.4031677246094, + "loss": 0.1976, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.3746795654296875, + "rewards/margins": 5.121485710144043, + "rewards/rejected": -8.49616527557373, + "step": 52420 + }, + { + "epoch": 1.7088251826869028, + "grad_norm": 5.874877452850342, + "learning_rate": 2.1531919053670937e-05, + "logits/chosen": 2.700112819671631, + "logits/rejected": 2.835411310195923, + "logps/chosen": -322.60791015625, + "logps/rejected": -327.37847900390625, + "loss": 0.2935, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.3421471118927, + "rewards/margins": 4.041619300842285, + "rewards/rejected": -7.383767127990723, + "step": 52440 + }, + { + "epoch": 1.7094769085384232, + "grad_norm": 2.6131131649017334, + "learning_rate": 2.152105669067249e-05, + "logits/chosen": 2.6751625537872314, + "logits/rejected": 2.8440613746643066, + "logps/chosen": -340.0489196777344, + "logps/rejected": -313.6860046386719, + "loss": 0.2026, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.806601047515869, + "rewards/margins": 4.4260406494140625, + "rewards/rejected": -8.232641220092773, + "step": 52460 + }, + { + "epoch": 1.7101286343899438, + "grad_norm": 1.2124234437942505, + "learning_rate": 2.1510194327674045e-05, + "logits/chosen": 3.1598846912384033, + "logits/rejected": 3.3150229454040527, + "logps/chosen": -381.5647888183594, + "logps/rejected": -386.5665588378906, + "loss": 0.266, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.0874435901641846, + "rewards/margins": 5.1367363929748535, + "rewards/rejected": -8.2241792678833, + "step": 52480 + }, + { + "epoch": 1.7107803602414644, + "grad_norm": 3.548224925994873, + "learning_rate": 2.1499331964675596e-05, + "logits/chosen": 2.9517643451690674, + "logits/rejected": 2.931514263153076, + "logps/chosen": -346.41326904296875, + "logps/rejected": -338.41473388671875, + "loss": 0.2333, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.5350089073181152, + "rewards/margins": 5.246595859527588, + "rewards/rejected": -8.781604766845703, + "step": 52500 + }, + { + "epoch": 1.711432086092985, + "grad_norm": 1.3866214752197266, + "learning_rate": 2.148846960167715e-05, + "logits/chosen": 2.811889171600342, + "logits/rejected": 3.048523187637329, + "logps/chosen": -361.606689453125, + "logps/rejected": -381.40478515625, + "loss": 0.177, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.4205074310302734, + "rewards/margins": 4.6489949226379395, + "rewards/rejected": -8.069501876831055, + "step": 52520 + }, + { + "epoch": 1.7120838119445057, + "grad_norm": 0.049562931060791016, + "learning_rate": 2.1477607238678704e-05, + "logits/chosen": 2.7706329822540283, + "logits/rejected": 2.81404709815979, + "logps/chosen": -323.22711181640625, + "logps/rejected": -386.1522521972656, + "loss": 0.2848, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7596065998077393, + "rewards/margins": 4.500493049621582, + "rewards/rejected": -8.260099411010742, + "step": 52540 + }, + { + "epoch": 1.712735537796026, + "grad_norm": 4.445038318634033, + "learning_rate": 2.1466744875680255e-05, + "logits/chosen": 3.147188901901245, + "logits/rejected": 3.0609748363494873, + "logps/chosen": -392.0711364746094, + "logps/rejected": -345.28546142578125, + "loss": 0.2263, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.040627956390381, + "rewards/margins": 4.870856761932373, + "rewards/rejected": -8.91148567199707, + "step": 52560 + }, + { + "epoch": 1.7133872636475467, + "grad_norm": 0.008451730944216251, + "learning_rate": 2.145588251268181e-05, + "logits/chosen": 2.969602584838867, + "logits/rejected": 2.8116860389709473, + "logps/chosen": -375.180908203125, + "logps/rejected": -352.28167724609375, + "loss": 0.3414, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.257339954376221, + "rewards/margins": 4.363604545593262, + "rewards/rejected": -8.620944023132324, + "step": 52580 + }, + { + "epoch": 1.714038989499067, + "grad_norm": 0.5765267610549927, + "learning_rate": 2.1445020149683363e-05, + "logits/chosen": 2.871939182281494, + "logits/rejected": 2.9905529022216797, + "logps/chosen": -342.69610595703125, + "logps/rejected": -343.62457275390625, + "loss": 0.2639, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.697972536087036, + "rewards/margins": 4.6793975830078125, + "rewards/rejected": -8.37736988067627, + "step": 52600 + }, + { + "epoch": 1.7146907153505877, + "grad_norm": 2.1876614093780518, + "learning_rate": 2.1434157786684917e-05, + "logits/chosen": 2.746011257171631, + "logits/rejected": 2.9210000038146973, + "logps/chosen": -372.9847106933594, + "logps/rejected": -357.40716552734375, + "loss": 0.3457, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.263155460357666, + "rewards/margins": 4.658692359924316, + "rewards/rejected": -7.921848297119141, + "step": 52620 + }, + { + "epoch": 1.7153424412021083, + "grad_norm": 7.53002405166626, + "learning_rate": 2.142329542368647e-05, + "logits/chosen": 3.0570290088653564, + "logits/rejected": 2.972270965576172, + "logps/chosen": -359.526611328125, + "logps/rejected": -373.39263916015625, + "loss": 0.3723, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.8385238647460938, + "rewards/margins": 4.030639171600342, + "rewards/rejected": -7.869162082672119, + "step": 52640 + }, + { + "epoch": 1.715994167053629, + "grad_norm": 0.47020018100738525, + "learning_rate": 2.1412433060688022e-05, + "logits/chosen": 2.7672314643859863, + "logits/rejected": 2.986281156539917, + "logps/chosen": -333.153076171875, + "logps/rejected": -307.5015869140625, + "loss": 0.2022, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.7568199634552, + "rewards/margins": 4.5946173667907715, + "rewards/rejected": -7.351437568664551, + "step": 52660 + }, + { + "epoch": 1.7166458929051496, + "grad_norm": 0.04994954168796539, + "learning_rate": 2.1401570697689576e-05, + "logits/chosen": 2.685330390930176, + "logits/rejected": 2.6909024715423584, + "logps/chosen": -334.00823974609375, + "logps/rejected": -343.5183410644531, + "loss": 0.4079, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.6265475749969482, + "rewards/margins": 4.594051837921143, + "rewards/rejected": -8.220599174499512, + "step": 52680 + }, + { + "epoch": 1.71729761875667, + "grad_norm": 9.312775611877441, + "learning_rate": 2.1390708334691127e-05, + "logits/chosen": 3.1595005989074707, + "logits/rejected": 3.2951958179473877, + "logps/chosen": -326.4331970214844, + "logps/rejected": -348.1930847167969, + "loss": 0.3248, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.1469054222106934, + "rewards/margins": 4.3833842277526855, + "rewards/rejected": -7.530290126800537, + "step": 52700 + }, + { + "epoch": 1.7179493446081906, + "grad_norm": 2.0601162910461426, + "learning_rate": 2.137984597169268e-05, + "logits/chosen": 2.8223071098327637, + "logits/rejected": 2.8744912147521973, + "logps/chosen": -340.572021484375, + "logps/rejected": -353.90777587890625, + "loss": 0.2263, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.8659863471984863, + "rewards/margins": 5.064877510070801, + "rewards/rejected": -7.930863857269287, + "step": 52720 + }, + { + "epoch": 1.718601070459711, + "grad_norm": 0.5866690278053284, + "learning_rate": 2.136898360869424e-05, + "logits/chosen": 3.0067858695983887, + "logits/rejected": 3.0230119228363037, + "logps/chosen": -336.67315673828125, + "logps/rejected": -358.8733825683594, + "loss": 0.1816, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.068082571029663, + "rewards/margins": 4.484213352203369, + "rewards/rejected": -7.5522966384887695, + "step": 52740 + }, + { + "epoch": 1.7192527963112316, + "grad_norm": 0.706824779510498, + "learning_rate": 2.135812124569579e-05, + "logits/chosen": 3.0873208045959473, + "logits/rejected": 3.2836012840270996, + "logps/chosen": -327.2189636230469, + "logps/rejected": -305.88116455078125, + "loss": 0.3239, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.717292070388794, + "rewards/margins": 3.5206940174102783, + "rewards/rejected": -7.237986087799072, + "step": 52760 + }, + { + "epoch": 1.7199045221627522, + "grad_norm": 0.17616556584835052, + "learning_rate": 2.1347258882697344e-05, + "logits/chosen": 2.885789155960083, + "logits/rejected": 2.7890000343322754, + "logps/chosen": -324.8633728027344, + "logps/rejected": -345.33123779296875, + "loss": 0.3588, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.2814369201660156, + "rewards/margins": 4.345324516296387, + "rewards/rejected": -7.626762390136719, + "step": 52780 + }, + { + "epoch": 1.7205562480142729, + "grad_norm": 5.409356117248535, + "learning_rate": 2.1336396519698895e-05, + "logits/chosen": 3.144073486328125, + "logits/rejected": 3.1933257579803467, + "logps/chosen": -314.6667785644531, + "logps/rejected": -340.298095703125, + "loss": 0.3168, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4191794395446777, + "rewards/margins": 3.927022933959961, + "rewards/rejected": -7.3462018966674805, + "step": 52800 + }, + { + "epoch": 1.7212079738657935, + "grad_norm": 0.6041615605354309, + "learning_rate": 2.132553415670045e-05, + "logits/chosen": 3.033372402191162, + "logits/rejected": 3.182572603225708, + "logps/chosen": -361.08172607421875, + "logps/rejected": -363.2991638183594, + "loss": 0.3543, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.6894912719726562, + "rewards/margins": 3.934530258178711, + "rewards/rejected": -7.624022006988525, + "step": 52820 + }, + { + "epoch": 1.721859699717314, + "grad_norm": 0.3443450629711151, + "learning_rate": 2.1314671793702003e-05, + "logits/chosen": 2.6462509632110596, + "logits/rejected": 2.8640923500061035, + "logps/chosen": -300.7018737792969, + "logps/rejected": -317.2325134277344, + "loss": 0.3254, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.8893113136291504, + "rewards/margins": 3.8907370567321777, + "rewards/rejected": -6.780048370361328, + "step": 52840 + }, + { + "epoch": 1.7225114255688345, + "grad_norm": 2.316420078277588, + "learning_rate": 2.1303809430703557e-05, + "logits/chosen": 3.235135316848755, + "logits/rejected": 3.1416373252868652, + "logps/chosen": -351.12152099609375, + "logps/rejected": -316.4228820800781, + "loss": 0.2365, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.7730233669281006, + "rewards/margins": 4.304845333099365, + "rewards/rejected": -7.077868461608887, + "step": 52860 + }, + { + "epoch": 1.723163151420355, + "grad_norm": 3.8740830421447754, + "learning_rate": 2.129294706770511e-05, + "logits/chosen": 3.0079123973846436, + "logits/rejected": 3.0876357555389404, + "logps/chosen": -350.050048828125, + "logps/rejected": -334.0777893066406, + "loss": 0.4179, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8002846240997314, + "rewards/margins": 4.1588454246521, + "rewards/rejected": -6.95913028717041, + "step": 52880 + }, + { + "epoch": 1.7238148772718755, + "grad_norm": 1.6191225051879883, + "learning_rate": 2.1282084704706662e-05, + "logits/chosen": 3.2631053924560547, + "logits/rejected": 3.371575117111206, + "logps/chosen": -368.22003173828125, + "logps/rejected": -376.5498046875, + "loss": 0.3491, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.0090579986572266, + "rewards/margins": 3.657946825027466, + "rewards/rejected": -6.667004585266113, + "step": 52900 + }, + { + "epoch": 1.7244666031233962, + "grad_norm": 6.468412399291992, + "learning_rate": 2.1271222341708216e-05, + "logits/chosen": 3.3550822734832764, + "logits/rejected": 3.167130947113037, + "logps/chosen": -346.78759765625, + "logps/rejected": -325.35321044921875, + "loss": 0.3279, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.293910264968872, + "rewards/margins": 4.0923171043396, + "rewards/rejected": -7.386227607727051, + "step": 52920 + }, + { + "epoch": 1.7251183289749168, + "grad_norm": 2.4637222290039062, + "learning_rate": 2.126035997870977e-05, + "logits/chosen": 2.968642473220825, + "logits/rejected": 3.0719919204711914, + "logps/chosen": -345.52276611328125, + "logps/rejected": -354.678955078125, + "loss": 0.2828, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.538177967071533, + "rewards/margins": 3.692336320877075, + "rewards/rejected": -7.2305145263671875, + "step": 52940 + }, + { + "epoch": 1.7257700548264374, + "grad_norm": 4.2399797439575195, + "learning_rate": 2.124949761571132e-05, + "logits/chosen": 3.210277557373047, + "logits/rejected": 3.1830904483795166, + "logps/chosen": -372.4859924316406, + "logps/rejected": -333.2574768066406, + "loss": 0.4379, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.774186134338379, + "rewards/margins": 3.6797420978546143, + "rewards/rejected": -6.453928470611572, + "step": 52960 + }, + { + "epoch": 1.7264217806779578, + "grad_norm": 5.473138332366943, + "learning_rate": 2.1238635252712875e-05, + "logits/chosen": 3.182633876800537, + "logits/rejected": 3.2846426963806152, + "logps/chosen": -348.79400634765625, + "logps/rejected": -342.50421142578125, + "loss": 0.3619, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2087478637695312, + "rewards/margins": 4.146261692047119, + "rewards/rejected": -7.35500955581665, + "step": 52980 + }, + { + "epoch": 1.7270735065294782, + "grad_norm": 3.014742851257324, + "learning_rate": 2.122777288971443e-05, + "logits/chosen": 2.8518600463867188, + "logits/rejected": 2.7828176021575928, + "logps/chosen": -334.14794921875, + "logps/rejected": -324.62554931640625, + "loss": 0.3292, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.1850945949554443, + "rewards/margins": 3.9035983085632324, + "rewards/rejected": -7.088693141937256, + "step": 53000 + }, + { + "epoch": 1.7277252323809988, + "grad_norm": 11.789417266845703, + "learning_rate": 2.1216910526715984e-05, + "logits/chosen": 2.983130693435669, + "logits/rejected": 2.945158004760742, + "logps/chosen": -384.5246276855469, + "logps/rejected": -326.08087158203125, + "loss": 0.4266, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.884418249130249, + "rewards/margins": 4.5630364418029785, + "rewards/rejected": -7.447454929351807, + "step": 53020 + }, + { + "epoch": 1.7283769582325195, + "grad_norm": 2.529110908508301, + "learning_rate": 2.1206048163717538e-05, + "logits/chosen": 3.0544865131378174, + "logits/rejected": 3.1530370712280273, + "logps/chosen": -350.8189392089844, + "logps/rejected": -335.42095947265625, + "loss": 0.3751, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.709609270095825, + "rewards/margins": 4.064639091491699, + "rewards/rejected": -6.7742486000061035, + "step": 53040 + }, + { + "epoch": 1.72902868408404, + "grad_norm": 5.971282005310059, + "learning_rate": 2.119518580071909e-05, + "logits/chosen": 3.586137294769287, + "logits/rejected": 3.599865674972534, + "logps/chosen": -382.4283752441406, + "logps/rejected": -386.93499755859375, + "loss": 0.2576, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.9130754470825195, + "rewards/margins": 4.352889060974121, + "rewards/rejected": -7.265964508056641, + "step": 53060 + }, + { + "epoch": 1.7296804099355607, + "grad_norm": 0.560688853263855, + "learning_rate": 2.1184323437720643e-05, + "logits/chosen": 3.068758249282837, + "logits/rejected": 3.206298351287842, + "logps/chosen": -350.2564392089844, + "logps/rejected": -364.43328857421875, + "loss": 0.1973, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.7606072425842285, + "rewards/margins": 5.003901481628418, + "rewards/rejected": -7.7645087242126465, + "step": 53080 + }, + { + "epoch": 1.7303321357870811, + "grad_norm": 2.900714874267578, + "learning_rate": 2.1173461074722194e-05, + "logits/chosen": 2.9239954948425293, + "logits/rejected": 3.2160537242889404, + "logps/chosen": -393.9674377441406, + "logps/rejected": -338.8376770019531, + "loss": 0.2294, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.9623866081237793, + "rewards/margins": 5.099582672119141, + "rewards/rejected": -8.061968803405762, + "step": 53100 + }, + { + "epoch": 1.7309838616386017, + "grad_norm": 2.455326795578003, + "learning_rate": 2.1163141829873672e-05, + "logits/chosen": 3.300358533859253, + "logits/rejected": 3.486319065093994, + "logps/chosen": -400.6410217285156, + "logps/rejected": -368.57562255859375, + "loss": 0.3811, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.3413612842559814, + "rewards/margins": 4.1995649337768555, + "rewards/rejected": -7.540926456451416, + "step": 53120 + }, + { + "epoch": 1.7316355874901221, + "grad_norm": 0.7639175653457642, + "learning_rate": 2.1152279466875223e-05, + "logits/chosen": 2.953286647796631, + "logits/rejected": 3.0629866123199463, + "logps/chosen": -352.06024169921875, + "logps/rejected": -365.8093566894531, + "loss": 0.3727, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.8521971702575684, + "rewards/margins": 4.234505653381348, + "rewards/rejected": -7.086702823638916, + "step": 53140 + }, + { + "epoch": 1.7322873133416428, + "grad_norm": 8.02718734741211, + "learning_rate": 2.114141710387678e-05, + "logits/chosen": 2.6658504009246826, + "logits/rejected": 2.9263758659362793, + "logps/chosen": -356.859130859375, + "logps/rejected": -364.4564514160156, + "loss": 0.1969, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.2176105976104736, + "rewards/margins": 4.960850238800049, + "rewards/rejected": -8.178461074829102, + "step": 53160 + }, + { + "epoch": 1.7329390391931634, + "grad_norm": 2.070235013961792, + "learning_rate": 2.113055474087833e-05, + "logits/chosen": 3.618272066116333, + "logits/rejected": 3.424123764038086, + "logps/chosen": -341.53057861328125, + "logps/rejected": -388.26507568359375, + "loss": 0.2826, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.5459671020507812, + "rewards/margins": 4.924498081207275, + "rewards/rejected": -8.470464706420898, + "step": 53180 + }, + { + "epoch": 1.733590765044684, + "grad_norm": 0.9985716342926025, + "learning_rate": 2.1119692377879885e-05, + "logits/chosen": 3.0623316764831543, + "logits/rejected": 3.0356032848358154, + "logps/chosen": -366.20111083984375, + "logps/rejected": -373.56146240234375, + "loss": 0.1823, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.4261744022369385, + "rewards/margins": 5.090047359466553, + "rewards/rejected": -7.516221523284912, + "step": 53200 + }, + { + "epoch": 1.7342424908962046, + "grad_norm": 1.9360250234603882, + "learning_rate": 2.110937313303136e-05, + "logits/chosen": 2.8176932334899902, + "logits/rejected": 2.8572604656219482, + "logps/chosen": -302.29510498046875, + "logps/rejected": -354.1962890625, + "loss": 0.243, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.1708552837371826, + "rewards/margins": 5.084229946136475, + "rewards/rejected": -8.255085945129395, + "step": 53220 + }, + { + "epoch": 1.734894216747725, + "grad_norm": 1.0202032327651978, + "learning_rate": 2.1098510770032914e-05, + "logits/chosen": 3.1475510597229004, + "logits/rejected": 3.171229362487793, + "logps/chosen": -383.9803161621094, + "logps/rejected": -390.9184265136719, + "loss": 0.2703, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.0269036293029785, + "rewards/margins": 3.9579315185546875, + "rewards/rejected": -7.98483419418335, + "step": 53240 + }, + { + "epoch": 1.7355459425992457, + "grad_norm": 0.3680661916732788, + "learning_rate": 2.1087648407034465e-05, + "logits/chosen": 2.849923610687256, + "logits/rejected": 2.932368516921997, + "logps/chosen": -311.90771484375, + "logps/rejected": -367.7380676269531, + "loss": 0.1686, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.0949361324310303, + "rewards/margins": 5.014796257019043, + "rewards/rejected": -8.109733581542969, + "step": 53260 + }, + { + "epoch": 1.736197668450766, + "grad_norm": 0.20195411145687103, + "learning_rate": 2.107678604403602e-05, + "logits/chosen": 3.048405647277832, + "logits/rejected": 3.172780990600586, + "logps/chosen": -345.4024353027344, + "logps/rejected": -330.7233581542969, + "loss": 0.2437, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.7346057891845703, + "rewards/margins": 4.191922187805176, + "rewards/rejected": -7.926527976989746, + "step": 53280 + }, + { + "epoch": 1.7368493943022867, + "grad_norm": 0.4826774001121521, + "learning_rate": 2.1065923681037574e-05, + "logits/chosen": 3.188706398010254, + "logits/rejected": 3.2462570667266846, + "logps/chosen": -401.32012939453125, + "logps/rejected": -350.23162841796875, + "loss": 0.2808, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.6464076042175293, + "rewards/margins": 4.3159098625183105, + "rewards/rejected": -7.96231746673584, + "step": 53300 + }, + { + "epoch": 1.7375011201538073, + "grad_norm": 2.4472672939300537, + "learning_rate": 2.1055061318039128e-05, + "logits/chosen": 3.057408571243286, + "logits/rejected": 3.070796251296997, + "logps/chosen": -357.28350830078125, + "logps/rejected": -355.866943359375, + "loss": 0.1727, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.7965762615203857, + "rewards/margins": 4.926562309265137, + "rewards/rejected": -8.723138809204102, + "step": 53320 + }, + { + "epoch": 1.738152846005328, + "grad_norm": 1.5565863847732544, + "learning_rate": 2.1044198955040682e-05, + "logits/chosen": 2.9224417209625244, + "logits/rejected": 3.076873779296875, + "logps/chosen": -361.44866943359375, + "logps/rejected": -351.9649353027344, + "loss": 0.1806, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.5124564170837402, + "rewards/margins": 5.350985527038574, + "rewards/rejected": -8.863443374633789, + "step": 53340 + }, + { + "epoch": 1.7388045718568486, + "grad_norm": 0.3480989634990692, + "learning_rate": 2.1033336592042233e-05, + "logits/chosen": 3.0638890266418457, + "logits/rejected": 3.0600321292877197, + "logps/chosen": -321.9134826660156, + "logps/rejected": -311.16558837890625, + "loss": 0.2255, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.6872916221618652, + "rewards/margins": 5.02321195602417, + "rewards/rejected": -8.710503578186035, + "step": 53360 + }, + { + "epoch": 1.739456297708369, + "grad_norm": 0.41957804560661316, + "learning_rate": 2.1022474229043787e-05, + "logits/chosen": 3.3007609844207764, + "logits/rejected": 3.0517849922180176, + "logps/chosen": -372.08831787109375, + "logps/rejected": -383.88848876953125, + "loss": 0.1572, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.8163161277770996, + "rewards/margins": 5.250955581665039, + "rewards/rejected": -8.067272186279297, + "step": 53380 + }, + { + "epoch": 1.7401080235598896, + "grad_norm": 1.4725227355957031, + "learning_rate": 2.101161186604534e-05, + "logits/chosen": 2.965878486633301, + "logits/rejected": 3.0745387077331543, + "logps/chosen": -381.39215087890625, + "logps/rejected": -417.08245849609375, + "loss": 0.2291, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.03745174407959, + "rewards/margins": 5.08864688873291, + "rewards/rejected": -9.126097679138184, + "step": 53400 + }, + { + "epoch": 1.74075974941141, + "grad_norm": 1.1751552820205688, + "learning_rate": 2.1000749503046892e-05, + "logits/chosen": 2.6470351219177246, + "logits/rejected": 2.7670295238494873, + "logps/chosen": -314.5076599121094, + "logps/rejected": -363.37481689453125, + "loss": 0.2302, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.415625810623169, + "rewards/margins": 4.821955680847168, + "rewards/rejected": -8.237582206726074, + "step": 53420 + }, + { + "epoch": 1.7414114752629306, + "grad_norm": 3.8198976516723633, + "learning_rate": 2.0989887140048446e-05, + "logits/chosen": 3.008805751800537, + "logits/rejected": 3.0331549644470215, + "logps/chosen": -321.02777099609375, + "logps/rejected": -333.6812438964844, + "loss": 0.3246, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.56605863571167, + "rewards/margins": 4.491066932678223, + "rewards/rejected": -8.05712604522705, + "step": 53440 + }, + { + "epoch": 1.7420632011144512, + "grad_norm": 4.715965270996094, + "learning_rate": 2.097902477705e-05, + "logits/chosen": 2.8059048652648926, + "logits/rejected": 3.0245137214660645, + "logps/chosen": -355.399169921875, + "logps/rejected": -340.9657287597656, + "loss": 0.3436, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.619739532470703, + "rewards/margins": 5.295022010803223, + "rewards/rejected": -8.914761543273926, + "step": 53460 + }, + { + "epoch": 1.7427149269659719, + "grad_norm": 2.1728551387786865, + "learning_rate": 2.0968162414051554e-05, + "logits/chosen": 3.212942123413086, + "logits/rejected": 3.2649426460266113, + "logps/chosen": -378.3273010253906, + "logps/rejected": -348.24017333984375, + "loss": 0.2922, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.4064788818359375, + "rewards/margins": 4.027344226837158, + "rewards/rejected": -8.433822631835938, + "step": 53480 + }, + { + "epoch": 1.7433666528174925, + "grad_norm": 7.7751288414001465, + "learning_rate": 2.095730005105311e-05, + "logits/chosen": 3.319124937057495, + "logits/rejected": 3.246063709259033, + "logps/chosen": -364.1673583984375, + "logps/rejected": -359.3438415527344, + "loss": 0.2187, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.798297882080078, + "rewards/margins": 4.525094985961914, + "rewards/rejected": -8.323392868041992, + "step": 53500 + }, + { + "epoch": 1.7440183786690129, + "grad_norm": 3.2791857719421387, + "learning_rate": 2.094643768805466e-05, + "logits/chosen": 3.0660667419433594, + "logits/rejected": 3.073791980743408, + "logps/chosen": -374.41998291015625, + "logps/rejected": -346.54144287109375, + "loss": 0.2925, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.9960479736328125, + "rewards/margins": 4.3186750411987305, + "rewards/rejected": -8.314723014831543, + "step": 53520 + }, + { + "epoch": 1.7446701045205333, + "grad_norm": 1.1118440628051758, + "learning_rate": 2.0935575325056213e-05, + "logits/chosen": 3.3280155658721924, + "logits/rejected": 3.1455867290496826, + "logps/chosen": -349.6155700683594, + "logps/rejected": -352.7362976074219, + "loss": 0.3666, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.3674635887145996, + "rewards/margins": 4.8627800941467285, + "rewards/rejected": -8.230242729187012, + "step": 53540 + }, + { + "epoch": 1.745321830372054, + "grad_norm": 0.30342718958854675, + "learning_rate": 2.0924712962057764e-05, + "logits/chosen": 2.70636248588562, + "logits/rejected": 2.931473731994629, + "logps/chosen": -372.4390563964844, + "logps/rejected": -361.0166320800781, + "loss": 0.253, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.507452964782715, + "rewards/margins": 4.563547134399414, + "rewards/rejected": -9.070999145507812, + "step": 53560 + }, + { + "epoch": 1.7459735562235745, + "grad_norm": 5.296016216278076, + "learning_rate": 2.0913850599059322e-05, + "logits/chosen": 3.1016652584075928, + "logits/rejected": 3.20658540725708, + "logps/chosen": -390.01007080078125, + "logps/rejected": -374.2552795410156, + "loss": 0.3932, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.568005084991455, + "rewards/margins": 4.773602485656738, + "rewards/rejected": -8.341606140136719, + "step": 53580 + }, + { + "epoch": 1.7466252820750952, + "grad_norm": 3.1252033710479736, + "learning_rate": 2.0902988236060876e-05, + "logits/chosen": 2.8380234241485596, + "logits/rejected": 3.118720531463623, + "logps/chosen": -369.808837890625, + "logps/rejected": -345.8714294433594, + "loss": 0.1963, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.578935146331787, + "rewards/margins": 4.766245365142822, + "rewards/rejected": -8.345181465148926, + "step": 53600 + }, + { + "epoch": 1.7472770079266158, + "grad_norm": 1.206436276435852, + "learning_rate": 2.0892125873062427e-05, + "logits/chosen": 3.168651580810547, + "logits/rejected": 3.2514851093292236, + "logps/chosen": -352.5486755371094, + "logps/rejected": -368.21209716796875, + "loss": 0.2633, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.935251235961914, + "rewards/margins": 4.17918586730957, + "rewards/rejected": -8.114437103271484, + "step": 53620 + }, + { + "epoch": 1.7479287337781362, + "grad_norm": 2.1031668186187744, + "learning_rate": 2.088126351006398e-05, + "logits/chosen": 2.911649227142334, + "logits/rejected": 3.0403811931610107, + "logps/chosen": -320.7434997558594, + "logps/rejected": -326.7784729003906, + "loss": 0.3357, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.7447972297668457, + "rewards/margins": 4.096566677093506, + "rewards/rejected": -7.84136438369751, + "step": 53640 + }, + { + "epoch": 1.7485804596296568, + "grad_norm": 0.7564829587936401, + "learning_rate": 2.0870401147065532e-05, + "logits/chosen": 3.238614559173584, + "logits/rejected": 3.243464708328247, + "logps/chosen": -398.60150146484375, + "logps/rejected": -377.82147216796875, + "loss": 0.1867, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.683058500289917, + "rewards/margins": 4.168243408203125, + "rewards/rejected": -7.851302146911621, + "step": 53660 + }, + { + "epoch": 1.7492321854811772, + "grad_norm": 3.196772575378418, + "learning_rate": 2.0859538784067086e-05, + "logits/chosen": 3.118086338043213, + "logits/rejected": 3.3055686950683594, + "logps/chosen": -418.94207763671875, + "logps/rejected": -391.387451171875, + "loss": 0.3719, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.130677223205566, + "rewards/margins": 4.716008186340332, + "rewards/rejected": -8.846685409545898, + "step": 53680 + }, + { + "epoch": 1.7498839113326978, + "grad_norm": 2.5743632316589355, + "learning_rate": 2.084867642106864e-05, + "logits/chosen": 2.9283509254455566, + "logits/rejected": 3.0171680450439453, + "logps/chosen": -375.9070739746094, + "logps/rejected": -357.75628662109375, + "loss": 0.2659, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.789339065551758, + "rewards/margins": 4.443312644958496, + "rewards/rejected": -8.23265266418457, + "step": 53700 + }, + { + "epoch": 1.7505356371842185, + "grad_norm": 2.254911422729492, + "learning_rate": 2.0837814058070194e-05, + "logits/chosen": 2.9984378814697266, + "logits/rejected": 3.027169704437256, + "logps/chosen": -355.6968688964844, + "logps/rejected": -394.5773010253906, + "loss": 0.1803, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.932783842086792, + "rewards/margins": 5.525224685668945, + "rewards/rejected": -9.458009719848633, + "step": 53720 + }, + { + "epoch": 1.751187363035739, + "grad_norm": 3.518096923828125, + "learning_rate": 2.082695169507175e-05, + "logits/chosen": 2.9296517372131348, + "logits/rejected": 2.7867321968078613, + "logps/chosen": -371.6079406738281, + "logps/rejected": -350.2498474121094, + "loss": 0.3024, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.6742019653320312, + "rewards/margins": 3.8634533882141113, + "rewards/rejected": -7.537654876708984, + "step": 53740 + }, + { + "epoch": 1.7518390888872597, + "grad_norm": 0.8016734719276428, + "learning_rate": 2.08160893320733e-05, + "logits/chosen": 2.7471580505371094, + "logits/rejected": 2.924013614654541, + "logps/chosen": -355.0122985839844, + "logps/rejected": -322.2711486816406, + "loss": 0.2601, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.604403018951416, + "rewards/margins": 4.276628017425537, + "rewards/rejected": -7.881031036376953, + "step": 53760 + }, + { + "epoch": 1.75249081473878, + "grad_norm": 0.46698063611984253, + "learning_rate": 2.0805226969074853e-05, + "logits/chosen": 3.3194637298583984, + "logits/rejected": 3.3002171516418457, + "logps/chosen": -349.56390380859375, + "logps/rejected": -326.1524353027344, + "loss": 0.3647, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.980966091156006, + "rewards/margins": 3.826758623123169, + "rewards/rejected": -7.807724952697754, + "step": 53780 + }, + { + "epoch": 1.7531425405903007, + "grad_norm": 3.2353830337524414, + "learning_rate": 2.0794364606076408e-05, + "logits/chosen": 3.0171189308166504, + "logits/rejected": 3.1769418716430664, + "logps/chosen": -382.3933410644531, + "logps/rejected": -404.55853271484375, + "loss": 0.2739, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.9698586463928223, + "rewards/margins": 5.265276908874512, + "rewards/rejected": -8.235135078430176, + "step": 53800 + }, + { + "epoch": 1.7537942664418211, + "grad_norm": 1.530100703239441, + "learning_rate": 2.0783502243077958e-05, + "logits/chosen": 3.1410136222839355, + "logits/rejected": 3.2798476219177246, + "logps/chosen": -425.9920349121094, + "logps/rejected": -387.627197265625, + "loss": 0.3892, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.230708360671997, + "rewards/margins": 4.666292667388916, + "rewards/rejected": -7.897000312805176, + "step": 53820 + }, + { + "epoch": 1.7544459922933417, + "grad_norm": 4.438148021697998, + "learning_rate": 2.0772639880079512e-05, + "logits/chosen": 2.8129029273986816, + "logits/rejected": 3.105478286743164, + "logps/chosen": -343.9136657714844, + "logps/rejected": -372.3033142089844, + "loss": 0.227, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.4597084522247314, + "rewards/margins": 4.786907196044922, + "rewards/rejected": -8.246614456176758, + "step": 53840 + }, + { + "epoch": 1.7550977181448624, + "grad_norm": 0.5709053874015808, + "learning_rate": 2.0761777517081067e-05, + "logits/chosen": 3.0324270725250244, + "logits/rejected": 2.9506754875183105, + "logps/chosen": -376.13616943359375, + "logps/rejected": -364.02886962890625, + "loss": 0.2122, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.3559365272521973, + "rewards/margins": 4.91517972946167, + "rewards/rejected": -8.27111530303955, + "step": 53860 + }, + { + "epoch": 1.755749443996383, + "grad_norm": 0.7649480104446411, + "learning_rate": 2.075091515408262e-05, + "logits/chosen": 3.2663521766662598, + "logits/rejected": 3.1207659244537354, + "logps/chosen": -390.07763671875, + "logps/rejected": -338.50341796875, + "loss": 0.1869, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.5624802112579346, + "rewards/margins": 4.050530433654785, + "rewards/rejected": -7.613010406494141, + "step": 53880 + }, + { + "epoch": 1.7564011698479036, + "grad_norm": 0.07146328687667847, + "learning_rate": 2.0740052791084175e-05, + "logits/chosen": 3.0700831413269043, + "logits/rejected": 3.125312328338623, + "logps/chosen": -385.7370300292969, + "logps/rejected": -368.81787109375, + "loss": 0.2555, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.543219804763794, + "rewards/margins": 4.320036888122559, + "rewards/rejected": -7.863256931304932, + "step": 53900 + }, + { + "epoch": 1.757052895699424, + "grad_norm": 6.920665740966797, + "learning_rate": 2.0729190428085726e-05, + "logits/chosen": 2.8131601810455322, + "logits/rejected": 2.941415309906006, + "logps/chosen": -312.1759033203125, + "logps/rejected": -363.77935791015625, + "loss": 0.3613, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9347150325775146, + "rewards/margins": 4.839930534362793, + "rewards/rejected": -7.774645805358887, + "step": 53920 + }, + { + "epoch": 1.7577046215509446, + "grad_norm": 1.6165814399719238, + "learning_rate": 2.071832806508728e-05, + "logits/chosen": 3.4108757972717285, + "logits/rejected": 3.367539882659912, + "logps/chosen": -384.5004577636719, + "logps/rejected": -413.40863037109375, + "loss": 0.1257, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.6469998359680176, + "rewards/margins": 5.492286682128906, + "rewards/rejected": -8.139286041259766, + "step": 53940 + }, + { + "epoch": 1.758356347402465, + "grad_norm": 7.449623107910156, + "learning_rate": 2.070746570208883e-05, + "logits/chosen": 2.8876118659973145, + "logits/rejected": 2.8388524055480957, + "logps/chosen": -328.226806640625, + "logps/rejected": -389.1358947753906, + "loss": 0.4076, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.720121383666992, + "rewards/margins": 4.087901592254639, + "rewards/rejected": -7.808022975921631, + "step": 53960 + }, + { + "epoch": 1.7590080732539857, + "grad_norm": 1.4524906873703003, + "learning_rate": 2.0696603339090388e-05, + "logits/chosen": 2.728365421295166, + "logits/rejected": 2.672757625579834, + "logps/chosen": -341.33673095703125, + "logps/rejected": -335.9598083496094, + "loss": 0.1669, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.3237831592559814, + "rewards/margins": 4.83596134185791, + "rewards/rejected": -8.159744262695312, + "step": 53980 + }, + { + "epoch": 1.7596597991055063, + "grad_norm": 0.752940833568573, + "learning_rate": 2.0685740976091942e-05, + "logits/chosen": 2.971083879470825, + "logits/rejected": 3.1873483657836914, + "logps/chosen": -331.19586181640625, + "logps/rejected": -363.5233459472656, + "loss": 0.289, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.8747398853302, + "rewards/margins": 4.5574140548706055, + "rewards/rejected": -8.432153701782227, + "step": 54000 + }, + { + "epoch": 1.7596597991055063, + "eval_logits/chosen": 3.1123528480529785, + "eval_logits/rejected": 3.113154411315918, + "eval_logps/chosen": -393.7906188964844, + "eval_logps/rejected": -378.55145263671875, + "eval_loss": 0.4756561517715454, + "eval_rewards/accuracies": 0.8335719108581543, + "eval_rewards/chosen": -3.9209983348846436, + "eval_rewards/margins": 4.310827732086182, + "eval_rewards/rejected": -8.231825828552246, + "eval_runtime": 3544.8391, + "eval_samples_per_second": 3.153, + "eval_steps_per_second": 3.153, + "step": 54000 + } + ], + "logging_steps": 20, + "max_steps": 92061, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 9000, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}