diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9549 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.999214865218529, + "eval_steps": 100, + "global_step": 5730, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005234231876472127, + "grad_norm": 0.5165453173260315, + "kl": 0.0361328125, + "learning_rate": 8.726003490401396e-07, + "logits/chosen": -1284925056.0, + "logits/rejected": -1155530752.0, + "logps/chosen": -305.5562310030395, + "logps/rejected": -284.19292604501607, + "loss": 0.4999, + "rewards/chosen": 0.0001331085854388298, + "rewards/margins": 4.391090532427996e-05, + "rewards/rejected": 8.919768011454984e-05, + "step": 10 + }, + { + "epoch": 0.010468463752944255, + "grad_norm": 0.507696597667233, + "kl": 0.13066406548023224, + "learning_rate": 1.7452006980802793e-06, + "logits/chosen": -1350985344.0, + "logits/rejected": -1120508288.0, + "logps/chosen": -327.7463976945245, + "logps/rejected": -297.61092150170646, + "loss": 0.4999, + "rewards/chosen": 0.001419726984645173, + "rewards/margins": 0.0006502778092238846, + "rewards/rejected": 0.0007694491754212883, + "step": 20 + }, + { + "epoch": 0.015702695629416383, + "grad_norm": 0.5221877360005158, + "kl": 0.18417969346046448, + "learning_rate": 2.617801047120419e-06, + "logits/chosen": -1257032960.0, + "logits/rejected": -1239207168.0, + "logps/chosen": -317.86996904024767, + "logps/rejected": -271.64668769716087, + "loss": 0.4997, + "rewards/chosen": 0.00518288627128483, + "rewards/margins": 0.0010991144603621173, + "rewards/rejected": 0.004083771810922713, + "step": 30 + }, + { + "epoch": 0.02093692750588851, + "grad_norm": 0.514175907898874, + "kl": 0.12470702826976776, + "learning_rate": 3.4904013961605585e-06, + "logits/chosen": -1334417792.0, + "logits/rejected": -1143367296.0, + "logps/chosen": -323.3053892215569, + "logps/rejected": -279.16339869281046, + "loss": 0.4993, + "rewards/chosen": 0.19018326262514035, + "rewards/margins": 0.1845642384855366, + "rewards/rejected": 0.0056190241396037585, + "step": 40 + }, + { + "epoch": 0.02617115938236064, + "grad_norm": 0.5085275851932604, + "kl": 0.13925781846046448, + "learning_rate": 4.363001745200698e-06, + "logits/chosen": -1259549440.0, + "logits/rejected": -1169791360.0, + "logps/chosen": -361.9696048632219, + "logps/rejected": -280.18006430868166, + "loss": 0.4981, + "rewards/chosen": 0.017455683653115502, + "rewards/margins": 0.013712324906129971, + "rewards/rejected": 0.0037433587469855307, + "step": 50 + }, + { + "epoch": 0.031405391258832765, + "grad_norm": 0.46829128041895135, + "kl": 0.0, + "learning_rate": 5.235602094240838e-06, + "logits/chosen": -1275907328.0, + "logits/rejected": -1146722688.0, + "logps/chosen": -318.89156626506025, + "logps/rejected": -301.97402597402595, + "loss": 0.4982, + "rewards/chosen": 0.1808394121836467, + "rewards/margins": 0.18323246590726683, + "rewards/rejected": -0.00239305372362013, + "step": 60 + }, + { + "epoch": 0.036639623135304895, + "grad_norm": 0.5227551593809364, + "kl": 0.0, + "learning_rate": 6.108202443280978e-06, + "logits/chosen": -1260598016.0, + "logits/rejected": -1303589632.0, + "logps/chosen": -374.28753993610223, + "logps/rejected": -286.38532110091745, + "loss": 0.4954, + "rewards/chosen": 0.015276729108426517, + "rewards/margins": 0.03535449573686689, + "rewards/rejected": -0.020077766628440366, + "step": 70 + }, + { + "epoch": 0.04187385501177702, + "grad_norm": 0.5592960536432072, + "kl": 0.0, + "learning_rate": 6.980802792321117e-06, + "logits/chosen": -1319947520.0, + "logits/rejected": -1250951168.0, + "logps/chosen": -333.88957055214723, + "logps/rejected": -304.71337579617835, + "loss": 0.4945, + "rewards/chosen": -0.0124519207726227, + "rewards/margins": 0.0456597681206894, + "rewards/rejected": -0.0581116888933121, + "step": 80 + }, + { + "epoch": 0.04710808688824915, + "grad_norm": 0.5380069808961143, + "kl": 0.0, + "learning_rate": 7.853403141361257e-06, + "logits/chosen": -1415158144.0, + "logits/rejected": -1307154816.0, + "logps/chosen": -353.02127659574467, + "logps/rejected": -296.38585209003213, + "loss": 0.4907, + "rewards/chosen": -0.07930934175531915, + "rewards/margins": 0.08092226998101526, + "rewards/rejected": -0.1602316117363344, + "step": 90 + }, + { + "epoch": 0.05234231876472128, + "grad_norm": 0.5652054096019196, + "kl": 0.0, + "learning_rate": 8.726003490401396e-06, + "logits/chosen": -1422078720.0, + "logits/rejected": -1432354816.0, + "logps/chosen": -374.0444444444444, + "logps/rejected": -317.04615384615386, + "loss": 0.4859, + "rewards/chosen": -0.3256448412698413, + "rewards/margins": 0.1057493894993895, + "rewards/rejected": -0.4313942307692308, + "step": 100 + }, + { + "epoch": 0.05234231876472128, + "eval_kl": 0.0, + "eval_logits/chosen": -3175687424.0, + "eval_logits/rejected": -3128284928.0, + "eval_logps/chosen": -393.0410687778328, + "eval_logps/rejected": -370.77672799602186, + "eval_loss": 0.485539048910141, + "eval_rewards/chosen": -0.6135576447303315, + "eval_rewards/margins": 0.14153932195291075, + "eval_rewards/rejected": -0.7550969666832422, + "eval_runtime": 93.7067, + "eval_samples_per_second": 42.686, + "eval_steps_per_second": 0.672, + "step": 100 + }, + { + "epoch": 0.05757655064119341, + "grad_norm": 0.731851227511473, + "kl": 0.0, + "learning_rate": 9.598603839441536e-06, + "logits/chosen": -1513304832.0, + "logits/rejected": -1316382336.0, + "logps/chosen": -370.15204678362574, + "logps/rejected": -375.89261744966444, + "loss": 0.4897, + "rewards/chosen": -0.49762426900584794, + "rewards/margins": 0.17886482495388362, + "rewards/rejected": -0.6764890939597316, + "step": 110 + }, + { + "epoch": 0.06281078251766553, + "grad_norm": 0.8844414419531235, + "kl": 0.0, + "learning_rate": 1.0471204188481676e-05, + "logits/chosen": -1446615424.0, + "logits/rejected": -1554409088.0, + "logps/chosen": -438.3061889250814, + "logps/rejected": -390.4384384384384, + "loss": 0.4743, + "rewards/chosen": -0.9323086319218241, + "rewards/margins": 0.2139000767868846, + "rewards/rejected": -1.1462087087087087, + "step": 120 + }, + { + "epoch": 0.06804501439413765, + "grad_norm": 0.5762003679726659, + "kl": 0.0, + "learning_rate": 1.1343804537521815e-05, + "logits/chosen": -1495479040.0, + "logits/rejected": -1531759872.0, + "logps/chosen": -415.79421221864953, + "logps/rejected": -433.3130699088146, + "loss": 0.4632, + "rewards/chosen": -0.9915594855305466, + "rewards/margins": 0.35456513453024374, + "rewards/rejected": -1.3461246200607904, + "step": 130 + }, + { + "epoch": 0.07327924627060979, + "grad_norm": 0.7230629257517631, + "kl": 0.0, + "learning_rate": 1.2216404886561955e-05, + "logits/chosen": -1567201664.0, + "logits/rejected": -1556715904.0, + "logps/chosen": -462.81481481481484, + "logps/rejected": -487.49367088607596, + "loss": 0.4764, + "rewards/chosen": -1.3439429012345678, + "rewards/margins": 0.37441152914517906, + "rewards/rejected": -1.7183544303797469, + "step": 140 + }, + { + "epoch": 0.07851347814708191, + "grad_norm": 0.6568103530741317, + "kl": 0.0, + "learning_rate": 1.3089005235602096e-05, + "logits/chosen": -1576638848.0, + "logits/rejected": -1556506240.0, + "logps/chosen": -454.82866043613706, + "logps/rejected": -408.9780564263323, + "loss": 0.4709, + "rewards/chosen": -0.938376168224299, + "rewards/margins": 0.3631520449418453, + "rewards/rejected": -1.3015282131661443, + "step": 150 + }, + { + "epoch": 0.08374771002355404, + "grad_norm": 0.7681435781467648, + "kl": 0.0, + "learning_rate": 1.3961605584642234e-05, + "logits/chosen": -1524839168.0, + "logits/rejected": -1607886464.0, + "logps/chosen": -391.9225806451613, + "logps/rejected": -412.3151515151515, + "loss": 0.462, + "rewards/chosen": -0.7109879032258064, + "rewards/margins": 0.56287573313783, + "rewards/rejected": -1.2738636363636364, + "step": 160 + }, + { + "epoch": 0.08898194190002617, + "grad_norm": 0.6470606006807348, + "kl": 0.0, + "learning_rate": 1.4834205933682374e-05, + "logits/chosen": -1812358784.0, + "logits/rejected": -1592996608.0, + "logps/chosen": -403.7957957957958, + "logps/rejected": -453.628664495114, + "loss": 0.4832, + "rewards/chosen": -0.9804804804804805, + "rewards/margins": 0.4303501384120276, + "rewards/rejected": -1.4108306188925082, + "step": 170 + }, + { + "epoch": 0.0942161737764983, + "grad_norm": 0.6062523081778814, + "kl": 0.0, + "learning_rate": 1.5706806282722515e-05, + "logits/chosen": -1425224448.0, + "logits/rejected": -1423337088.0, + "logps/chosen": -353.8, + "logps/rejected": -389.05, + "loss": 0.4639, + "rewards/chosen": -0.6763671875, + "rewards/margins": 0.5416015625, + "rewards/rejected": -1.21796875, + "step": 180 + }, + { + "epoch": 0.09945040565297043, + "grad_norm": 0.5853123473869152, + "kl": 0.0, + "learning_rate": 1.6579406631762653e-05, + "logits/chosen": -1505126016.0, + "logits/rejected": -1503028864.0, + "logps/chosen": -414.8, + "logps/rejected": -472.75, + "loss": 0.4585, + "rewards/chosen": -0.96484375, + "rewards/margins": 0.764453125, + "rewards/rejected": -1.729296875, + "step": 190 + }, + { + "epoch": 0.10468463752944256, + "grad_norm": 0.5544132434798503, + "kl": 0.0, + "learning_rate": 1.7452006980802792e-05, + "logits/chosen": -1624034560.0, + "logits/rejected": -1637456256.0, + "logps/chosen": -481.99376947040497, + "logps/rejected": -547.7115987460814, + "loss": 0.4556, + "rewards/chosen": -1.3319704049844237, + "rewards/margins": 1.131783513510874, + "rewards/rejected": -2.4637539184952977, + "step": 200 + }, + { + "epoch": 0.10468463752944256, + "eval_kl": 0.0, + "eval_logits/chosen": -3518155520.0, + "eval_logits/rejected": -3502177280.0, + "eval_logps/chosen": -452.2592775853538, + "eval_logps/rejected": -491.090999502735, + "eval_loss": 0.46578124165534973, + "eval_rewards/chosen": -1.2023132112815438, + "eval_rewards/margins": 0.757905585337054, + "eval_rewards/rejected": -1.9602187966185978, + "eval_runtime": 93.4704, + "eval_samples_per_second": 42.794, + "eval_steps_per_second": 0.674, + "step": 200 + }, + { + "epoch": 0.10991886940591468, + "grad_norm": 0.5633400221663752, + "kl": 0.0, + "learning_rate": 1.8324607329842934e-05, + "logits/chosen": -1558393600.0, + "logits/rejected": -1515401984.0, + "logps/chosen": -446.51851851851853, + "logps/rejected": -465.82278481012656, + "loss": 0.4702, + "rewards/chosen": -1.308641975308642, + "rewards/margins": 0.7303216322862949, + "rewards/rejected": -2.038963607594937, + "step": 210 + }, + { + "epoch": 0.11515310128238682, + "grad_norm": 0.8284904491112901, + "kl": 0.0, + "learning_rate": 1.9197207678883072e-05, + "logits/chosen": -1535954176.0, + "logits/rejected": -1548537088.0, + "logps/chosen": -430.65, + "logps/rejected": -465.8, + "loss": 0.4699, + "rewards/chosen": -1.17978515625, + "rewards/margins": 0.42138671875, + "rewards/rejected": -1.601171875, + "step": 220 + }, + { + "epoch": 0.12038733315885894, + "grad_norm": 0.5556844000663201, + "kl": 0.0, + "learning_rate": 2.006980802792321e-05, + "logits/chosen": -1586914944.0, + "logits/rejected": -1648780928.0, + "logps/chosen": -457.20127795527156, + "logps/rejected": -569.6391437308869, + "loss": 0.4462, + "rewards/chosen": -1.5564097444089458, + "rewards/margins": 1.1716101332668953, + "rewards/rejected": -2.728019877675841, + "step": 230 + }, + { + "epoch": 0.12562156503533106, + "grad_norm": 0.6855447682569064, + "kl": 0.0, + "learning_rate": 2.0942408376963353e-05, + "logits/chosen": -1610193280.0, + "logits/rejected": -1664719232.0, + "logps/chosen": -527.1847133757962, + "logps/rejected": -572.2699386503067, + "loss": 0.4581, + "rewards/chosen": -1.6970541401273886, + "rewards/margins": 1.0817035285842678, + "rewards/rejected": -2.7787576687116564, + "step": 240 + }, + { + "epoch": 0.13085579691180318, + "grad_norm": 0.5884560345985042, + "kl": 0.0, + "learning_rate": 2.181500872600349e-05, + "logits/chosen": -1686949120.0, + "logits/rejected": -1573283456.0, + "logps/chosen": -567.063063063063, + "logps/rejected": -599.4527687296417, + "loss": 0.4696, + "rewards/chosen": -2.0405405405405403, + "rewards/margins": 1.0877167884496877, + "rewards/rejected": -3.128257328990228, + "step": 250 + }, + { + "epoch": 0.1360900287882753, + "grad_norm": 0.6800356341208197, + "kl": 0.0, + "learning_rate": 2.268760907504363e-05, + "logits/chosen": -1592157824.0, + "logits/rejected": -1575800064.0, + "logps/chosen": -456.7725856697819, + "logps/rejected": -522.3322884012539, + "loss": 0.4699, + "rewards/chosen": -1.407904984423676, + "rewards/margins": 0.6860409403412142, + "rewards/rejected": -2.09394592476489, + "step": 260 + }, + { + "epoch": 0.14132426066474746, + "grad_norm": 0.605890168743028, + "kl": 0.0, + "learning_rate": 2.3560209424083772e-05, + "logits/chosen": -1526307200.0, + "logits/rejected": -1540358144.0, + "logps/chosen": -449.85, + "logps/rejected": -448.7, + "loss": 0.4709, + "rewards/chosen": -1.132080078125, + "rewards/margins": 0.3761230468750001, + "rewards/rejected": -1.508203125, + "step": 270 + }, + { + "epoch": 0.14655849254121958, + "grad_norm": 0.5866481803635993, + "kl": 0.0, + "learning_rate": 2.443280977312391e-05, + "logits/chosen": -1631374592.0, + "logits/rejected": -1556925696.0, + "logps/chosen": -431.41104294478527, + "logps/rejected": -466.0891719745223, + "loss": 0.4686, + "rewards/chosen": -1.1589340490797546, + "rewards/margins": 0.5214003458247041, + "rewards/rejected": -1.6803343949044587, + "step": 280 + }, + { + "epoch": 0.1517927244176917, + "grad_norm": 0.6838595939783, + "kl": 0.0, + "learning_rate": 2.5305410122164053e-05, + "logits/chosen": -1706242816.0, + "logits/rejected": -1674785536.0, + "logps/chosen": -463.2049689440994, + "logps/rejected": -457.40880503144655, + "loss": 0.4689, + "rewards/chosen": -1.1663431677018634, + "rewards/margins": 0.46455305871323094, + "rewards/rejected": -1.6308962264150944, + "step": 290 + }, + { + "epoch": 0.15702695629416383, + "grad_norm": 0.657362779444818, + "kl": 0.0, + "learning_rate": 2.617801047120419e-05, + "logits/chosen": -1563007360.0, + "logits/rejected": -1538680448.0, + "logps/chosen": -410.1183800623053, + "logps/rejected": -425.8808777429467, + "loss": 0.4658, + "rewards/chosen": -1.044489875389408, + "rewards/margins": 0.5211449208488363, + "rewards/rejected": -1.5656347962382444, + "step": 300 + }, + { + "epoch": 0.15702695629416383, + "eval_kl": 0.0, + "eval_logits/chosen": -3454242304.0, + "eval_logits/rejected": -3428943360.0, + "eval_logps/chosen": -416.22167243938645, + "eval_logps/rejected": -439.7414221780209, + "eval_loss": 0.46299219131469727, + "eval_rewards/chosen": -0.8429614052449282, + "eval_rewards/margins": 0.6032096539296118, + "eval_rewards/rejected": -1.44617105917454, + "eval_runtime": 93.4505, + "eval_samples_per_second": 42.803, + "eval_steps_per_second": 0.674, + "step": 300 + }, + { + "epoch": 0.16226118817063595, + "grad_norm": 0.5595274955541248, + "kl": 0.0, + "learning_rate": 2.7050610820244333e-05, + "logits/chosen": -1672269056.0, + "logits/rejected": -1616065280.0, + "logps/chosen": -503.0617283950617, + "logps/rejected": -515.746835443038, + "loss": 0.4648, + "rewards/chosen": -1.6059992283950617, + "rewards/margins": 0.6509232399593685, + "rewards/rejected": -2.2569224683544302, + "step": 310 + }, + { + "epoch": 0.16749542004710807, + "grad_norm": 0.5451493353326954, + "kl": 0.0, + "learning_rate": 2.792321116928447e-05, + "logits/chosen": -1698273664.0, + "logits/rejected": -1834588544.0, + "logps/chosen": -461.6848874598071, + "logps/rejected": -551.0030395136778, + "loss": 0.4571, + "rewards/chosen": -1.5639067524115755, + "rewards/margins": 0.9851054056431359, + "rewards/rejected": -2.5490121580547114, + "step": 320 + }, + { + "epoch": 0.17272965192358022, + "grad_norm": 0.5540765788797768, + "kl": 0.0, + "learning_rate": 2.879581151832461e-05, + "logits/chosen": -1663880448.0, + "logits/rejected": -1705194240.0, + "logps/chosen": -472.02492211838006, + "logps/rejected": -502.77115987460814, + "loss": 0.4663, + "rewards/chosen": -1.4273753894080996, + "rewards/margins": 0.7626716325354739, + "rewards/rejected": -2.1900470219435735, + "step": 330 + }, + { + "epoch": 0.17796388380005235, + "grad_norm": 0.6702465977915886, + "kl": 0.0, + "learning_rate": 2.966841186736475e-05, + "logits/chosen": -1732247552.0, + "logits/rejected": -1823264000.0, + "logps/chosen": -472.2547770700637, + "logps/rejected": -495.0184049079755, + "loss": 0.4653, + "rewards/chosen": -1.3769904458598725, + "rewards/margins": 0.5137304130358331, + "rewards/rejected": -1.8907208588957056, + "step": 340 + }, + { + "epoch": 0.18319811567652447, + "grad_norm": 0.5946325216041444, + "kl": 0.0, + "learning_rate": 3.054101221640489e-05, + "logits/chosen": -1580413696.0, + "logits/rejected": -1975517184.0, + "logps/chosen": -502.18815331010455, + "logps/rejected": -546.356940509915, + "loss": 0.4393, + "rewards/chosen": -1.6064895470383276, + "rewards/margins": 0.7727597447463748, + "rewards/rejected": -2.3792492917847023, + "step": 350 + }, + { + "epoch": 0.1884323475529966, + "grad_norm": 0.7184388546396678, + "kl": 0.0, + "learning_rate": 3.141361256544503e-05, + "logits/chosen": -1819279360.0, + "logits/rejected": -1610612736.0, + "logps/chosen": -578.5060240963855, + "logps/rejected": -636.0519480519481, + "loss": 0.4787, + "rewards/chosen": -2.4390060240963853, + "rewards/margins": 1.0800686512282898, + "rewards/rejected": -3.519074675324675, + "step": 360 + }, + { + "epoch": 0.19366657942946872, + "grad_norm": 0.5963756654926242, + "kl": 0.0, + "learning_rate": 3.228621291448517e-05, + "logits/chosen": -1832910848.0, + "logits/rejected": -1772932352.0, + "logps/chosen": -433.1636363636364, + "logps/rejected": -440.46451612903223, + "loss": 0.4755, + "rewards/chosen": -1.0137310606060606, + "rewards/margins": 0.49554313294232655, + "rewards/rejected": -1.509274193548387, + "step": 370 + }, + { + "epoch": 0.19890081130594087, + "grad_norm": 0.7653020375478802, + "kl": 0.0, + "learning_rate": 3.3158813263525307e-05, + "logits/chosen": -1737280768.0, + "logits/rejected": -1738958464.0, + "logps/chosen": -419.92452830188677, + "logps/rejected": -482.18633540372673, + "loss": 0.4546, + "rewards/chosen": -1.0692315251572326, + "rewards/margins": 0.561785555588109, + "rewards/rejected": -1.6310170807453417, + "step": 380 + }, + { + "epoch": 0.204135043182413, + "grad_norm": 0.5470598105886728, + "kl": 0.0, + "learning_rate": 3.403141361256545e-05, + "logits/chosen": -1615645952.0, + "logits/rejected": -1695757056.0, + "logps/chosen": -523.5527156549521, + "logps/rejected": -659.8654434250765, + "loss": 0.4491, + "rewards/chosen": -1.9094448881789137, + "rewards/margins": 1.6620382922492207, + "rewards/rejected": -3.5714831804281344, + "step": 390 + }, + { + "epoch": 0.2093692750588851, + "grad_norm": 0.4397255692980879, + "kl": 0.0, + "learning_rate": 3.4904013961605584e-05, + "logits/chosen": -1483944704.0, + "logits/rejected": -1774610048.0, + "logps/chosen": -507.66101694915255, + "logps/rejected": -540.9391304347826, + "loss": 0.4543, + "rewards/chosen": -1.9313559322033897, + "rewards/margins": 0.41791943011545096, + "rewards/rejected": -2.3492753623188407, + "step": 400 + }, + { + "epoch": 0.2093692750588851, + "eval_kl": 0.0, + "eval_logits/chosen": -3808428032.0, + "eval_logits/rejected": -3728536576.0, + "eval_logps/chosen": -562.5413161801089, + "eval_logps/rejected": -589.5574341123819, + "eval_loss": 0.47276562452316284, + "eval_rewards/chosen": -2.3096239485403265, + "eval_rewards/margins": 0.6356768968102453, + "eval_rewards/rejected": -2.945300845350572, + "eval_runtime": 93.4384, + "eval_samples_per_second": 42.809, + "eval_steps_per_second": 0.674, + "step": 400 + }, + { + "epoch": 0.21460350693535724, + "grad_norm": 0.6281588994781631, + "kl": 0.0, + "learning_rate": 3.5776614310645726e-05, + "logits/chosen": -1783837440.0, + "logits/rejected": -1711066368.0, + "logps/chosen": -513.4723926380368, + "logps/rejected": -534.624203821656, + "loss": 0.4732, + "rewards/chosen": -1.7965874233128833, + "rewards/margins": 0.6082692645852059, + "rewards/rejected": -2.4048566878980893, + "step": 410 + }, + { + "epoch": 0.21983773881182936, + "grad_norm": 0.6464496703356261, + "kl": 0.0, + "learning_rate": 3.664921465968587e-05, + "logits/chosen": -1744830464.0, + "logits/rejected": -1841718912.0, + "logps/chosen": -475.9, + "logps/rejected": -487.8, + "loss": 0.4621, + "rewards/chosen": -1.2525390625, + "rewards/margins": 0.6818359375, + "rewards/rejected": -1.934375, + "step": 420 + }, + { + "epoch": 0.22507197068830148, + "grad_norm": 0.5760537357814977, + "kl": 0.0, + "learning_rate": 3.752181500872601e-05, + "logits/chosen": -1720084096.0, + "logits/rejected": -1644586624.0, + "logps/chosen": -572.1212121212121, + "logps/rejected": -579.0967741935484, + "loss": 0.4851, + "rewards/chosen": -2.3323863636363638, + "rewards/margins": 0.5716458944281522, + "rewards/rejected": -2.904032258064516, + "step": 430 + }, + { + "epoch": 0.23030620256477363, + "grad_norm": 0.46548421992086714, + "kl": 0.0, + "learning_rate": 3.8394415357766145e-05, + "logits/chosen": -1524839168.0, + "logits/rejected": -1457101184.0, + "logps/chosen": -469.3333333333333, + "logps/rejected": -510.7848101265823, + "loss": 0.4737, + "rewards/chosen": -1.548707561728395, + "rewards/margins": 0.6397813623222379, + "rewards/rejected": -2.188488924050633, + "step": 440 + }, + { + "epoch": 0.23554043444124576, + "grad_norm": 0.632322529320257, + "kl": 0.0, + "learning_rate": 3.926701570680629e-05, + "logits/chosen": -1409705600.0, + "logits/rejected": -1382442624.0, + "logps/chosen": -430.2278481012658, + "logps/rejected": -464.98765432098764, + "loss": 0.4558, + "rewards/chosen": -0.9287974683544303, + "rewards/margins": 0.8775296921393968, + "rewards/rejected": -1.8063271604938271, + "step": 450 + }, + { + "epoch": 0.24077466631771788, + "grad_norm": 0.863998535094776, + "kl": 0.0, + "learning_rate": 4.013961605584642e-05, + "logits/chosen": -1358744832.0, + "logits/rejected": -1437807360.0, + "logps/chosen": -477.8877887788779, + "logps/rejected": -527.8575667655787, + "loss": 0.4473, + "rewards/chosen": -1.2968234323432344, + "rewards/margins": 1.076692888131543, + "rewards/rejected": -2.3735163204747773, + "step": 460 + }, + { + "epoch": 0.24600889819419, + "grad_norm": 0.5718990529028527, + "kl": 0.0, + "learning_rate": 4.1012216404886564e-05, + "logits/chosen": -1557554816.0, + "logits/rejected": -1502399744.0, + "logps/chosen": -697.2, + "logps/rejected": -779.5, + "loss": 0.476, + "rewards/chosen": -2.8913015365600585, + "rewards/margins": 2.1313547134399413, + "rewards/rejected": -5.02265625, + "step": 470 + }, + { + "epoch": 0.2512431300706621, + "grad_norm": 0.4910531304249267, + "kl": 0.0, + "learning_rate": 4.1884816753926706e-05, + "logits/chosen": -1737700096.0, + "logits/rejected": -1578945792.0, + "logps/chosen": -573.3809523809524, + "logps/rejected": -857.7894736842105, + "loss": 0.4705, + "rewards/chosen": -2.5364583333333335, + "rewards/margins": 3.074561403508772, + "rewards/rejected": -5.611019736842105, + "step": 480 + }, + { + "epoch": 0.2564773619471343, + "grad_norm": 0.623179288487328, + "kl": 0.0, + "learning_rate": 4.275741710296685e-05, + "logits/chosen": -1720922880.0, + "logits/rejected": -1587963520.0, + "logps/chosen": -648.072072072072, + "logps/rejected": -719.4267100977198, + "loss": 0.4775, + "rewards/chosen": -3.0417605105105103, + "rewards/margins": 1.1585652223885123, + "rewards/rejected": -4.200325732899023, + "step": 490 + }, + { + "epoch": 0.26171159382360637, + "grad_norm": 0.5699561570689229, + "kl": 0.0, + "learning_rate": 4.363001745200698e-05, + "logits/chosen": -1380974592.0, + "logits/rejected": -1450390272.0, + "logps/chosen": -548.4967320261438, + "logps/rejected": -635.4011976047905, + "loss": 0.4445, + "rewards/chosen": -2.0408496732026142, + "rewards/margins": 1.4277132010488827, + "rewards/rejected": -3.468562874251497, + "step": 500 + }, + { + "epoch": 0.26171159382360637, + "eval_kl": 0.0, + "eval_logits/chosen": -3128418048.0, + "eval_logits/rejected": -2977955840.0, + "eval_logps/chosen": -534.3255813953489, + "eval_logps/rejected": -585.3565390353058, + "eval_loss": 0.4657031297683716, + "eval_rewards/chosen": -2.027090549233053, + "eval_rewards/margins": 0.8744509723979763, + "eval_rewards/rejected": -2.9015415216310294, + "eval_runtime": 93.4439, + "eval_samples_per_second": 42.806, + "eval_steps_per_second": 0.674, + "step": 500 + }, + { + "epoch": 0.2669458257000785, + "grad_norm": 0.5227183067644776, + "kl": 0.0, + "learning_rate": 4.4502617801047125e-05, + "logits/chosen": -1514143744.0, + "logits/rejected": -1418094208.0, + "logps/chosen": -464.4938271604938, + "logps/rejected": -544.1012658227849, + "loss": 0.4525, + "rewards/chosen": -1.46875, + "rewards/margins": 1.2005537974683542, + "rewards/rejected": -2.6693037974683542, + "step": 510 + }, + { + "epoch": 0.2721800575765506, + "grad_norm": 0.7979790881328824, + "kl": 0.0, + "learning_rate": 4.537521815008726e-05, + "logits/chosen": -1408447232.0, + "logits/rejected": -1350356224.0, + "logps/chosen": -539.2445820433436, + "logps/rejected": -590.9400630914827, + "loss": 0.4654, + "rewards/chosen": -1.9049922600619196, + "rewards/margins": 1.1391717777929702, + "rewards/rejected": -3.0441640378548898, + "step": 520 + }, + { + "epoch": 0.27741428945302277, + "grad_norm": 0.5782821069877753, + "kl": 0.0, + "learning_rate": 4.62478184991274e-05, + "logits/chosen": -1633471744.0, + "logits/rejected": -1529452928.0, + "logps/chosen": -447.219512195122, + "logps/rejected": -482.56410256410254, + "loss": 0.4671, + "rewards/chosen": -1.1834984756097562, + "rewards/margins": 0.7592098577235771, + "rewards/rejected": -1.9427083333333333, + "step": 530 + }, + { + "epoch": 0.2826485213294949, + "grad_norm": 0.5557000802623413, + "kl": 0.0, + "learning_rate": 4.7120418848167544e-05, + "logits/chosen": -1497576192.0, + "logits/rejected": -1404252928.0, + "logps/chosen": -536.9107692307692, + "logps/rejected": -561.168253968254, + "loss": 0.4802, + "rewards/chosen": -2.223846153846154, + "rewards/margins": 0.345995115995116, + "rewards/rejected": -2.56984126984127, + "step": 540 + }, + { + "epoch": 0.287882753205967, + "grad_norm": 0.5326154463719267, + "kl": 0.0, + "learning_rate": 4.7993019197207686e-05, + "logits/chosen": -1470732672.0, + "logits/rejected": -1550214784.0, + "logps/chosen": -600.2077922077922, + "logps/rejected": -632.5783132530121, + "loss": 0.4592, + "rewards/chosen": -2.6318993506493507, + "rewards/margins": 0.8703596854952278, + "rewards/rejected": -3.5022590361445785, + "step": 550 + }, + { + "epoch": 0.29311698508243916, + "grad_norm": 1.3655207310078739, + "kl": 0.0, + "learning_rate": 4.886561954624782e-05, + "logits/chosen": -1554828544.0, + "logits/rejected": -1595513216.0, + "logps/chosen": -491.62700964630227, + "logps/rejected": -535.2462006079028, + "loss": 0.4513, + "rewards/chosen": -1.6141479099678457, + "rewards/margins": 0.9189068012783548, + "rewards/rejected": -2.5330547112462005, + "step": 560 + }, + { + "epoch": 0.29835121695891126, + "grad_norm": 0.647903484038066, + "kl": 0.0, + "learning_rate": 4.973821989528796e-05, + "logits/chosen": -1670801024.0, + "logits/rejected": -1741894400.0, + "logps/chosen": -527.0709677419355, + "logps/rejected": -641.8424242424243, + "loss": 0.4469, + "rewards/chosen": -2.0377016129032257, + "rewards/margins": 1.4562377810361684, + "rewards/rejected": -3.493939393939394, + "step": 570 + }, + { + "epoch": 0.3035854488353834, + "grad_norm": 0.46006692434550134, + "kl": 0.0, + "learning_rate": 4.999977269399062e-05, + "logits/chosen": -1725956096.0, + "logits/rejected": -1775029504.0, + "logps/chosen": -619.1746031746031, + "logps/rejected": -694.5476923076923, + "loss": 0.4667, + "rewards/chosen": -2.857738095238095, + "rewards/margins": 1.0472619047619047, + "rewards/rejected": -3.905, + "step": 580 + }, + { + "epoch": 0.30881968071185556, + "grad_norm": 0.505652615532779, + "kl": 0.0, + "learning_rate": 4.9998659368385024e-05, + "logits/chosen": -1675204992.0, + "logits/rejected": -1821586176.0, + "logps/chosen": -670.7368421052631, + "logps/rejected": -852.5714285714286, + "loss": 0.4549, + "rewards/chosen": -3.2960526315789473, + "rewards/margins": 2.3029057017543857, + "rewards/rejected": -5.598958333333333, + "step": 590 + }, + { + "epoch": 0.31405391258832765, + "grad_norm": 0.47080891304692235, + "kl": 0.0, + "learning_rate": 4.999661831436499e-05, + "logits/chosen": -1803970176.0, + "logits/rejected": -1722600704.0, + "logps/chosen": -609.3700305810397, + "logps/rejected": -651.7571884984026, + "loss": 0.4654, + "rewards/chosen": -2.3581804281345566, + "rewards/margins": 1.0863083897568173, + "rewards/rejected": -3.444488817891374, + "step": 600 + }, + { + "epoch": 0.31405391258832765, + "eval_kl": 0.0, + "eval_logits/chosen": -3596449280.0, + "eval_logits/rejected": -3444389120.0, + "eval_logps/chosen": -529.132112815438, + "eval_logps/rejected": -582.269517652909, + "eval_loss": 0.4657500088214874, + "eval_rewards/chosen": -1.9722909450766948, + "eval_rewards/margins": 0.896306767504111, + "eval_rewards/rejected": -2.8685977125808058, + "eval_runtime": 93.4365, + "eval_samples_per_second": 42.81, + "eval_steps_per_second": 0.674, + "step": 600 + }, + { + "epoch": 0.3192881444647998, + "grad_norm": 0.5339767024170512, + "kl": 0.0, + "learning_rate": 4.9993649607676306e-05, + "logits/chosen": -1825361152.0, + "logits/rejected": -1622776192.0, + "logps/chosen": -454.8433734939759, + "logps/rejected": -541.922077922078, + "loss": 0.4677, + "rewards/chosen": -1.3794239457831325, + "rewards/margins": 0.8608357944766076, + "rewards/rejected": -2.24025974025974, + "step": 610 + }, + { + "epoch": 0.3245223763412719, + "grad_norm": 0.5213452170759816, + "kl": 0.0, + "learning_rate": 4.998975335849104e-05, + "logits/chosen": -1747347072.0, + "logits/rejected": -2048078592.0, + "logps/chosen": -562.7034482758621, + "logps/rejected": -542.5371428571428, + "loss": 0.451, + "rewards/chosen": -2.232112068965517, + "rewards/margins": 0.32038793103448304, + "rewards/rejected": -2.5525, + "step": 620 + }, + { + "epoch": 0.32975660821774405, + "grad_norm": 0.4969950941570501, + "kl": 0.0, + "learning_rate": 4.998492971140339e-05, + "logits/chosen": -1885759104.0, + "logits/rejected": -1890792192.0, + "logps/chosen": -744.7923322683706, + "logps/rejected": -789.9204892966361, + "loss": 0.4539, + "rewards/chosen": -4.179313099041534, + "rewards/margins": 0.8566196226710048, + "rewards/rejected": -5.0359327217125385, + "step": 630 + }, + { + "epoch": 0.33499084009421615, + "grad_norm": 0.5451050219577263, + "kl": 0.0, + "learning_rate": 4.997917884542433e-05, + "logits/chosen": -1594674432.0, + "logits/rejected": -1598449280.0, + "logps/chosen": -773.7539432176657, + "logps/rejected": -728.1733746130031, + "loss": 0.4474, + "rewards/chosen": -4.292981072555205, + "rewards/margins": 0.29331923704231855, + "rewards/rejected": -4.586300309597523, + "step": 640 + }, + { + "epoch": 0.3402250719706883, + "grad_norm": 0.5566572286394867, + "kl": 0.0, + "learning_rate": 4.997250097397497e-05, + "logits/chosen": -1689885056.0, + "logits/rejected": -1711695488.0, + "logps/chosen": -810.5, + "logps/rejected": -962.2, + "loss": 0.458, + "rewards/chosen": -4.571484375, + "rewards/margins": 1.9644531250000004, + "rewards/rejected": -6.5359375, + "step": 650 + }, + { + "epoch": 0.34545930384716045, + "grad_norm": 0.7822806261860601, + "kl": 0.0, + "learning_rate": 4.9964896344878655e-05, + "logits/chosen": -1595932672.0, + "logits/rejected": -1489816832.0, + "logps/chosen": -600.5696594427245, + "logps/rejected": -857.4384858044164, + "loss": 0.4588, + "rewards/chosen": -2.655185758513932, + "rewards/margins": 3.174072916564932, + "rewards/rejected": -5.829258675078864, + "step": 660 + }, + { + "epoch": 0.35069353572363254, + "grad_norm": 0.5814521512972838, + "kl": 0.0, + "learning_rate": 4.995636524035173e-05, + "logits/chosen": -1610193280.0, + "logits/rejected": -1613758464.0, + "logps/chosen": -492.0261437908497, + "logps/rejected": -678.9940119760479, + "loss": 0.4322, + "rewards/chosen": -1.9438316993464053, + "rewards/margins": 1.8391024323901217, + "rewards/rejected": -3.782934131736527, + "step": 670 + }, + { + "epoch": 0.3559277676001047, + "grad_norm": 0.9280019507247134, + "kl": 0.0, + "learning_rate": 4.9946907976993104e-05, + "logits/chosen": -1555667328.0, + "logits/rejected": -1712114944.0, + "logps/chosen": -446.7854785478548, + "logps/rejected": -587.0148367952522, + "loss": 0.4426, + "rewards/chosen": -1.2946575907590758, + "rewards/margins": 1.6893928543447818, + "rewards/rejected": -2.9840504451038576, + "step": 680 + }, + { + "epoch": 0.3611619994765768, + "grad_norm": 0.5627181242723076, + "kl": 0.0, + "learning_rate": 4.9936524905772464e-05, + "logits/chosen": -1844654848.0, + "logits/rejected": -1409076480.0, + "logps/chosen": -602.8156424581006, + "logps/rejected": -633.0780141843971, + "loss": 0.4862, + "rewards/chosen": -2.725034916201117, + "rewards/margins": 0.7687594100400177, + "rewards/rejected": -3.493794326241135, + "step": 690 + }, + { + "epoch": 0.36639623135304894, + "grad_norm": 0.7905331398546509, + "kl": 0.0, + "learning_rate": 4.992521641201728e-05, + "logits/chosen": -1645215744.0, + "logits/rejected": -1646054656.0, + "logps/chosen": -442.0253164556962, + "logps/rejected": -546.2716049382716, + "loss": 0.4517, + "rewards/chosen": -1.162381329113924, + "rewards/margins": 1.1979581770589154, + "rewards/rejected": -2.3603395061728394, + "step": 700 + }, + { + "epoch": 0.36639623135304894, + "eval_kl": 0.0, + "eval_logits/chosen": -3631068928.0, + "eval_logits/rejected": -3583666688.0, + "eval_logps/chosen": -452.750123701138, + "eval_logps/rejected": -492.39582297364495, + "eval_loss": 0.4689921736717224, + "eval_rewards/chosen": -1.2098589807026225, + "eval_rewards/margins": 0.7624184931909628, + "eval_rewards/rejected": -1.9722774738935853, + "eval_runtime": 93.452, + "eval_samples_per_second": 42.803, + "eval_steps_per_second": 0.674, + "step": 700 + }, + { + "epoch": 0.3716304632295211, + "grad_norm": 0.9400152465029169, + "kl": 0.0, + "learning_rate": 4.991298291539852e-05, + "logits/chosen": -1697434880.0, + "logits/rejected": -1574122240.0, + "logps/chosen": -561.9318885448916, + "logps/rejected": -566.813880126183, + "loss": 0.4746, + "rewards/chosen": -2.135642414860681, + "rewards/margins": 0.567038973151937, + "rewards/rejected": -2.702681388012618, + "step": 710 + }, + { + "epoch": 0.3768646951059932, + "grad_norm": 0.7974875506467937, + "kl": 0.0, + "learning_rate": 4.9899824869915e-05, + "logits/chosen": -1780482048.0, + "logits/rejected": -1550004992.0, + "logps/chosen": -558.2222222222222, + "logps/rejected": -649.9865771812081, + "loss": 0.4847, + "rewards/chosen": -2.4365862573099415, + "rewards/margins": 1.2320379037638842, + "rewards/rejected": -3.6686241610738257, + "step": 720 + }, + { + "epoch": 0.38209892698246534, + "grad_norm": 0.6046106097461434, + "kl": 0.0, + "learning_rate": 4.988574276387662e-05, + "logits/chosen": -1531759872.0, + "logits/rejected": -1609354496.0, + "logps/chosen": -733.272131147541, + "logps/rejected": -945.7671641791045, + "loss": 0.4492, + "rewards/chosen": -3.739344262295082, + "rewards/margins": 2.6472229018840223, + "rewards/rejected": -6.386567164179104, + "step": 730 + }, + { + "epoch": 0.38733315885893743, + "grad_norm": 0.7231031658869789, + "kl": 0.0, + "learning_rate": 4.9870737119886216e-05, + "logits/chosen": -1537212416.0, + "logits/rejected": -1564475392.0, + "logps/chosen": -622.6542056074767, + "logps/rejected": -822.4702194357367, + "loss": 0.4693, + "rewards/chosen": -2.7961448598130842, + "rewards/margins": 2.4091842937919314, + "rewards/rejected": -5.205329153605016, + "step": 740 + }, + { + "epoch": 0.3925673907354096, + "grad_norm": 0.6790396014146793, + "kl": 0.0, + "learning_rate": 4.985480849482012e-05, + "logits/chosen": -1664299776.0, + "logits/rejected": -1463392640.0, + "logps/chosen": -504.02373887240356, + "logps/rejected": -571.7755775577558, + "loss": 0.469, + "rewards/chosen": -1.559532640949555, + "rewards/margins": 1.1335366659811383, + "rewards/rejected": -2.6930693069306932, + "step": 750 + }, + { + "epoch": 0.39780162261188173, + "grad_norm": 0.7572345641049093, + "kl": 0.0, + "learning_rate": 4.983795747980757e-05, + "logits/chosen": -1453326336.0, + "logits/rejected": -1286812416.0, + "logps/chosen": -544.6706586826348, + "logps/rejected": -572.7581699346405, + "loss": 0.4779, + "rewards/chosen": -2.0988023952095807, + "rewards/margins": 0.9269328989080665, + "rewards/rejected": -3.025735294117647, + "step": 760 + }, + { + "epoch": 0.40303585448835383, + "grad_norm": 0.4563766273978695, + "kl": 0.0, + "learning_rate": 4.982018470020871e-05, + "logits/chosen": -1355808768.0, + "logits/rejected": -1307364608.0, + "logps/chosen": -589.6, + "logps/rejected": -710.9, + "loss": 0.4678, + "rewards/chosen": -2.70078125, + "rewards/margins": 1.475, + "rewards/rejected": -4.17578125, + "step": 770 + }, + { + "epoch": 0.408270086364826, + "grad_norm": 0.6737898724374982, + "kl": 0.0, + "learning_rate": 4.980149081559142e-05, + "logits/chosen": -1413060992.0, + "logits/rejected": -1393767168.0, + "logps/chosen": -566.2, + "logps/rejected": -598.1, + "loss": 0.4712, + "rewards/chosen": -2.268359375, + "rewards/margins": 0.7742187499999997, + "rewards/rejected": -3.042578125, + "step": 780 + }, + { + "epoch": 0.4135043182412981, + "grad_norm": 0.9789884036557728, + "kl": 0.0, + "learning_rate": 4.978187651970683e-05, + "logits/chosen": -1426692480.0, + "logits/rejected": -1546439936.0, + "logps/chosen": -710.5667752442997, + "logps/rejected": -951.5435435435436, + "loss": 0.4526, + "rewards/chosen": -3.73371335504886, + "rewards/margins": 2.6003707290352245, + "rewards/rejected": -6.334084084084084, + "step": 790 + }, + { + "epoch": 0.4187385501177702, + "grad_norm": 0.5400402364635294, + "kl": 0.0, + "learning_rate": 4.976134254046353e-05, + "logits/chosen": -1541616384.0, + "logits/rejected": -1428579968.0, + "logps/chosen": -742.3806646525679, + "logps/rejected": -925.9288025889967, + "loss": 0.4701, + "rewards/chosen": -4.000377643504532, + "rewards/margins": 2.705933036107119, + "rewards/rejected": -6.706310679611651, + "step": 800 + }, + { + "epoch": 0.4187385501177702, + "eval_kl": 0.0, + "eval_logits/chosen": -3269426688.0, + "eval_logits/rejected": -3207910144.0, + "eval_logps/chosen": -550.3809995051954, + "eval_logps/rejected": -656.9627051218299, + "eval_loss": 0.45948827266693115, + "eval_rewards/chosen": -2.187654626422563, + "eval_rewards/margins": 1.4299485560737075, + "eval_rewards/rejected": -3.6176031824962704, + "eval_runtime": 93.4665, + "eval_samples_per_second": 42.796, + "eval_steps_per_second": 0.674, + "step": 800 + }, + { + "epoch": 0.4239727819942423, + "grad_norm": 0.43312826690611683, + "kl": 0.0, + "learning_rate": 4.973988963990065e-05, + "logits/chosen": -1373215104.0, + "logits/rejected": -1463392640.0, + "logps/chosen": -502.2540716612378, + "logps/rejected": -595.0750750750751, + "loss": 0.4514, + "rewards/chosen": -1.7492874592833876, + "rewards/margins": 1.3389257539298258, + "rewards/rejected": -3.0882132132132134, + "step": 810 + }, + { + "epoch": 0.42920701387071447, + "grad_norm": 0.5639702493055443, + "kl": 0.0, + "learning_rate": 4.9717518614159496e-05, + "logits/chosen": -1284086144.0, + "logits/rejected": -1526307200.0, + "logps/chosen": -545.2179930795847, + "logps/rejected": -586.3931623931624, + "loss": 0.4329, + "rewards/chosen": -2.0311418685121105, + "rewards/margins": 0.7427185303482884, + "rewards/rejected": -2.773860398860399, + "step": 820 + }, + { + "epoch": 0.4344412457471866, + "grad_norm": 0.5479773194208483, + "kl": 0.0, + "learning_rate": 4.9694230293454034e-05, + "logits/chosen": -1395025536.0, + "logits/rejected": -1466957824.0, + "logps/chosen": -584.906148867314, + "logps/rejected": -660.2054380664653, + "loss": 0.4574, + "rewards/chosen": -2.1725323624595467, + "rewards/margins": 1.3969540423742903, + "rewards/rejected": -3.569486404833837, + "step": 830 + }, + { + "epoch": 0.4396754776236587, + "grad_norm": 1.2259598727664864, + "kl": 0.0, + "learning_rate": 4.9670025542040085e-05, + "logits/chosen": -1483105920.0, + "logits/rejected": -1462973184.0, + "logps/chosen": -602.2, + "logps/rejected": -715.8, + "loss": 0.4609, + "rewards/chosen": -2.4546875, + "rewards/margins": 1.9167968749999997, + "rewards/rejected": -4.371484375, + "step": 840 + }, + { + "epoch": 0.44490970950013087, + "grad_norm": 0.467639624489036, + "kl": 0.0, + "learning_rate": 4.964490525818325e-05, + "logits/chosen": -1678979840.0, + "logits/rejected": -1438226816.0, + "logps/chosen": -578.0645161290323, + "logps/rejected": -736.9632107023411, + "loss": 0.4765, + "rewards/chosen": -2.4149560117302054, + "rewards/margins": 1.5164821153600956, + "rewards/rejected": -3.931438127090301, + "step": 850 + }, + { + "epoch": 0.45014394137660296, + "grad_norm": 0.5160824174070019, + "kl": 0.0, + "learning_rate": 4.9618870374125554e-05, + "logits/chosen": -1446195968.0, + "logits/rejected": -1525258624.0, + "logps/chosen": -492.3154574132492, + "logps/rejected": -531.3188854489164, + "loss": 0.4725, + "rewards/chosen": -1.4590863248903292, + "rewards/margins": 0.8056195574626119, + "rewards/rejected": -2.264705882352941, + "step": 860 + }, + { + "epoch": 0.4553781732530751, + "grad_norm": 0.5034576976762464, + "kl": 0.0, + "learning_rate": 4.959192185605088e-05, + "logits/chosen": -1166016512.0, + "logits/rejected": -1171259392.0, + "logps/chosen": -500.0883280757098, + "logps/rejected": -518.4396284829721, + "loss": 0.4681, + "rewards/chosen": -1.6275630914826498, + "rewards/margins": 0.7458889209012511, + "rewards/rejected": -2.373452012383901, + "step": 870 + }, + { + "epoch": 0.46061240512954726, + "grad_norm": 0.529996647847534, + "kl": 0.0, + "learning_rate": 4.956406070404911e-05, + "logits/chosen": -1160144512.0, + "logits/rejected": -1177131392.0, + "logps/chosen": -497.6551724137931, + "logps/rejected": -610.8909657320872, + "loss": 0.4637, + "rewards/chosen": -1.6755485893416928, + "rewards/margins": 1.2753859900975595, + "rewards/rejected": -2.9509345794392523, + "step": 880 + }, + { + "epoch": 0.46584663700601936, + "grad_norm": 0.5312067368635373, + "kl": 0.0, + "learning_rate": 4.953528795207896e-05, + "logits/chosen": -1360212736.0, + "logits/rejected": -1401526656.0, + "logps/chosen": -615.0543130990416, + "logps/rejected": -729.7370030581039, + "loss": 0.4479, + "rewards/chosen": -2.7581869009584663, + "rewards/margins": 1.5912014782464263, + "rewards/rejected": -4.349388379204893, + "step": 890 + }, + { + "epoch": 0.4710808688824915, + "grad_norm": 0.5427173257752647, + "kl": 0.0, + "learning_rate": 4.9505604667929694e-05, + "logits/chosen": -1431096576.0, + "logits/rejected": -1388104960.0, + "logps/chosen": -644.1823708206687, + "logps/rejected": -853.8649517684887, + "loss": 0.4711, + "rewards/chosen": -2.942629179331307, + "rewards/margins": 2.707290434816603, + "rewards/rejected": -5.64991961414791, + "step": 900 + }, + { + "epoch": 0.4710808688824915, + "eval_kl": 0.0, + "eval_logits/chosen": -3022695168.0, + "eval_logits/rejected": -3008713984.0, + "eval_logps/chosen": -592.4354280059376, + "eval_logps/rejected": -758.0387866732968, + "eval_loss": 0.46263280510902405, + "eval_rewards/chosen": -2.6066303809995053, + "eval_rewards/margins": 2.022409897468918, + "eval_rewards/rejected": -4.629040278468423, + "eval_runtime": 93.4443, + "eval_samples_per_second": 42.806, + "eval_steps_per_second": 0.674, + "step": 900 + }, + { + "epoch": 0.4763151007589636, + "grad_norm": 0.5670490372210191, + "kl": 0.0, + "learning_rate": 4.947501195318143e-05, + "logits/chosen": -1286393088.0, + "logits/rejected": -1343016192.0, + "logps/chosen": -541.8770226537217, + "logps/rejected": -748.5679758308157, + "loss": 0.4443, + "rewards/chosen": -2.2771035598705502, + "rewards/margins": 2.1466124522140415, + "rewards/rejected": -4.423716012084592, + "step": 910 + }, + { + "epoch": 0.48154933263543576, + "grad_norm": 0.9235768559898764, + "kl": 0.0, + "learning_rate": 4.9443510943164264e-05, + "logits/chosen": -1360632192.0, + "logits/rejected": -1285134720.0, + "logps/chosen": -606.2545454545455, + "logps/rejected": -831.3806451612903, + "loss": 0.4702, + "rewards/chosen": -2.9428030303030304, + "rewards/margins": 2.578164711632453, + "rewards/rejected": -5.5209677419354835, + "step": 920 + }, + { + "epoch": 0.48678356451190785, + "grad_norm": 0.9874634734285241, + "kl": 0.0, + "learning_rate": 4.941110280691619e-05, + "logits/chosen": -1304218880.0, + "logits/rejected": -1258920320.0, + "logps/chosen": -656.0, + "logps/rejected": -765.0445859872611, + "loss": 0.4713, + "rewards/chosen": -3.4745015337423313, + "rewards/margins": 1.234893370716267, + "rewards/rejected": -4.709394904458598, + "step": 930 + }, + { + "epoch": 0.49201779638838, + "grad_norm": 1.2200993550123265, + "kl": 0.0, + "learning_rate": 4.937778874713963e-05, + "logits/chosen": -1235851648.0, + "logits/rejected": -1326658304.0, + "logps/chosen": -526.688524590164, + "logps/rejected": -661.3014925373134, + "loss": 0.4579, + "rewards/chosen": -1.9245901639344263, + "rewards/margins": 1.7929471494984095, + "rewards/rejected": -3.7175373134328358, + "step": 940 + }, + { + "epoch": 0.49725202826485215, + "grad_norm": 0.5242476580045962, + "kl": 0.0, + "learning_rate": 4.93435700001569e-05, + "logits/chosen": -1391460352.0, + "logits/rejected": -1220961920.0, + "logps/chosen": -505.7142857142857, + "logps/rejected": -512.5263157894736, + "loss": 0.4774, + "rewards/chosen": -1.6203497023809523, + "rewards/margins": 0.6912251331453636, + "rewards/rejected": -2.311574835526316, + "step": 950 + }, + { + "epoch": 0.5024862601413242, + "grad_norm": 0.4569538224283365, + "kl": 0.0, + "learning_rate": 4.930844783586425e-05, + "logits/chosen": -1389992320.0, + "logits/rejected": -1364616832.0, + "logps/chosen": -444.84923076923076, + "logps/rejected": -559.5428571428571, + "loss": 0.4583, + "rewards/chosen": -1.3008653846153846, + "rewards/margins": 1.5765155677655678, + "rewards/rejected": -2.8773809523809524, + "step": 960 + }, + { + "epoch": 0.5077204920177963, + "grad_norm": 0.43309063699281336, + "kl": 0.0, + "learning_rate": 4.927242355768477e-05, + "logits/chosen": -1397961472.0, + "logits/rejected": -1420610816.0, + "logps/chosen": -605.3587301587302, + "logps/rejected": -647.1876923076923, + "loss": 0.4748, + "rewards/chosen": -2.7121031746031745, + "rewards/margins": 0.9955891330891333, + "rewards/rejected": -3.707692307692308, + "step": 970 + }, + { + "epoch": 0.5129547238942685, + "grad_norm": 0.5990213278748463, + "kl": 0.0, + "learning_rate": 4.923549850251999e-05, + "logits/chosen": -1486461312.0, + "logits/rejected": -1447244544.0, + "logps/chosen": -569.9378881987577, + "logps/rejected": -693.1320754716982, + "loss": 0.4635, + "rewards/chosen": -2.345302795031056, + "rewards/margins": 1.5124016074846676, + "rewards/rejected": -3.8577044025157234, + "step": 980 + }, + { + "epoch": 0.5181889557707406, + "grad_norm": 0.7014728476500259, + "kl": 0.0, + "learning_rate": 4.9197674040700333e-05, + "logits/chosen": -1509320320.0, + "logits/rejected": -1295620480.0, + "logps/chosen": -584.8674698795181, + "logps/rejected": -670.3376623376623, + "loss": 0.4781, + "rewards/chosen": -2.560617469879518, + "rewards/margins": 1.3107299327178845, + "rewards/rejected": -3.8713474025974026, + "step": 990 + }, + { + "epoch": 0.5234231876472127, + "grad_norm": 0.5015363941820027, + "kl": 0.0, + "learning_rate": 4.915895157593418e-05, + "logits/chosen": -993420928.0, + "logits/rejected": -932393792.0, + "logps/chosen": -586.2360248447205, + "logps/rejected": -731.4716981132076, + "loss": 0.4534, + "rewards/chosen": -2.779114906832298, + "rewards/margins": 1.5565769170670727, + "rewards/rejected": -4.335691823899371, + "step": 1000 + }, + { + "epoch": 0.5234231876472127, + "eval_kl": 0.0, + "eval_logits/chosen": -2621240320.0, + "eval_logits/rejected": -2359728640.0, + "eval_logps/chosen": -548.955962394854, + "eval_logps/rejected": -656.3580308304327, + "eval_loss": 0.4593867063522339, + "eval_rewards/chosen": -2.1703983176645223, + "eval_rewards/margins": 1.4413620005851047, + "eval_rewards/rejected": -3.611760318249627, + "eval_runtime": 93.4283, + "eval_samples_per_second": 42.814, + "eval_steps_per_second": 0.674, + "step": 1000 + }, + { + "epoch": 0.528657419523685, + "grad_norm": 0.4651845185250115, + "kl": 0.0, + "learning_rate": 4.911933254525583e-05, + "logits/chosen": -1264582656.0, + "logits/rejected": -1249692928.0, + "logps/chosen": -512.2025316455696, + "logps/rejected": -624.9876543209876, + "loss": 0.4461, + "rewards/chosen": -1.9050632911392404, + "rewards/margins": 1.5521126347866856, + "rewards/rejected": -3.457175925925926, + "step": 1010 + }, + { + "epoch": 0.533891651400157, + "grad_norm": 0.8209032470774238, + "kl": 0.0, + "learning_rate": 4.907881841897216e-05, + "logits/chosen": -1299395328.0, + "logits/rejected": -1353921280.0, + "logps/chosen": -617.7215189873418, + "logps/rejected": -818.1728395061729, + "loss": 0.4533, + "rewards/chosen": -3.068631329113924, + "rewards/margins": 2.218405707923113, + "rewards/rejected": -5.287037037037037, + "step": 1020 + }, + { + "epoch": 0.5391258832766291, + "grad_norm": 0.8025767619285588, + "kl": 0.0, + "learning_rate": 4.903741070060802e-05, + "logits/chosen": -1437178240.0, + "logits/rejected": -1362939136.0, + "logps/chosen": -801.6534954407294, + "logps/rejected": -1041.491961414791, + "loss": 0.4819, + "rewards/chosen": -4.88031914893617, + "rewards/margins": 2.763169597044537, + "rewards/rejected": -7.643488745980707, + "step": 1030 + }, + { + "epoch": 0.5443601151531012, + "grad_norm": 0.5755210037603522, + "kl": 0.0, + "learning_rate": 4.899511092685051e-05, + "logits/chosen": -1300024576.0, + "logits/rejected": -1426273024.0, + "logps/chosen": -617.6300940438872, + "logps/rejected": -793.0218068535826, + "loss": 0.448, + "rewards/chosen": -2.7321708463949843, + "rewards/margins": 2.284184293791932, + "rewards/rejected": -5.016355140186916, + "step": 1040 + }, + { + "epoch": 0.5495943470295734, + "grad_norm": 0.6536447981970849, + "kl": 0.0, + "learning_rate": 4.895192066749189e-05, + "logits/chosen": -1290587392.0, + "logits/rejected": -1367972224.0, + "logps/chosen": -537.9872611464968, + "logps/rejected": -568.0981595092024, + "loss": 0.4598, + "rewards/chosen": -1.950437898089172, + "rewards/margins": 0.961659494548865, + "rewards/rejected": -2.912097392638037, + "step": 1050 + }, + { + "epoch": 0.5548285789060455, + "grad_norm": 0.7464676262058021, + "kl": 0.0, + "learning_rate": 4.890784152537134e-05, + "logits/chosen": -1363358464.0, + "logits/rejected": -1421030144.0, + "logps/chosen": -612.5859872611464, + "logps/rejected": -741.8895705521472, + "loss": 0.4466, + "rewards/chosen": -2.6575437898089174, + "rewards/margins": 1.7922261488413893, + "rewards/rejected": -4.449769938650307, + "step": 1060 + }, + { + "epoch": 0.5600628107825176, + "grad_norm": 0.5391370350173297, + "kl": 0.0, + "learning_rate": 4.886287513631548e-05, + "logits/chosen": -1447034880.0, + "logits/rejected": -1292684544.0, + "logps/chosen": -652.1904761904761, + "logps/rejected": -886.7368421052631, + "loss": 0.4765, + "rewards/chosen": -3.2641369047619047, + "rewards/margins": 2.480106516290727, + "rewards/rejected": -5.744243421052632, + "step": 1070 + }, + { + "epoch": 0.5652970426589898, + "grad_norm": 0.6642351807621497, + "kl": 0.0, + "learning_rate": 4.881702316907768e-05, + "logits/chosen": -1291216512.0, + "logits/rejected": -1240465408.0, + "logps/chosen": -657.3211009174312, + "logps/rejected": -793.0479233226837, + "loss": 0.4577, + "rewards/chosen": -3.2305045871559632, + "rewards/margins": 1.8625465310548992, + "rewards/rejected": -5.093051118210862, + "step": 1080 + }, + { + "epoch": 0.5705312745354619, + "grad_norm": 0.7711978299001714, + "kl": 0.0, + "learning_rate": 4.8770287325276116e-05, + "logits/chosen": -1279262720.0, + "logits/rejected": -1235641984.0, + "logps/chosen": -615.0219435736677, + "logps/rejected": -897.196261682243, + "loss": 0.4411, + "rewards/chosen": -2.970219435736677, + "rewards/margins": 2.7291575113038213, + "rewards/rejected": -5.6993769470404985, + "step": 1090 + }, + { + "epoch": 0.575765506411934, + "grad_norm": 0.4862124971831588, + "kl": 0.0, + "learning_rate": 4.872266933933058e-05, + "logits/chosen": -1231867136.0, + "logits/rejected": -1382442624.0, + "logps/chosen": -613.1178451178451, + "logps/rejected": -751.6734693877551, + "loss": 0.4428, + "rewards/chosen": -2.7518939393939394, + "rewards/margins": 1.8669107253290926, + "rewards/rejected": -4.618804664723032, + "step": 1100 + }, + { + "epoch": 0.575765506411934, + "eval_kl": 0.0, + "eval_logits/chosen": -2962243840.0, + "eval_logits/rejected": -2838944512.0, + "eval_logps/chosen": -587.938644235527, + "eval_logps/rejected": -759.0253605171556, + "eval_loss": 0.4583164155483246, + "eval_rewards/chosen": -2.5622216724393865, + "eval_rewards/margins": 2.079374548346292, + "eval_rewards/rejected": -4.6415962207856785, + "eval_runtime": 93.4298, + "eval_samples_per_second": 42.813, + "eval_steps_per_second": 0.674, + "step": 1100 + }, + { + "epoch": 0.5809997382884062, + "grad_norm": 0.5321270952633724, + "kl": 0.0, + "learning_rate": 4.86741709783982e-05, + "logits/chosen": -1324980608.0, + "logits/rejected": -1339660672.0, + "logps/chosen": -612.2322580645161, + "logps/rejected": -860.4121212121212, + "loss": 0.442, + "rewards/chosen": -2.820161290322581, + "rewards/margins": 2.7540811339198434, + "rewards/rejected": -5.574242424242424, + "step": 1110 + }, + { + "epoch": 0.5862339701648783, + "grad_norm": 0.5390449896110832, + "kl": 0.0, + "learning_rate": 4.86247940423078e-05, + "logits/chosen": -1407608448.0, + "logits/rejected": -1356228224.0, + "logps/chosen": -845.8404907975461, + "logps/rejected": -1190.216560509554, + "loss": 0.4735, + "rewards/chosen": -5.088957055214724, + "rewards/margins": 3.746234027587823, + "rewards/rejected": -8.835191082802547, + "step": 1120 + }, + { + "epoch": 0.5914682020413504, + "grad_norm": 1.013994676787218, + "kl": 0.0, + "learning_rate": 4.857454036349308e-05, + "logits/chosen": -1230399104.0, + "logits/rejected": -1360003072.0, + "logps/chosen": -815.6862745098039, + "logps/rejected": -1295.5209580838323, + "loss": 0.4508, + "rewards/chosen": -5.069444444444445, + "rewards/margins": 4.926813040585495, + "rewards/rejected": -9.99625748502994, + "step": 1130 + }, + { + "epoch": 0.5967024339178225, + "grad_norm": 0.4959809334818276, + "kl": 0.0, + "learning_rate": 4.8523411806924704e-05, + "logits/chosen": -1222429952.0, + "logits/rejected": -1233964288.0, + "logps/chosen": -732.4668769716088, + "logps/rejected": -1033.1145510835913, + "loss": 0.4554, + "rewards/chosen": -4.1857255520504735, + "rewards/margins": 3.266286831850455, + "rewards/rejected": -7.452012383900929, + "step": 1140 + }, + { + "epoch": 0.6019366657942947, + "grad_norm": 0.5525159790700757, + "kl": 0.0, + "learning_rate": 4.8471410270041e-05, + "logits/chosen": -1273600384.0, + "logits/rejected": -1167694208.0, + "logps/chosen": -672.0963855421687, + "logps/rejected": -895.2727272727273, + "loss": 0.4676, + "rewards/chosen": -3.2428463855421685, + "rewards/margins": 2.5915691988734157, + "rewards/rejected": -5.834415584415584, + "step": 1150 + }, + { + "epoch": 0.6071708976707668, + "grad_norm": 1.020879817955065, + "kl": 0.0, + "learning_rate": 4.84185376826776e-05, + "logits/chosen": -1160144512.0, + "logits/rejected": -1336934400.0, + "logps/chosen": -576.9664429530201, + "logps/rejected": -796.5380116959064, + "loss": 0.4354, + "rewards/chosen": -2.484060402684564, + "rewards/margins": 2.3613343341575415, + "rewards/rejected": -4.845394736842105, + "step": 1160 + }, + { + "epoch": 0.6124051295472389, + "grad_norm": 1.5753296833031845, + "kl": 0.0, + "learning_rate": 4.8364796006995785e-05, + "logits/chosen": -1258291200.0, + "logits/rejected": -1169791360.0, + "logps/chosen": -574.6424242424242, + "logps/rejected": -794.2193548387097, + "loss": 0.4648, + "rewards/chosen": -2.3223484848484848, + "rewards/margins": 2.761925708699902, + "rewards/rejected": -5.084274193548387, + "step": 1170 + }, + { + "epoch": 0.6176393614237111, + "grad_norm": 0.7771283437694375, + "kl": 0.0, + "learning_rate": 4.831018723740969e-05, + "logits/chosen": -1246337408.0, + "logits/rejected": -1175453696.0, + "logps/chosen": -609.6738461538462, + "logps/rejected": -783.847619047619, + "loss": 0.456, + "rewards/chosen": -2.5955769230769232, + "rewards/margins": 2.1702960927960926, + "rewards/rejected": -4.765873015873016, + "step": 1180 + }, + { + "epoch": 0.6228735933001832, + "grad_norm": 0.6937936950072409, + "kl": 0.0, + "learning_rate": 4.825471340051228e-05, + "logits/chosen": -1167065088.0, + "logits/rejected": -1241304320.0, + "logps/chosen": -640.4155844155844, + "logps/rejected": -821.5903614457832, + "loss": 0.4504, + "rewards/chosen": -2.888392857142857, + "rewards/margins": 2.294589070567986, + "rewards/rejected": -5.182981927710843, + "step": 1190 + }, + { + "epoch": 0.6281078251766553, + "grad_norm": 0.550959849863717, + "kl": 0.0, + "learning_rate": 4.8198376555000134e-05, + "logits/chosen": -1269615872.0, + "logits/rejected": -1262485504.0, + "logps/chosen": -579.5555555555555, + "logps/rejected": -637.2658227848101, + "loss": 0.4619, + "rewards/chosen": -2.6766975308641974, + "rewards/margins": 0.9190303172370684, + "rewards/rejected": -3.5957278481012658, + "step": 1200 + }, + { + "epoch": 0.6281078251766553, + "eval_kl": 0.0, + "eval_logits/chosen": -2887944704.0, + "eval_logits/rejected": -2875961088.0, + "eval_logps/chosen": -570.1098466105888, + "eval_logps/rejected": -742.3172550969667, + "eval_loss": 0.45485547184944153, + "eval_rewards/chosen": -2.3843394359228105, + "eval_rewards/margins": 2.0890568843158768, + "eval_rewards/rejected": -4.473396320238687, + "eval_runtime": 93.4311, + "eval_samples_per_second": 42.812, + "eval_steps_per_second": 0.674, + "step": 1200 + }, + { + "epoch": 0.6333420570531274, + "grad_norm": 0.4957479086059728, + "kl": 0.0, + "learning_rate": 4.8141178791597086e-05, + "logits/chosen": -1334417792.0, + "logits/rejected": -1312607488.0, + "logps/chosen": -544.2006269592476, + "logps/rejected": -696.1246105919004, + "loss": 0.4547, + "rewards/chosen": -2.127742946708464, + "rewards/margins": 1.9672726296155236, + "rewards/rejected": -4.095015576323988, + "step": 1210 + }, + { + "epoch": 0.6385762889295996, + "grad_norm": 0.5599659559777491, + "kl": 0.0, + "learning_rate": 4.8083122232976555e-05, + "logits/chosen": -1310720000.0, + "logits/rejected": -1296878848.0, + "logps/chosen": -533.7358490566038, + "logps/rejected": -600.1490683229814, + "loss": 0.4536, + "rewards/chosen": -2.0184748427672954, + "rewards/margins": 1.1729071448103445, + "rewards/rejected": -3.19138198757764, + "step": 1220 + }, + { + "epoch": 0.6438105208060717, + "grad_norm": 0.4205980725915211, + "kl": 0.0, + "learning_rate": 4.802420903368285e-05, + "logits/chosen": -1353921280.0, + "logits/rejected": -1259339776.0, + "logps/chosen": -521.015479876161, + "logps/rejected": -639.7981072555204, + "loss": 0.4561, + "rewards/chosen": -1.7972136222910218, + "rewards/margins": 1.373527702630114, + "rewards/rejected": -3.170741324921136, + "step": 1230 + }, + { + "epoch": 0.6490447526825438, + "grad_norm": 0.938997171512235, + "kl": 0.0, + "learning_rate": 4.7964441380051184e-05, + "logits/chosen": -1280311296.0, + "logits/rejected": -1343854976.0, + "logps/chosen": -537.3935483870968, + "logps/rejected": -607.4181818181818, + "loss": 0.4526, + "rewards/chosen": -1.9568548387096774, + "rewards/margins": 1.3946603128054742, + "rewards/rejected": -3.3515151515151516, + "step": 1240 + }, + { + "epoch": 0.654278984559016, + "grad_norm": 0.471438924793617, + "kl": 0.0, + "learning_rate": 4.790382149012651e-05, + "logits/chosen": -1317640576.0, + "logits/rejected": -1276850944.0, + "logps/chosen": -549.7, + "logps/rejected": -682.3, + "loss": 0.4476, + "rewards/chosen": -2.1296875, + "rewards/margins": 1.491796875, + "rewards/rejected": -3.621484375, + "step": 1250 + }, + { + "epoch": 0.6595132164354881, + "grad_norm": 0.5189481411209818, + "kl": 0.0, + "learning_rate": 4.7842351613581235e-05, + "logits/chosen": -1350356224.0, + "logits/rejected": -1310510336.0, + "logps/chosen": -529.65, + "logps/rejected": -629.9, + "loss": 0.4479, + "rewards/chosen": -1.728515625, + "rewards/margins": 1.648046875, + "rewards/rejected": -3.3765625, + "step": 1260 + }, + { + "epoch": 0.6647474483119602, + "grad_norm": 0.3747743160718501, + "kl": 0.0, + "learning_rate": 4.778003403163175e-05, + "logits/chosen": -1436339456.0, + "logits/rejected": -1278004480.0, + "logps/chosen": -494.65060240963857, + "logps/rejected": -561.3506493506494, + "loss": 0.4579, + "rewards/chosen": -1.4790097891566265, + "rewards/margins": 1.2811363147394774, + "rewards/rejected": -2.760146103896104, + "step": 1270 + }, + { + "epoch": 0.6699816801884323, + "grad_norm": 0.6112560725436108, + "kl": 0.0, + "learning_rate": 4.771687105695373e-05, + "logits/chosen": -1431935360.0, + "logits/rejected": -1293103872.0, + "logps/chosen": -485.3172205438066, + "logps/rejected": -602.0970873786408, + "loss": 0.4595, + "rewards/chosen": -1.2869146525679758, + "rewards/margins": 1.8643798458139014, + "rewards/rejected": -3.151294498381877, + "step": 1280 + }, + { + "epoch": 0.6752159120649045, + "grad_norm": 0.8780083757352779, + "kl": 0.0, + "learning_rate": 4.765286503359632e-05, + "logits/chosen": -1350985344.0, + "logits/rejected": -1378038528.0, + "logps/chosen": -540.5239616613419, + "logps/rejected": -729.4434250764526, + "loss": 0.4465, + "rewards/chosen": -2.110323482428115, + "rewards/margins": 2.0257621444832004, + "rewards/rejected": -4.136085626911315, + "step": 1290 + }, + { + "epoch": 0.6804501439413766, + "grad_norm": 0.6875786082085655, + "kl": 0.0, + "learning_rate": 4.758801833689516e-05, + "logits/chosen": -1400478080.0, + "logits/rejected": -1204813824.0, + "logps/chosen": -765.8795180722891, + "logps/rejected": -994.7012987012987, + "loss": 0.4627, + "rewards/chosen": -4.251129518072289, + "rewards/margins": 2.9923769754342047, + "rewards/rejected": -7.2435064935064934, + "step": 1300 + }, + { + "epoch": 0.6804501439413766, + "eval_kl": 0.0, + "eval_logits/chosen": -2942404096.0, + "eval_logits/rejected": -2700332800.0, + "eval_logps/chosen": -705.3933696190005, + "eval_logps/rejected": -1037.3664843361512, + "eval_loss": 0.4564609229564667, + "eval_rewards/chosen": -3.737877288471054, + "eval_rewards/margins": 3.6828089372872754, + "eval_rewards/rejected": -7.420686225758329, + "eval_runtime": 93.4369, + "eval_samples_per_second": 42.81, + "eval_steps_per_second": 0.674, + "step": 1300 + }, + { + "epoch": 0.6856843758178487, + "grad_norm": 0.7487850272715161, + "kl": 0.0, + "learning_rate": 4.752233337338423e-05, + "logits/chosen": -1514772864.0, + "logits/rejected": -1181116032.0, + "logps/chosen": -660.6840579710145, + "logps/rejected": -1124.3389830508474, + "loss": 0.4639, + "rewards/chosen": -3.3021739130434784, + "rewards/margins": 4.929182019159912, + "rewards/rejected": -8.23135593220339, + "step": 1310 + }, + { + "epoch": 0.6909186076943209, + "grad_norm": 1.3004359512746912, + "kl": 0.0, + "learning_rate": 4.745581258070654e-05, + "logits/chosen": -1363777920.0, + "logits/rejected": -1418094208.0, + "logps/chosen": -704.4142394822006, + "logps/rejected": -936.02416918429, + "loss": 0.4353, + "rewards/chosen": -3.8309061488673137, + "rewards/margins": 2.5610878088366134, + "rewards/rejected": -6.391993957703927, + "step": 1320 + }, + { + "epoch": 0.696152839570793, + "grad_norm": 0.9380483957803694, + "kl": 0.0, + "learning_rate": 4.738845842752364e-05, + "logits/chosen": -1527146112.0, + "logits/rejected": -1395235200.0, + "logps/chosen": -655.9024390243902, + "logps/rejected": -928.4102564102565, + "loss": 0.4589, + "rewards/chosen": -3.3266006097560976, + "rewards/margins": 2.894553236397748, + "rewards/rejected": -6.221153846153846, + "step": 1330 + }, + { + "epoch": 0.7013870714472651, + "grad_norm": 0.6394650193087535, + "kl": 0.0, + "learning_rate": 4.732027341342405e-05, + "logits/chosen": -1620259584.0, + "logits/rejected": -1330852608.0, + "logps/chosen": -535.6521739130435, + "logps/rejected": -647.0508474576271, + "loss": 0.4685, + "rewards/chosen": -2.1190217391304347, + "rewards/margins": 1.1597918201915993, + "rewards/rejected": -3.278813559322034, + "step": 1340 + }, + { + "epoch": 0.7066213033237373, + "grad_norm": 0.5927846775262273, + "kl": 0.0, + "learning_rate": 4.725126006883046e-05, + "logits/chosen": -1479121280.0, + "logits/rejected": -1497366528.0, + "logps/chosen": -567.9746031746032, + "logps/rejected": -689.8215384615385, + "loss": 0.4429, + "rewards/chosen": -2.175, + "rewards/margins": 1.5423076923076926, + "rewards/rejected": -3.7173076923076924, + "step": 1350 + }, + { + "epoch": 0.7118555352002094, + "grad_norm": 0.4712554320074594, + "kl": 0.0, + "learning_rate": 4.718142095490584e-05, + "logits/chosen": -1507013376.0, + "logits/rejected": -1583978880.0, + "logps/chosen": -856.7272727272727, + "logps/rejected": -1207.0361445783133, + "loss": 0.4461, + "rewards/chosen": -5.242288961038961, + "rewards/margins": 3.8601206775152566, + "rewards/rejected": -9.102409638554217, + "step": 1360 + }, + { + "epoch": 0.7170897670766815, + "grad_norm": 0.8329581163405648, + "kl": 0.0, + "learning_rate": 4.711075866345841e-05, + "logits/chosen": -1503448320.0, + "logits/rejected": -1478911616.0, + "logps/chosen": -964.5419354838709, + "logps/rejected": -1271.2727272727273, + "loss": 0.4472, + "rewards/chosen": -6.430645161290323, + "rewards/margins": 3.249657869012707, + "rewards/rejected": -9.68030303030303, + "step": 1370 + }, + { + "epoch": 0.7223239989531536, + "grad_norm": 1.012785333416532, + "kl": 0.0, + "learning_rate": 4.70392758168454e-05, + "logits/chosen": -1536373504.0, + "logits/rejected": -1538051328.0, + "logps/chosen": -830.8645161290323, + "logps/rejected": -1044.9454545454546, + "loss": 0.4487, + "rewards/chosen": -4.7915322580645165, + "rewards/margins": 2.7804374389051807, + "rewards/rejected": -7.571969696969697, + "step": 1380 + }, + { + "epoch": 0.7275582308296258, + "grad_norm": 0.4696033985185279, + "kl": 0.0, + "learning_rate": 4.696697506787579e-05, + "logits/chosen": -1367343104.0, + "logits/rejected": -1455423488.0, + "logps/chosen": -776.0, + "logps/rejected": -999.6190476190476, + "loss": 0.4437, + "rewards/chosen": -4.519736842105263, + "rewards/margins": 2.3976738721804516, + "rewards/rejected": -6.917410714285714, + "step": 1390 + }, + { + "epoch": 0.7327924627060979, + "grad_norm": 0.94786322002552, + "kl": 0.0, + "learning_rate": 4.689385909971184e-05, + "logits/chosen": -1476185344.0, + "logits/rejected": -1244869376.0, + "logps/chosen": -808.3809523809524, + "logps/rejected": -1248.7368421052631, + "loss": 0.4622, + "rewards/chosen": -4.801153273809524, + "rewards/margins": 4.798764489348372, + "rewards/rejected": -9.599917763157896, + "step": 1400 + }, + { + "epoch": 0.7327924627060979, + "eval_kl": 0.0, + "eval_logits/chosen": -3010711296.0, + "eval_logits/rejected": -2812979968.0, + "eval_logps/chosen": -783.2637308263236, + "eval_logps/rejected": -1089.1775236200895, + "eval_loss": 0.4619843661785126, + "eval_rewards/chosen": -4.518060366155368, + "eval_rewards/margins": 3.4185383409555214, + "eval_rewards/rejected": -7.93659870711089, + "eval_runtime": 93.4326, + "eval_samples_per_second": 42.812, + "eval_steps_per_second": 0.674, + "step": 1400 + }, + { + "epoch": 0.73802669458257, + "grad_norm": 0.6652634253453028, + "kl": 0.0, + "learning_rate": 4.68199306257695e-05, + "logits/chosen": -1382023168.0, + "logits/rejected": -1249483136.0, + "logps/chosen": -1017.8723404255319, + "logps/rejected": -1256.7459807073956, + "loss": 0.471, + "rewards/chosen": -6.800151975683891, + "rewards/margins": 2.8622274455379735, + "rewards/rejected": -9.662379421221864, + "step": 1410 + }, + { + "epoch": 0.7432609264590422, + "grad_norm": 0.6086427236404571, + "kl": 0.0, + "learning_rate": 4.674519238961773e-05, + "logits/chosen": -1209217792.0, + "logits/rejected": -1091777280.0, + "logps/chosen": -781.2515337423313, + "logps/rejected": -999.9490445859873, + "loss": 0.461, + "rewards/chosen": -4.742523006134969, + "rewards/margins": 2.4684642550115274, + "rewards/rejected": -7.210987261146497, + "step": 1420 + }, + { + "epoch": 0.7484951583355143, + "grad_norm": 0.587128760826668, + "kl": 0.0, + "learning_rate": 4.66696471648767e-05, + "logits/chosen": -1229140736.0, + "logits/rejected": -1149658752.0, + "logps/chosen": -623.1898734177215, + "logps/rejected": -915.358024691358, + "loss": 0.4482, + "rewards/chosen": -3.1242088607594938, + "rewards/margins": 2.8715473120800126, + "rewards/rejected": -5.995756172839506, + "step": 1430 + }, + { + "epoch": 0.7537293902119864, + "grad_norm": 0.5455085313383593, + "kl": 0.0, + "learning_rate": 4.659329775511478e-05, + "logits/chosen": -1181954816.0, + "logits/rejected": -1167484544.0, + "logps/chosen": -647.3375796178344, + "logps/rejected": -891.680981595092, + "loss": 0.4553, + "rewards/chosen": -2.915207006369427, + "rewards/margins": 2.9260506623422295, + "rewards/rejected": -5.841257668711656, + "step": 1440 + }, + { + "epoch": 0.7589636220884585, + "grad_norm": 0.6399322898967481, + "kl": 0.0, + "learning_rate": 4.651614699374461e-05, + "logits/chosen": -1192860032.0, + "logits/rejected": -1133930112.0, + "logps/chosen": -622.7924528301887, + "logps/rejected": -1006.3105590062112, + "loss": 0.4463, + "rewards/chosen": -2.8330385220125787, + "rewards/margins": 4.218203714012265, + "rewards/rejected": -7.051242236024844, + "step": 1450 + }, + { + "epoch": 0.7641978539649307, + "grad_norm": 0.5781761467701665, + "kl": 0.0, + "learning_rate": 4.643819774391786e-05, + "logits/chosen": -1172727424.0, + "logits/rejected": -1157627904.0, + "logps/chosen": -750.91961414791, + "logps/rejected": -904.7537993920972, + "loss": 0.4565, + "rewards/chosen": -4.142282958199357, + "rewards/margins": 1.9367443974237437, + "rewards/rejected": -6.079027355623101, + "step": 1460 + }, + { + "epoch": 0.7694320858414028, + "grad_norm": 0.8073221865761616, + "kl": 0.0, + "learning_rate": 4.635945289841902e-05, + "logits/chosen": -1248224896.0, + "logits/rejected": -1091777280.0, + "logps/chosen": -671.1246200607902, + "logps/rejected": -1179.1639871382636, + "loss": 0.458, + "rewards/chosen": -3.5155775075987843, + "rewards/margins": 5.603393553494463, + "rewards/rejected": -9.118971061093248, + "step": 1470 + }, + { + "epoch": 0.7746663177178749, + "grad_norm": 0.731784234486593, + "kl": 0.0, + "learning_rate": 4.6279915379558017e-05, + "logits/chosen": -1293523328.0, + "logits/rejected": -1241094528.0, + "logps/chosen": -677.12, + "logps/rejected": -918.7555555555556, + "loss": 0.4593, + "rewards/chosen": -3.539615384615385, + "rewards/margins": 2.8984798534798535, + "rewards/rejected": -6.438095238095238, + "step": 1480 + }, + { + "epoch": 0.7799005495943471, + "grad_norm": 0.8269303401098701, + "kl": 0.0, + "learning_rate": 4.61995881390618e-05, + "logits/chosen": -1258920320.0, + "logits/rejected": -1162241664.0, + "logps/chosen": -703.2074303405573, + "logps/rejected": -881.8675078864353, + "loss": 0.4587, + "rewards/chosen": -3.490905572755418, + "rewards/margins": 2.262249001377074, + "rewards/rejected": -5.753154574132492, + "step": 1490 + }, + { + "epoch": 0.7851347814708192, + "grad_norm": 0.5998559400896546, + "kl": 0.0, + "learning_rate": 4.6118474157964765e-05, + "logits/chosen": -1203136128.0, + "logits/rejected": -1172937088.0, + "logps/chosen": -643.9245283018868, + "logps/rejected": -819.4782608695652, + "loss": 0.4571, + "rewards/chosen": -3.2344732704402515, + "rewards/margins": 2.1319863568889414, + "rewards/rejected": -5.366459627329193, + "step": 1500 + }, + { + "epoch": 0.7851347814708192, + "eval_kl": 0.0, + "eval_logits/chosen": -2759985152.0, + "eval_logits/rejected": -2638283776.0, + "eval_logps/chosen": -567.8931222167244, + "eval_logps/rejected": -773.6330183988066, + "eval_loss": 0.45720311999320984, + "eval_rewards/chosen": -2.362196932211776, + "eval_rewards/margins": 2.422735937007518, + "eval_rewards/rejected": -4.784932869219294, + "eval_runtime": 93.4354, + "eval_samples_per_second": 42.81, + "eval_steps_per_second": 0.674, + "step": 1500 + }, + { + "epoch": 0.7903690133472913, + "grad_norm": 0.5520818527747415, + "kl": 0.0, + "learning_rate": 4.6036576446498124e-05, + "logits/chosen": -1313026816.0, + "logits/rejected": -1107086592.0, + "logps/chosen": -480.0, + "logps/rejected": -601.027027027027, + "loss": 0.4786, + "rewards/chosen": -1.6594295058139534, + "rewards/margins": 1.3794218455373979, + "rewards/rejected": -3.0388513513513513, + "step": 1510 + }, + { + "epoch": 0.7956032452237635, + "grad_norm": 0.6769152223374779, + "kl": 0.0, + "learning_rate": 4.5953898043978244e-05, + "logits/chosen": -1228511616.0, + "logits/rejected": -1322254336.0, + "logps/chosen": -459.7922077922078, + "logps/rejected": -568.9638554216867, + "loss": 0.4482, + "rewards/chosen": -1.322646103896104, + "rewards/margins": 1.6125948599593176, + "rewards/rejected": -2.9352409638554215, + "step": 1520 + }, + { + "epoch": 0.8008374771002356, + "grad_norm": 0.7170707161941443, + "kl": 0.0, + "learning_rate": 4.5870442018693775e-05, + "logits/chosen": -1229350528.0, + "logits/rejected": -1242143104.0, + "logps/chosen": -616.128617363344, + "logps/rejected": -772.6686930091186, + "loss": 0.4464, + "rewards/chosen": -2.6310289389067525, + "rewards/margins": 2.3317370185400557, + "rewards/rejected": -4.962765957446808, + "step": 1530 + }, + { + "epoch": 0.8060717089767077, + "grad_norm": 0.4908931386250085, + "kl": 0.0, + "learning_rate": 4.578621146779186e-05, + "logits/chosen": -1167904000.0, + "logits/rejected": -1179018880.0, + "logps/chosen": -712.6514657980456, + "logps/rejected": -903.1111111111111, + "loss": 0.442, + "rewards/chosen": -3.5916123778501627, + "rewards/margins": 2.4309101446723598, + "rewards/rejected": -6.0225225225225225, + "step": 1540 + }, + { + "epoch": 0.8113059408531798, + "grad_norm": 0.7969496873913599, + "kl": 0.0, + "learning_rate": 4.570120951716312e-05, + "logits/chosen": -1113587712.0, + "logits/rejected": -1260178688.0, + "logps/chosen": -554.1168384879725, + "logps/rejected": -655.8624641833811, + "loss": 0.427, + "rewards/chosen": -2.2758805841924397, + "rewards/margins": 1.3129446307072739, + "rewards/rejected": -3.5888252148997135, + "step": 1550 + }, + { + "epoch": 0.816540172729652, + "grad_norm": 0.7026514692155559, + "kl": 0.0, + "learning_rate": 4.561543932132574e-05, + "logits/chosen": -1231028224.0, + "logits/rejected": -1186988032.0, + "logps/chosen": -514.0125786163522, + "logps/rejected": -615.4534161490683, + "loss": 0.4641, + "rewards/chosen": -1.8867924528301887, + "rewards/margins": 1.039449783194656, + "rewards/rejected": -2.926242236024845, + "step": 1560 + }, + { + "epoch": 0.821774404606124, + "grad_norm": 0.6689197008305775, + "kl": 0.0, + "learning_rate": 4.5528904063308294e-05, + "logits/chosen": -1195586304.0, + "logits/rejected": -1132042624.0, + "logps/chosen": -562.7414330218069, + "logps/rejected": -632.0752351097178, + "loss": 0.4642, + "rewards/chosen": -2.2988707165109035, + "rewards/margins": 1.3555179982226386, + "rewards/rejected": -3.654388714733542, + "step": 1570 + }, + { + "epoch": 0.8270086364825961, + "grad_norm": 0.48187712881639605, + "kl": 0.0, + "learning_rate": 4.544160695453173e-05, + "logits/chosen": -1171259392.0, + "logits/rejected": -1162031872.0, + "logps/chosen": -612.1423948220065, + "logps/rejected": -730.2960725075528, + "loss": 0.4444, + "rewards/chosen": -2.8717637540453076, + "rewards/margins": 1.6622241613625475, + "rewards/rejected": -4.533987915407855, + "step": 1580 + }, + { + "epoch": 0.8322428683590684, + "grad_norm": 0.49689165831090876, + "kl": 0.0, + "learning_rate": 4.535355123469009e-05, + "logits/chosen": -1364616832.0, + "logits/rejected": -1105094272.0, + "logps/chosen": -522.4811594202898, + "logps/rejected": -693.477966101695, + "loss": 0.4636, + "rewards/chosen": -2.0105072463768114, + "rewards/margins": 2.0017808892164086, + "rewards/rejected": -4.01228813559322, + "step": 1590 + }, + { + "epoch": 0.8374771002355405, + "grad_norm": 0.46546847894467513, + "kl": 0.0, + "learning_rate": 4.5264740171630346e-05, + "logits/chosen": -1284505600.0, + "logits/rejected": -1162661120.0, + "logps/chosen": -537.3293051359517, + "logps/rejected": -721.1909385113269, + "loss": 0.4714, + "rewards/chosen": -2.1334969788519635, + "rewards/margins": 2.135515966131855, + "rewards/rejected": -4.269012944983818, + "step": 1600 + }, + { + "epoch": 0.8374771002355405, + "eval_kl": 0.0, + "eval_logits/chosen": -2749732352.0, + "eval_logits/rejected": -2697137152.0, + "eval_logps/chosen": -526.0603661553687, + "eval_logps/rejected": -662.9457981103928, + "eval_loss": 0.4618281126022339, + "eval_rewards/chosen": -1.9415821375556654, + "eval_rewards/margins": 1.7369360126183775, + "eval_rewards/rejected": -3.678518150174043, + "eval_runtime": 93.4309, + "eval_samples_per_second": 42.812, + "eval_steps_per_second": 0.674, + "step": 1600 + }, + { + "epoch": 0.8427113321120125, + "grad_norm": 0.6877692391512474, + "kl": 0.0, + "learning_rate": 4.5175177061231135e-05, + "logits/chosen": -1227463040.0, + "logits/rejected": -1216348160.0, + "logps/chosen": -482.126582278481, + "logps/rejected": -651.0617283950618, + "loss": 0.4595, + "rewards/chosen": -1.8253560126582278, + "rewards/margins": 1.6723291725269573, + "rewards/rejected": -3.497685185185185, + "step": 1610 + }, + { + "epoch": 0.8479455639884846, + "grad_norm": 1.0779043545356655, + "kl": 0.0, + "learning_rate": 4.508486522728037e-05, + "logits/chosen": -1146513024.0, + "logits/rejected": -1204813824.0, + "logps/chosen": -617.6410256410256, + "logps/rejected": -724.0, + "loss": 0.4601, + "rewards/chosen": -3.0234375, + "rewards/margins": 1.2730564024390247, + "rewards/rejected": -4.296493902439025, + "step": 1620 + }, + { + "epoch": 0.8531797958649568, + "grad_norm": 1.8732198687823562, + "kl": 0.0, + "learning_rate": 4.499380802135197e-05, + "logits/chosen": -1244240256.0, + "logits/rejected": -1163919360.0, + "logps/chosen": -569.6, + "logps/rejected": -862.3, + "loss": 0.4386, + "rewards/chosen": -2.205078125, + "rewards/margins": 3.373046875, + "rewards/rejected": -5.578125, + "step": 1630 + }, + { + "epoch": 0.8584140277414289, + "grad_norm": 0.6010892356137254, + "kl": 0.0, + "learning_rate": 4.490200882268142e-05, + "logits/chosen": -1278423808.0, + "logits/rejected": -1088421888.0, + "logps/chosen": -517.7831325301205, + "logps/rejected": -721.8701298701299, + "loss": 0.4642, + "rewards/chosen": -2.1364834337349397, + "rewards/margins": 2.0254483844468787, + "rewards/rejected": -4.161931818181818, + "step": 1640 + }, + { + "epoch": 0.863648259617901, + "grad_norm": 0.6195805031595781, + "kl": 0.0, + "learning_rate": 4.480947103804044e-05, + "logits/chosen": -1189504640.0, + "logits/rejected": -1145464448.0, + "logps/chosen": -610.0934579439253, + "logps/rejected": -731.8871473354232, + "loss": 0.4686, + "rewards/chosen": -2.809968847352025, + "rewards/margins": 1.5720844441840254, + "rewards/rejected": -4.38205329153605, + "step": 1650 + }, + { + "epoch": 0.8688824914943732, + "grad_norm": 0.6162524547197834, + "kl": 0.0, + "learning_rate": 4.471619810161046e-05, + "logits/chosen": -1099956224.0, + "logits/rejected": -1118620928.0, + "logps/chosen": -567.7152103559871, + "logps/rejected": -803.5770392749245, + "loss": 0.4349, + "rewards/chosen": -2.560881877022654, + "rewards/margins": 2.4428945580226635, + "rewards/rejected": -5.003776435045317, + "step": 1660 + }, + { + "epoch": 0.8741167233708453, + "grad_norm": 0.8094087004559215, + "kl": 0.0, + "learning_rate": 4.462219347485523e-05, + "logits/chosen": -1235851648.0, + "logits/rejected": -1014182720.0, + "logps/chosen": -623.0588235294117, + "logps/rejected": -716.3733333333333, + "loss": 0.4825, + "rewards/chosen": -2.572794117647059, + "rewards/margins": 1.730539215686275, + "rewards/rejected": -4.303333333333334, + "step": 1670 + }, + { + "epoch": 0.8793509552473174, + "grad_norm": 2.202531952092164, + "kl": 0.0, + "learning_rate": 4.452746064639239e-05, + "logits/chosen": -1071434944.0, + "logits/rejected": -1215719040.0, + "logps/chosen": -562.1917808219179, + "logps/rejected": -664.183908045977, + "loss": 0.4356, + "rewards/chosen": -2.2741866438356166, + "rewards/margins": 1.2714311722563374, + "rewards/rejected": -3.545617816091954, + "step": 1680 + }, + { + "epoch": 0.8845851871237895, + "grad_norm": 0.7546139807682359, + "kl": 0.0, + "learning_rate": 4.4432003131863906e-05, + "logits/chosen": -1045220544.0, + "logits/rejected": -876609536.0, + "logps/chosen": -607.4146341463414, + "logps/rejected": -712.1025641025641, + "loss": 0.4601, + "rewards/chosen": -2.603849085365854, + "rewards/margins": 1.4995162992495312, + "rewards/rejected": -4.103365384615385, + "step": 1690 + }, + { + "epoch": 0.8898194190002617, + "grad_norm": 0.5459929181429486, + "kl": 0.0, + "learning_rate": 4.4335824473805716e-05, + "logits/chosen": -781503680.0, + "logits/rejected": -809920128.0, + "logps/chosen": -588.6344827586207, + "logps/rejected": -712.5942857142857, + "loss": 0.4266, + "rewards/chosen": -2.5760775862068965, + "rewards/margins": 1.5317795566502466, + "rewards/rejected": -4.107857142857143, + "step": 1700 + }, + { + "epoch": 0.8898194190002617, + "eval_kl": 0.0, + "eval_logits/chosen": -1904080896.0, + "eval_logits/rejected": -1675191680.0, + "eval_logps/chosen": -554.2761009401287, + "eval_logps/rejected": -683.2819492789657, + "eval_loss": 0.45478126406669617, + "eval_rewards/chosen": -2.223899059871351, + "eval_rewards/margins": 1.6572545950267097, + "eval_rewards/rejected": -3.8811536548980605, + "eval_runtime": 93.4412, + "eval_samples_per_second": 42.808, + "eval_steps_per_second": 0.674, + "step": 1700 + }, + { + "epoch": 0.8950536508767338, + "grad_norm": 0.4395457724253016, + "kl": 0.0, + "learning_rate": 4.423892824151616e-05, + "logits/chosen": -925682880.0, + "logits/rejected": -894225600.0, + "logps/chosen": -554.5, + "logps/rejected": -700.1, + "loss": 0.4603, + "rewards/chosen": -2.29921875, + "rewards/margins": 1.616796875, + "rewards/rejected": -3.916015625, + "step": 1710 + }, + { + "epoch": 0.9002878827532059, + "grad_norm": 0.8329962927416344, + "kl": 0.0, + "learning_rate": 4.414131803092362e-05, + "logits/chosen": -1054448000.0, + "logits/rejected": -922327424.0, + "logps/chosen": -504.95412844036696, + "logps/rejected": -647.4632587859425, + "loss": 0.4552, + "rewards/chosen": -2.029625382262997, + "rewards/margins": 1.48335385096384, + "rewards/rejected": -3.512979233226837, + "step": 1720 + }, + { + "epoch": 0.9055221146296781, + "grad_norm": 0.6160573144972731, + "kl": 0.0, + "learning_rate": 4.404299746445295e-05, + "logits/chosen": -1025717056.0, + "logits/rejected": -967521088.0, + "logps/chosen": -504.22429906542055, + "logps/rejected": -682.4326018808778, + "loss": 0.4591, + "rewards/chosen": -1.7044392523364487, + "rewards/margins": 2.07220651568863, + "rewards/rejected": -3.7766457680250785, + "step": 1730 + }, + { + "epoch": 0.9107563465061502, + "grad_norm": 0.6060710943985353, + "kl": 0.0, + "learning_rate": 4.394397019089116e-05, + "logits/chosen": -1029701632.0, + "logits/rejected": -1022781056.0, + "logps/chosen": -575.1715210355987, + "logps/rejected": -696.1691842900302, + "loss": 0.4463, + "rewards/chosen": -2.292677993527508, + "rewards/margins": 1.4610984415178092, + "rewards/rejected": -3.7537764350453173, + "step": 1740 + }, + { + "epoch": 0.9159905783826223, + "grad_norm": 0.8160183432487458, + "kl": 0.0, + "learning_rate": 4.384423988525196e-05, + "logits/chosen": -1208798464.0, + "logits/rejected": -997405504.0, + "logps/chosen": -692.7283582089552, + "logps/rejected": -909.0098360655737, + "loss": 0.463, + "rewards/chosen": -3.575373134328358, + "rewards/margins": 2.482823586983117, + "rewards/rejected": -6.058196721311475, + "step": 1750 + }, + { + "epoch": 0.9212248102590945, + "grad_norm": 0.5247996958339426, + "kl": 0.0, + "learning_rate": 4.3743810248639325e-05, + "logits/chosen": -1090938496.0, + "logits/rejected": -1167904000.0, + "logps/chosen": -586.989898989899, + "logps/rejected": -848.4198250728863, + "loss": 0.434, + "rewards/chosen": -2.617003367003367, + "rewards/margins": 3.081247361859606, + "rewards/rejected": -5.698250728862973, + "step": 1760 + }, + { + "epoch": 0.9264590421355666, + "grad_norm": 0.4876127564448978, + "kl": 0.0, + "learning_rate": 4.364268500811025e-05, + "logits/chosen": -1071225216.0, + "logits/rejected": -1151126784.0, + "logps/chosen": -561.6623376623377, + "logps/rejected": -721.4457831325301, + "loss": 0.4567, + "rewards/chosen": -2.36911525974026, + "rewards/margins": 1.8496347402597402, + "rewards/rejected": -4.21875, + "step": 1770 + }, + { + "epoch": 0.9316932740120387, + "grad_norm": 0.4259995214022993, + "kl": 0.0, + "learning_rate": 4.354086791653633e-05, + "logits/chosen": -1195166976.0, + "logits/rejected": -1017118720.0, + "logps/chosen": -564.1305637982196, + "logps/rejected": -664.7128712871287, + "loss": 0.4818, + "rewards/chosen": -2.4063427299703264, + "rewards/margins": 1.1992678310857792, + "rewards/rejected": -3.6056105610561056, + "step": 1780 + }, + { + "epoch": 0.9369275058885108, + "grad_norm": 0.8417012837062164, + "kl": 0.0, + "learning_rate": 4.343836275246455e-05, + "logits/chosen": -1106876800.0, + "logits/rejected": -1180486912.0, + "logps/chosen": -571.7979797979798, + "logps/rejected": -697.8425655976677, + "loss": 0.4433, + "rewards/chosen": -2.2925084175084174, + "rewards/margins": 1.5478705912379382, + "rewards/rejected": -3.8403790087463556, + "step": 1790 + }, + { + "epoch": 0.942161737764983, + "grad_norm": 0.48599694772283725, + "kl": 0.0, + "learning_rate": 4.333517331997704e-05, + "logits/chosen": -1041865088.0, + "logits/rejected": -1005584384.0, + "logps/chosen": -572.2838709677419, + "logps/rejected": -771.2, + "loss": 0.449, + "rewards/chosen": -2.52258064516129, + "rewards/margins": 1.9986314760508312, + "rewards/rejected": -4.5212121212121215, + "step": 1800 + }, + { + "epoch": 0.942161737764983, + "eval_kl": 0.0, + "eval_logits/chosen": -2063198080.0, + "eval_logits/rejected": -1781980032.0, + "eval_logps/chosen": -630.7530925284512, + "eval_logps/rejected": -780.4753853804077, + "eval_loss": 0.4611523449420929, + "eval_rewards/chosen": -2.9898565066798612, + "eval_rewards/margins": 1.863947570893485, + "eval_rewards/rejected": -4.853804077573346, + "eval_runtime": 93.4415, + "eval_samples_per_second": 42.808, + "eval_steps_per_second": 0.674, + "step": 1800 + }, + { + "epoch": 0.9473959696414551, + "grad_norm": 2.6266758242101504, + "kl": 0.0, + "learning_rate": 4.3231303448549904e-05, + "logits/chosen": -917084544.0, + "logits/rejected": -829423616.0, + "logps/chosen": -644.2264150943396, + "logps/rejected": -748.223602484472, + "loss": 0.463, + "rewards/chosen": -3.0727201257861636, + "rewards/margins": 1.3698264580647677, + "rewards/rejected": -4.442546583850931, + "step": 1810 + }, + { + "epoch": 0.9526302015179272, + "grad_norm": 0.4042034424904704, + "kl": 0.0, + "learning_rate": 4.312675699291109e-05, + "logits/chosen": -1117782016.0, + "logits/rejected": -859937152.0, + "logps/chosen": -590.8502994011976, + "logps/rejected": -735.5816993464052, + "loss": 0.4671, + "rewards/chosen": -2.502994011976048, + "rewards/margins": 1.8932478180893115, + "rewards/rejected": -4.396241830065359, + "step": 1820 + }, + { + "epoch": 0.9578644333943994, + "grad_norm": 0.6023946268417015, + "kl": 0.0, + "learning_rate": 4.3021537832897366e-05, + "logits/chosen": -1201458432.0, + "logits/rejected": -1019006144.0, + "logps/chosen": -544.780487804878, + "logps/rejected": -787.1794871794872, + "loss": 0.4584, + "rewards/chosen": -2.018292682926829, + "rewards/margins": 2.7765791119449656, + "rewards/rejected": -4.794871794871795, + "step": 1830 + }, + { + "epoch": 0.9630986652708715, + "grad_norm": 1.0304378455020775, + "kl": 0.0, + "learning_rate": 4.2915649873310295e-05, + "logits/chosen": -1215928704.0, + "logits/rejected": -1159305600.0, + "logps/chosen": -534.9873417721519, + "logps/rejected": -651.4567901234568, + "loss": 0.4617, + "rewards/chosen": -1.8275316455696202, + "rewards/margins": 1.957190576652602, + "rewards/rejected": -3.7847222222222223, + "step": 1840 + }, + { + "epoch": 0.9683328971473436, + "grad_norm": 1.0604209553629633, + "kl": 0.0, + "learning_rate": 4.2809097043771364e-05, + "logits/chosen": -1071644672.0, + "logits/rejected": -1154482176.0, + "logps/chosen": -656.6864686468647, + "logps/rejected": -724.2255192878338, + "loss": 0.4595, + "rewards/chosen": -3.446575907590759, + "rewards/margins": 0.8627712734181427, + "rewards/rejected": -4.309347181008902, + "step": 1850 + }, + { + "epoch": 0.9735671290238157, + "grad_norm": 0.4643605329633105, + "kl": 0.0, + "learning_rate": 4.270188329857613e-05, + "logits/chosen": -1131203840.0, + "logits/rejected": -1029491904.0, + "logps/chosen": -617.6507936507936, + "logps/rejected": -847.1630769230769, + "loss": 0.446, + "rewards/chosen": -2.8615079365079366, + "rewards/margins": 2.5123382173382174, + "rewards/rejected": -5.373846153846154, + "step": 1860 + }, + { + "epoch": 0.9788013609002879, + "grad_norm": 0.5357454435977848, + "kl": 0.0, + "learning_rate": 4.259401261654746e-05, + "logits/chosen": -1183632640.0, + "logits/rejected": -924424576.0, + "logps/chosen": -689.6457142857142, + "logps/rejected": -862.4551724137931, + "loss": 0.4981, + "rewards/chosen": -3.9153571428571428, + "rewards/margins": 1.6824876847290642, + "rewards/rejected": -5.597844827586207, + "step": 1870 + }, + { + "epoch": 0.98403559277676, + "grad_norm": 0.6560194750573574, + "kl": 0.0, + "learning_rate": 4.248548900088793e-05, + "logits/chosen": -1229979648.0, + "logits/rejected": -977901952.0, + "logps/chosen": -517.6637168141593, + "logps/rejected": -625.5415282392026, + "loss": 0.4687, + "rewards/chosen": -1.6395648967551621, + "rewards/margins": 1.7109334421152695, + "rewards/rejected": -3.3504983388704317, + "step": 1880 + }, + { + "epoch": 0.9892698246532321, + "grad_norm": 0.6967632982234868, + "kl": 0.0, + "learning_rate": 4.2376316479031155e-05, + "logits/chosen": -1214041344.0, + "logits/rejected": -1190133760.0, + "logps/chosen": -488.8488745980707, + "logps/rejected": -551.8784194528876, + "loss": 0.4502, + "rewards/chosen": -1.5696342443729903, + "rewards/margins": 1.1302137799431191, + "rewards/rejected": -2.6998480243161094, + "step": 1890 + }, + { + "epoch": 0.9945040565297043, + "grad_norm": 0.7039052687108839, + "kl": 0.0, + "learning_rate": 4.2266499102492426e-05, + "logits/chosen": -1291216512.0, + "logits/rejected": -957769344.0, + "logps/chosen": -645.5813953488372, + "logps/rejected": -762.0540540540541, + "loss": 0.4773, + "rewards/chosen": -2.9923691860465116, + "rewards/margins": 1.6414990571967314, + "rewards/rejected": -4.633868243243243, + "step": 1900 + }, + { + "epoch": 0.9945040565297043, + "eval_kl": 0.0, + "eval_logits/chosen": -2405932544.0, + "eval_logits/rejected": -2133635840.0, + "eval_logps/chosen": -658.1771400296883, + "eval_logps/rejected": -792.9825957235206, + "eval_loss": 0.46122264862060547, + "eval_rewards/chosen": -3.266081147946561, + "eval_rewards/margins": 1.7145255154050054, + "eval_rewards/rejected": -4.980606663351566, + "eval_runtime": 93.4298, + "eval_samples_per_second": 42.813, + "eval_steps_per_second": 0.674, + "step": 1900 + }, + { + "epoch": 0.9997382884061764, + "grad_norm": 1.64298159620159, + "kl": 0.0, + "learning_rate": 4.215604094671835e-05, + "logits/chosen": -1055286912.0, + "logits/rejected": -1002858112.0, + "logps/chosen": -664.6230529595016, + "logps/rejected": -746.1316614420062, + "loss": 0.4767, + "rewards/chosen": -3.2990654205607477, + "rewards/margins": 1.0191163976210706, + "rewards/rejected": -4.318181818181818, + "step": 1910 + }, + { + "epoch": 1.0049725202826485, + "grad_norm": 1.15544453899332, + "kl": 0.0, + "learning_rate": 4.2044946110935485e-05, + "logits/chosen": -1248015104.0, + "logits/rejected": -1101004800.0, + "logps/chosen": -596.4171779141104, + "logps/rejected": -646.624203821656, + "loss": 0.466, + "rewards/chosen": -2.8730828220858897, + "rewards/margins": 0.8275541205892694, + "rewards/rejected": -3.700636942675159, + "step": 1920 + }, + { + "epoch": 1.0102067521591207, + "grad_norm": 0.7147045746313497, + "kl": 0.0, + "learning_rate": 4.193321871799839e-05, + "logits/chosen": -1404882176.0, + "logits/rejected": -1079613824.0, + "logps/chosen": -597.5141242937854, + "logps/rejected": -692.3636363636364, + "loss": 0.4909, + "rewards/chosen": -2.713276836158192, + "rewards/margins": 1.0104993876180317, + "rewards/rejected": -3.7237762237762237, + "step": 1930 + }, + { + "epoch": 1.0154409840355927, + "grad_norm": 0.3811105029373331, + "kl": 0.0, + "learning_rate": 4.1820862914236495e-05, + "logits/chosen": -1241723648.0, + "logits/rejected": -1168113664.0, + "logps/chosen": -548.6355140186915, + "logps/rejected": -740.4137931034483, + "loss": 0.4364, + "rewards/chosen": -2.3601051401869158, + "rewards/margins": 2.304863511850702, + "rewards/rejected": -4.664968652037618, + "step": 1940 + }, + { + "epoch": 1.020675215912065, + "grad_norm": 0.6064780465316373, + "kl": 0.0, + "learning_rate": 4.170788286930024e-05, + "logits/chosen": -1268567296.0, + "logits/rejected": -1069128064.0, + "logps/chosen": -630.4864864864865, + "logps/rejected": -733.185667752443, + "loss": 0.468, + "rewards/chosen": -2.695846053573104, + "rewards/margins": 1.8546425457754299, + "rewards/rejected": -4.550488599348534, + "step": 1950 + }, + { + "epoch": 1.025909447788537, + "grad_norm": 0.7574936764747783, + "kl": 0.0, + "learning_rate": 4.159428277600641e-05, + "logits/chosen": -1210895616.0, + "logits/rejected": -1113378048.0, + "logps/chosen": -643.8637770897833, + "logps/rejected": -788.3911671924291, + "loss": 0.4486, + "rewards/chosen": -2.7565789473684212, + "rewards/margins": 2.301386352316121, + "rewards/rejected": -5.057965299684542, + "step": 1960 + }, + { + "epoch": 1.031143679665009, + "grad_norm": 0.4407112262867067, + "kl": 0.0, + "learning_rate": 4.1480066850182456e-05, + "logits/chosen": -1225785344.0, + "logits/rejected": -1066401792.0, + "logps/chosen": -553.033033033033, + "logps/rejected": -648.9641693811075, + "loss": 0.4585, + "rewards/chosen": -2.1083881733295797, + "rewards/margins": 1.4005694813935472, + "rewards/rejected": -3.508957654723127, + "step": 1970 + }, + { + "epoch": 1.0363779115414813, + "grad_norm": 0.6105630732861353, + "kl": 0.0, + "learning_rate": 4.1365239330510055e-05, + "logits/chosen": -1200409856.0, + "logits/rejected": -1117152896.0, + "logps/chosen": -577.211356466877, + "logps/rejected": -647.0340557275542, + "loss": 0.4376, + "rewards/chosen": -2.117705047318612, + "rewards/margins": 1.4565983582541433, + "rewards/rejected": -3.5743034055727554, + "step": 1980 + }, + { + "epoch": 1.0416121434179535, + "grad_norm": 0.36902170461352074, + "kl": 0.0, + "learning_rate": 4.1249804478367844e-05, + "logits/chosen": -1086953856.0, + "logits/rejected": -988807168.0, + "logps/chosen": -543.3128834355829, + "logps/rejected": -669.7579617834394, + "loss": 0.4531, + "rewards/chosen": -2.051955521472393, + "rewards/margins": 1.7020253702473522, + "rewards/rejected": -3.753980891719745, + "step": 1990 + }, + { + "epoch": 1.0468463752944255, + "grad_norm": 0.585975080111189, + "kl": 0.0, + "learning_rate": 4.113376657767324e-05, + "logits/chosen": -1178599424.0, + "logits/rejected": -1079823616.0, + "logps/chosen": -552.2674772036474, + "logps/rejected": -588.4501607717042, + "loss": 0.4654, + "rewards/chosen": -2.0672492401215807, + "rewards/margins": 0.9745514029652358, + "rewards/rejected": -3.0418006430868165, + "step": 2000 + }, + { + "epoch": 1.0468463752944255, + "eval_kl": 0.0, + "eval_logits/chosen": -2678495744.0, + "eval_logits/rejected": -2597272832.0, + "eval_logps/chosen": -528.9421078673923, + "eval_logps/rejected": -616.7041272998508, + "eval_loss": 0.4567851424217224, + "eval_rewards/chosen": -1.968951014349332, + "eval_rewards/margins": 1.2454945351285396, + "eval_rewards/rejected": -3.2144455494778716, + "eval_runtime": 93.4154, + "eval_samples_per_second": 42.82, + "eval_steps_per_second": 0.674, + "step": 2000 + }, + { + "epoch": 1.0520806071708977, + "grad_norm": 0.5056456626042156, + "kl": 0.0, + "learning_rate": 4.101712993472348e-05, + "logits/chosen": -1287231872.0, + "logits/rejected": -1250531712.0, + "logps/chosen": -505.0126582278481, + "logps/rejected": -589.0370370370371, + "loss": 0.4375, + "rewards/chosen": -1.6400316455696202, + "rewards/margins": 1.44908872480075, + "rewards/rejected": -3.0891203703703702, + "step": 2010 + }, + { + "epoch": 1.05731483904737, + "grad_norm": 0.5989903113327755, + "kl": 0.0, + "learning_rate": 4.089989887803579e-05, + "logits/chosen": -1364407040.0, + "logits/rejected": -1140011776.0, + "logps/chosen": -436.22028985507245, + "logps/rejected": -529.7898305084746, + "loss": 0.4414, + "rewards/chosen": -1.1168478260869565, + "rewards/margins": 1.1954403095062638, + "rewards/rejected": -2.3122881355932203, + "step": 2020 + }, + { + "epoch": 1.0625490709238419, + "grad_norm": 0.6371162663574992, + "kl": 0.0, + "learning_rate": 4.078207775818677e-05, + "logits/chosen": -1195796096.0, + "logits/rejected": -1192860032.0, + "logps/chosen": -466.33112582781456, + "logps/rejected": -573.6331360946746, + "loss": 0.4137, + "rewards/chosen": -1.283319536423841, + "rewards/margins": 1.705585789019946, + "rewards/rejected": -2.988905325443787, + "step": 2030 + }, + { + "epoch": 1.067783302800314, + "grad_norm": 0.4672351902083702, + "kl": 0.0, + "learning_rate": 4.066367094765091e-05, + "logits/chosen": -1106247680.0, + "logits/rejected": -1035993088.0, + "logps/chosen": -513.1794871794872, + "logps/rejected": -633.560975609756, + "loss": 0.423, + "rewards/chosen": -1.8940304487179487, + "rewards/margins": 1.5206036976235147, + "rewards/rejected": -3.4146341463414633, + "step": 2040 + }, + { + "epoch": 1.073017534676786, + "grad_norm": 0.44456160753496904, + "kl": 0.0, + "learning_rate": 4.054468284063837e-05, + "logits/chosen": -1123758848.0, + "logits/rejected": -1046793408.0, + "logps/chosen": -508.62111801242236, + "logps/rejected": -643.3207547169811, + "loss": 0.4316, + "rewards/chosen": -1.8557841614906831, + "rewards/margins": 1.3828164674401344, + "rewards/rejected": -3.2386006289308176, + "step": 2050 + }, + { + "epoch": 1.0782517665532583, + "grad_norm": 0.5992749413853709, + "kl": 0.0, + "learning_rate": 4.0425117852931854e-05, + "logits/chosen": -1236480768.0, + "logits/rejected": -1104150528.0, + "logps/chosen": -514.0804953560372, + "logps/rejected": -616.378548895899, + "loss": 0.4238, + "rewards/chosen": -1.4851006191950464, + "rewards/margins": 1.8303567940541647, + "rewards/rejected": -3.315457413249211, + "step": 2060 + }, + { + "epoch": 1.0834859984297305, + "grad_norm": 0.7960409622933732, + "kl": 0.0, + "learning_rate": 4.030498042172277e-05, + "logits/chosen": -1191182336.0, + "logits/rejected": -1186358912.0, + "logps/chosen": -446.5407166123779, + "logps/rejected": -573.6936936936937, + "loss": 0.4079, + "rewards/chosen": -1.2974857491856677, + "rewards/margins": 1.6180547913548726, + "rewards/rejected": -2.9155405405405403, + "step": 2070 + }, + { + "epoch": 1.0887202303062025, + "grad_norm": 0.557857832422228, + "kl": 0.0, + "learning_rate": 4.0184275005446536e-05, + "logits/chosen": -1252838656.0, + "logits/rejected": -1044381696.0, + "logps/chosen": -505.81268882175226, + "logps/rejected": -670.6537216828478, + "loss": 0.4405, + "rewards/chosen": -1.959733761329305, + "rewards/margins": 1.5884053972467467, + "rewards/rejected": -3.5481391585760518, + "step": 2080 + }, + { + "epoch": 1.0939544621826747, + "grad_norm": 0.5146387697235528, + "kl": 0.0, + "learning_rate": 4.0063006083617164e-05, + "logits/chosen": -1001390080.0, + "logits/rejected": -814428992.0, + "logps/chosen": -497.4276923076923, + "logps/rejected": -751.2380952380952, + "loss": 0.424, + "rewards/chosen": -2.1334615384615385, + "rewards/margins": 2.7026495726495727, + "rewards/rejected": -4.836111111111111, + "step": 2090 + }, + { + "epoch": 1.0991886940591469, + "grad_norm": 0.7762982381798743, + "kl": 0.0, + "learning_rate": 3.9941178156660956e-05, + "logits/chosen": -943298944.0, + "logits/rejected": -795869184.0, + "logps/chosen": -603.4858934169279, + "logps/rejected": -831.1028037383178, + "loss": 0.4228, + "rewards/chosen": -2.774980407523511, + "rewards/margins": 2.5451130504204142, + "rewards/rejected": -5.320093457943925, + "step": 2100 + }, + { + "epoch": 1.0991886940591469, + "eval_kl": 0.0, + "eval_logits/chosen": -1897423232.0, + "eval_logits/rejected": -1554289280.0, + "eval_logps/chosen": -605.4824344383968, + "eval_logps/rejected": -827.4172053704625, + "eval_loss": 0.4539531171321869, + "eval_rewards/chosen": -2.737877288471054, + "eval_rewards/margins": 2.5883285792564448, + "eval_rewards/rejected": -5.326205867727499, + "eval_runtime": 93.4436, + "eval_samples_per_second": 42.807, + "eval_steps_per_second": 0.674, + "step": 2100 + }, + { + "epoch": 1.1044229259356189, + "grad_norm": 0.5835570550501369, + "kl": 0.0, + "learning_rate": 3.9818795745749544e-05, + "logits/chosen": -910163968.0, + "logits/rejected": -678952960.0, + "logps/chosen": -582.7730061349694, + "logps/rejected": -1006.6751592356688, + "loss": 0.4134, + "rewards/chosen": -2.3824769938650308, + "rewards/margins": 4.701917910593568, + "rewards/rejected": -7.084394904458598, + "step": 2110 + }, + { + "epoch": 1.109657157812091, + "grad_norm": 0.7263499612561425, + "kl": 0.0, + "learning_rate": 3.969586339263209e-05, + "logits/chosen": -940153216.0, + "logits/rejected": -795344896.0, + "logps/chosen": -573.8, + "logps/rejected": -756.2, + "loss": 0.4227, + "rewards/chosen": -2.5900390625, + "rewards/margins": 2.3224609375, + "rewards/rejected": -4.9125, + "step": 2120 + }, + { + "epoch": 1.1148913896885633, + "grad_norm": 0.5624781964454179, + "kl": 0.0, + "learning_rate": 3.9572385659466717e-05, + "logits/chosen": -1081920768.0, + "logits/rejected": -1029491904.0, + "logps/chosen": -538.1132075471698, + "logps/rejected": -698.8322981366459, + "loss": 0.4119, + "rewards/chosen": -2.2749361242138364, + "rewards/margins": 1.6513061118110084, + "rewards/rejected": -3.926242236024845, + "step": 2130 + }, + { + "epoch": 1.1201256215650353, + "grad_norm": 0.42810241625177137, + "kl": 0.0, + "learning_rate": 3.944836712865122e-05, + "logits/chosen": -1106247680.0, + "logits/rejected": -993001472.0, + "logps/chosen": -507.2704402515723, + "logps/rejected": -762.8322981366459, + "loss": 0.3997, + "rewards/chosen": -2.029726808176101, + "rewards/margins": 2.6010961731903586, + "rewards/rejected": -4.630822981366459, + "step": 2140 + }, + { + "epoch": 1.1253598534415075, + "grad_norm": 1.792400030040611, + "kl": 0.0, + "learning_rate": 3.932381240265301e-05, + "logits/chosen": -1087792768.0, + "logits/rejected": -1007681536.0, + "logps/chosen": -620.0774193548388, + "logps/rejected": -853.3333333333334, + "loss": 0.4082, + "rewards/chosen": -2.63125, + "rewards/margins": 2.992234848484848, + "rewards/rejected": -5.623484848484848, + "step": 2150 + }, + { + "epoch": 1.1305940853179797, + "grad_norm": 0.5225809025312501, + "kl": 0.0, + "learning_rate": 3.919872610383831e-05, + "logits/chosen": -1179648000.0, + "logits/rejected": -880384384.0, + "logps/chosen": -584.7447447447447, + "logps/rejected": -784.1563517915309, + "loss": 0.4165, + "rewards/chosen": -2.2351726726726726, + "rewards/margins": 2.7599413338419856, + "rewards/rejected": -4.995114006514658, + "step": 2160 + }, + { + "epoch": 1.1358283171944517, + "grad_norm": 0.5287442378757015, + "kl": 0.0, + "learning_rate": 3.9073112874300574e-05, + "logits/chosen": -1114845952.0, + "logits/rejected": -979369984.0, + "logps/chosen": -434.03095975232196, + "logps/rejected": -636.8706624605678, + "loss": 0.4086, + "rewards/chosen": -1.198843846749226, + "rewards/margins": 1.92773344031702, + "rewards/rejected": -3.126577287066246, + "step": 2170 + }, + { + "epoch": 1.1410625490709239, + "grad_norm": 0.4391631988412161, + "kl": 0.0, + "learning_rate": 3.8946977375688306e-05, + "logits/chosen": -1016384704.0, + "logits/rejected": -800168320.0, + "logps/chosen": -554.1682242990654, + "logps/rejected": -720.3510971786834, + "loss": 0.4097, + "rewards/chosen": -2.169575058411215, + "rewards/margins": 2.07023685381449, + "rewards/rejected": -4.239811912225705, + "step": 2180 + }, + { + "epoch": 1.146296780947396, + "grad_norm": 0.771737814326899, + "kl": 0.0, + "learning_rate": 3.882032428903195e-05, + "logits/chosen": -940992128.0, + "logits/rejected": -778357952.0, + "logps/chosen": -493.84326018808775, + "logps/rejected": -693.4330218068536, + "loss": 0.388, + "rewards/chosen": -1.720807210031348, + "rewards/margins": 2.2090993320247265, + "rewards/rejected": -3.9299065420560746, + "step": 2190 + }, + { + "epoch": 1.151531012823868, + "grad_norm": 0.4987120515896395, + "kl": 0.0, + "learning_rate": 3.869315831457025e-05, + "logits/chosen": -939314368.0, + "logits/rejected": -714289984.0, + "logps/chosen": -538.6504559270517, + "logps/rejected": -637.6334405144695, + "loss": 0.4094, + "rewards/chosen": -1.9659954407294833, + "rewards/margins": 1.4363357489811277, + "rewards/rejected": -3.402331189710611, + "step": 2200 + }, + { + "epoch": 1.151531012823868, + "eval_kl": 0.0, + "eval_logits/chosen": -1938966784.0, + "eval_logits/rejected": -1658281344.0, + "eval_logps/chosen": -495.0578921326076, + "eval_logps/rejected": -585.3565390353058, + "eval_loss": 0.4561484456062317, + "eval_rewards/chosen": -1.6318035625927758, + "eval_rewards/margins": 1.2711054379044893, + "eval_rewards/rejected": -2.902909000497265, + "eval_runtime": 93.3795, + "eval_samples_per_second": 42.836, + "eval_steps_per_second": 0.675, + "step": 2200 + }, + { + "epoch": 1.1567652447003403, + "grad_norm": 0.4808914069280438, + "kl": 0.0, + "learning_rate": 3.856548417157581e-05, + "logits/chosen": -964060800.0, + "logits/rejected": -820196160.0, + "logps/chosen": -439.7037037037037, + "logps/rejected": -554.3291139240506, + "loss": 0.4057, + "rewards/chosen": -1.3296199845679013, + "rewards/margins": 1.5208546989764027, + "rewards/rejected": -2.850474683544304, + "step": 2210 + }, + { + "epoch": 1.1619994765768125, + "grad_norm": 0.5384626355350068, + "kl": 0.0, + "learning_rate": 3.843730659817991e-05, + "logits/chosen": -1157418240.0, + "logits/rejected": -929877184.0, + "logps/chosen": -463.1055900621118, + "logps/rejected": -571.5723270440252, + "loss": 0.3925, + "rewards/chosen": -1.1734763198757765, + "rewards/margins": 1.6975928625141607, + "rewards/rejected": -2.8710691823899372, + "step": 2220 + }, + { + "epoch": 1.1672337084532844, + "grad_norm": 0.9556698752680222, + "kl": 0.0, + "learning_rate": 3.830863035119671e-05, + "logits/chosen": -999502656.0, + "logits/rejected": -930611200.0, + "logps/chosen": -460.6233766233766, + "logps/rejected": -653.2048192771084, + "loss": 0.382, + "rewards/chosen": -1.5374391233766234, + "rewards/margins": 2.0212958163824126, + "rewards/rejected": -3.558734939759036, + "step": 2230 + }, + { + "epoch": 1.1724679403297567, + "grad_norm": 0.6610739790203071, + "kl": 0.0, + "learning_rate": 3.8179460205946717e-05, + "logits/chosen": -844418240.0, + "logits/rejected": -707683968.0, + "logps/chosen": -561.4, + "logps/rejected": -765.4, + "loss": 0.4029, + "rewards/chosen": -2.3671875, + "rewards/margins": 2.4789062499999996, + "rewards/rejected": -4.84609375, + "step": 2240 + }, + { + "epoch": 1.1777021722062289, + "grad_norm": 0.5001407476527855, + "kl": 0.0, + "learning_rate": 3.804980095607955e-05, + "logits/chosen": -905445376.0, + "logits/rejected": -802789760.0, + "logps/chosen": -514.6081504702195, + "logps/rejected": -698.018691588785, + "loss": 0.3878, + "rewards/chosen": -1.7934952978056427, + "rewards/margins": 2.1161620230666314, + "rewards/rejected": -3.9096573208722742, + "step": 2250 + }, + { + "epoch": 1.1829364040827008, + "grad_norm": 0.47433364516008364, + "kl": 0.0, + "learning_rate": 3.791965741339607e-05, + "logits/chosen": -893072192.0, + "logits/rejected": -958503296.0, + "logps/chosen": -432.2787456445993, + "logps/rejected": -619.4220963172804, + "loss": 0.3568, + "rewards/chosen": -0.9600664198606271, + "rewards/margins": 2.1079222486946136, + "rewards/rejected": -3.0679886685552407, + "step": 2260 + }, + { + "epoch": 1.188170635959173, + "grad_norm": 0.5473442952438966, + "kl": 0.0, + "learning_rate": 3.7789034407669754e-05, + "logits/chosen": -1082549888.0, + "logits/rejected": -838441344.0, + "logps/chosen": -452.55727554179566, + "logps/rejected": -616.378548895899, + "loss": 0.3847, + "rewards/chosen": -1.1921439628482973, + "rewards/margins": 2.1154270150696837, + "rewards/rejected": -3.307570977917981, + "step": 2270 + }, + { + "epoch": 1.193404867835645, + "grad_norm": 0.5456062661969855, + "kl": 0.0, + "learning_rate": 3.7657936786467526e-05, + "logits/chosen": -1233335040.0, + "logits/rejected": -1010198144.0, + "logps/chosen": -454.24624624624624, + "logps/rejected": -569.4332247557003, + "loss": 0.3936, + "rewards/chosen": -1.1121199324324325, + "rewards/margins": 1.6916259959063296, + "rewards/rejected": -2.803745928338762, + "step": 2280 + }, + { + "epoch": 1.1986390997121172, + "grad_norm": 0.48831991548633896, + "kl": 0.0, + "learning_rate": 3.752636941496981e-05, + "logits/chosen": -1165177600.0, + "logits/rejected": -1014602112.0, + "logps/chosen": -387.0691823899371, + "logps/rejected": -613.664596273292, + "loss": 0.3634, + "rewards/chosen": -0.813298447327044, + "rewards/margins": 2.093533850809602, + "rewards/rejected": -2.906832298136646, + "step": 2290 + }, + { + "epoch": 1.2038733315885894, + "grad_norm": 1.0066995567188157, + "kl": 0.0, + "learning_rate": 3.739433717578999e-05, + "logits/chosen": -1147351808.0, + "logits/rejected": -1030540480.0, + "logps/chosen": -441.68152866242036, + "logps/rejected": -643.0429447852761, + "loss": 0.3779, + "rewards/chosen": -1.0644904458598725, + "rewards/margins": 2.300540228986753, + "rewards/rejected": -3.3650306748466257, + "step": 2300 + }, + { + "epoch": 1.2038733315885894, + "eval_kl": 0.0, + "eval_logits/chosen": -2399674624.0, + "eval_logits/rejected": -2125380352.0, + "eval_logps/chosen": -514.6284017812964, + "eval_logps/rejected": -627.5882645450025, + "eval_loss": 0.45635154843330383, + "eval_rewards/chosen": -1.8295398317664522, + "eval_rewards/margins": 1.4960444546582123, + "eval_rewards/rejected": -3.3255842864246645, + "eval_runtime": 93.3818, + "eval_samples_per_second": 42.835, + "eval_steps_per_second": 0.675, + "step": 2300 + }, + { + "epoch": 1.2091075634650614, + "grad_norm": 0.5113334468273057, + "kl": 0.0, + "learning_rate": 3.726184496879323e-05, + "logits/chosen": -1007891264.0, + "logits/rejected": -1058013184.0, + "logps/chosen": -467.43624161073825, + "logps/rejected": -666.8538011695906, + "loss": 0.3791, + "rewards/chosen": -1.4866820469798658, + "rewards/margins": 2.1664612278739352, + "rewards/rejected": -3.653143274853801, + "step": 2310 + }, + { + "epoch": 1.2143417953415336, + "grad_norm": 0.6051314218968836, + "kl": 0.0, + "learning_rate": 3.71288977109146e-05, + "logits/chosen": -1114845952.0, + "logits/rejected": -938056064.0, + "logps/chosen": -475.3865030674847, + "logps/rejected": -625.6305732484077, + "loss": 0.4018, + "rewards/chosen": -1.461320935582822, + "rewards/margins": 1.8372459433980697, + "rewards/rejected": -3.2985668789808917, + "step": 2320 + }, + { + "epoch": 1.2195760272180058, + "grad_norm": 0.5399080960512563, + "kl": 0.0, + "learning_rate": 3.699550033597663e-05, + "logits/chosen": -1155950208.0, + "logits/rejected": -1095132800.0, + "logps/chosen": -430.9808917197452, + "logps/rejected": -603.2883435582822, + "loss": 0.358, + "rewards/chosen": -0.7718016023089171, + "rewards/margins": 2.3060358210039666, + "rewards/rejected": -3.0778374233128836, + "step": 2330 + }, + { + "epoch": 1.2248102590944778, + "grad_norm": 0.48019340691437284, + "kl": 0.0, + "learning_rate": 3.686165779450619e-05, + "logits/chosen": -1158047360.0, + "logits/rejected": -988387712.0, + "logps/chosen": -422.5853658536585, + "logps/rejected": -529.7435897435897, + "loss": 0.4038, + "rewards/chosen": -0.8891482469512195, + "rewards/margins": 1.569986368433396, + "rewards/rejected": -2.4591346153846154, + "step": 2340 + }, + { + "epoch": 1.23004449097095, + "grad_norm": 0.5240411276344981, + "kl": 0.0, + "learning_rate": 3.672737505355081e-05, + "logits/chosen": -998873472.0, + "logits/rejected": -825334144.0, + "logps/chosen": -420.0487804878049, + "logps/rejected": -590.0512820512821, + "loss": 0.3863, + "rewards/chosen": -1.0591773056402438, + "rewards/margins": 1.87591884820591, + "rewards/rejected": -2.9350961538461537, + "step": 2350 + }, + { + "epoch": 1.235278722847422, + "grad_norm": 0.5381425730614176, + "kl": 0.0, + "learning_rate": 3.659265709649428e-05, + "logits/chosen": -919391424.0, + "logits/rejected": -660707712.0, + "logps/chosen": -423.07788161993767, + "logps/rejected": -651.6363636363636, + "loss": 0.3566, + "rewards/chosen": -0.8701263142523364, + "rewards/margins": 2.8444113033025222, + "rewards/rejected": -3.714537617554859, + "step": 2360 + }, + { + "epoch": 1.2405129547238942, + "grad_norm": 0.5783093754455919, + "kl": 0.0, + "learning_rate": 3.645750892287178e-05, + "logits/chosen": -942669824.0, + "logits/rejected": -837812224.0, + "logps/chosen": -405.0331125827815, + "logps/rejected": -571.7396449704142, + "loss": 0.3374, + "rewards/chosen": -0.5006467301324503, + "rewards/margins": 2.3114834473823427, + "rewards/rejected": -2.812130177514793, + "step": 2370 + }, + { + "epoch": 1.2457471866003664, + "grad_norm": 0.5404535850119857, + "kl": 0.0, + "learning_rate": 3.632193554818429e-05, + "logits/chosen": -935329792.0, + "logits/rejected": -691640704.0, + "logps/chosen": -462.44025157232704, + "logps/rejected": -624.695652173913, + "loss": 0.369, + "rewards/chosen": -0.691011056960004, + "rewards/margins": 2.7880262101207416, + "rewards/rejected": -3.4790372670807455, + "step": 2380 + }, + { + "epoch": 1.2509814184768384, + "grad_norm": 0.4591882116536945, + "kl": 0.0, + "learning_rate": 3.6185942003712515e-05, + "logits/chosen": -927150912.0, + "logits/rejected": -722468864.0, + "logps/chosen": -416.6808510638298, + "logps/rejected": -663.7684887459807, + "loss": 0.3684, + "rewards/chosen": -0.9160809270516718, + "rewards/margins": 2.778452835006206, + "rewards/rejected": -3.694533762057878, + "step": 2390 + }, + { + "epoch": 1.2562156503533106, + "grad_norm": 0.6102727110509801, + "kl": 0.0, + "learning_rate": 3.604953333633009e-05, + "logits/chosen": -955252736.0, + "logits/rejected": -769235328.0, + "logps/chosen": -464.283185840708, + "logps/rejected": -666.6843853820598, + "loss": 0.3731, + "rewards/chosen": -1.2830360435103245, + "rewards/margins": 2.393458973100971, + "rewards/rejected": -3.6764950166112955, + "step": 2400 + }, + { + "epoch": 1.2562156503533106, + "eval_kl": 0.0, + "eval_logits/chosen": -1886105216.0, + "eval_logits/rejected": -1661210752.0, + "eval_logps/chosen": -496.7679366650173, + "eval_logps/rejected": -605.2153157633019, + "eval_loss": 0.4578281342983246, + "eval_rewards/chosen": -1.6493072736269174, + "eval_rewards/margins": 1.4530050088196265, + "eval_rewards/rejected": -3.102312282446544, + "eval_runtime": 93.4183, + "eval_samples_per_second": 42.818, + "eval_steps_per_second": 0.674, + "step": 2400 + }, + { + "epoch": 1.2614498822297828, + "grad_norm": 0.5419994441020899, + "kl": 0.0, + "learning_rate": 3.5912714608316346e-05, + "logits/chosen": -832464512.0, + "logits/rejected": -718484288.0, + "logps/chosen": -455.2207792207792, + "logps/rejected": -698.5060240963855, + "loss": 0.3477, + "rewards/chosen": -1.0582830255681819, + "rewards/margins": 3.069729022624589, + "rewards/rejected": -4.128012048192771, + "step": 2410 + }, + { + "epoch": 1.2666841141062548, + "grad_norm": 1.1041391753234233, + "kl": 0.0, + "learning_rate": 3.577549089716845e-05, + "logits/chosen": -842425984.0, + "logits/rejected": -619184128.0, + "logps/chosen": -402.8553846153846, + "logps/rejected": -700.4444444444445, + "loss": 0.3471, + "rewards/chosen": -0.9087079326923077, + "rewards/margins": 3.2742285752442, + "rewards/rejected": -4.182936507936508, + "step": 2420 + }, + { + "epoch": 1.271918345982727, + "grad_norm": 0.4627953236624642, + "kl": 0.0, + "learning_rate": 3.56378672954129e-05, + "logits/chosen": -896532480.0, + "logits/rejected": -730123456.0, + "logps/chosen": -487.7770897832817, + "logps/rejected": -696.429022082019, + "loss": 0.3557, + "rewards/chosen": -1.3471120356037152, + "rewards/margins": 2.7372728224404486, + "rewards/rejected": -4.084384858044164, + "step": 2430 + }, + { + "epoch": 1.2771525778591992, + "grad_norm": 0.5819819288924131, + "kl": 0.0, + "learning_rate": 3.5499848910416646e-05, + "logits/chosen": -1001390080.0, + "logits/rejected": -815267840.0, + "logps/chosen": -421.08868501529054, + "logps/rejected": -678.5431309904153, + "loss": 0.3453, + "rewards/chosen": -0.9145940844801224, + "rewards/margins": 3.003137544912849, + "rewards/rejected": -3.9177316293929714, + "step": 2440 + }, + { + "epoch": 1.2823868097356712, + "grad_norm": 1.2888270938532067, + "kl": 0.0, + "learning_rate": 3.536144086419744e-05, + "logits/chosen": -1004326080.0, + "logits/rejected": -833775232.0, + "logps/chosen": -406.111801242236, + "logps/rejected": -582.5408805031446, + "loss": 0.3863, + "rewards/chosen": -1.0681774068322982, + "rewards/margins": 1.7191653604632993, + "rewards/rejected": -2.7873427672955975, + "step": 2450 + }, + { + "epoch": 1.2876210416121434, + "grad_norm": 0.8224405943194626, + "kl": 0.0, + "learning_rate": 3.522264829323381e-05, + "logits/chosen": -865494656.0, + "logits/rejected": -794296320.0, + "logps/chosen": -459.9225806451613, + "logps/rejected": -565.7212121212121, + "loss": 0.4008, + "rewards/chosen": -1.0576612903225806, + "rewards/margins": 1.7438538611925711, + "rewards/rejected": -2.8015151515151517, + "step": 2460 + }, + { + "epoch": 1.2928552734886156, + "grad_norm": 0.6298545868105269, + "kl": 0.0, + "learning_rate": 3.5083476348274454e-05, + "logits/chosen": -741657792.0, + "logits/rejected": -663958336.0, + "logps/chosen": -455.0769230769231, + "logps/rejected": -694.6341463414634, + "loss": 0.3556, + "rewards/chosen": -1.2873347355769231, + "rewards/margins": 2.8254701424718576, + "rewards/rejected": -4.112804878048781, + "step": 2470 + }, + { + "epoch": 1.2980895053650876, + "grad_norm": 1.120319476468608, + "kl": 0.0, + "learning_rate": 3.494393019414704e-05, + "logits/chosen": -760217600.0, + "logits/rejected": -618187968.0, + "logps/chosen": -449.5686274509804, + "logps/rejected": -713.4850299401197, + "loss": 0.3475, + "rewards/chosen": -1.2301368464052287, + "rewards/margins": 3.027348183534891, + "rewards/rejected": -4.25748502994012, + "step": 2480 + }, + { + "epoch": 1.3033237372415598, + "grad_norm": 0.8833961844092323, + "kl": 0.0, + "learning_rate": 3.480401500956657e-05, + "logits/chosen": -747529856.0, + "logits/rejected": -612892672.0, + "logps/chosen": -461.81366459627327, + "logps/rejected": -668.5786163522013, + "loss": 0.368, + "rewards/chosen": -1.3041537267080745, + "rewards/margins": 2.2583462732919255, + "rewards/rejected": -3.5625, + "step": 2490 + }, + { + "epoch": 1.308557969118032, + "grad_norm": 0.7959973438288277, + "kl": 0.0, + "learning_rate": 3.4663735986943194e-05, + "logits/chosen": -723622272.0, + "logits/rejected": -588146304.0, + "logps/chosen": -441.44, + "logps/rejected": -676.8941176470588, + "loss": 0.3538, + "rewards/chosen": -1.0112760416666666, + "rewards/margins": 2.8769592524509804, + "rewards/rejected": -3.888235294117647, + "step": 2500 + }, + { + "epoch": 1.308557969118032, + "eval_kl": 0.0, + "eval_logits/chosen": -1523131520.0, + "eval_logits/rejected": -1173539584.0, + "eval_logps/chosen": -520.2968827313211, + "eval_logps/rejected": -641.4639482844356, + "eval_loss": 0.45775389671325684, + "eval_rewards/chosen": -1.8840920336467095, + "eval_rewards/margins": 1.5788617207043598, + "eval_rewards/rejected": -3.4629537543510693, + "eval_runtime": 93.4179, + "eval_samples_per_second": 42.818, + "eval_steps_per_second": 0.674, + "step": 2500 + }, + { + "epoch": 1.313792200994504, + "grad_norm": 0.6111285044739223, + "kl": 0.0, + "learning_rate": 3.452309833218948e-05, + "logits/chosen": -867696640.0, + "logits/rejected": -645083968.0, + "logps/chosen": -516.1823708206687, + "logps/rejected": -749.9935691318328, + "loss": 0.3563, + "rewards/chosen": -1.453647416413374, + "rewards/margins": 2.949889561078587, + "rewards/rejected": -4.403536977491961, + "step": 2510 + }, + { + "epoch": 1.3190264328709762, + "grad_norm": 1.1367797934531445, + "kl": 0.0, + "learning_rate": 3.438210726452724e-05, + "logits/chosen": -850499968.0, + "logits/rejected": -654416256.0, + "logps/chosen": -392.9051987767584, + "logps/rejected": -761.0479233226837, + "loss": 0.3216, + "rewards/chosen": -0.7394997133027523, + "rewards/margins": 3.7509156221605062, + "rewards/rejected": -4.4904153354632586, + "step": 2520 + }, + { + "epoch": 1.3242606647474484, + "grad_norm": 0.6046826859018436, + "kl": 0.0, + "learning_rate": 3.424076801629387e-05, + "logits/chosen": -920649728.0, + "logits/rejected": -852072832.0, + "logps/chosen": -417.80204778156997, + "logps/rejected": -634.3746397694524, + "loss": 0.3214, + "rewards/chosen": -0.8054074232081911, + "rewards/margins": 2.649923988895555, + "rewards/rejected": -3.4553314121037464, + "step": 2530 + }, + { + "epoch": 1.3294948966239204, + "grad_norm": 0.6290047351078182, + "kl": 0.0, + "learning_rate": 3.4099085832748095e-05, + "logits/chosen": -1099327104.0, + "logits/rejected": -1017118720.0, + "logps/chosen": -407.2025723472669, + "logps/rejected": -598.0790273556231, + "loss": 0.3594, + "rewards/chosen": -0.7732993368167203, + "rewards/margins": 2.3102872893230977, + "rewards/rejected": -3.0835866261398177, + "step": 2540 + }, + { + "epoch": 1.3347291285003926, + "grad_norm": 0.7278685039353052, + "kl": 0.0, + "learning_rate": 3.395706597187538e-05, + "logits/chosen": -1117572352.0, + "logits/rejected": -899887936.0, + "logps/chosen": -432.34890965732086, + "logps/rejected": -595.8620689655172, + "loss": 0.3615, + "rewards/chosen": -0.8768983644859814, + "rewards/margins": 2.3605624505610407, + "rewards/rejected": -3.237460815047022, + "step": 2550 + }, + { + "epoch": 1.3399633603768648, + "grad_norm": 0.5868735607220685, + "kl": 0.0, + "learning_rate": 3.381471370419278e-05, + "logits/chosen": -1081501312.0, + "logits/rejected": -898944192.0, + "logps/chosen": -441.9240506329114, + "logps/rejected": -697.4814814814815, + "loss": 0.3362, + "rewards/chosen": -0.9524698378164557, + "rewards/margins": 2.9780857177391, + "rewards/rejected": -3.9305555555555554, + "step": 2560 + }, + { + "epoch": 1.3451975922533368, + "grad_norm": 1.3113360831377077, + "kl": 0.0, + "learning_rate": 3.3672034312553326e-05, + "logits/chosen": -1024039296.0, + "logits/rejected": -698561344.0, + "logps/chosen": -476.35692307692307, + "logps/rejected": -833.3206349206349, + "loss": 0.3546, + "rewards/chosen": -1.4344711538461539, + "rewards/margins": 4.108782814407815, + "rewards/rejected": -5.5432539682539685, + "step": 2570 + }, + { + "epoch": 1.350431824129809, + "grad_norm": 0.5501904923995546, + "kl": 0.0, + "learning_rate": 3.352903309194999e-05, + "logits/chosen": -974022272.0, + "logits/rejected": -782657152.0, + "logps/chosen": -445.72168284789643, + "logps/rejected": -795.6495468277946, + "loss": 0.3294, + "rewards/chosen": -1.3428398058252426, + "rewards/margins": 3.5846526413046664, + "rewards/rejected": -4.927492447129909, + "step": 2580 + }, + { + "epoch": 1.3556660560062812, + "grad_norm": 0.5726718370323468, + "kl": 0.0, + "learning_rate": 3.338571534931919e-05, + "logits/chosen": -1016804160.0, + "logits/rejected": -867172352.0, + "logps/chosen": -430.8724832214765, + "logps/rejected": -744.6081871345029, + "loss": 0.3272, + "rewards/chosen": -1.143089345637584, + "rewards/margins": 3.473869718689901, + "rewards/rejected": -4.616959064327485, + "step": 2590 + }, + { + "epoch": 1.3609002878827532, + "grad_norm": 0.5016181108370353, + "kl": 0.0, + "learning_rate": 3.324208640334383e-05, + "logits/chosen": -1298556544.0, + "logits/rejected": -899258752.0, + "logps/chosen": -419.6507042253521, + "logps/rejected": -713.5438596491229, + "loss": 0.3699, + "rewards/chosen": -0.999768926056338, + "rewards/margins": 3.238827565171732, + "rewards/rejected": -4.23859649122807, + "step": 2600 + }, + { + "epoch": 1.3609002878827532, + "eval_kl": 0.0, + "eval_logits/chosen": -2620041984.0, + "eval_logits/rejected": -2228706560.0, + "eval_logps/chosen": -464.24542305789214, + "eval_logps/rejected": -571.5126802585778, + "eval_loss": 0.4552265703678131, + "eval_rewards/chosen": -1.3249628896585848, + "eval_rewards/margins": 1.4383389502220716, + "eval_rewards/rejected": -2.7633018398806564, + "eval_runtime": 93.4503, + "eval_samples_per_second": 42.804, + "eval_steps_per_second": 0.674, + "step": 2600 + }, + { + "epoch": 1.3661345197592254, + "grad_norm": 0.5441867816476207, + "kl": 0.0, + "learning_rate": 3.309815158425591e-05, + "logits/chosen": -1281359872.0, + "logits/rejected": -1104989440.0, + "logps/chosen": -386.2085889570552, + "logps/rejected": -610.0382165605096, + "loss": 0.3322, + "rewards/chosen": -0.46640145705521474, + "rewards/margins": 2.5964966321167595, + "rewards/rejected": -3.0628980891719744, + "step": 2610 + }, + { + "epoch": 1.3713687516356974, + "grad_norm": 0.6345683057432078, + "kl": 0.0, + "learning_rate": 3.295391623363874e-05, + "logits/chosen": -1280730752.0, + "logits/rejected": -1039243648.0, + "logps/chosen": -412.21183800623055, + "logps/rejected": -631.9749216300941, + "loss": 0.3526, + "rewards/chosen": -0.6753796728971962, + "rewards/margins": 2.6251689164444967, + "rewards/rejected": -3.300548589341693, + "step": 2620 + }, + { + "epoch": 1.3766029835121696, + "grad_norm": 0.5674215327512067, + "kl": 0.0, + "learning_rate": 3.280938570422869e-05, + "logits/chosen": -1282198784.0, + "logits/rejected": -923376000.0, + "logps/chosen": -377.22352941176473, + "logps/rejected": -641.6, + "loss": 0.3667, + "rewards/chosen": -0.6543198529411764, + "rewards/margins": 2.8973468137254903, + "rewards/rejected": -3.5516666666666667, + "step": 2630 + }, + { + "epoch": 1.3818372153886418, + "grad_norm": 0.5205232170690738, + "kl": 0.0, + "learning_rate": 3.266456535971654e-05, + "logits/chosen": -1084856704.0, + "logits/rejected": -985241984.0, + "logps/chosen": -458.17704918032786, + "logps/rejected": -665.9820895522388, + "loss": 0.3498, + "rewards/chosen": -0.9981045081967214, + "rewards/margins": 2.5884626559823833, + "rewards/rejected": -3.5865671641791046, + "step": 2640 + }, + { + "epoch": 1.3870714472651138, + "grad_norm": 0.6619442824934767, + "kl": 0.0, + "learning_rate": 3.2519460574548435e-05, + "logits/chosen": -1018586752.0, + "logits/rejected": -875770688.0, + "logps/chosen": -436.0883280757098, + "logps/rejected": -744.5201238390093, + "loss": 0.3679, + "rewards/chosen": -0.9814175867507886, + "rewards/margins": 3.5220653853854342, + "rewards/rejected": -4.503482972136223, + "step": 2650 + }, + { + "epoch": 1.392305679141586, + "grad_norm": 0.48346469367063, + "kl": 0.0, + "learning_rate": 3.237407673372644e-05, + "logits/chosen": -1107086592.0, + "logits/rejected": -822922432.0, + "logps/chosen": -461.2544378698225, + "logps/rejected": -731.2317880794702, + "loss": 0.3577, + "rewards/chosen": -1.0839959319526626, + "rewards/margins": 3.2388517501665426, + "rewards/rejected": -4.322847682119205, + "step": 2660 + }, + { + "epoch": 1.397539911018058, + "grad_norm": 0.8678937627006628, + "kl": 0.0, + "learning_rate": 3.2228419232608695e-05, + "logits/chosen": -1117991680.0, + "logits/rejected": -744593792.0, + "logps/chosen": -416.094674556213, + "logps/rejected": -673.1655629139073, + "loss": 0.3578, + "rewards/chosen": -0.8156954974112426, + "rewards/margins": 3.1851323171582937, + "rewards/rejected": -4.000827814569536, + "step": 2670 + }, + { + "epoch": 1.4027741428945302, + "grad_norm": 0.658458342613859, + "kl": 0.0, + "learning_rate": 3.208249347670917e-05, + "logits/chosen": -944347520.0, + "logits/rejected": -722993152.0, + "logps/chosen": -423.54430379746833, + "logps/rejected": -716.4444444444445, + "loss": 0.3536, + "rewards/chosen": -1.0082946004746836, + "rewards/margins": 3.232446140266057, + "rewards/rejected": -4.2407407407407405, + "step": 2680 + }, + { + "epoch": 1.4080083747710024, + "grad_norm": 0.4856064293626971, + "kl": 0.0, + "learning_rate": 3.1936304881497084e-05, + "logits/chosen": -867801472.0, + "logits/rejected": -710410240.0, + "logps/chosen": -476.8888888888889, + "logps/rejected": -697.7215189873418, + "loss": 0.3519, + "rewards/chosen": -1.3734809027777777, + "rewards/margins": 2.6538134010196908, + "rewards/rejected": -4.0272943037974684, + "step": 2690 + }, + { + "epoch": 1.4132426066474744, + "grad_norm": 0.5589508309469322, + "kl": 0.0, + "learning_rate": 3.178985887219589e-05, + "logits/chosen": -790836032.0, + "logits/rejected": -627992192.0, + "logps/chosen": -495.3268608414239, + "logps/rejected": -932.9305135951662, + "loss": 0.3293, + "rewards/chosen": -1.562626415857605, + "rewards/margins": 4.543869052420341, + "rewards/rejected": -6.106495468277946, + "step": 2700 + }, + { + "epoch": 1.4132426066474744, + "eval_kl": 0.0, + "eval_logits/chosen": -1547698176.0, + "eval_logits/rejected": -1145178112.0, + "eval_logps/chosen": -559.4695695200396, + "eval_logps/rejected": -805.1397314768772, + "eval_loss": 0.45078906416893005, + "eval_rewards/chosen": -2.276039089559624, + "eval_rewards/margins": 2.8229166538516144, + "eval_rewards/rejected": -5.0989557434112385, + "eval_runtime": 93.4389, + "eval_samples_per_second": 42.809, + "eval_steps_per_second": 0.674, + "step": 2700 + }, + { + "epoch": 1.4184768385239466, + "grad_norm": 0.47458066615925315, + "kl": 0.0, + "learning_rate": 3.164316088358201e-05, + "logits/chosen": -779511424.0, + "logits/rejected": -455029568.0, + "logps/chosen": -558.780487804878, + "logps/rejected": -872.3076923076923, + "loss": 0.3588, + "rewards/chosen": -2.2006538205030486, + "rewards/margins": 4.011685923086695, + "rewards/rejected": -6.212339743589744, + "step": 2710 + }, + { + "epoch": 1.4237110704004188, + "grad_norm": 0.4805620768604118, + "kl": 0.0, + "learning_rate": 3.149621635978309e-05, + "logits/chosen": -705901376.0, + "logits/rejected": -523239424.0, + "logps/chosen": -486.81290322580645, + "logps/rejected": -1012.6545454545454, + "loss": 0.3358, + "rewards/chosen": -1.6076108870967742, + "rewards/margins": 5.625722446236559, + "rewards/rejected": -7.233333333333333, + "step": 2720 + }, + { + "epoch": 1.4289453022768908, + "grad_norm": 0.8604064532109258, + "kl": 0.0, + "learning_rate": 3.134903075407594e-05, + "logits/chosen": -661232000.0, + "logits/rejected": -641099392.0, + "logps/chosen": -477.7304964539007, + "logps/rejected": -869.8994413407821, + "loss": 0.3028, + "rewards/chosen": -1.3523936170212767, + "rewards/margins": 4.299142695827885, + "rewards/rejected": -5.651536312849162, + "step": 2730 + }, + { + "epoch": 1.434179534153363, + "grad_norm": 0.5740359557647091, + "kl": 0.0, + "learning_rate": 3.120160952868424e-05, + "logits/chosen": -802999488.0, + "logits/rejected": -629722304.0, + "logps/chosen": -500.75949367088606, + "logps/rejected": -868.3456790123457, + "loss": 0.3432, + "rewards/chosen": -1.3324515427215189, + "rewards/margins": 4.25859783999453, + "rewards/rejected": -5.591049382716049, + "step": 2740 + }, + { + "epoch": 1.4394137660298352, + "grad_norm": 0.8150577225591913, + "kl": 0.0, + "learning_rate": 3.1053958154575743e-05, + "logits/chosen": -833093632.0, + "logits/rejected": -635437056.0, + "logps/chosen": -506.3619047619048, + "logps/rejected": -867.0523076923076, + "loss": 0.3312, + "rewards/chosen": -1.4729910714285714, + "rewards/margins": 4.43931662087912, + "rewards/rejected": -5.912307692307692, + "step": 2750 + }, + { + "epoch": 1.4446479979063072, + "grad_norm": 0.4811412127606036, + "kl": 0.0, + "learning_rate": 3.090608211125931e-05, + "logits/chosen": -878496960.0, + "logits/rejected": -652424000.0, + "logps/chosen": -534.9380530973451, + "logps/rejected": -947.3488372093024, + "loss": 0.3638, + "rewards/chosen": -1.9941694321533923, + "rewards/margins": 4.019119604391458, + "rewards/rejected": -6.01328903654485, + "step": 2760 + }, + { + "epoch": 1.4498822297827794, + "grad_norm": 0.5558741046389223, + "kl": 0.0, + "learning_rate": 3.0757986886581506e-05, + "logits/chosen": -793247744.0, + "logits/rejected": -655150272.0, + "logps/chosen": -435.3605015673981, + "logps/rejected": -711.3769470404984, + "loss": 0.3391, + "rewards/chosen": -0.9830899133951313, + "rewards/margins": 3.080773014953779, + "rewards/rejected": -4.06386292834891, + "step": 2770 + }, + { + "epoch": 1.4551164616592516, + "grad_norm": 0.6805099501290808, + "kl": 0.0, + "learning_rate": 3.060967797652299e-05, + "logits/chosen": -764516736.0, + "logits/rejected": -620652160.0, + "logps/chosen": -382.8930817610063, + "logps/rejected": -623.0062111801242, + "loss": 0.3268, + "rewards/chosen": -0.38918779481132076, + "rewards/margins": 3.0409364287911633, + "rewards/rejected": -3.4301242236024843, + "step": 2780 + }, + { + "epoch": 1.4603506935357236, + "grad_norm": 0.4985713683174277, + "kl": 0.0, + "learning_rate": 3.046116088499449e-05, + "logits/chosen": -821035008.0, + "logits/rejected": -718589120.0, + "logps/chosen": -373.3003095975232, + "logps/rejected": -633.2365930599369, + "loss": 0.3369, + "rewards/chosen": -0.4190208978328173, + "rewards/margins": 2.7556636447539335, + "rewards/rejected": -3.1746845425867507, + "step": 2790 + }, + { + "epoch": 1.4655849254121958, + "grad_norm": 0.5357398173690253, + "kl": 0.0, + "learning_rate": 3.0312441123632607e-05, + "logits/chosen": -841691968.0, + "logits/rejected": -659973760.0, + "logps/chosen": -414.20512820512823, + "logps/rejected": -630.5365853658536, + "loss": 0.3376, + "rewards/chosen": -0.7527919671474359, + "rewards/margins": 2.662604374315979, + "rewards/rejected": -3.4153963414634148, + "step": 2800 + }, + { + "epoch": 1.4655849254121958, + "eval_kl": 0.0, + "eval_logits/chosen": -1543770240.0, + "eval_logits/rejected": -1201035648.0, + "eval_logps/chosen": -495.72290945076696, + "eval_logps/rejected": -643.6280457483839, + "eval_loss": 0.45756250619888306, + "eval_rewards/chosen": -1.6387926768926273, + "eval_rewards/margins": 1.8452948417548116, + "eval_rewards/rejected": -3.484087518647439, + "eval_runtime": 93.4359, + "eval_samples_per_second": 42.81, + "eval_steps_per_second": 0.674, + "step": 2800 + }, + { + "epoch": 1.470819157288668, + "grad_norm": 1.0632107189982902, + "kl": 0.0, + "learning_rate": 3.0163524211595257e-05, + "logits/chosen": -742706368.0, + "logits/rejected": -504207776.0, + "logps/chosen": -471.7546012269939, + "logps/rejected": -858.3949044585987, + "loss": 0.3515, + "rewards/chosen": -1.2275450536809815, + "rewards/margins": 4.451595073707553, + "rewards/rejected": -5.679140127388535, + "step": 2810 + }, + { + "epoch": 1.47605338916514, + "grad_norm": 0.6066090073258897, + "kl": 0.0, + "learning_rate": 3.001441567535681e-05, + "logits/chosen": -518992704.0, + "logits/rejected": -481768256.0, + "logps/chosen": -463.1225806451613, + "logps/rejected": -838.7878787878788, + "loss": 0.3447, + "rewards/chosen": -1.4911290322580646, + "rewards/margins": 3.7975073313782994, + "rewards/rejected": -5.288636363636364, + "step": 2820 + }, + { + "epoch": 1.4812876210416122, + "grad_norm": 0.5724306368429418, + "kl": 0.0, + "learning_rate": 2.9865121048503052e-05, + "logits/chosen": -565706752.0, + "logits/rejected": -314074720.0, + "logps/chosen": -527.854103343465, + "logps/rejected": -860.1929260450161, + "loss": 0.3691, + "rewards/chosen": -2.0718085106382977, + "rewards/margins": 3.8060049941848533, + "rewards/rejected": -5.877813504823151, + "step": 2830 + }, + { + "epoch": 1.4865218529180844, + "grad_norm": 0.8133276454135758, + "kl": 0.0, + "learning_rate": 2.971564587152579e-05, + "logits/chosen": -629669888.0, + "logits/rejected": -469054272.0, + "logps/chosen": -481.6385542168675, + "logps/rejected": -840.9350649350649, + "loss": 0.3583, + "rewards/chosen": -1.8020519578313252, + "rewards/margins": 3.5843116785323117, + "rewards/rejected": -5.386363636363637, + "step": 2840 + }, + { + "epoch": 1.4917560847945563, + "grad_norm": 0.6631153686584916, + "kl": 0.0, + "learning_rate": 2.9565995691617242e-05, + "logits/chosen": -809290944.0, + "logits/rejected": -725247616.0, + "logps/chosen": -451.44370860927154, + "logps/rejected": -709.207100591716, + "loss": 0.3432, + "rewards/chosen": -1.1947304428807948, + "rewards/margins": 3.027163048243465, + "rewards/rejected": -4.22189349112426, + "step": 2850 + }, + { + "epoch": 1.4969903166710286, + "grad_norm": 0.5188461507645532, + "kl": 0.0, + "learning_rate": 2.9416176062464207e-05, + "logits/chosen": -850185408.0, + "logits/rejected": -705377088.0, + "logps/chosen": -502.1575757575758, + "logps/rejected": -739.0451612903225, + "loss": 0.3496, + "rewards/chosen": -1.5047585227272726, + "rewards/margins": 3.0883866385630494, + "rewards/rejected": -4.593145161290322, + "step": 2860 + }, + { + "epoch": 1.5022245485475008, + "grad_norm": 0.5151500356381172, + "kl": 0.0, + "learning_rate": 2.9266192544041916e-05, + "logits/chosen": -836973376.0, + "logits/rejected": -678848128.0, + "logps/chosen": -388.55757575757576, + "logps/rejected": -746.0129032258064, + "loss": 0.3352, + "rewards/chosen": -0.7599668560606061, + "rewards/margins": 4.0009202407135875, + "rewards/rejected": -4.760887096774193, + "step": 2870 + }, + { + "epoch": 1.5074587804239727, + "grad_norm": 0.8504634704689563, + "kl": 0.0, + "learning_rate": 2.9116050702407703e-05, + "logits/chosen": -775736512.0, + "logits/rejected": -702231360.0, + "logps/chosen": -488.4025157232704, + "logps/rejected": -753.6894409937888, + "loss": 0.352, + "rewards/chosen": -1.600051591981132, + "rewards/margins": 3.076190644043712, + "rewards/rejected": -4.676242236024844, + "step": 2880 + }, + { + "epoch": 1.512693012300445, + "grad_norm": 0.7155406298950329, + "kl": 0.0, + "learning_rate": 2.8965756109494485e-05, + "logits/chosen": -772695680.0, + "logits/rejected": -694891328.0, + "logps/chosen": -536.4528301886793, + "logps/rejected": -841.4409937888199, + "loss": 0.3432, + "rewards/chosen": -1.9663423742138364, + "rewards/margins": 3.4738750170905113, + "rewards/rejected": -5.440217391304348, + "step": 2890 + }, + { + "epoch": 1.5179272441769172, + "grad_norm": 0.6035091088536375, + "kl": 0.0, + "learning_rate": 2.8815314342903948e-05, + "logits/chosen": -787270848.0, + "logits/rejected": -597321344.0, + "logps/chosen": -602.1204819277109, + "logps/rejected": -968.5194805194806, + "loss": 0.3545, + "rewards/chosen": -2.746046686746988, + "rewards/margins": 4.1265182483179474, + "rewards/rejected": -6.872564935064935, + "step": 2900 + }, + { + "epoch": 1.5179272441769172, + "eval_kl": 0.0, + "eval_logits/chosen": -1516407296.0, + "eval_logits/rejected": -1264482816.0, + "eval_logps/chosen": -736.205838693716, + "eval_logps/rejected": -1167.2123321730483, + "eval_loss": 0.45225000381469727, + "eval_rewards/chosen": -4.043295398317665, + "eval_rewards/margins": 4.673263527589843, + "eval_rewards/rejected": -8.716558925907508, + "eval_runtime": 93.433, + "eval_samples_per_second": 42.811, + "eval_steps_per_second": 0.674, + "step": 2900 + }, + { + "epoch": 1.5231614760533891, + "grad_norm": 0.6807872514271341, + "kl": 0.0, + "learning_rate": 2.8664730985699534e-05, + "logits/chosen": -689438720.0, + "logits/rejected": -629669888.0, + "logps/chosen": -672.8971962616822, + "logps/rejected": -1098.5329153605016, + "loss": 0.352, + "rewards/chosen": -3.6897877725856696, + "rewards/margins": 4.076669907665114, + "rewards/rejected": -7.766457680250784, + "step": 2910 + }, + { + "epoch": 1.5283957079298613, + "grad_norm": 1.004264687962567, + "kl": 0.0, + "learning_rate": 2.851401162619929e-05, + "logits/chosen": -740504384.0, + "logits/rejected": -671612928.0, + "logps/chosen": -581.166144200627, + "logps/rejected": -1205.632398753894, + "loss": 0.34, + "rewards/chosen": -2.5907866379310347, + "rewards/margins": 6.76785822188205, + "rewards/rejected": -9.358644859813085, + "step": 2920 + }, + { + "epoch": 1.5336299398063336, + "grad_norm": 0.5686639443419601, + "kl": 0.0, + "learning_rate": 2.836316185776846e-05, + "logits/chosen": -855742848.0, + "logits/rejected": -828794496.0, + "logps/chosen": -669.1282051282051, + "logps/rejected": -1336.3902439024391, + "loss": 0.341, + "rewards/chosen": -3.5234375, + "rewards/margins": 6.859184451219512, + "rewards/rejected": -10.382621951219512, + "step": 2930 + }, + { + "epoch": 1.5388641716828055, + "grad_norm": 1.0145324073512791, + "kl": 0.0, + "learning_rate": 2.8212187278611906e-05, + "logits/chosen": -950429312.0, + "logits/rejected": -835610240.0, + "logps/chosen": -832.0984615384615, + "logps/rejected": -1331.911111111111, + "loss": 0.392, + "rewards/chosen": -5.184615384615385, + "rewards/margins": 5.361813186813186, + "rewards/rejected": -10.54642857142857, + "step": 2940 + }, + { + "epoch": 1.5440984035592775, + "grad_norm": 0.6405221763005252, + "kl": 0.0, + "learning_rate": 2.8061093491566364e-05, + "logits/chosen": -961124736.0, + "logits/rejected": -918972032.0, + "logps/chosen": -732.5749235474007, + "logps/rejected": -1326.3130990415336, + "loss": 0.3448, + "rewards/chosen": -3.912461773700306, + "rewards/margins": 6.49728263524538, + "rewards/rejected": -10.409744408945686, + "step": 2950 + }, + { + "epoch": 1.54933263543575, + "grad_norm": 0.54622495653271, + "kl": 0.0, + "learning_rate": 2.7909886103892508e-05, + "logits/chosen": -992372352.0, + "logits/rejected": -994469504.0, + "logps/chosen": -473.14285714285717, + "logps/rejected": -818.3138461538462, + "loss": 0.3316, + "rewards/chosen": -1.2914186507936507, + "rewards/margins": 4.099735195360195, + "rewards/rejected": -5.391153846153846, + "step": 2960 + }, + { + "epoch": 1.554566867312222, + "grad_norm": 0.5518051451207503, + "kl": 0.0, + "learning_rate": 2.775857072706684e-05, + "logits/chosen": -1027185024.0, + "logits/rejected": -1086324736.0, + "logps/chosen": -461.2987012987013, + "logps/rejected": -758.3614457831326, + "loss": 0.3422, + "rewards/chosen": -1.1163250811688312, + "rewards/margins": 3.468012268228759, + "rewards/rejected": -4.5843373493975905, + "step": 2970 + }, + { + "epoch": 1.559801099188694, + "grad_norm": 0.9684158437169723, + "kl": 0.0, + "learning_rate": 2.7607152976573485e-05, + "logits/chosen": -1112119680.0, + "logits/rejected": -1002858112.0, + "logps/chosen": -538.3809523809524, + "logps/rejected": -908.421052631579, + "loss": 0.3674, + "rewards/chosen": -2.1278831845238093, + "rewards/margins": 3.815373394423559, + "rewards/rejected": -5.943256578947368, + "step": 2980 + }, + { + "epoch": 1.5650353310651663, + "grad_norm": 0.4735573674201958, + "kl": 0.0, + "learning_rate": 2.745563847169577e-05, + "logits/chosen": -1047946880.0, + "logits/rejected": -963431616.0, + "logps/chosen": -543.1272727272727, + "logps/rejected": -919.9483870967742, + "loss": 0.351, + "rewards/chosen": -2.0688920454545454, + "rewards/margins": 4.311753115835778, + "rewards/rejected": -6.380645161290323, + "step": 2990 + }, + { + "epoch": 1.5702695629416383, + "grad_norm": 0.5342128661104129, + "kl": 0.0, + "learning_rate": 2.730403283530767e-05, + "logits/chosen": -1017957568.0, + "logits/rejected": -1011246720.0, + "logps/chosen": -481.62025316455697, + "logps/rejected": -906.4691358024692, + "loss": 0.3399, + "rewards/chosen": -1.7228540348101267, + "rewards/margins": 4.141343496054072, + "rewards/rejected": -5.864197530864198, + "step": 3000 + }, + { + "epoch": 1.5702695629416383, + "eval_kl": 0.0, + "eval_logits/chosen": -2308331776.0, + "eval_logits/rejected": -2173847808.0, + "eval_logps/chosen": -634.2681840672934, + "eval_logps/rejected": -983.0730979612133, + "eval_loss": 0.44966405630111694, + "eval_rewards/chosen": -3.022389905987135, + "eval_rewards/margins": 3.854785628572785, + "eval_rewards/rejected": -6.87717553455992, + "eval_runtime": 93.4395, + "eval_samples_per_second": 42.808, + "eval_steps_per_second": 0.674, + "step": 3000 + }, + { + "epoch": 1.5755037948181103, + "grad_norm": 0.7511793567904532, + "kl": 0.0, + "learning_rate": 2.7152341693665157e-05, + "logits/chosen": -1030540480.0, + "logits/rejected": -1036202816.0, + "logps/chosen": -629.9733333333334, + "logps/rejected": -1026.070588235294, + "loss": 0.3273, + "rewards/chosen": -2.8218229166666666, + "rewards/margins": 4.504647671568627, + "rewards/rejected": -7.326470588235294, + "step": 3010 + }, + { + "epoch": 1.5807380266945825, + "grad_norm": 0.7390161397159264, + "kl": 0.0, + "learning_rate": 2.700057067619741e-05, + "logits/chosen": -1011351552.0, + "logits/rejected": -1045220544.0, + "logps/chosen": -522.3741935483871, + "logps/rejected": -1024.7757575757576, + "loss": 0.3266, + "rewards/chosen": -2.0369329637096776, + "rewards/margins": 5.278218551441838, + "rewards/rejected": -7.315151515151515, + "step": 3020 + }, + { + "epoch": 1.5859722585710547, + "grad_norm": 0.4565756908370157, + "kl": 0.0, + "learning_rate": 2.6848725415297887e-05, + "logits/chosen": -1065982336.0, + "logits/rejected": -926941184.0, + "logps/chosen": -619.4922600619195, + "logps/rejected": -1133.1230283911673, + "loss": 0.3562, + "rewards/chosen": -2.773171439628483, + "rewards/margins": 5.447648749645966, + "rewards/rejected": -8.220820189274448, + "step": 3030 + }, + { + "epoch": 1.5912064904475267, + "grad_norm": 0.4524101511224128, + "kl": 0.0, + "learning_rate": 2.6696811546115296e-05, + "logits/chosen": -977063104.0, + "logits/rejected": -975699968.0, + "logps/chosen": -612.5566343042071, + "logps/rejected": -1335.4924471299094, + "loss": 0.343, + "rewards/chosen": -3.0539037216828477, + "rewards/margins": 7.356217124238602, + "rewards/rejected": -10.41012084592145, + "step": 3040 + }, + { + "epoch": 1.596440722323999, + "grad_norm": 0.46571336835876526, + "kl": 0.0, + "learning_rate": 2.6544834706344478e-05, + "logits/chosen": -1069128064.0, + "logits/rejected": -1019425600.0, + "logps/chosen": -542.6876971608833, + "logps/rejected": -1155.9628482972137, + "loss": 0.334, + "rewards/chosen": -2.3101340694006307, + "rewards/margins": 6.342342710785127, + "rewards/rejected": -8.652476780185758, + "step": 3050 + }, + { + "epoch": 1.6016749542004711, + "grad_norm": 0.49672912228082994, + "kl": 0.0, + "learning_rate": 2.6392800536017187e-05, + "logits/chosen": -1069757248.0, + "logits/rejected": -947493248.0, + "logps/chosen": -577.8650306748466, + "logps/rejected": -1023.1847133757962, + "loss": 0.3375, + "rewards/chosen": -2.266823236196319, + "rewards/margins": 4.895597145969286, + "rewards/rejected": -7.162420382165605, + "step": 3060 + }, + { + "epoch": 1.606909186076943, + "grad_norm": 0.45133476462395533, + "kl": 0.0, + "learning_rate": 2.6240714677292765e-05, + "logits/chosen": -1005164928.0, + "logits/rejected": -1023200448.0, + "logps/chosen": -487.73856209150324, + "logps/rejected": -825.3892215568862, + "loss": 0.3276, + "rewards/chosen": -1.6173917483660132, + "rewards/margins": 3.4671890899573405, + "rewards/rejected": -5.084580838323354, + "step": 3070 + }, + { + "epoch": 1.6121434179534153, + "grad_norm": 0.5101297454841202, + "kl": 0.0, + "learning_rate": 2.60885827742488e-05, + "logits/chosen": -1089470464.0, + "logits/rejected": -903033664.0, + "logps/chosen": -445.44785276073617, + "logps/rejected": -756.28025477707, + "loss": 0.3426, + "rewards/chosen": -0.963957055214724, + "rewards/margins": 3.73269899574069, + "rewards/rejected": -4.696656050955414, + "step": 3080 + }, + { + "epoch": 1.6173776498298875, + "grad_norm": 0.6709928988481924, + "kl": 0.0, + "learning_rate": 2.5936410472671603e-05, + "logits/chosen": -1081081856.0, + "logits/rejected": -948122432.0, + "logps/chosen": -527.4, + "logps/rejected": -908.6, + "loss": 0.3432, + "rewards/chosen": -1.796337890625, + "rewards/margins": 4.245458984375, + "rewards/rejected": -6.041796875, + "step": 3090 + }, + { + "epoch": 1.6226118817063595, + "grad_norm": 0.525355527958997, + "kl": 0.0, + "learning_rate": 2.5784203419846742e-05, + "logits/chosen": -939314368.0, + "logits/rejected": -838651072.0, + "logps/chosen": -531.1168831168832, + "logps/rejected": -942.0722891566265, + "loss": 0.3429, + "rewards/chosen": -1.7687702922077921, + "rewards/margins": 4.598323081286184, + "rewards/rejected": -6.367093373493976, + "step": 3100 + }, + { + "epoch": 1.6226118817063595, + "eval_kl": 0.0, + "eval_logits/chosen": -2065728000.0, + "eval_logits/rejected": -1775322368.0, + "eval_logps/chosen": -641.7100445324097, + "eval_logps/rejected": -945.901541521631, + "eval_loss": 0.4497109353542328, + "eval_rewards/chosen": -3.100816427511133, + "eval_rewards/margins": 3.405150752996077, + "eval_rewards/rejected": -6.50596718050721, + "eval_runtime": 94.217, + "eval_samples_per_second": 42.455, + "eval_steps_per_second": 0.669, + "step": 3100 + }, + { + "epoch": 1.6278461135828317, + "grad_norm": 0.35519814669807354, + "kl": 0.0, + "learning_rate": 2.5631967264349423e-05, + "logits/chosen": -952107008.0, + "logits/rejected": -813904704.0, + "logps/chosen": -604.4012158054711, + "logps/rejected": -835.8070739549839, + "loss": 0.3653, + "rewards/chosen": -2.898936170212766, + "rewards/margins": 2.683057398919067, + "rewards/rejected": -5.581993569131833, + "step": 3110 + }, + { + "epoch": 1.633080345459304, + "grad_norm": 0.6880022249793109, + "kl": 0.0, + "learning_rate": 2.5479707655834912e-05, + "logits/chosen": -1054448000.0, + "logits/rejected": -871786112.0, + "logps/chosen": -546.5919003115265, + "logps/rejected": -1034.833855799373, + "loss": 0.3434, + "rewards/chosen": -2.2242017133956384, + "rewards/margins": 5.286770073438217, + "rewards/rejected": -7.5109717868338555, + "step": 3120 + }, + { + "epoch": 1.638314577335776, + "grad_norm": 0.5447475519276055, + "kl": 0.0, + "learning_rate": 2.5327430244828815e-05, + "logits/chosen": -1043333120.0, + "logits/rejected": -910583424.0, + "logps/chosen": -572.4668769716088, + "logps/rejected": -853.8947368421053, + "loss": 0.3362, + "rewards/chosen": -2.3772180599369084, + "rewards/margins": 3.363494014366497, + "rewards/rejected": -5.7407120743034055, + "step": 3130 + }, + { + "epoch": 1.643548809212248, + "grad_norm": 0.6308208204607763, + "kl": 0.0, + "learning_rate": 2.517514068251743e-05, + "logits/chosen": -1020054720.0, + "logits/rejected": -832988800.0, + "logps/chosen": -511.7037037037037, + "logps/rejected": -914.8354430379746, + "loss": 0.3374, + "rewards/chosen": -1.7274305555555556, + "rewards/margins": 4.221936533052039, + "rewards/rejected": -5.949367088607595, + "step": 3140 + }, + { + "epoch": 1.6487830410887203, + "grad_norm": 0.6056564587784243, + "kl": 0.0, + "learning_rate": 2.5022844620537988e-05, + "logits/chosen": -913729152.0, + "logits/rejected": -808347264.0, + "logps/chosen": -477.1003236245955, + "logps/rejected": -800.3867069486405, + "loss": 0.3335, + "rewards/chosen": -1.2809971682847896, + "rewards/margins": 3.9659816836789563, + "rewards/rejected": -5.246978851963746, + "step": 3150 + }, + { + "epoch": 1.6540172729651923, + "grad_norm": 0.4584347654412988, + "kl": 0.0, + "learning_rate": 2.487054771076893e-05, + "logits/chosen": -985661440.0, + "logits/rejected": -875980416.0, + "logps/chosen": -404.6792452830189, + "logps/rejected": -746.0372670807453, + "loss": 0.3279, + "rewards/chosen": -0.7281102594339622, + "rewards/margins": 3.5040325977088944, + "rewards/rejected": -4.232142857142857, + "step": 3160 + }, + { + "epoch": 1.6592515048416645, + "grad_norm": 0.950559110059267, + "kl": 0.0, + "learning_rate": 2.4718255605120185e-05, + "logits/chosen": -984612864.0, + "logits/rejected": -867801472.0, + "logps/chosen": -429.6149068322981, + "logps/rejected": -722.2138364779875, + "loss": 0.3212, + "rewards/chosen": -0.7463485054347826, + "rewards/margins": 3.5897364002255947, + "rewards/rejected": -4.336084905660377, + "step": 3170 + }, + { + "epoch": 1.6644857367181367, + "grad_norm": 0.5945415890746549, + "kl": 0.0, + "learning_rate": 2.456597395532338e-05, + "logits/chosen": -1107505920.0, + "logits/rejected": -908276544.0, + "logps/chosen": -428.4179104477612, + "logps/rejected": -751.1081967213115, + "loss": 0.3311, + "rewards/chosen": -0.7750932835820895, + "rewards/margins": 3.832693601663812, + "rewards/rejected": -4.607786885245901, + "step": 3180 + }, + { + "epoch": 1.6697199685946087, + "grad_norm": 0.6507764293651568, + "kl": 0.0, + "learning_rate": 2.4413708412722084e-05, + "logits/chosen": -1060739456.0, + "logits/rejected": -942879552.0, + "logps/chosen": -403.55555555555554, + "logps/rejected": -760.3037974683544, + "loss": 0.3207, + "rewards/chosen": -0.513695987654321, + "rewards/margins": 4.264785025003906, + "rewards/rejected": -4.7784810126582276, + "step": 3190 + }, + { + "epoch": 1.674954200471081, + "grad_norm": 0.6257739087742488, + "kl": 0.0, + "learning_rate": 2.4261464628062143e-05, + "logits/chosen": -1028862784.0, + "logits/rejected": -1017538176.0, + "logps/chosen": -414.36421725239614, + "logps/rejected": -792.269113149847, + "loss": 0.3005, + "rewards/chosen": -0.7960637979233227, + "rewards/margins": 3.907300116449766, + "rewards/rejected": -4.703363914373089, + "step": 3200 + }, + { + "epoch": 1.674954200471081, + "eval_kl": 0.0, + "eval_logits/chosen": -2172250112.0, + "eval_logits/rejected": -2024051328.0, + "eval_logps/chosen": -515.8317664522514, + "eval_logps/rejected": -719.3396320238687, + "eval_loss": 0.44920703768730164, + "eval_rewards/chosen": -1.8422191984166254, + "eval_rewards/margins": 2.399824560907094, + "eval_rewards/rejected": -4.242043759323719, + "eval_runtime": 93.4548, + "eval_samples_per_second": 42.801, + "eval_steps_per_second": 0.674, + "step": 3200 + }, + { + "epoch": 1.680188432347553, + "grad_norm": 1.0328000427510753, + "kl": 0.0, + "learning_rate": 2.410924825128195e-05, + "logits/chosen": -1078565248.0, + "logits/rejected": -850290304.0, + "logps/chosen": -457.85714285714283, + "logps/rejected": -658.5263157894736, + "loss": 0.3694, + "rewards/chosen": -1.2154947916666667, + "rewards/margins": 2.6738966557017543, + "rewards/rejected": -3.8893914473684212, + "step": 3210 + }, + { + "epoch": 1.685422664224025, + "grad_norm": 0.5208279156175792, + "kl": 0.0, + "learning_rate": 2.395706493130274e-05, + "logits/chosen": -1064094912.0, + "logits/rejected": -793457472.0, + "logps/chosen": -413.5420289855073, + "logps/rejected": -859.8779661016949, + "loss": 0.3487, + "rewards/chosen": -0.7851222826086957, + "rewards/margins": 4.839877717391304, + "rewards/rejected": -5.625, + "step": 3220 + }, + { + "epoch": 1.6906568961004973, + "grad_norm": 0.6530601088926155, + "kl": 0.0, + "learning_rate": 2.380492031581897e-05, + "logits/chosen": -927360640.0, + "logits/rejected": -874722112.0, + "logps/chosen": -456.7741935483871, + "logps/rejected": -829.2848484848485, + "loss": 0.3305, + "rewards/chosen": -1.384375, + "rewards/margins": 3.946685606060606, + "rewards/rejected": -5.331060606060606, + "step": 3230 + }, + { + "epoch": 1.6958911279769695, + "grad_norm": 0.7857055522080945, + "kl": 0.0, + "learning_rate": 2.365282005108875e-05, + "logits/chosen": -938056064.0, + "logits/rejected": -797232320.0, + "logps/chosen": -475.49847094801225, + "logps/rejected": -878.3130990415335, + "loss": 0.3524, + "rewards/chosen": -1.4733252102446484, + "rewards/margins": 4.301435173141933, + "rewards/rejected": -5.774760383386582, + "step": 3240 + }, + { + "epoch": 1.7011253598534415, + "grad_norm": 1.3501713758787808, + "kl": 0.0, + "learning_rate": 2.3500769781724256e-05, + "logits/chosen": -1067764928.0, + "logits/rejected": -812226944.0, + "logps/chosen": -481.3872832369942, + "logps/rejected": -860.1904761904761, + "loss": 0.3503, + "rewards/chosen": -1.6231484826589595, + "rewards/margins": 3.704232469721993, + "rewards/rejected": -5.3273809523809526, + "step": 3250 + }, + { + "epoch": 1.7063595917299135, + "grad_norm": 0.7060478361762077, + "kl": 0.0, + "learning_rate": 2.334877515048231e-05, + "logits/chosen": -1088002432.0, + "logits/rejected": -968255104.0, + "logps/chosen": -501.98083067092654, + "logps/rejected": -789.4311926605504, + "loss": 0.3216, + "rewards/chosen": -1.5131165135782747, + "rewards/margins": 3.3890241592046, + "rewards/rejected": -4.902140672782875, + "step": 3260 + }, + { + "epoch": 1.711593823606386, + "grad_norm": 0.7469032114353387, + "kl": 0.0, + "learning_rate": 2.319684179805491e-05, + "logits/chosen": -1021732480.0, + "logits/rejected": -1027709312.0, + "logps/chosen": -580.3540983606557, + "logps/rejected": -993.6238805970149, + "loss": 0.3386, + "rewards/chosen": -2.4837090163934428, + "rewards/margins": 4.50733575972596, + "rewards/rejected": -6.991044776119403, + "step": 3270 + }, + { + "epoch": 1.7168280554828579, + "grad_norm": 0.8433420902642595, + "kl": 0.0, + "learning_rate": 2.304497536285996e-05, + "logits/chosen": -988387712.0, + "logits/rejected": -944137856.0, + "logps/chosen": -700.6984126984127, + "logps/rejected": -1110.843076923077, + "loss": 0.3698, + "rewards/chosen": -3.7896329365079366, + "rewards/margins": 4.231905525030525, + "rewards/rejected": -8.021538461538462, + "step": 3280 + }, + { + "epoch": 1.7220622873593299, + "grad_norm": 0.684092268488129, + "kl": 0.0, + "learning_rate": 2.289318148083196e-05, + "logits/chosen": -1000551232.0, + "logits/rejected": -938685248.0, + "logps/chosen": -844.1165048543689, + "logps/rejected": -1529.0392749244713, + "loss": 0.3529, + "rewards/chosen": -4.929510517799352, + "rewards/margins": 7.559160177064696, + "rewards/rejected": -12.488670694864048, + "step": 3290 + }, + { + "epoch": 1.7272965192358023, + "grad_norm": 0.6694204511833904, + "kl": 0.0, + "learning_rate": 2.2741465785212905e-05, + "logits/chosen": -991323776.0, + "logits/rejected": -970352256.0, + "logps/chosen": -771.3795379537954, + "logps/rejected": -1320.0712166172107, + "loss": 0.3468, + "rewards/chosen": -4.449824669966997, + "rewards/margins": 5.689641205403923, + "rewards/rejected": -10.13946587537092, + "step": 3300 + }, + { + "epoch": 1.7272965192358023, + "eval_kl": 0.0, + "eval_logits/chosen": -2299277568.0, + "eval_logits/rejected": -2052279552.0, + "eval_logps/chosen": -650.1019297377536, + "eval_logps/rejected": -1102.9895574341124, + "eval_loss": 0.4503124952316284, + "eval_rewards/chosen": -3.182088075210292, + "eval_rewards/margins": 4.895236638862309, + "eval_rewards/rejected": -8.077324714072601, + "eval_runtime": 93.468, + "eval_samples_per_second": 42.795, + "eval_steps_per_second": 0.674, + "step": 3300 + }, + { + "epoch": 1.7325307511122743, + "grad_norm": 0.5936884040448765, + "kl": 0.0, + "learning_rate": 2.2589833906343182e-05, + "logits/chosen": -1130155264.0, + "logits/rejected": -893386752.0, + "logps/chosen": -524.2017804154302, + "logps/rejected": -1324.039603960396, + "loss": 0.3446, + "rewards/chosen": -1.9497751298219586, + "rewards/margins": 8.34436678436946, + "rewards/rejected": -10.29414191419142, + "step": 3310 + }, + { + "epoch": 1.7377649829887463, + "grad_norm": 0.8087649219172015, + "kl": 0.0, + "learning_rate": 2.2438291471452667e-05, + "logits/chosen": -1134559232.0, + "logits/rejected": -969093952.0, + "logps/chosen": -652.5679758308157, + "logps/rejected": -1052.2718446601941, + "loss": 0.3799, + "rewards/chosen": -3.147092145015106, + "rewards/margins": 4.507438599321464, + "rewards/rejected": -7.6545307443365695, + "step": 3320 + }, + { + "epoch": 1.7429992148652187, + "grad_norm": 0.6066142385094927, + "kl": 0.0, + "learning_rate": 2.2286844104451846e-05, + "logits/chosen": -1056125760.0, + "logits/rejected": -890136192.0, + "logps/chosen": -528.9268292682926, + "logps/rejected": -1025.948717948718, + "loss": 0.353, + "rewards/chosen": -2.246189024390244, + "rewards/margins": 5.228971232020013, + "rewards/rejected": -7.475160256410256, + "step": 3330 + }, + { + "epoch": 1.7482334467416907, + "grad_norm": 0.6316286497085969, + "kl": 0.0, + "learning_rate": 2.213549742572314e-05, + "logits/chosen": -1060110336.0, + "logits/rejected": -960915072.0, + "logps/chosen": -551.7435897435897, + "logps/rejected": -958.829268292683, + "loss": 0.3362, + "rewards/chosen": -2.3900741185897436, + "rewards/margins": 4.085916735068793, + "rewards/rejected": -6.475990853658536, + "step": 3340 + }, + { + "epoch": 1.7534676786181627, + "grad_norm": 0.6928407887366415, + "kl": 0.0, + "learning_rate": 2.1984257051912326e-05, + "logits/chosen": -974756224.0, + "logits/rejected": -900936512.0, + "logps/chosen": -547.4760383386581, + "logps/rejected": -911.65749235474, + "loss": 0.3315, + "rewards/chosen": -2.032947284345048, + "rewards/margins": 3.9081842141258996, + "rewards/rejected": -5.941131498470948, + "step": 3350 + }, + { + "epoch": 1.7587019104946349, + "grad_norm": 0.6287104207098294, + "kl": 0.0, + "learning_rate": 2.183312859572008e-05, + "logits/chosen": -924057600.0, + "logits/rejected": -831940224.0, + "logps/chosen": -547.1898734177215, + "logps/rejected": -1009.3827160493827, + "loss": 0.3402, + "rewards/chosen": -1.9961926424050633, + "rewards/margins": 5.153498715619628, + "rewards/rejected": -7.1496913580246915, + "step": 3360 + }, + { + "epoch": 1.763936142371107, + "grad_norm": 1.309187284373212, + "kl": 0.0, + "learning_rate": 2.1682117665693663e-05, + "logits/chosen": -941306688.0, + "logits/rejected": -866333504.0, + "logps/chosen": -594.6750788643533, + "logps/rejected": -968.8173374613003, + "loss": 0.3435, + "rewards/chosen": -2.580515575709779, + "rewards/margins": 4.067317241627682, + "rewards/rejected": -6.647832817337461, + "step": 3370 + }, + { + "epoch": 1.769170374247579, + "grad_norm": 0.8941842404807043, + "kl": 0.0, + "learning_rate": 2.1531229866018832e-05, + "logits/chosen": -918762304.0, + "logits/rejected": -699819648.0, + "logps/chosen": -514.4539877300614, + "logps/rejected": -1172.7898089171974, + "loss": 0.3531, + "rewards/chosen": -1.924079754601227, + "rewards/margins": 7.1101559141885815, + "rewards/rejected": -9.034235668789808, + "step": 3380 + }, + { + "epoch": 1.7744046061240513, + "grad_norm": 0.7075290388583574, + "kl": 0.0, + "learning_rate": 2.1380470796311843e-05, + "logits/chosen": -899258752.0, + "logits/rejected": -794086592.0, + "logps/chosen": -649.0864197530864, + "logps/rejected": -1344.607594936709, + "loss": 0.3448, + "rewards/chosen": -3.271291473765432, + "rewards/margins": 7.443898399652289, + "rewards/rejected": -10.715189873417721, + "step": 3390 + }, + { + "epoch": 1.7796388380005235, + "grad_norm": 0.784185262384793, + "kl": 0.0, + "learning_rate": 2.1229846051411624e-05, + "logits/chosen": -947702976.0, + "logits/rejected": -786851456.0, + "logps/chosen": -541.416149068323, + "logps/rejected": -1070.691823899371, + "loss": 0.3361, + "rewards/chosen": -1.9438082298136645, + "rewards/margins": 5.612009380249228, + "rewards/rejected": -7.555817610062893, + "step": 3400 + }, + { + "epoch": 1.7796388380005235, + "eval_kl": 0.0, + "eval_logits/chosen": -1953080960.0, + "eval_logits/rejected": -1726854912.0, + "eval_logps/chosen": -586.8936170212766, + "eval_logps/rejected": -953.3485827946296, + "eval_loss": 0.44880467653274536, + "eval_rewards/chosen": -2.550377288471054, + "eval_rewards/margins": 4.0314228109819545, + "eval_rewards/rejected": -6.581800099453009, + "eval_runtime": 93.4545, + "eval_samples_per_second": 42.802, + "eval_steps_per_second": 0.674, + "step": 3400 + }, + { + "epoch": 1.7848730698769955, + "grad_norm": 0.6417707088331657, + "kl": 0.0, + "learning_rate": 2.1079361221172168e-05, + "logits/chosen": -876924096.0, + "logits/rejected": -800273216.0, + "logps/chosen": -555.4069400630915, + "logps/rejected": -1011.8142414860681, + "loss": 0.3365, + "rewards/chosen": -2.269469637223975, + "rewards/margins": 5.012264108906056, + "rewards/rejected": -7.281733746130031, + "step": 3410 + }, + { + "epoch": 1.7901073017534677, + "grad_norm": 0.6562900236984517, + "kl": 0.0, + "learning_rate": 2.092902189025507e-05, + "logits/chosen": -968674496.0, + "logits/rejected": -715653120.0, + "logps/chosen": -531.7701149425287, + "logps/rejected": -1166.6301369863013, + "loss": 0.3455, + "rewards/chosen": -2.1996901939655173, + "rewards/margins": 6.43986460055503, + "rewards/rejected": -8.639554794520548, + "step": 3420 + }, + { + "epoch": 1.7953415336299399, + "grad_norm": 0.8218911355557875, + "kl": 0.0, + "learning_rate": 2.0778833637922277e-05, + "logits/chosen": -863607168.0, + "logits/rejected": -816840704.0, + "logps/chosen": -521.4545454545455, + "logps/rejected": -1002.2168674698795, + "loss": 0.3291, + "rewards/chosen": -1.9117352374188312, + "rewards/margins": 5.367632232460687, + "rewards/rejected": -7.279367469879518, + "step": 3430 + }, + { + "epoch": 1.8005757655064119, + "grad_norm": 0.584855714512457, + "kl": 0.0, + "learning_rate": 2.0628802037829047e-05, + "logits/chosen": -910583424.0, + "logits/rejected": -787166016.0, + "logps/chosen": -515.0868167202573, + "logps/rejected": -1014.468085106383, + "loss": 0.3294, + "rewards/chosen": -1.6524567926045015, + "rewards/margins": 5.78219366332255, + "rewards/rejected": -7.434650455927052, + "step": 3440 + }, + { + "epoch": 1.805809997382884, + "grad_norm": 0.5836628556756525, + "kl": 0.0, + "learning_rate": 2.0478932657817105e-05, + "logits/chosen": -852702016.0, + "logits/rejected": -765250752.0, + "logps/chosen": -664.28664495114, + "logps/rejected": -1122.4024024024025, + "loss": 0.3339, + "rewards/chosen": -3.09441164495114, + "rewards/margins": 5.129312078772584, + "rewards/rejected": -8.223723723723724, + "step": 3450 + }, + { + "epoch": 1.8110442292593563, + "grad_norm": 0.5760216909652937, + "kl": 0.0, + "learning_rate": 2.0329231059707986e-05, + "logits/chosen": -732325504.0, + "logits/rejected": -773219968.0, + "logps/chosen": -588.6896551724138, + "logps/rejected": -990.72, + "loss": 0.3215, + "rewards/chosen": -2.666971982758621, + "rewards/margins": 4.308028017241378, + "rewards/rejected": -6.975, + "step": 3460 + }, + { + "epoch": 1.8162784611358282, + "grad_norm": 0.7342083500831641, + "kl": 0.0, + "learning_rate": 2.017970279909667e-05, + "logits/chosen": -782552256.0, + "logits/rejected": -720476544.0, + "logps/chosen": -540.6792452830189, + "logps/rejected": -1125.5652173913043, + "loss": 0.3382, + "rewards/chosen": -2.103355935534591, + "rewards/margins": 5.864035368813235, + "rewards/rejected": -7.967391304347826, + "step": 3470 + }, + { + "epoch": 1.8215126930123005, + "grad_norm": 1.3753140883742832, + "kl": 0.0, + "learning_rate": 2.0030353425145378e-05, + "logits/chosen": -791255424.0, + "logits/rejected": -629565056.0, + "logps/chosen": -527.0, + "logps/rejected": -930.8, + "loss": 0.351, + "rewards/chosen": -1.91419677734375, + "rewards/margins": 4.7514282226562505, + "rewards/rejected": -6.665625, + "step": 3480 + }, + { + "epoch": 1.8267469248887727, + "grad_norm": 0.6736862587298565, + "kl": 0.0, + "learning_rate": 1.9881188480377632e-05, + "logits/chosen": -787795136.0, + "logits/rejected": -726243712.0, + "logps/chosen": -518.7313915857605, + "logps/rejected": -847.2749244712991, + "loss": 0.3406, + "rewards/chosen": -1.9428094660194175, + "rewards/margins": 3.7622509569413074, + "rewards/rejected": -5.705060422960725, + "step": 3490 + }, + { + "epoch": 1.8319811567652446, + "grad_norm": 1.106559380657363, + "kl": 0.0, + "learning_rate": 1.9732213500472605e-05, + "logits/chosen": -802999488.0, + "logits/rejected": -650012288.0, + "logps/chosen": -464.7463556851312, + "logps/rejected": -869.0639730639731, + "loss": 0.3405, + "rewards/chosen": -1.4682944606413995, + "rewards/margins": 4.312850320503381, + "rewards/rejected": -5.781144781144781, + "step": 3500 + }, + { + "epoch": 1.8319811567652446, + "eval_kl": 0.0, + "eval_logits/chosen": -1600359936.0, + "eval_logits/rejected": -1368940928.0, + "eval_logps/chosen": -595.8555170707571, + "eval_logps/rejected": -917.2272501243162, + "eval_loss": 0.447328120470047, + "eval_rewards/chosen": -2.6416996536368136, + "eval_rewards/margins": 3.5805778202567713, + "eval_rewards/rejected": -6.222277473893585, + "eval_runtime": 93.4479, + "eval_samples_per_second": 42.805, + "eval_steps_per_second": 0.674, + "step": 3500 + }, + { + "epoch": 1.8372153886417169, + "grad_norm": 0.4772534610253572, + "kl": 0.0, + "learning_rate": 1.9583434014059638e-05, + "logits/chosen": -800797504.0, + "logits/rejected": -647705408.0, + "logps/chosen": -516.1904761904761, + "logps/rejected": -1060.6315789473683, + "loss": 0.3433, + "rewards/chosen": -1.9356631324404763, + "rewards/margins": 5.672067130717418, + "rewards/rejected": -7.607730263157895, + "step": 3510 + }, + { + "epoch": 1.842449620518189, + "grad_norm": 0.4797616930935301, + "kl": 0.0, + "learning_rate": 1.9434855542513106e-05, + "logits/chosen": -823236992.0, + "logits/rejected": -705796480.0, + "logps/chosen": -444.1150159744409, + "logps/rejected": -825.5412844036697, + "loss": 0.34, + "rewards/chosen": -1.3486796126198084, + "rewards/margins": 3.9609534149031274, + "rewards/rejected": -5.309633027522936, + "step": 3520 + }, + { + "epoch": 1.847683852394661, + "grad_norm": 0.5695842896989155, + "kl": 0.0, + "learning_rate": 1.9286483599747475e-05, + "logits/chosen": -722259136.0, + "logits/rejected": -758015616.0, + "logps/chosen": -514.8354430379746, + "logps/rejected": -884.5432098765432, + "loss": 0.351, + "rewards/chosen": -2.095010878164557, + "rewards/margins": 3.8100817144280357, + "rewards/rejected": -5.905092592592593, + "step": 3530 + }, + { + "epoch": 1.8529180842711332, + "grad_norm": 0.6900510116894868, + "kl": 0.0, + "learning_rate": 1.9138323692012737e-05, + "logits/chosen": -784439680.0, + "logits/rejected": -683252096.0, + "logps/chosen": -511.39622641509436, + "logps/rejected": -1062.1614906832299, + "loss": 0.3223, + "rewards/chosen": -1.662441037735849, + "rewards/margins": 5.90471734735732, + "rewards/rejected": -7.567158385093168, + "step": 3540 + }, + { + "epoch": 1.8581523161476055, + "grad_norm": 0.9385097332615309, + "kl": 0.0, + "learning_rate": 1.8990381317689958e-05, + "logits/chosen": -824809856.0, + "logits/rejected": -664063168.0, + "logps/chosen": -437.96941896024464, + "logps/rejected": -783.8466453674122, + "loss": 0.3498, + "rewards/chosen": -1.254730504587156, + "rewards/margins": 3.7029372270422374, + "rewards/rejected": -4.957667731629393, + "step": 3550 + }, + { + "epoch": 1.8633865480240774, + "grad_norm": 0.8998105100017849, + "kl": 0.0, + "learning_rate": 1.8842661967087353e-05, + "logits/chosen": -801007232.0, + "logits/rejected": -725719424.0, + "logps/chosen": -453.1358024691358, + "logps/rejected": -819.746835443038, + "loss": 0.3526, + "rewards/chosen": -1.3312596450617284, + "rewards/margins": 3.7272846587357398, + "rewards/rejected": -5.0585443037974684, + "step": 3560 + }, + { + "epoch": 1.8686207799005496, + "grad_norm": 0.6166221805440664, + "kl": 0.0, + "learning_rate": 1.8695171122236444e-05, + "logits/chosen": -735156608.0, + "logits/rejected": -726872896.0, + "logps/chosen": -470.7752442996743, + "logps/rejected": -846.3183183183183, + "loss": 0.3509, + "rewards/chosen": -1.5342528501628665, + "rewards/margins": 3.9041855882755714, + "rewards/rejected": -5.438438438438438, + "step": 3570 + }, + { + "epoch": 1.8738550117770219, + "grad_norm": 0.9064425131607848, + "kl": 0.0, + "learning_rate": 1.8547914256688663e-05, + "logits/chosen": -914987392.0, + "logits/rejected": -619288960.0, + "logps/chosen": -581.008695652174, + "logps/rejected": -964.2305084745763, + "loss": 0.3741, + "rewards/chosen": -2.208786231884058, + "rewards/margins": 4.6098578359125515, + "rewards/rejected": -6.81864406779661, + "step": 3580 + }, + { + "epoch": 1.8790892436534938, + "grad_norm": 0.4694968385256991, + "kl": 0.0, + "learning_rate": 1.8400896835312208e-05, + "logits/chosen": -803314048.0, + "logits/rejected": -782342528.0, + "logps/chosen": -627.4711864406779, + "logps/rejected": -1034.3884057971015, + "loss": 0.3389, + "rewards/chosen": -2.8996822033898306, + "rewards/margins": 4.351767071972488, + "rewards/rejected": -7.251449275362319, + "step": 3590 + }, + { + "epoch": 1.8843234755299658, + "grad_norm": 0.5385083556993564, + "kl": 0.0, + "learning_rate": 1.8254124314089223e-05, + "logits/chosen": -786012544.0, + "logits/rejected": -727921472.0, + "logps/chosen": -592.3975155279503, + "logps/rejected": -968.3522012578617, + "loss": 0.362, + "rewards/chosen": -2.4558423913043477, + "rewards/margins": 4.180950061525841, + "rewards/rejected": -6.636792452830188, + "step": 3600 + }, + { + "epoch": 1.8843234755299658, + "eval_kl": 0.0, + "eval_logits/chosen": -1622263552.0, + "eval_logits/rejected": -1444837888.0, + "eval_logps/chosen": -594.8738248391885, + "eval_logps/rejected": -933.2352063649926, + "eval_loss": 0.4472343623638153, + "eval_rewards/chosen": -2.6311541316180107, + "eval_rewards/margins": 3.747761830589846, + "eval_rewards/rejected": -6.378915962207857, + "eval_runtime": 93.4509, + "eval_samples_per_second": 42.803, + "eval_steps_per_second": 0.674, + "step": 3600 + }, + { + "epoch": 1.8895577074064382, + "grad_norm": 0.7316473583926785, + "kl": 0.0, + "learning_rate": 1.810760213991332e-05, + "logits/chosen": -709885952.0, + "logits/rejected": -708417920.0, + "logps/chosen": -562.8745762711865, + "logps/rejected": -1033.4608695652173, + "loss": 0.3495, + "rewards/chosen": -2.3506091101694917, + "rewards/margins": 4.948303933308769, + "rewards/rejected": -7.298913043478261, + "step": 3610 + }, + { + "epoch": 1.8947919392829102, + "grad_norm": 0.7645475500978628, + "kl": 0.0, + "learning_rate": 1.796133575038748e-05, + "logits/chosen": -663434048.0, + "logits/rejected": -674339200.0, + "logps/chosen": -638.6709265175718, + "logps/rejected": -1222.7522935779816, + "loss": 0.3553, + "rewards/chosen": -3.1315894568690097, + "rewards/margins": 6.036606261785424, + "rewards/rejected": -9.168195718654435, + "step": 3620 + }, + { + "epoch": 1.9000261711593822, + "grad_norm": 0.49257289540325766, + "kl": 0.0, + "learning_rate": 1.781533057362221e-05, + "logits/chosen": -701497344.0, + "logits/rejected": -594228032.0, + "logps/chosen": -531.6923076923077, + "logps/rejected": -1016.3809523809524, + "loss": 0.3361, + "rewards/chosen": -2.2409375, + "rewards/margins": 4.999935515873016, + "rewards/rejected": -7.2408730158730155, + "step": 3630 + }, + { + "epoch": 1.9052604030358546, + "grad_norm": 0.8198249770364769, + "kl": 0.0, + "learning_rate": 1.7669592028034116e-05, + "logits/chosen": -755289280.0, + "logits/rejected": -648386944.0, + "logps/chosen": -473.2682926829268, + "logps/rejected": -1005.5384615384615, + "loss": 0.3421, + "rewards/chosen": -1.4224942835365855, + "rewards/margins": 5.542649947232645, + "rewards/rejected": -6.965144230769231, + "step": 3640 + }, + { + "epoch": 1.9104946349123266, + "grad_norm": 0.9264357764615075, + "kl": 0.0, + "learning_rate": 1.7524125522144826e-05, + "logits/chosen": -701602176.0, + "logits/rejected": -645922816.0, + "logps/chosen": -545.9804560260586, + "logps/rejected": -965.957957957958, + "loss": 0.3238, + "rewards/chosen": -2.004911441368078, + "rewards/margins": 4.538632102175465, + "rewards/rejected": -6.543543543543543, + "step": 3650 + }, + { + "epoch": 1.9157288667887986, + "grad_norm": 0.47892161569097425, + "kl": 0.0, + "learning_rate": 1.7378936454380276e-05, + "logits/chosen": -742601536.0, + "logits/rejected": -597478592.0, + "logps/chosen": -566.4191616766467, + "logps/rejected": -995.7647058823529, + "loss": 0.351, + "rewards/chosen": -2.3877245508982035, + "rewards/margins": 4.476654534069116, + "rewards/rejected": -6.86437908496732, + "step": 3660 + }, + { + "epoch": 1.9209630986652708, + "grad_norm": 0.5930608926989867, + "kl": 0.0, + "learning_rate": 1.7234030212870334e-05, + "logits/chosen": -670564352.0, + "logits/rejected": -624427008.0, + "logps/chosen": -476.1471571906354, + "logps/rejected": -1045.208211143695, + "loss": 0.3098, + "rewards/chosen": -1.4977529264214047, + "rewards/margins": 6.181865841907041, + "rewards/rejected": -7.679618768328446, + "step": 3670 + }, + { + "epoch": 1.926197330541743, + "grad_norm": 0.4877603718353012, + "kl": 0.0, + "learning_rate": 1.7089412175248896e-05, + "logits/chosen": -703070208.0, + "logits/rejected": -640575104.0, + "logps/chosen": -491.5032679738562, + "logps/rejected": -966.8023952095808, + "loss": 0.3282, + "rewards/chosen": -1.5397263071895424, + "rewards/margins": 5.183327585026027, + "rewards/rejected": -6.723053892215569, + "step": 3680 + }, + { + "epoch": 1.931431562418215, + "grad_norm": 0.7344807590979141, + "kl": 0.0, + "learning_rate": 1.694508770845427e-05, + "logits/chosen": -740084928.0, + "logits/rejected": -586835584.0, + "logps/chosen": -526.8571428571429, + "logps/rejected": -891.2631578947369, + "loss": 0.3558, + "rewards/chosen": -2.110932849702381, + "rewards/margins": 3.780925702929198, + "rewards/rejected": -5.891858552631579, + "step": 3690 + }, + { + "epoch": 1.9366657942946872, + "grad_norm": 0.5681820063641687, + "kl": 0.0, + "learning_rate": 1.680106216853003e-05, + "logits/chosen": -702545920.0, + "logits/rejected": -728865152.0, + "logps/chosen": -476.7676767676768, + "logps/rejected": -914.0991253644315, + "loss": 0.3153, + "rewards/chosen": -1.3383838383838385, + "rewards/margins": 4.622257561033071, + "rewards/rejected": -5.960641399416909, + "step": 3700 + }, + { + "epoch": 1.9366657942946872, + "eval_kl": 0.0, + "eval_logits/chosen": -1449431680.0, + "eval_logits/rejected": -1236653952.0, + "eval_logps/chosen": -548.1009401286492, + "eval_logps/rejected": -828.2764793635007, + "eval_loss": 0.4481757879257202, + "eval_rewards/chosen": -2.1611207323107373, + "eval_rewards/margins": 3.175030436262112, + "eval_rewards/rejected": -5.336151168572849, + "eval_runtime": 93.4468, + "eval_samples_per_second": 42.805, + "eval_steps_per_second": 0.674, + "step": 3700 + }, + { + "epoch": 1.9419000261711594, + "grad_norm": 0.7862073138396892, + "kl": 0.0, + "learning_rate": 1.665734090042622e-05, + "logits/chosen": -694681600.0, + "logits/rejected": -642829504.0, + "logps/chosen": -481.1210191082803, + "logps/rejected": -1028.3190184049079, + "loss": 0.3552, + "rewards/chosen": -1.5101574940286624, + "rewards/margins": 5.586468272842504, + "rewards/rejected": -7.096625766871166, + "step": 3710 + }, + { + "epoch": 1.9471342580476314, + "grad_norm": 0.8285344647326756, + "kl": 0.0, + "learning_rate": 1.651392923780105e-05, + "logits/chosen": -724251456.0, + "logits/rejected": -629145600.0, + "logps/chosen": -501.58255451713393, + "logps/rejected": -835.3103448275862, + "loss": 0.3722, + "rewards/chosen": -1.7075879818925233, + "rewards/margins": 3.5847317673237775, + "rewards/rejected": -5.2923197492163006, + "step": 3720 + }, + { + "epoch": 1.9523684899241036, + "grad_norm": 0.6838021947504493, + "kl": 0.0, + "learning_rate": 1.637083250282288e-05, + "logits/chosen": -716387136.0, + "logits/rejected": -572889472.0, + "logps/chosen": -504.85626911314984, + "logps/rejected": -945.1757188498402, + "loss": 0.3516, + "rewards/chosen": -1.6721139143730888, + "rewards/margins": 4.77756659680902, + "rewards/rejected": -6.449680511182109, + "step": 3730 + }, + { + "epoch": 1.9576027218005758, + "grad_norm": 0.49973063687202535, + "kl": 0.0, + "learning_rate": 1.6228056005972762e-05, + "logits/chosen": -754450432.0, + "logits/rejected": -652738560.0, + "logps/chosen": -499.238670694864, + "logps/rejected": -1019.4433656957929, + "loss": 0.3441, + "rewards/chosen": -1.5205343655589123, + "rewards/margins": 5.6631225923698905, + "rewards/rejected": -7.183656957928803, + "step": 3740 + }, + { + "epoch": 1.9628369536770478, + "grad_norm": 0.7913259869671393, + "kl": 0.0, + "learning_rate": 1.6085605045847367e-05, + "logits/chosen": -836553920.0, + "logits/rejected": -634178752.0, + "logps/chosen": -443.83492063492065, + "logps/rejected": -1056.0984615384616, + "loss": 0.3281, + "rewards/chosen": -0.9071180555555556, + "rewards/margins": 6.879035790598291, + "rewards/rejected": -7.786153846153846, + "step": 3750 + }, + { + "epoch": 1.96807118555352, + "grad_norm": 0.6164908641832416, + "kl": 0.0, + "learning_rate": 1.5943484908962325e-05, + "logits/chosen": -712192832.0, + "logits/rejected": -696044736.0, + "logps/chosen": -521.3465346534654, + "logps/rejected": -953.9228486646884, + "loss": 0.35, + "rewards/chosen": -2.1199431466584158, + "rewards/margins": 4.52916664562645, + "rewards/rejected": -6.649109792284866, + "step": 3760 + }, + { + "epoch": 1.9733054174299922, + "grad_norm": 0.9149146447875485, + "kl": 0.0, + "learning_rate": 1.580170086955603e-05, + "logits/chosen": -780769664.0, + "logits/rejected": -724041728.0, + "logps/chosen": -506.03821656050957, + "logps/rejected": -1002.7975460122699, + "loss": 0.3521, + "rewards/chosen": -1.7051652070063694, + "rewards/margins": 5.1656170015825875, + "rewards/rejected": -6.870782208588957, + "step": 3770 + }, + { + "epoch": 1.9785396493064642, + "grad_norm": 0.5360239915386263, + "kl": 0.0, + "learning_rate": 1.5660258189393946e-05, + "logits/chosen": -726872896.0, + "logits/rejected": -551079104.0, + "logps/chosen": -592.5014245014245, + "logps/rejected": -1048.3598615916956, + "loss": 0.4024, + "rewards/chosen": -2.969128383190883, + "rewards/margins": 4.530871616809117, + "rewards/rejected": -7.5, + "step": 3780 + }, + { + "epoch": 1.9837738811829364, + "grad_norm": 0.7520642077552421, + "kl": 0.0, + "learning_rate": 1.551916211757326e-05, + "logits/chosen": -776470528.0, + "logits/rejected": -559992000.0, + "logps/chosen": -461.6804733727811, + "logps/rejected": -912.635761589404, + "loss": 0.3518, + "rewards/chosen": -1.1292991863905326, + "rewards/margins": 5.128978959304831, + "rewards/rejected": -6.258278145695364, + "step": 3790 + }, + { + "epoch": 1.9890081130594086, + "grad_norm": 1.159247733239679, + "kl": 0.0, + "learning_rate": 1.537841789032819e-05, + "logits/chosen": -741028672.0, + "logits/rejected": -635856512.0, + "logps/chosen": -490.70031545741324, + "logps/rejected": -917.5975232198142, + "loss": 0.332, + "rewards/chosen": -1.5408246253943219, + "rewards/margins": 4.827596427237257, + "rewards/rejected": -6.368421052631579, + "step": 3800 + }, + { + "epoch": 1.9890081130594086, + "eval_kl": 0.0, + "eval_logits/chosen": -1527192704.0, + "eval_logits/rejected": -1300766848.0, + "eval_logps/chosen": -517.2568035625927, + "eval_logps/rejected": -750.3689706613625, + "eval_loss": 0.4465000033378601, + "eval_rewards/chosen": -1.8533523008411676, + "eval_rewards/margins": 2.699855058681458, + "eval_rewards/rejected": -4.5532073595226255, + "eval_runtime": 93.4594, + "eval_samples_per_second": 42.799, + "eval_steps_per_second": 0.674, + "step": 3800 + }, + { + "epoch": 1.9942423449358806, + "grad_norm": 0.9057885208631024, + "kl": 0.0, + "learning_rate": 1.523803073083558e-05, + "logits/chosen": -846620288.0, + "logits/rejected": -680001536.0, + "logps/chosen": -472.8358208955224, + "logps/rejected": -827.5934426229509, + "loss": 0.3806, + "rewards/chosen": -1.304764750466418, + "rewards/margins": 3.919005741336861, + "rewards/rejected": -5.223770491803279, + "step": 3810 + }, + { + "epoch": 1.9994765768123528, + "grad_norm": 0.5119873172602645, + "kl": 0.0, + "learning_rate": 1.509800584902108e-05, + "logits/chosen": -803838336.0, + "logits/rejected": -722573696.0, + "logps/chosen": -474.0, + "logps/rejected": -756.7179487179487, + "loss": 0.3808, + "rewards/chosen": -1.3775724085365855, + "rewards/margins": 3.1476679760787993, + "rewards/rejected": -4.525240384615385, + "step": 3820 + }, + { + "epoch": 2.004710808688825, + "grad_norm": 1.0538275818801006, + "kl": 0.0, + "learning_rate": 1.4958348441365826e-05, + "logits/chosen": -836973376.0, + "logits/rejected": -691431040.0, + "logps/chosen": -462.88145896656533, + "logps/rejected": -709.1961414790997, + "loss": 0.3741, + "rewards/chosen": -1.5843643142097263, + "rewards/margins": 2.6957803803240354, + "rewards/rejected": -4.280144694533762, + "step": 3830 + }, + { + "epoch": 2.009945040565297, + "grad_norm": 0.8750158141328271, + "kl": 0.0, + "learning_rate": 1.4819063690713565e-05, + "logits/chosen": -916035968.0, + "logits/rejected": -644769408.0, + "logps/chosen": -470.2808022922636, + "logps/rejected": -773.3883161512027, + "loss": 0.3592, + "rewards/chosen": -1.3557093929083095, + "rewards/margins": 3.066971019462825, + "rewards/rejected": -4.422680412371134, + "step": 3840 + }, + { + "epoch": 2.015179272441769, + "grad_norm": 0.867120838419022, + "kl": 0.0, + "learning_rate": 1.4680156766078312e-05, + "logits/chosen": -785593152.0, + "logits/rejected": -728393344.0, + "logps/chosen": -384.1006289308176, + "logps/rejected": -811.9254658385094, + "loss": 0.3292, + "rewards/chosen": -0.747248427672956, + "rewards/margins": 4.652596292823938, + "rewards/rejected": -5.399844720496894, + "step": 3850 + }, + { + "epoch": 2.0204135043182414, + "grad_norm": 0.6042760418475094, + "kl": 0.0, + "learning_rate": 1.4541632822452546e-05, + "logits/chosen": -842216256.0, + "logits/rejected": -607964352.0, + "logps/chosen": -556.9139465875371, + "logps/rejected": -836.013201320132, + "loss": 0.3501, + "rewards/chosen": -1.8699041983494065, + "rewards/margins": 3.69610240231066, + "rewards/rejected": -5.566006600660066, + "step": 3860 + }, + { + "epoch": 2.0256477361947134, + "grad_norm": 0.6030811889034846, + "kl": 0.0, + "learning_rate": 1.4403497000615885e-05, + "logits/chosen": -897476224.0, + "logits/rejected": -684510400.0, + "logps/chosen": -514.7329192546584, + "logps/rejected": -893.2830188679245, + "loss": 0.342, + "rewards/chosen": -1.5113062888198758, + "rewards/margins": 4.606618239482011, + "rewards/rejected": -6.117924528301887, + "step": 3870 + }, + { + "epoch": 2.0308819680711854, + "grad_norm": 0.7459007455056504, + "kl": 0.0, + "learning_rate": 1.4265754426944322e-05, + "logits/chosen": -860461440.0, + "logits/rejected": -671403200.0, + "logps/chosen": -484.3987915407855, + "logps/rejected": -980.9190938511326, + "loss": 0.3352, + "rewards/chosen": -1.3766509917567504, + "rewards/margins": 5.371730885265904, + "rewards/rejected": -6.748381877022654, + "step": 3880 + }, + { + "epoch": 2.036116199947658, + "grad_norm": 0.7604713023578963, + "kl": 0.0, + "learning_rate": 1.4128410213219942e-05, + "logits/chosen": -869688960.0, + "logits/rejected": -683986112.0, + "logps/chosen": -546.6, + "logps/rejected": -1040.2, + "loss": 0.3211, + "rewards/chosen": -1.85484619140625, + "rewards/margins": 5.72406005859375, + "rewards/rejected": -7.57890625, + "step": 3890 + }, + { + "epoch": 2.04135043182413, + "grad_norm": 1.0804174673014937, + "kl": 0.0, + "learning_rate": 1.3991469456441273e-05, + "logits/chosen": -826173056.0, + "logits/rejected": -657824128.0, + "logps/chosen": -501.8452012383901, + "logps/rejected": -1085.8801261829653, + "loss": 0.3281, + "rewards/chosen": -1.6408184984520124, + "rewards/margins": 6.257840807541679, + "rewards/rejected": -7.898659305993691, + "step": 3900 + }, + { + "epoch": 2.04135043182413, + "eval_kl": 0.0, + "eval_logits/chosen": -1679585792.0, + "eval_logits/rejected": -1427328256.0, + "eval_logps/chosen": -596.9638792676892, + "eval_logps/rejected": -954.6852312282447, + "eval_loss": 0.4462812542915344, + "eval_rewards/chosen": -2.651657595249876, + "eval_rewards/margins": 3.942698446520387, + "eval_rewards/rejected": -6.594356041770263, + "eval_runtime": 93.447, + "eval_samples_per_second": 42.805, + "eval_steps_per_second": 0.674, + "step": 3900 + }, + { + "epoch": 2.0465846637006018, + "grad_norm": 0.5340218287321821, + "kl": 0.0, + "learning_rate": 1.3854937238634077e-05, + "logits/chosen": -826907008.0, + "logits/rejected": -684720128.0, + "logps/chosen": -579.2492307692307, + "logps/rejected": -1005.815873015873, + "loss": 0.3338, + "rewards/chosen": -2.297403846153846, + "rewards/margins": 4.872437423687424, + "rewards/rejected": -7.16984126984127, + "step": 3910 + }, + { + "epoch": 2.051818895577074, + "grad_norm": 1.1950327463899646, + "kl": 0.0, + "learning_rate": 1.3718818626662776e-05, + "logits/chosen": -840538496.0, + "logits/rejected": -627782464.0, + "logps/chosen": -479.9, + "logps/rejected": -971.6, + "loss": 0.3238, + "rewards/chosen": -1.4367919921875, + "rewards/margins": 5.5171142578125, + "rewards/rejected": -6.95390625, + "step": 3920 + }, + { + "epoch": 2.057053127453546, + "grad_norm": 0.42341628215112187, + "kl": 0.0, + "learning_rate": 1.3583118672042442e-05, + "logits/chosen": -825019584.0, + "logits/rejected": -621595840.0, + "logps/chosen": -464.4651162790698, + "logps/rejected": -945.6216216216217, + "loss": 0.3096, + "rewards/chosen": -1.3582394622093024, + "rewards/margins": 5.080105132385293, + "rewards/rejected": -6.438344594594595, + "step": 3930 + }, + { + "epoch": 2.062287359330018, + "grad_norm": 1.9842269461022177, + "kl": 0.0, + "learning_rate": 1.3447842410751255e-05, + "logits/chosen": -761161344.0, + "logits/rejected": -688599872.0, + "logps/chosen": -393.63157894736844, + "logps/rejected": -860.7619047619048, + "loss": 0.2949, + "rewards/chosen": -0.6329538445723685, + "rewards/margins": 5.192939012570489, + "rewards/rejected": -5.825892857142857, + "step": 3940 + }, + { + "epoch": 2.0675215912064906, + "grad_norm": 0.6398178755738954, + "kl": 0.0, + "learning_rate": 1.331299486304371e-05, + "logits/chosen": -761685632.0, + "logits/rejected": -684929856.0, + "logps/chosen": -501.4185303514377, + "logps/rejected": -804.1100917431193, + "loss": 0.3164, + "rewards/chosen": -1.7560652955271565, + "rewards/margins": 3.414424001108929, + "rewards/rejected": -5.170489296636085, + "step": 3950 + }, + { + "epoch": 2.0727558230829626, + "grad_norm": 0.5001389075478716, + "kl": 0.0, + "learning_rate": 1.3178581033264218e-05, + "logits/chosen": -784754304.0, + "logits/rejected": -674653824.0, + "logps/chosen": -535.4733542319749, + "logps/rejected": -945.2461059190031, + "loss": 0.2987, + "rewards/chosen": -2.061030564263323, + "rewards/margins": 4.187411803337923, + "rewards/rejected": -6.248442367601246, + "step": 3960 + }, + { + "epoch": 2.0779900549594346, + "grad_norm": 1.3573404725613187, + "kl": 0.0, + "learning_rate": 1.3044605909661434e-05, + "logits/chosen": -831101312.0, + "logits/rejected": -637534208.0, + "logps/chosen": -511.0617283950617, + "logps/rejected": -930.9367088607595, + "loss": 0.2924, + "rewards/chosen": -1.5159866898148149, + "rewards/margins": 4.905690525375059, + "rewards/rejected": -6.421677215189874, + "step": 3970 + }, + { + "epoch": 2.083224286835907, + "grad_norm": 0.7052536575350146, + "kl": 0.0, + "learning_rate": 1.2911074464203157e-05, + "logits/chosen": -691378560.0, + "logits/rejected": -608383808.0, + "logps/chosen": -434.2931596091205, + "logps/rejected": -906.8588588588589, + "loss": 0.2845, + "rewards/chosen": -1.1614413680781759, + "rewards/margins": 5.0990691424323344, + "rewards/rejected": -6.26051051051051, + "step": 3980 + }, + { + "epoch": 2.088458518712379, + "grad_norm": 0.5194332110158825, + "kl": 0.0, + "learning_rate": 1.2777991652391758e-05, + "logits/chosen": -745747264.0, + "logits/rejected": -601043776.0, + "logps/chosen": -453.30120481927713, + "logps/rejected": -860.8831168831168, + "loss": 0.3133, + "rewards/chosen": -1.4601609563253013, + "rewards/margins": 3.994384498220153, + "rewards/rejected": -5.454545454545454, + "step": 3990 + }, + { + "epoch": 2.093692750588851, + "grad_norm": 1.005682372476424, + "kl": 0.0, + "learning_rate": 1.2645362413080342e-05, + "logits/chosen": -702650752.0, + "logits/rejected": -508926368.0, + "logps/chosen": -389.91515151515154, + "logps/rejected": -866.2709677419355, + "loss": 0.3181, + "rewards/chosen": -1.0228811553030304, + "rewards/margins": 4.944860780180841, + "rewards/rejected": -5.967741935483871, + "step": 4000 + }, + { + "epoch": 2.093692750588851, + "eval_kl": 0.0, + "eval_logits/chosen": -1560081408.0, + "eval_logits/rejected": -1263351040.0, + "eval_logps/chosen": -525.110341415141, + "eval_logps/rejected": -727.0094480358031, + "eval_loss": 0.4486015737056732, + "eval_rewards/chosen": -1.932149925779317, + "eval_rewards/margins": 2.384732222405665, + "eval_rewards/rejected": -4.316882148184982, + "eval_runtime": 93.4499, + "eval_samples_per_second": 42.804, + "eval_steps_per_second": 0.674, + "step": 4000 + }, + { + "epoch": 2.0989269824653234, + "grad_norm": 0.8653491127894726, + "kl": 0.0, + "learning_rate": 1.2513191668289393e-05, + "logits/chosen": -744803520.0, + "logits/rejected": -595119296.0, + "logps/chosen": -400.85804416403784, + "logps/rejected": -879.3560371517028, + "loss": 0.291, + "rewards/chosen": -0.7524645110410094, + "rewards/margins": 5.041653136017814, + "rewards/rejected": -5.794117647058823, + "step": 4010 + }, + { + "epoch": 2.1041612143417954, + "grad_norm": 0.5023478537850743, + "kl": 0.0, + "learning_rate": 1.2381484323024178e-05, + "logits/chosen": -772276224.0, + "logits/rejected": -564238720.0, + "logps/chosen": -368.2507836990596, + "logps/rejected": -883.8380062305296, + "loss": 0.273, + "rewards/chosen": -0.25505118534482757, + "rewards/margins": 5.663173113720593, + "rewards/rejected": -5.918224299065421, + "step": 4020 + }, + { + "epoch": 2.1093954462182674, + "grad_norm": 0.47927941821650943, + "kl": 0.0, + "learning_rate": 1.2250245265092666e-05, + "logits/chosen": -683147264.0, + "logits/rejected": -500118336.0, + "logps/chosen": -428.57142857142856, + "logps/rejected": -802.7169811320755, + "loss": 0.3035, + "rewards/chosen": -1.0651446040372672, + "rewards/margins": 4.261899421119965, + "rewards/rejected": -5.327044025157233, + "step": 4030 + }, + { + "epoch": 2.11462967809474, + "grad_norm": 0.43364669877586093, + "kl": 0.0, + "learning_rate": 1.2119479364924148e-05, + "logits/chosen": -692479616.0, + "logits/rejected": -560359040.0, + "logps/chosen": -441.1640866873065, + "logps/rejected": -814.1324921135647, + "loss": 0.3, + "rewards/chosen": -1.3454914860681115, + "rewards/margins": 3.826432804152709, + "rewards/rejected": -5.1719242902208205, + "step": 4040 + }, + { + "epoch": 2.1198639099712118, + "grad_norm": 0.4900123761137033, + "kl": 0.0, + "learning_rate": 1.1989191475388516e-05, + "logits/chosen": -708732544.0, + "logits/rejected": -572103040.0, + "logps/chosen": -385.9746835443038, + "logps/rejected": -871.8024691358024, + "loss": 0.2711, + "rewards/chosen": -0.8139092167721519, + "rewards/margins": 4.855843869647601, + "rewards/rejected": -5.669753086419753, + "step": 4050 + }, + { + "epoch": 2.1250981418476838, + "grad_norm": 0.5886859527298862, + "kl": 0.0, + "learning_rate": 1.1859386431616157e-05, + "logits/chosen": -776785088.0, + "logits/rejected": -623797888.0, + "logps/chosen": -431.7928802588997, + "logps/rejected": -838.9607250755287, + "loss": 0.2749, + "rewards/chosen": -0.7912874190938511, + "rewards/margins": 4.660374212326089, + "rewards/rejected": -5.45166163141994, + "step": 4060 + }, + { + "epoch": 2.130332373724156, + "grad_norm": 1.0109121847189841, + "kl": 0.0, + "learning_rate": 1.173006905081847e-05, + "logits/chosen": -857944896.0, + "logits/rejected": -517000384.0, + "logps/chosen": -453.10843373493975, + "logps/rejected": -853.2987012987013, + "loss": 0.2734, + "rewards/chosen": -0.9301816641566265, + "rewards/margins": 4.767058595583633, + "rewards/rejected": -5.697240259740259, + "step": 4070 + }, + { + "epoch": 2.135566605600628, + "grad_norm": 1.0500135128137795, + "kl": 0.0, + "learning_rate": 1.160124413210918e-05, + "logits/chosen": -787900032.0, + "logits/rejected": -592130880.0, + "logps/chosen": -349.6369230769231, + "logps/rejected": -861.4603174603175, + "loss": 0.2678, + "rewards/chosen": -0.2839783653846154, + "rewards/margins": 5.0350692536630035, + "rewards/rejected": -5.319047619047619, + "step": 4080 + }, + { + "epoch": 2.1408008374771, + "grad_norm": 0.6560418400369694, + "kl": 0.0, + "learning_rate": 1.1472916456326146e-05, + "logits/chosen": -784020288.0, + "logits/rejected": -531942592.0, + "logps/chosen": -444.8, + "logps/rejected": -822.6, + "loss": 0.2778, + "rewards/chosen": -1.0637939453125, + "rewards/margins": 4.2182373046875, + "rewards/rejected": -5.28203125, + "step": 4090 + }, + { + "epoch": 2.146035069353572, + "grad_norm": 0.7551742469602785, + "kl": 0.0, + "learning_rate": 1.1345090785853999e-05, + "logits/chosen": -776260800.0, + "logits/rejected": -585734528.0, + "logps/chosen": -369.46835443037975, + "logps/rejected": -818.8641975308642, + "loss": 0.2603, + "rewards/chosen": -0.4557456487341772, + "rewards/margins": 4.676970400648539, + "rewards/rejected": -5.132716049382716, + "step": 4100 + }, + { + "epoch": 2.146035069353572, + "eval_kl": 0.0, + "eval_logits/chosen": -1459418112.0, + "eval_logits/rejected": -1138054528.0, + "eval_logps/chosen": -524.6036615536863, + "eval_logps/rejected": -721.5355544505221, + "eval_loss": 0.451171875, + "eval_rewards/chosen": -1.927325581395349, + "eval_rewards/margins": 2.337592369872677, + "eval_rewards/rejected": -4.264917951268026, + "eval_runtime": 93.4435, + "eval_samples_per_second": 42.807, + "eval_steps_per_second": 0.674, + "step": 4100 + }, + { + "epoch": 2.1512693012300446, + "grad_norm": 0.39235017704762193, + "kl": 0.0, + "learning_rate": 1.1217771864447396e-05, + "logits/chosen": -765565312.0, + "logits/rejected": -523894784.0, + "logps/chosen": -407.58409785932724, + "logps/rejected": -776.7923322683706, + "loss": 0.2826, + "rewards/chosen": -0.7646215596330275, + "rewards/margins": 4.039691539408507, + "rewards/rejected": -4.804313099041534, + "step": 4110 + }, + { + "epoch": 2.1565035331065165, + "grad_norm": 0.9980238027631436, + "kl": 0.0, + "learning_rate": 1.1090964417054946e-05, + "logits/chosen": -705691648.0, + "logits/rejected": -469028032.0, + "logps/chosen": -406.66261398176295, + "logps/rejected": -768.5144694533763, + "loss": 0.2883, + "rewards/chosen": -0.9441845554711246, + "rewards/margins": 4.141828306265209, + "rewards/rejected": -5.086012861736334, + "step": 4120 + }, + { + "epoch": 2.1617377649829885, + "grad_norm": 0.44846759594413, + "kl": 0.0, + "learning_rate": 1.0964673149643911e-05, + "logits/chosen": -777099648.0, + "logits/rejected": -531523168.0, + "logps/chosen": -414.9433962264151, + "logps/rejected": -828.5217391304348, + "loss": 0.2407, + "rewards/chosen": -0.6657576650943396, + "rewards/margins": 4.735639850433611, + "rewards/rejected": -5.40139751552795, + "step": 4130 + }, + { + "epoch": 2.166971996859461, + "grad_norm": 1.0494481926845889, + "kl": 0.0, + "learning_rate": 1.08389027490255e-05, + "logits/chosen": -704118784.0, + "logits/rejected": -602092352.0, + "logps/chosen": -371.94805194805195, + "logps/rejected": -816.9638554216867, + "loss": 0.2649, + "rewards/chosen": -0.5836038961038961, + "rewards/margins": 4.6212153810045375, + "rewards/rejected": -5.204819277108434, + "step": 4140 + }, + { + "epoch": 2.172206228735933, + "grad_norm": 0.48196554804324737, + "kl": 0.0, + "learning_rate": 1.0713657882680975e-05, + "logits/chosen": -682518144.0, + "logits/rejected": -520827712.0, + "logps/chosen": -418.4691358024691, + "logps/rejected": -798.5822784810126, + "loss": 0.2906, + "rewards/chosen": -1.0041473765432098, + "rewards/margins": 4.1097766740897015, + "rewards/rejected": -5.113924050632911, + "step": 4150 + }, + { + "epoch": 2.177440460612405, + "grad_norm": 1.416871559890211, + "kl": 0.0, + "learning_rate": 1.0588943198588456e-05, + "logits/chosen": -716072576.0, + "logits/rejected": -583637376.0, + "logps/chosen": -383.3, + "logps/rejected": -802.5, + "loss": 0.2669, + "rewards/chosen": -0.545361328125, + "rewards/margins": 4.497607421875, + "rewards/rejected": -5.04296875, + "step": 4160 + }, + { + "epoch": 2.1826746924888774, + "grad_norm": 0.6456767473107083, + "kl": 0.0, + "learning_rate": 1.0464763325050358e-05, + "logits/chosen": -603979776.0, + "logits/rejected": -628568896.0, + "logps/chosen": -347.06849315068496, + "logps/rejected": -838.6206896551724, + "loss": 0.2265, + "rewards/chosen": -0.028842037671232876, + "rewards/margins": 5.19529589336325, + "rewards/rejected": -5.224137931034483, + "step": 4170 + }, + { + "epoch": 2.1879089243653493, + "grad_norm": 0.3442641001225459, + "kl": 0.0, + "learning_rate": 1.0341122870521725e-05, + "logits/chosen": -679372416.0, + "logits/rejected": -476525376.0, + "logps/chosen": -351.39240506329116, + "logps/rejected": -795.1604938271605, + "loss": 0.2384, + "rewards/chosen": -0.20490506329113925, + "rewards/margins": 4.890002344116268, + "rewards/rejected": -5.094907407407407, + "step": 4180 + }, + { + "epoch": 2.1931431562418213, + "grad_norm": 0.4512805022045456, + "kl": 0.0, + "learning_rate": 1.0218026423439101e-05, + "logits/chosen": -709047104.0, + "logits/rejected": -486696544.0, + "logps/chosen": -384.0955223880597, + "logps/rejected": -794.6491803278689, + "loss": 0.2544, + "rewards/chosen": -0.3564365671641791, + "rewards/margins": 4.705038842671886, + "rewards/rejected": -5.061475409836065, + "step": 4190 + }, + { + "epoch": 2.1983773881182938, + "grad_norm": 0.407756567161035, + "kl": 0.0, + "learning_rate": 1.0095478552050347e-05, + "logits/chosen": -640994496.0, + "logits/rejected": -545416832.0, + "logps/chosen": -307.7115987460815, + "logps/rejected": -814.8535825545172, + "loss": 0.2388, + "rewards/chosen": -0.040727860501567396, + "rewards/margins": 4.7482129494672805, + "rewards/rejected": -4.788940809968848, + "step": 4200 + }, + { + "epoch": 2.1983773881182938, + "eval_kl": 0.0, + "eval_logits/chosen": -1377462656.0, + "eval_logits/rejected": -1113288064.0, + "eval_logps/chosen": -484.2592775853538, + "eval_logps/rejected": -613.2352063649926, + "eval_loss": 0.4555937647819519, + "eval_rewards/chosen": -1.5242454230578921, + "eval_rewards/margins": 1.6601155913627943, + "eval_rewards/rejected": -3.1843610144206864, + "eval_runtime": 93.4488, + "eval_samples_per_second": 42.804, + "eval_steps_per_second": 0.674, + "step": 4200 + }, + { + "epoch": 2.2036116199947657, + "grad_norm": 0.5169381215562495, + "kl": 0.0, + "learning_rate": 9.973483804245033e-06, + "logits/chosen": -717960000.0, + "logits/rejected": -570005888.0, + "logps/chosen": -330.7524115755627, + "logps/rejected": -749.4224924012158, + "loss": 0.2471, + "rewards/chosen": 0.02110128617363344, + "rewards/margins": 4.477028337845366, + "rewards/rejected": -4.455927051671733, + "step": 4210 + }, + { + "epoch": 2.2088458518712377, + "grad_norm": 0.6406300733522271, + "kl": 0.0, + "learning_rate": 9.85204670738569e-06, + "logits/chosen": -620022976.0, + "logits/rejected": -565287296.0, + "logps/chosen": -323.8933333333333, + "logps/rejected": -758.0235294117647, + "loss": 0.2546, + "rewards/chosen": -0.04555338541666667, + "rewards/margins": 4.550034849877451, + "rewards/rejected": -4.595588235294118, + "step": 4220 + }, + { + "epoch": 2.21408008374771, + "grad_norm": 1.0622337513292255, + "kl": 0.0, + "learning_rate": 9.731171768139807e-06, + "logits/chosen": -662700032.0, + "logits/rejected": -499227040.0, + "logps/chosen": -358.0246913580247, + "logps/rejected": -760.6075949367089, + "loss": 0.2706, + "rewards/chosen": -0.2875855999228395, + "rewards/margins": 4.2859903494442495, + "rewards/rejected": -4.573575949367089, + "step": 4230 + }, + { + "epoch": 2.219314315624182, + "grad_norm": 0.6021455816373862, + "kl": 0.0, + "learning_rate": 9.610863472312582e-06, + "logits/chosen": -704223616.0, + "logits/rejected": -538129216.0, + "logps/chosen": -331.6923076923077, + "logps/rejected": -764.0, + "loss": 0.2176, + "rewards/chosen": 0.22345753205128205, + "rewards/margins": 4.911719727173233, + "rewards/rejected": -4.688262195121951, + "step": 4240 + }, + { + "epoch": 2.224548547500654, + "grad_norm": 0.4635125876528428, + "kl": 0.0, + "learning_rate": 9.491126284680398e-06, + "logits/chosen": -614884992.0, + "logits/rejected": -477521504.0, + "logps/chosen": -333.1480362537764, + "logps/rejected": -717.3592233009708, + "loss": 0.2614, + "rewards/chosen": -0.05881797583081571, + "rewards/margins": 4.218690114784071, + "rewards/rejected": -4.277508090614886, + "step": 4250 + }, + { + "epoch": 2.2297827793771265, + "grad_norm": 0.47303646752910194, + "kl": 0.0, + "learning_rate": 9.371964648825221e-06, + "logits/chosen": -663853440.0, + "logits/rejected": -537237888.0, + "logps/chosen": -320.62111801242236, + "logps/rejected": -752.3018867924528, + "loss": 0.2374, + "rewards/chosen": 0.059819002329192544, + "rewards/margins": 4.639221518052463, + "rewards/rejected": -4.57940251572327, + "step": 4260 + }, + { + "epoch": 2.2350170112535985, + "grad_norm": 0.39987970021496416, + "kl": 0.0, + "learning_rate": 9.253382986969578e-06, + "logits/chosen": -710619968.0, + "logits/rejected": -506226272.0, + "logps/chosen": -294.41975308641975, + "logps/rejected": -773.4683544303797, + "loss": 0.227, + "rewards/chosen": 0.24508101851851852, + "rewards/margins": 5.129179119784341, + "rewards/rejected": -4.884098101265823, + "step": 4270 + }, + { + "epoch": 2.2402512431300705, + "grad_norm": 0.7503840516133481, + "kl": 0.0, + "learning_rate": 9.135385699812558e-06, + "logits/chosen": -779930816.0, + "logits/rejected": -603665216.0, + "logps/chosen": -301.1948051948052, + "logps/rejected": -752.5783132530121, + "loss": 0.205, + "rewards/chosen": 0.578023538961039, + "rewards/margins": 5.206788599202003, + "rewards/rejected": -4.628765060240964, + "step": 4280 + }, + { + "epoch": 2.245485475006543, + "grad_norm": 0.4724610462486881, + "kl": 0.0, + "learning_rate": 9.017977166366445e-06, + "logits/chosen": -787061120.0, + "logits/rejected": -559992000.0, + "logps/chosen": -327.8490566037736, + "logps/rejected": -717.7142857142857, + "loss": 0.2269, + "rewards/chosen": 0.601754866306137, + "rewards/margins": 5.016351139598062, + "rewards/rejected": -4.4145962732919255, + "step": 4290 + }, + { + "epoch": 2.250719706883015, + "grad_norm": 0.49216136291730395, + "kl": 0.0, + "learning_rate": 8.901161743794175e-06, + "logits/chosen": -734108032.0, + "logits/rejected": -551865536.0, + "logps/chosen": -282.2716049382716, + "logps/rejected": -743.8987341772151, + "loss": 0.224, + "rewards/chosen": 0.4676408179012346, + "rewards/margins": 4.9668496786607275, + "rewards/rejected": -4.499208860759493, + "step": 4300 + }, + { + "epoch": 2.250719706883015, + "eval_kl": 0.0, + "eval_logits/chosen": -1391976320.0, + "eval_logits/rejected": -1133061248.0, + "eval_logps/chosen": -479.9841662543295, + "eval_logps/rejected": -590.8304326205867, + "eval_loss": 0.45759373903274536, + "eval_rewards/chosen": -1.481321128154379, + "eval_rewards/margins": 1.4770329245557154, + "eval_rewards/rejected": -2.9583540527100944, + "eval_runtime": 93.45, + "eval_samples_per_second": 42.804, + "eval_steps_per_second": 0.674, + "step": 4300 + }, + { + "epoch": 2.255953938759487, + "grad_norm": 0.6632663517963511, + "kl": 0.0, + "learning_rate": 8.784943767247714e-06, + "logits/chosen": -755813568.0, + "logits/rejected": -542638080.0, + "logps/chosen": -350.7294117647059, + "logps/rejected": -749.8666666666667, + "loss": 0.2505, + "rewards/chosen": -0.1849264705882353, + "rewards/margins": 4.332573529411765, + "rewards/rejected": -4.5175, + "step": 4310 + }, + { + "epoch": 2.2611881706359593, + "grad_norm": 0.7391169731532689, + "kl": 0.0, + "learning_rate": 8.669327549707096e-06, + "logits/chosen": -655569728.0, + "logits/rejected": -531103744.0, + "logps/chosen": -336.8126984126984, + "logps/rejected": -768.0, + "loss": 0.2342, + "rewards/chosen": 0.10716765873015872, + "rewards/margins": 4.954090735653236, + "rewards/rejected": -4.846923076923077, + "step": 4320 + }, + { + "epoch": 2.2664224025124313, + "grad_norm": 0.6386360163671007, + "kl": 0.0, + "learning_rate": 8.554317381820411e-06, + "logits/chosen": -667104064.0, + "logits/rejected": -506986496.0, + "logps/chosen": -301.05806451612904, + "logps/rejected": -754.4242424242424, + "loss": 0.232, + "rewards/chosen": 0.19148185483870966, + "rewards/margins": 4.9043606427174975, + "rewards/rejected": -4.712878787878788, + "step": 4330 + }, + { + "epoch": 2.2716566343889033, + "grad_norm": 0.600238772093869, + "kl": 0.0, + "learning_rate": 8.439917531744587e-06, + "logits/chosen": -666894336.0, + "logits/rejected": -518520832.0, + "logps/chosen": -323.95180722891564, + "logps/rejected": -758.6493506493506, + "loss": 0.2381, + "rewards/chosen": 0.22035015060240964, + "rewards/margins": 4.907444306446565, + "rewards/rejected": -4.6870941558441555, + "step": 4340 + }, + { + "epoch": 2.2768908662653757, + "grad_norm": 0.8954410797785683, + "kl": 0.0, + "learning_rate": 8.326132244986932e-06, + "logits/chosen": -701707072.0, + "logits/rejected": -517367392.0, + "logps/chosen": -290.51851851851853, + "logps/rejected": -748.7594936708861, + "loss": 0.2228, + "rewards/chosen": 0.3577594521604938, + "rewards/margins": 4.953487300261759, + "rewards/rejected": -4.595727848101266, + "step": 4350 + }, + { + "epoch": 2.2821250981418477, + "grad_norm": 0.6812826415405142, + "kl": 0.0, + "learning_rate": 8.212965744247652e-06, + "logits/chosen": -646709248.0, + "logits/rejected": -484101312.0, + "logps/chosen": -325.4787878787879, + "logps/rejected": -740.2322580645161, + "loss": 0.2575, + "rewards/chosen": -0.15175189393939395, + "rewards/margins": 4.116796493157381, + "rewards/rejected": -4.268548387096774, + "step": 4360 + }, + { + "epoch": 2.2873593300183197, + "grad_norm": 0.5151716397306011, + "kl": 0.0, + "learning_rate": 8.100422229263077e-06, + "logits/chosen": -604399232.0, + "logits/rejected": -566650496.0, + "logps/chosen": -358.19867549668874, + "logps/rejected": -766.1065088757397, + "loss": 0.24, + "rewards/chosen": -0.015883692052980132, + "rewards/margins": 4.736335242858263, + "rewards/rejected": -4.752218934911243, + "step": 4370 + }, + { + "epoch": 2.292593561894792, + "grad_norm": 1.135779708785077, + "kl": 0.0, + "learning_rate": 7.988505876649863e-06, + "logits/chosen": -608174080.0, + "logits/rejected": -524956480.0, + "logps/chosen": -331.6687898089172, + "logps/rejected": -709.398773006135, + "loss": 0.2588, + "rewards/chosen": -0.05260997213375796, + "rewards/margins": 4.220396162835567, + "rewards/rejected": -4.273006134969325, + "step": 4380 + }, + { + "epoch": 2.297827793771264, + "grad_norm": 0.5072498511179133, + "kl": 0.0, + "learning_rate": 7.877220839749939e-06, + "logits/chosen": -597111616.0, + "logits/rejected": -512281792.0, + "logps/chosen": -310.92459016393445, + "logps/rejected": -764.5611940298508, + "loss": 0.2456, + "rewards/chosen": 0.0782530737704918, + "rewards/margins": 4.849894864815268, + "rewards/rejected": -4.771641791044776, + "step": 4390 + }, + { + "epoch": 2.303062025647736, + "grad_norm": 0.3652767871003493, + "kl": 0.0, + "learning_rate": 7.766571248476399e-06, + "logits/chosen": -577136256.0, + "logits/rejected": -415983200.0, + "logps/chosen": -382.37538461538463, + "logps/rejected": -836.4698412698413, + "loss": 0.26, + "rewards/chosen": -0.4841105769230769, + "rewards/margins": 4.81271481990232, + "rewards/rejected": -5.296825396825397, + "step": 4400 + }, + { + "epoch": 2.303062025647736, + "eval_kl": 0.0, + "eval_logits/chosen": -1215682432.0, + "eval_logits/rejected": -961427712.0, + "eval_logps/chosen": -496.70460168233546, + "eval_logps/rejected": -633.5713575335654, + "eval_loss": 0.4561718702316284, + "eval_rewards/chosen": -1.6511009401286492, + "eval_rewards/margins": 1.7327876725018827, + "eval_rewards/rejected": -3.383888612630532, + "eval_runtime": 93.4509, + "eval_samples_per_second": 42.803, + "eval_steps_per_second": 0.674, + "step": 4400 + }, + { + "epoch": 2.3082962575242085, + "grad_norm": 0.7225883147344231, + "kl": 0.0, + "learning_rate": 7.656561209160248e-06, + "logits/chosen": -582431552.0, + "logits/rejected": -499017312.0, + "logps/chosen": -314.61333333333334, + "logps/rejected": -786.4470588235295, + "loss": 0.2242, + "rewards/chosen": 0.25373046875, + "rewards/margins": 5.158142233455882, + "rewards/rejected": -4.904411764705882, + "step": 4410 + }, + { + "epoch": 2.3135304894006805, + "grad_norm": 0.5150878842633106, + "kl": 0.0, + "learning_rate": 7.547194804398e-06, + "logits/chosen": -703279936.0, + "logits/rejected": -511082496.0, + "logps/chosen": -382.17846153846153, + "logps/rejected": -845.3079365079365, + "loss": 0.2401, + "rewards/chosen": -0.09444411057692308, + "rewards/margins": 5.292063825931014, + "rewards/rejected": -5.386507936507937, + "step": 4420 + }, + { + "epoch": 2.3187647212771525, + "grad_norm": 0.44831570510738245, + "kl": 0.0, + "learning_rate": 7.43847609290014e-06, + "logits/chosen": -621491008.0, + "logits/rejected": -489894720.0, + "logps/chosen": -278.4773413897281, + "logps/rejected": -815.8446601941747, + "loss": 0.2211, + "rewards/chosen": 0.3919290502265861, + "rewards/margins": 5.424291509773512, + "rewards/rejected": -5.032362459546926, + "step": 4430 + }, + { + "epoch": 2.323998953153625, + "grad_norm": 0.31782004144491344, + "kl": 0.0, + "learning_rate": 7.330409109340563e-06, + "logits/chosen": -515217824.0, + "logits/rejected": -545993536.0, + "logps/chosen": -315.5342465753425, + "logps/rejected": -770.5747126436781, + "loss": 0.2086, + "rewards/chosen": 0.1988073897688356, + "rewards/margins": 5.010588998964238, + "rewards/rejected": -4.811781609195402, + "step": 4440 + }, + { + "epoch": 2.329233185030097, + "grad_norm": 1.2202322471583766, + "kl": 0.0, + "learning_rate": 7.222997864206757e-06, + "logits/chosen": -604346752.0, + "logits/rejected": -504784480.0, + "logps/chosen": -316.65594855305466, + "logps/rejected": -817.2158054711247, + "loss": 0.215, + "rewards/chosen": 0.21764469453376206, + "rewards/margins": 5.486641655020084, + "rewards/rejected": -5.268996960486322, + "step": 4450 + }, + { + "epoch": 2.334467416906569, + "grad_norm": 0.6017860477665279, + "kl": 0.0, + "learning_rate": 7.1162463436510615e-06, + "logits/chosen": -627625152.0, + "logits/rejected": -399297728.0, + "logps/chosen": -349.1446153846154, + "logps/rejected": -789.8412698412699, + "loss": 0.2337, + "rewards/chosen": -0.11662860576923077, + "rewards/margins": 5.069085679945055, + "rewards/rejected": -5.185714285714286, + "step": 4460 + }, + { + "epoch": 2.3397016487830413, + "grad_norm": 0.4804667120654089, + "kl": 0.0, + "learning_rate": 7.010158509342682e-06, + "logits/chosen": -588775424.0, + "logits/rejected": -481977952.0, + "logps/chosen": -353.53846153846155, + "logps/rejected": -810.6341463414634, + "loss": 0.2194, + "rewards/chosen": -0.06151091746794872, + "rewards/margins": 5.045196399605222, + "rewards/rejected": -5.1067073170731705, + "step": 4470 + }, + { + "epoch": 2.3449358806595133, + "grad_norm": 0.9272819424930441, + "kl": 0.0, + "learning_rate": 6.904738298320665e-06, + "logits/chosen": -649173376.0, + "logits/rejected": -448423520.0, + "logps/chosen": -336.9846153846154, + "logps/rejected": -809.7523809523809, + "loss": 0.2483, + "rewards/chosen": -0.031105769230769232, + "rewards/margins": 5.2395291514041515, + "rewards/rejected": -5.270634920634921, + "step": 4480 + }, + { + "epoch": 2.3501701125359853, + "grad_norm": 0.45114391479169963, + "kl": 0.0, + "learning_rate": 6.799989622847827e-06, + "logits/chosen": -559860928.0, + "logits/rejected": -491362720.0, + "logps/chosen": -313.53846153846155, + "logps/rejected": -871.8048780487804, + "loss": 0.2363, + "rewards/chosen": -0.07071314102564102, + "rewards/margins": 5.6548966150719195, + "rewards/rejected": -5.725609756097561, + "step": 4490 + }, + { + "epoch": 2.3554043444124577, + "grad_norm": 0.5010732836904357, + "kl": 0.0, + "learning_rate": 6.695916370265528e-06, + "logits/chosen": -522138432.0, + "logits/rejected": -414030240.0, + "logps/chosen": -343.3762711864407, + "logps/rejected": -907.6869565217391, + "loss": 0.2234, + "rewards/chosen": -0.18509004237288135, + "rewards/margins": 5.982301261974945, + "rewards/rejected": -6.167391304347826, + "step": 4500 + }, + { + "epoch": 2.3554043444124577, + "eval_kl": 0.0, + "eval_logits/chosen": -1145577600.0, + "eval_logits/rejected": -884432256.0, + "eval_logps/chosen": -540.8807521029194, + "eval_logps/rejected": -735.6340129288911, + "eval_loss": 0.4556874930858612, + "eval_rewards/chosen": -2.091848095002474, + "eval_rewards/margins": 2.313795863227263, + "eval_rewards/rejected": -4.405643958229737, + "eval_runtime": 93.4327, + "eval_samples_per_second": 42.812, + "eval_steps_per_second": 0.674, + "step": 4500 + }, + { + "epoch": 2.3606385762889297, + "grad_norm": 1.0333751484523532, + "kl": 0.0, + "learning_rate": 6.592522402849421e-06, + "logits/chosen": -636590464.0, + "logits/rejected": -421606208.0, + "logps/chosen": -373.8426966292135, + "logps/rejected": -919.3239436619718, + "loss": 0.2598, + "rewards/chosen": -0.5181377282303371, + "rewards/margins": 5.887672130924593, + "rewards/rejected": -6.40580985915493, + "step": 4510 + }, + { + "epoch": 2.3658728081654017, + "grad_norm": 0.6399952632365049, + "kl": 0.0, + "learning_rate": 6.489811557666137e-06, + "logits/chosen": -648177280.0, + "logits/rejected": -545574080.0, + "logps/chosen": -350.32615384615383, + "logps/rejected": -833.5238095238095, + "loss": 0.2274, + "rewards/chosen": -0.19805288461538462, + "rewards/margins": 5.143216956654456, + "rewards/rejected": -5.341269841269841, + "step": 4520 + }, + { + "epoch": 2.3711070400418737, + "grad_norm": 0.6465757772054535, + "kl": 0.0, + "learning_rate": 6.387787646430854e-06, + "logits/chosen": -707946112.0, + "logits/rejected": -566702912.0, + "logps/chosen": -314.63291139240505, + "logps/rejected": -818.8641975308642, + "loss": 0.22, + "rewards/chosen": 0.3127472310126582, + "rewards/margins": 5.450864514963275, + "rewards/rejected": -5.138117283950617, + "step": 4530 + }, + { + "epoch": 2.376341271918346, + "grad_norm": 0.4214262313722552, + "kl": 0.0, + "learning_rate": 6.286454455365875e-06, + "logits/chosen": -754555264.0, + "logits/rejected": -540016640.0, + "logps/chosen": -294.6900584795322, + "logps/rejected": -800.5369127516778, + "loss": 0.2453, + "rewards/chosen": 0.23325566520467836, + "rewards/margins": 5.291980497419444, + "rewards/rejected": -5.058724832214765, + "step": 4540 + }, + { + "epoch": 2.381575503794818, + "grad_norm": 0.4238912364274616, + "kl": 0.0, + "learning_rate": 6.1858157450600775e-06, + "logits/chosen": -721210560.0, + "logits/rejected": -625999872.0, + "logps/chosen": -364.0774193548387, + "logps/rejected": -827.1515151515151, + "loss": 0.2304, + "rewards/chosen": -0.1208921370967742, + "rewards/margins": 5.11395634775171, + "rewards/rejected": -5.234848484848484, + "step": 4550 + }, + { + "epoch": 2.38680973567129, + "grad_norm": 1.2001050025077662, + "kl": 0.0, + "learning_rate": 6.085875250329401e-06, + "logits/chosen": -656828032.0, + "logits/rejected": -566702912.0, + "logps/chosen": -330.1635220125786, + "logps/rejected": -768.5962732919255, + "loss": 0.2435, + "rewards/chosen": 0.12917649371069181, + "rewards/margins": 4.847344195574046, + "rewards/rejected": -4.718167701863354, + "step": 4560 + }, + { + "epoch": 2.3920439675477625, + "grad_norm": 0.9922339601859416, + "kl": 0.0, + "learning_rate": 5.9866366800782e-06, + "logits/chosen": -736310080.0, + "logits/rejected": -497549312.0, + "logps/chosen": -365.87951807228916, + "logps/rejected": -852.0519480519481, + "loss": 0.2309, + "rewards/chosen": -0.16735692771084337, + "rewards/margins": 5.382967747613832, + "rewards/rejected": -5.550324675324675, + "step": 4570 + }, + { + "epoch": 2.3972781994242345, + "grad_norm": 0.8372891321444087, + "kl": 0.0, + "learning_rate": 5.888103717161619e-06, + "logits/chosen": -734632320.0, + "logits/rejected": -425354848.0, + "logps/chosen": -326.99708454810497, + "logps/rejected": -892.7676767676768, + "loss": 0.2377, + "rewards/chosen": 0.01731049562682216, + "rewards/margins": 6.149465377781705, + "rewards/rejected": -6.132154882154882, + "step": 4580 + }, + { + "epoch": 2.4025124313007065, + "grad_norm": 1.1722616912692914, + "kl": 0.0, + "learning_rate": 5.790280018248939e-06, + "logits/chosen": -658610560.0, + "logits/rejected": -490733568.0, + "logps/chosen": -332.5678233438486, + "logps/rejected": -859.1455108359133, + "loss": 0.2372, + "rewards/chosen": -0.004115733438485805, + "rewards/margins": 5.72034247089588, + "rewards/rejected": -5.724458204334366, + "step": 4590 + }, + { + "epoch": 2.407746663177179, + "grad_norm": 0.7180486773047849, + "kl": 0.0, + "learning_rate": 5.693169213687824e-06, + "logits/chosen": -655569728.0, + "logits/rejected": -571211776.0, + "logps/chosen": -352.59442724458205, + "logps/rejected": -779.205047318612, + "loss": 0.235, + "rewards/chosen": -0.15576020704334365, + "rewards/margins": 4.647867553209022, + "rewards/rejected": -4.803627760252366, + "step": 4600 + }, + { + "epoch": 2.407746663177179, + "eval_kl": 0.0, + "eval_logits/chosen": -1404625792.0, + "eval_logits/rejected": -1132328960.0, + "eval_logps/chosen": -488.93023255813955, + "eval_logps/rejected": -651.3615116857285, + "eval_loss": 0.455929696559906, + "eval_rewards/chosen": -1.5702003958436417, + "eval_rewards/margins": 1.9910875206158312, + "eval_rewards/rejected": -3.561287916459473, + "eval_runtime": 93.4284, + "eval_samples_per_second": 42.814, + "eval_steps_per_second": 0.674, + "step": 4600 + }, + { + "epoch": 2.412980895053651, + "grad_norm": 0.42584673850291915, + "kl": 0.0, + "learning_rate": 5.596774907369659e-06, + "logits/chosen": -692269888.0, + "logits/rejected": -605133184.0, + "logps/chosen": -334.1758957654723, + "logps/rejected": -871.7837837837837, + "loss": 0.2151, + "rewards/chosen": -0.0889149022801303, + "rewards/margins": 5.420094106728879, + "rewards/rejected": -5.509009009009009, + "step": 4610 + }, + { + "epoch": 2.418215126930123, + "grad_norm": 0.5818582065995188, + "kl": 0.0, + "learning_rate": 5.501100676595761e-06, + "logits/chosen": -740714112.0, + "logits/rejected": -470758208.0, + "logps/chosen": -391.3394495412844, + "logps/rejected": -817.891373801917, + "loss": 0.2548, + "rewards/chosen": -0.3944954128440367, + "rewards/margins": 5.256463053609638, + "rewards/rejected": -5.650958466453674, + "step": 4620 + }, + { + "epoch": 2.4234493588065953, + "grad_norm": 1.3819770535923588, + "kl": 0.0, + "learning_rate": 5.406150071944604e-06, + "logits/chosen": -700763328.0, + "logits/rejected": -526280288.0, + "logps/chosen": -326.3647798742138, + "logps/rejected": -896.5962732919255, + "loss": 0.2412, + "rewards/chosen": 0.03156937893081761, + "rewards/margins": 6.1480290062600105, + "rewards/rejected": -6.116459627329193, + "step": 4630 + }, + { + "epoch": 2.4286835906830673, + "grad_norm": 0.4913752890099384, + "kl": 0.0, + "learning_rate": 5.311926617140122e-06, + "logits/chosen": -603350656.0, + "logits/rejected": -634807936.0, + "logps/chosen": -343.64963503649636, + "logps/rejected": -894.4262295081967, + "loss": 0.2068, + "rewards/chosen": -0.08704949817518248, + "rewards/margins": 5.797513343354872, + "rewards/rejected": -5.884562841530054, + "step": 4640 + }, + { + "epoch": 2.4339178225595393, + "grad_norm": 0.721220360194313, + "kl": 0.0, + "learning_rate": 5.218433808920884e-06, + "logits/chosen": -737778048.0, + "logits/rejected": -527958016.0, + "logps/chosen": -374.52037617554856, + "logps/rejected": -881.8442367601247, + "loss": 0.2279, + "rewards/chosen": -0.04217280564263323, + "rewards/margins": 5.678232178781043, + "rewards/rejected": -5.720404984423676, + "step": 4650 + }, + { + "epoch": 2.4391520544360117, + "grad_norm": 0.45581445352448063, + "kl": 0.0, + "learning_rate": 5.125675116910325e-06, + "logits/chosen": -658925184.0, + "logits/rejected": -453980992.0, + "logps/chosen": -415.0886075949367, + "logps/rejected": -960.395061728395, + "loss": 0.2374, + "rewards/chosen": -0.5506081882911392, + "rewards/margins": 6.295070824054539, + "rewards/rejected": -6.845679012345679, + "step": 4660 + }, + { + "epoch": 2.4443862863124837, + "grad_norm": 2.000847358239112, + "kl": 0.0, + "learning_rate": 5.033653983488029e-06, + "logits/chosen": -733111936.0, + "logits/rejected": -521614144.0, + "logps/chosen": -416.0955223880597, + "logps/rejected": -1016.4459016393442, + "loss": 0.2485, + "rewards/chosen": -0.7988339552238806, + "rewards/margins": 5.922477520185955, + "rewards/rejected": -6.721311475409836, + "step": 4670 + }, + { + "epoch": 2.4496205181889557, + "grad_norm": 1.0399985853719926, + "kl": 0.0, + "learning_rate": 4.942373823661927e-06, + "logits/chosen": -693528192.0, + "logits/rejected": -522977280.0, + "logps/chosen": -377.45, + "logps/rejected": -891.3, + "loss": 0.2406, + "rewards/chosen": -0.49594554901123045, + "rewards/margins": 5.42124195098877, + "rewards/rejected": -5.9171875, + "step": 4680 + }, + { + "epoch": 2.454854750065428, + "grad_norm": 0.5943719797905609, + "kl": 0.0, + "learning_rate": 4.85183802494159e-06, + "logits/chosen": -666370048.0, + "logits/rejected": -534931040.0, + "logps/chosen": -309.0658307210031, + "logps/rejected": -861.1090342679128, + "loss": 0.2172, + "rewards/chosen": 0.3622159090909091, + "rewards/margins": 6.108321828094025, + "rewards/rejected": -5.746105919003115, + "step": 4690 + }, + { + "epoch": 2.4600889819419, + "grad_norm": 0.9426707645421406, + "kl": 0.0, + "learning_rate": 4.762049947212521e-06, + "logits/chosen": -638687616.0, + "logits/rejected": -570530176.0, + "logps/chosen": -326.69182389937106, + "logps/rejected": -930.1863354037267, + "loss": 0.2246, + "rewards/chosen": 0.051788522012578615, + "rewards/margins": 6.245887900894566, + "rewards/rejected": -6.194099378881988, + "step": 4700 + }, + { + "epoch": 2.4600889819419, + "eval_kl": 0.0, + "eval_logits/chosen": -1378527872.0, + "eval_logits/rejected": -1094446976.0, + "eval_logps/chosen": -506.0781791192479, + "eval_logps/rejected": -691.9065141720537, + "eval_loss": 0.45228123664855957, + "eval_rewards/chosen": -1.7423305294408709, + "eval_rewards/margins": 2.2244770289877716, + "eval_rewards/rejected": -3.9668075584286426, + "eval_runtime": 93.451, + "eval_samples_per_second": 42.803, + "eval_steps_per_second": 0.674, + "step": 4700 + }, + { + "epoch": 2.465323213818372, + "grad_norm": 0.48273618157451387, + "kl": 0.0, + "learning_rate": 4.673012922611436e-06, + "logits/chosen": -747215232.0, + "logits/rejected": -566309696.0, + "logps/chosen": -337.8793650793651, + "logps/rejected": -835.3476923076923, + "loss": 0.2229, + "rewards/chosen": -0.011532738095238096, + "rewards/margins": 5.370774954212454, + "rewards/rejected": -5.382307692307692, + "step": 4710 + }, + { + "epoch": 2.470557445694844, + "grad_norm": 1.1484291534793738, + "kl": 0.0, + "learning_rate": 4.584730255402647e-06, + "logits/chosen": -764307072.0, + "logits/rejected": -553333568.0, + "logps/chosen": -317.72121212121215, + "logps/rejected": -895.174193548387, + "loss": 0.2187, + "rewards/chosen": 0.34360795454545456, + "rewards/margins": 6.343607954545455, + "rewards/rejected": -6.0, + "step": 4720 + }, + { + "epoch": 2.4757916775713165, + "grad_norm": 0.5790162958462677, + "kl": 0.0, + "learning_rate": 4.497205221855386e-06, + "logits/chosen": -617611264.0, + "logits/rejected": -612473216.0, + "logps/chosen": -287.1551155115512, + "logps/rejected": -787.1810089020771, + "loss": 0.2337, + "rewards/chosen": 0.18550175330033003, + "rewards/margins": 4.99336525478401, + "rewards/rejected": -4.80786350148368, + "step": 4730 + }, + { + "epoch": 2.4810259094477884, + "grad_norm": 0.5496024948748146, + "kl": 0.0, + "learning_rate": 4.41044107012227e-06, + "logits/chosen": -723465024.0, + "logits/rejected": -491048128.0, + "logps/chosen": -358.67069486404836, + "logps/rejected": -776.4919093851132, + "loss": 0.2634, + "rewards/chosen": -0.27346110271903323, + "rewards/margins": 4.758092295339219, + "rewards/rejected": -5.031553398058253, + "step": 4740 + }, + { + "epoch": 2.4862601413242604, + "grad_norm": 0.8519585979935906, + "kl": 0.0, + "learning_rate": 4.324441020118722e-06, + "logits/chosen": -624951296.0, + "logits/rejected": -460744288.0, + "logps/chosen": -340.8862275449102, + "logps/rejected": -869.0196078431372, + "loss": 0.263, + "rewards/chosen": -0.4779425523952096, + "rewards/margins": 5.2352927417224375, + "rewards/rejected": -5.713235294117647, + "step": 4750 + }, + { + "epoch": 2.491494373200733, + "grad_norm": 0.9981597518915051, + "kl": 0.0, + "learning_rate": 4.2392082634034825e-06, + "logits/chosen": -665531200.0, + "logits/rejected": -534039744.0, + "logps/chosen": -379.18954248366015, + "logps/rejected": -900.9820359281437, + "loss": 0.2408, + "rewards/chosen": -0.45895884395424835, + "rewards/margins": 5.738645946464913, + "rewards/rejected": -6.197604790419161, + "step": 4760 + }, + { + "epoch": 2.496728605077205, + "grad_norm": 0.784519279781345, + "kl": 0.0, + "learning_rate": 4.154745963060197e-06, + "logits/chosen": -680420992.0, + "logits/rejected": -558261888.0, + "logps/chosen": -400.24767801857587, + "logps/rejected": -837.9558359621451, + "loss": 0.24, + "rewards/chosen": -0.4515770123839009, + "rewards/margins": 5.036593334619253, + "rewards/rejected": -5.488170347003154, + "step": 4770 + }, + { + "epoch": 2.501962836953677, + "grad_norm": 0.9234334650905448, + "kl": 0.0, + "learning_rate": 4.071057253579979e-06, + "logits/chosen": -700763328.0, + "logits/rejected": -511705088.0, + "logps/chosen": -306.5619335347432, + "logps/rejected": -808.5954692556634, + "loss": 0.236, + "rewards/chosen": 0.07057212990936555, + "rewards/margins": 5.465394136381858, + "rewards/rejected": -5.394822006472492, + "step": 4780 + }, + { + "epoch": 2.5071970688301493, + "grad_norm": 0.6296245262199346, + "kl": 0.0, + "learning_rate": 3.988145240745136e-06, + "logits/chosen": -675597504.0, + "logits/rejected": -546937216.0, + "logps/chosen": -355.64307692307693, + "logps/rejected": -811.479365079365, + "loss": 0.2441, + "rewards/chosen": -0.27162259615384615, + "rewards/margins": 4.9458377213064715, + "rewards/rejected": -5.217460317460318, + "step": 4790 + }, + { + "epoch": 2.5124313007066212, + "grad_norm": 0.6754890512905317, + "kl": 0.0, + "learning_rate": 3.906013001513886e-06, + "logits/chosen": -699400192.0, + "logits/rejected": -578394496.0, + "logps/chosen": -377.0769230769231, + "logps/rejected": -790.2439024390244, + "loss": 0.24, + "rewards/chosen": -0.38168569711538464, + "rewards/margins": 4.640417961421201, + "rewards/rejected": -5.022103658536586, + "step": 4800 + }, + { + "epoch": 2.5124313007066212, + "eval_kl": 0.0, + "eval_logits/chosen": -1400231808.0, + "eval_logits/rejected": -1111490560.0, + "eval_logps/chosen": -509.52993567540824, + "eval_logps/rejected": -701.2312282446544, + "eval_loss": 0.45185938477516174, + "eval_rewards/chosen": -1.7776472043542801, + "eval_rewards/margins": 2.2837650283657593, + "eval_rewards/rejected": -4.0614122327200395, + "eval_runtime": 93.4505, + "eval_samples_per_second": 42.803, + "eval_steps_per_second": 0.674, + "step": 4800 + }, + { + "epoch": 2.5176655325830932, + "grad_norm": 0.800808631316141, + "kl": 0.0, + "learning_rate": 3.824663583906144e-06, + "logits/chosen": -750675584.0, + "logits/rejected": -554801536.0, + "logps/chosen": -339.219512195122, + "logps/rejected": -819.3846153846154, + "loss": 0.2421, + "rewards/chosen": -0.08836699695121951, + "rewards/margins": 5.197690695356473, + "rewards/rejected": -5.2860576923076925, + "step": 4810 + }, + { + "epoch": 2.5228997644595657, + "grad_norm": 0.8762123307256493, + "kl": 0.0, + "learning_rate": 3.744100006890461e-06, + "logits/chosen": -683776384.0, + "logits/rejected": -558156992.0, + "logps/chosen": -339.73006134969324, + "logps/rejected": -753.8343949044586, + "loss": 0.2553, + "rewards/chosen": -0.3314081671779141, + "rewards/margins": 4.196458074860303, + "rewards/rejected": -4.5278662420382165, + "step": 4820 + }, + { + "epoch": 2.5281339963360376, + "grad_norm": 0.8555920028342421, + "kl": 0.0, + "learning_rate": 3.664325260271953e-06, + "logits/chosen": -740294656.0, + "logits/rejected": -543162368.0, + "logps/chosen": -314.2153846153846, + "logps/rejected": -803.1492063492063, + "loss": 0.246, + "rewards/chosen": 0.09826923076923078, + "rewards/margins": 5.433189865689865, + "rewards/rejected": -5.334920634920635, + "step": 4830 + }, + { + "epoch": 2.5333682282125096, + "grad_norm": 0.752604945442186, + "kl": 0.0, + "learning_rate": 3.5853423045813377e-06, + "logits/chosen": -632134016.0, + "logits/rejected": -568537920.0, + "logps/chosen": -365.74426229508197, + "logps/rejected": -893.3253731343284, + "loss": 0.2529, + "rewards/chosen": -0.5505635245901639, + "rewards/margins": 5.412123042574016, + "rewards/rejected": -5.962686567164179, + "step": 4840 + }, + { + "epoch": 2.538602460088982, + "grad_norm": 0.8106638168857512, + "kl": 0.0, + "learning_rate": 3.507154070965099e-06, + "logits/chosen": -654730880.0, + "logits/rejected": -509398208.0, + "logps/chosen": -379.9388379204893, + "logps/rejected": -792.2300319488818, + "loss": 0.2938, + "rewards/chosen": -0.660431001529052, + "rewards/margins": 4.429025867480533, + "rewards/rejected": -5.0894568690095845, + "step": 4850 + }, + { + "epoch": 2.543836691965454, + "grad_norm": 0.5194989581627086, + "kl": 0.0, + "learning_rate": 3.4297634610766765e-06, + "logits/chosen": -698771072.0, + "logits/rejected": -520355840.0, + "logps/chosen": -403.9384615384615, + "logps/rejected": -867.3523809523809, + "loss": 0.2571, + "rewards/chosen": -0.6559375, + "rewards/margins": 5.1821577380952375, + "rewards/rejected": -5.838095238095238, + "step": 4860 + }, + { + "epoch": 2.549070923841926, + "grad_norm": 2.654714248081575, + "kl": 0.0, + "learning_rate": 3.3531733469687855e-06, + "logits/chosen": -668886656.0, + "logits/rejected": -503578624.0, + "logps/chosen": -348.48902821316614, + "logps/rejected": -929.993769470405, + "loss": 0.2305, + "rewards/chosen": -0.058752938871473356, + "rewards/margins": 6.480187871097375, + "rewards/rejected": -6.538940809968848, + "step": 4870 + }, + { + "epoch": 2.5543051557183984, + "grad_norm": 0.4859429710310574, + "kl": 0.0, + "learning_rate": 3.277386570986868e-06, + "logits/chosen": -697407872.0, + "logits/rejected": -608803200.0, + "logps/chosen": -364.1038961038961, + "logps/rejected": -910.843373493976, + "loss": 0.2223, + "rewards/chosen": -0.05125050730519481, + "rewards/margins": 6.076761540887576, + "rewards/rejected": -6.128012048192771, + "step": 4880 + }, + { + "epoch": 2.5595393875948704, + "grad_norm": 0.4463890469673047, + "kl": 0.0, + "learning_rate": 3.2024059456635558e-06, + "logits/chosen": -721420288.0, + "logits/rejected": -572207936.0, + "logps/chosen": -330.7951807228916, + "logps/rejected": -956.7792207792207, + "loss": 0.251, + "rewards/chosen": -0.08921427899096386, + "rewards/margins": 6.315006500229815, + "rewards/rejected": -6.404220779220779, + "step": 4890 + }, + { + "epoch": 2.5647736194713424, + "grad_norm": 0.6907246619558275, + "kl": 0.0, + "learning_rate": 3.128234253614343e-06, + "logits/chosen": -705377088.0, + "logits/rejected": -538758336.0, + "logps/chosen": -374.12345679012344, + "logps/rejected": -850.632911392405, + "loss": 0.2557, + "rewards/chosen": -0.3543836805555556, + "rewards/margins": 5.324413787798875, + "rewards/rejected": -5.67879746835443, + "step": 4900 + }, + { + "epoch": 2.5647736194713424, + "eval_kl": 0.0, + "eval_logits/chosen": -1397235840.0, + "eval_logits/rejected": -1123008256.0, + "eval_logps/chosen": -528.5304304799604, + "eval_logps/rejected": -749.2869219293883, + "eval_loss": 0.4514218866825104, + "eval_rewards/chosen": -1.962333003463632, + "eval_rewards/margins": 2.5779454649600373, + "eval_rewards/rejected": -4.5402784684236694, + "eval_runtime": 93.432, + "eval_samples_per_second": 42.812, + "eval_steps_per_second": 0.674, + "step": 4900 + }, + { + "epoch": 2.570007851347815, + "grad_norm": 0.7318646432841975, + "kl": 0.0, + "learning_rate": 3.054874247434278e-06, + "logits/chosen": -659659136.0, + "logits/rejected": -554539392.0, + "logps/chosen": -352.96894409937886, + "logps/rejected": -883.4213836477987, + "loss": 0.2456, + "rewards/chosen": -0.4120305221273292, + "rewards/margins": 5.228692748312922, + "rewards/rejected": -5.640723270440252, + "step": 4910 + }, + { + "epoch": 2.575242083224287, + "grad_norm": 1.2433568467442586, + "kl": 0.0, + "learning_rate": 2.9823286495958558e-06, + "logits/chosen": -699819648.0, + "logits/rejected": -545364352.0, + "logps/chosen": -384.69565217391306, + "logps/rejected": -883.4252199413489, + "loss": 0.2342, + "rewards/chosen": -0.39778428093645485, + "rewards/margins": 5.525236246922782, + "rewards/rejected": -5.923020527859237, + "step": 4920 + }, + { + "epoch": 2.580476315100759, + "grad_norm": 0.4162299263993999, + "kl": 0.0, + "learning_rate": 2.9106001523479364e-06, + "logits/chosen": -648124800.0, + "logits/rejected": -592655168.0, + "logps/chosen": -363.44303797468353, + "logps/rejected": -929.1851851851852, + "loss": 0.2375, + "rewards/chosen": -0.4677734375, + "rewards/margins": 5.842411747685185, + "rewards/rejected": -6.310185185185185, + "step": 4930 + }, + { + "epoch": 2.5857105469772312, + "grad_norm": 1.2374037693342654, + "kl": 0.0, + "learning_rate": 2.8396914176158694e-06, + "logits/chosen": -693790336.0, + "logits/rejected": -532047456.0, + "logps/chosen": -407.1446540880503, + "logps/rejected": -952.4472049689441, + "loss": 0.2642, + "rewards/chosen": -0.6344462460691824, + "rewards/margins": 5.9152431949246065, + "rewards/rejected": -6.549689440993789, + "step": 4940 + }, + { + "epoch": 2.5909447788537032, + "grad_norm": 0.5581455511945357, + "kl": 0.0, + "learning_rate": 2.7696050769026956e-06, + "logits/chosen": -608383808.0, + "logits/rejected": -527171584.0, + "logps/chosen": -404.16720257234726, + "logps/rejected": -1129.240121580547, + "loss": 0.2551, + "rewards/chosen": -0.9647558279742765, + "rewards/margins": 7.3635116492293715, + "rewards/rejected": -8.328267477203648, + "step": 4950 + }, + { + "epoch": 2.596179010730175, + "grad_norm": 0.4543139629422976, + "kl": 0.0, + "learning_rate": 2.7003437311914766e-06, + "logits/chosen": -705377088.0, + "logits/rejected": -622854144.0, + "logps/chosen": -373.4177215189873, + "logps/rejected": -959.4074074074074, + "loss": 0.243, + "rewards/chosen": -0.6134295886075949, + "rewards/margins": 6.047064238552899, + "rewards/rejected": -6.660493827160494, + "step": 4960 + }, + { + "epoch": 2.6014132426066476, + "grad_norm": 0.5615407102756935, + "kl": 0.0, + "learning_rate": 2.631909950848793e-06, + "logits/chosen": -709990784.0, + "logits/rejected": -555535552.0, + "logps/chosen": -470.8430769230769, + "logps/rejected": -1022.984126984127, + "loss": 0.2512, + "rewards/chosen": -1.1778846153846154, + "rewards/margins": 5.991956654456654, + "rewards/rejected": -7.16984126984127, + "step": 4970 + }, + { + "epoch": 2.6066474744831196, + "grad_norm": 0.4780257246681461, + "kl": 0.0, + "learning_rate": 2.5643062755293407e-06, + "logits/chosen": -697512768.0, + "logits/rejected": -619813248.0, + "logps/chosen": -387.7281553398058, + "logps/rejected": -971.7945619335347, + "loss": 0.2401, + "rewards/chosen": -0.6754778519417476, + "rewards/margins": 5.8902321178467725, + "rewards/rejected": -6.56570996978852, + "step": 4980 + }, + { + "epoch": 2.6118817063595916, + "grad_norm": 0.699923487248618, + "kl": 0.0, + "learning_rate": 2.4975352140816615e-06, + "logits/chosen": -778253120.0, + "logits/rejected": -530894016.0, + "logps/chosen": -379.28834355828224, + "logps/rejected": -936.968152866242, + "loss": 0.2341, + "rewards/chosen": -0.31894651073619634, + "rewards/margins": 6.252709540219218, + "rewards/rejected": -6.571656050955414, + "step": 4990 + }, + { + "epoch": 2.617115938236064, + "grad_norm": 0.5015730941076766, + "kl": 0.0, + "learning_rate": 2.4315992444550824e-06, + "logits/chosen": -789577728.0, + "logits/rejected": -634598208.0, + "logps/chosen": -394.1366459627329, + "logps/rejected": -975.2955974842768, + "loss": 0.2413, + "rewards/chosen": -0.46241750776397517, + "rewards/margins": 6.148431548839798, + "rewards/rejected": -6.610849056603773, + "step": 5000 + }, + { + "epoch": 2.617115938236064, + "eval_kl": 0.0, + "eval_logits/chosen": -1477659904.0, + "eval_logits/rejected": -1194644352.0, + "eval_logps/chosen": -527.0420583869371, + "eval_logps/rejected": -771.0870213823969, + "eval_loss": 0.45064061880111694, + "eval_rewards/chosen": -1.9520658090054428, + "eval_rewards/margins": 2.808749705663876, + "eval_rewards/rejected": -4.760815514669319, + "eval_runtime": 93.4433, + "eval_samples_per_second": 42.807, + "eval_steps_per_second": 0.674, + "step": 5000 + }, + { + "epoch": 2.622350170112536, + "grad_norm": 0.9405868473590856, + "kl": 0.0, + "learning_rate": 2.3665008136077334e-06, + "logits/chosen": -703489664.0, + "logits/rejected": -601148608.0, + "logps/chosen": -385.5114006514658, + "logps/rejected": -1009.4894894894895, + "loss": 0.2443, + "rewards/chosen": -0.36499898208469056, + "rewards/margins": 6.6905565734708645, + "rewards/rejected": -7.055555555555555, + "step": 5010 + }, + { + "epoch": 2.627584401989008, + "grad_norm": 1.0482270197343795, + "kl": 0.0, + "learning_rate": 2.3022423374157135e-06, + "logits/chosen": -744803520.0, + "logits/rejected": -540855488.0, + "logps/chosen": -420.9079754601227, + "logps/rejected": -787.2611464968153, + "loss": 0.2767, + "rewards/chosen": -0.9802051380368099, + "rewards/margins": 4.1161324415810245, + "rewards/rejected": -5.096337579617835, + "step": 5020 + }, + { + "epoch": 2.6328186338654804, + "grad_norm": 0.4804923751480013, + "kl": 0.0, + "learning_rate": 2.2388262005834852e-06, + "logits/chosen": -752458112.0, + "logits/rejected": -549768384.0, + "logps/chosen": -353.7177914110429, + "logps/rejected": -899.8726114649681, + "loss": 0.2598, + "rewards/chosen": -0.32867618865030673, + "rewards/margins": 5.797916168037592, + "rewards/rejected": -6.126592356687898, + "step": 5030 + }, + { + "epoch": 2.6380528657419524, + "grad_norm": 0.4130460548051139, + "kl": 0.0, + "learning_rate": 2.1762547565553292e-06, + "logits/chosen": -732010880.0, + "logits/rejected": -620547264.0, + "logps/chosen": -420.453074433657, + "logps/rejected": -821.558912386707, + "loss": 0.2426, + "rewards/chosen": -0.8308555825242718, + "rewards/margins": 4.620806048895668, + "rewards/rejected": -5.45166163141994, + "step": 5040 + }, + { + "epoch": 2.6432870976184244, + "grad_norm": 2.0506539104559693, + "kl": 0.0, + "learning_rate": 2.114530327428027e-06, + "logits/chosen": -746795840.0, + "logits/rejected": -582588800.0, + "logps/chosen": -391.55828220858893, + "logps/rejected": -943.4904458598726, + "loss": 0.2477, + "rewards/chosen": -0.5600076687116564, + "rewards/margins": 5.672476407721464, + "rewards/rejected": -6.232484076433121, + "step": 5050 + }, + { + "epoch": 2.648521329494897, + "grad_norm": 0.8035699204629465, + "kl": 0.0, + "learning_rate": 2.0536552038646963e-06, + "logits/chosen": -715443392.0, + "logits/rejected": -582798528.0, + "logps/chosen": -408.33656957928804, + "logps/rejected": -902.3806646525679, + "loss": 0.2451, + "rewards/chosen": -0.5545610841423948, + "rewards/margins": 5.68788604576697, + "rewards/rejected": -6.242447129909365, + "step": 5060 + }, + { + "epoch": 2.653755561371369, + "grad_norm": 0.4585898319323983, + "kl": 0.0, + "learning_rate": 1.993631645009747e-06, + "logits/chosen": -766089600.0, + "logits/rejected": -619603584.0, + "logps/chosen": -348.60681114551085, + "logps/rejected": -976.0504731861199, + "loss": 0.2299, + "rewards/chosen": -0.15412151702786378, + "rewards/margins": 6.421588262151947, + "rewards/rejected": -6.575709779179811, + "step": 5070 + }, + { + "epoch": 2.658989793247841, + "grad_norm": 0.5318634490809688, + "kl": 0.0, + "learning_rate": 1.9344618784050887e-06, + "logits/chosen": -704433344.0, + "logits/rejected": -550292672.0, + "logps/chosen": -412.3154574132492, + "logps/rejected": -909.4736842105264, + "loss": 0.2276, + "rewards/chosen": -0.5821205885252366, + "rewards/margins": 5.635371671536683, + "rewards/rejected": -6.21749226006192, + "step": 5080 + }, + { + "epoch": 2.6642240251243132, + "grad_norm": 0.41118095505831503, + "kl": 0.0, + "learning_rate": 1.8761480999074126e-06, + "logits/chosen": -799119744.0, + "logits/rejected": -529740608.0, + "logps/chosen": -380.5679758308157, + "logps/rejected": -929.7605177993528, + "loss": 0.2336, + "rewards/chosen": -0.2652709592145015, + "rewards/margins": 6.10123389515443, + "rewards/rejected": -6.366504854368932, + "step": 5090 + }, + { + "epoch": 2.669458257000785, + "grad_norm": 0.5504088625457714, + "kl": 0.0, + "learning_rate": 1.8186924736067479e-06, + "logits/chosen": -722259136.0, + "logits/rejected": -525179296.0, + "logps/chosen": -355.1111111111111, + "logps/rejected": -964.9620253164557, + "loss": 0.2273, + "rewards/chosen": -0.06327160493827161, + "rewards/margins": 6.755557508985779, + "rewards/rejected": -6.818829113924051, + "step": 5100 + }, + { + "epoch": 2.669458257000785, + "eval_kl": 0.0, + "eval_logits/chosen": -1420737280.0, + "eval_logits/rejected": -1144246016.0, + "eval_logps/chosen": -524.350321622959, + "eval_logps/rejected": -753.7105917454003, + "eval_loss": 0.4495312571525574, + "eval_rewards/chosen": -1.927078179119248, + "eval_rewards/margins": 2.660937733362104, + "eval_rewards/rejected": -4.588015912481352, + "eval_runtime": 93.4354, + "eval_samples_per_second": 42.81, + "eval_steps_per_second": 0.674, + "step": 5100 + }, + { + "epoch": 2.674692488877257, + "grad_norm": 0.4037617876587109, + "kl": 0.0, + "learning_rate": 1.7620971317461182e-06, + "logits/chosen": -730018624.0, + "logits/rejected": -624112448.0, + "logps/chosen": -335.34591194968556, + "logps/rejected": -958.6086956521739, + "loss": 0.2173, + "rewards/chosen": -0.051088345125786166, + "rewards/margins": 6.337886810153718, + "rewards/rejected": -6.388975155279503, + "step": 5110 + }, + { + "epoch": 2.6799267207537296, + "grad_norm": 0.47909127799613394, + "kl": 0.0, + "learning_rate": 1.7063641746424164e-06, + "logits/chosen": -762839040.0, + "logits/rejected": -514326528.0, + "logps/chosen": -370.7951807228916, + "logps/rejected": -817.038961038961, + "loss": 0.2518, + "rewards/chosen": -0.3368552334337349, + "rewards/margins": 5.094962948384447, + "rewards/rejected": -5.431818181818182, + "step": 5120 + }, + { + "epoch": 2.6851609526302016, + "grad_norm": 0.651180944659031, + "kl": 0.0, + "learning_rate": 1.6514956706084883e-06, + "logits/chosen": -767872192.0, + "logits/rejected": -499856192.0, + "logps/chosen": -337.3953488372093, + "logps/rejected": -988.2162162162163, + "loss": 0.2433, + "rewards/chosen": 0.0436500726744186, + "rewards/margins": 6.9997311537555, + "rewards/rejected": -6.956081081081081, + "step": 5130 + }, + { + "epoch": 2.6903951845066736, + "grad_norm": 1.0432660455153437, + "kl": 0.0, + "learning_rate": 1.5974936558763226e-06, + "logits/chosen": -713975424.0, + "logits/rejected": -539911808.0, + "logps/chosen": -363.3482428115016, + "logps/rejected": -944.7339449541284, + "loss": 0.2372, + "rewards/chosen": -0.45554612619808305, + "rewards/margins": 6.005463048113843, + "rewards/rejected": -6.4610091743119265, + "step": 5140 + }, + { + "epoch": 2.695629416383146, + "grad_norm": 0.8243520837524749, + "kl": 0.0, + "learning_rate": 1.5443601345215358e-06, + "logits/chosen": -716282240.0, + "logits/rejected": -531261024.0, + "logps/chosen": -353.57055214723925, + "logps/rejected": -889.5796178343949, + "loss": 0.2588, + "rewards/chosen": -0.29694689417177916, + "rewards/margins": 5.6075117045543355, + "rewards/rejected": -5.904458598726115, + "step": 5150 + }, + { + "epoch": 2.700863648259618, + "grad_norm": 0.6082955016572004, + "kl": 0.0, + "learning_rate": 1.4920970783889737e-06, + "logits/chosen": -789368000.0, + "logits/rejected": -548405248.0, + "logps/chosen": -392.90909090909093, + "logps/rejected": -871.3333333333334, + "loss": 0.2667, + "rewards/chosen": -0.6789994673295454, + "rewards/margins": 4.7966949771149, + "rewards/rejected": -5.475694444444445, + "step": 5160 + }, + { + "epoch": 2.70609788013609, + "grad_norm": 1.3862654743582614, + "kl": 0.0, + "learning_rate": 1.4407064270195253e-06, + "logits/chosen": -744279232.0, + "logits/rejected": -584476288.0, + "logps/chosen": -407.30546623794214, + "logps/rejected": -871.2948328267477, + "loss": 0.2287, + "rewards/chosen": -0.5827220659163987, + "rewards/margins": 5.0753326453298016, + "rewards/rejected": -5.6580547112462005, + "step": 5170 + }, + { + "epoch": 2.7113321120125624, + "grad_norm": 0.47285633180777387, + "kl": 0.0, + "learning_rate": 1.390190087578161e-06, + "logits/chosen": -686817280.0, + "logits/rejected": -630194176.0, + "logps/chosen": -394.38961038961037, + "logps/rejected": -863.6144578313254, + "loss": 0.2427, + "rewards/chosen": -0.6700994318181818, + "rewards/margins": 5.0279427368565175, + "rewards/rejected": -5.698042168674699, + "step": 5180 + }, + { + "epoch": 2.7165663438890344, + "grad_norm": 0.6246600302967272, + "kl": 0.0, + "learning_rate": 1.3405499347831641e-06, + "logits/chosen": -671298368.0, + "logits/rejected": -661966016.0, + "logps/chosen": -415.588424437299, + "logps/rejected": -817.4103343465046, + "loss": 0.2685, + "rewards/chosen": -0.8964529742765274, + "rewards/margins": 4.246404168580616, + "rewards/rejected": -5.142857142857143, + "step": 5190 + }, + { + "epoch": 2.7218005757655064, + "grad_norm": 0.6070359162128525, + "kl": 0.0, + "learning_rate": 1.2917878108365229e-06, + "logits/chosen": -691535872.0, + "logits/rejected": -587412288.0, + "logps/chosen": -435.1392405063291, + "logps/rejected": -877.0370370370371, + "loss": 0.2645, + "rewards/chosen": -0.9455102848101266, + "rewards/margins": 5.006650209017034, + "rewards/rejected": -5.952160493827161, + "step": 5200 + }, + { + "epoch": 2.7218005757655064, + "eval_kl": 0.0, + "eval_logits/chosen": -1426662528.0, + "eval_logits/rejected": -1147708032.0, + "eval_logps/chosen": -529.068777832756, + "eval_logps/rejected": -767.9681750372948, + "eval_loss": 0.450070321559906, + "eval_rewards/chosen": -1.9729713013359722, + "eval_rewards/margins": 2.7549004042831227, + "eval_rewards/rejected": -4.727871705619095, + "eval_runtime": 93.4352, + "eval_samples_per_second": 42.81, + "eval_steps_per_second": 0.674, + "step": 5200 + }, + { + "epoch": 2.727034807641979, + "grad_norm": 1.0037145493476631, + "kl": 0.0, + "learning_rate": 1.2439055253556015e-06, + "logits/chosen": -641309056.0, + "logits/rejected": -601987456.0, + "logps/chosen": -424.1898305084746, + "logps/rejected": -1009.7159420289855, + "loss": 0.2554, + "rewards/chosen": -0.9086864406779661, + "rewards/margins": 6.078270081061164, + "rewards/rejected": -6.98695652173913, + "step": 5210 + }, + { + "epoch": 2.732269039518451, + "grad_norm": 0.6312057167352639, + "kl": 0.0, + "learning_rate": 1.1969048553059609e-06, + "logits/chosen": -784544576.0, + "logits/rejected": -514588672.0, + "logps/chosen": -355.0320699708455, + "logps/rejected": -1084.7676767676767, + "loss": 0.2584, + "rewards/chosen": -0.23856596209912537, + "rewards/margins": 7.694935721402558, + "rewards/rejected": -7.933501683501683, + "step": 5220 + }, + { + "epoch": 2.7375032713949228, + "grad_norm": 0.4977395450990626, + "kl": 0.0, + "learning_rate": 1.1507875449354166e-06, + "logits/chosen": -731591488.0, + "logits/rejected": -560411456.0, + "logps/chosen": -418.01846153846157, + "logps/rejected": -895.3904761904762, + "loss": 0.2805, + "rewards/chosen": -0.9154807692307693, + "rewards/margins": 5.0964239926739925, + "rewards/rejected": -6.011904761904762, + "step": 5230 + }, + { + "epoch": 2.7427375032713948, + "grad_norm": 0.5728940069453299, + "kl": 0.0, + "learning_rate": 1.1055553057093215e-06, + "logits/chosen": -733688640.0, + "logits/rejected": -545364352.0, + "logps/chosen": -341.25373134328356, + "logps/rejected": -818.7803278688525, + "loss": 0.2671, + "rewards/chosen": -0.31609141791044776, + "rewards/margins": 5.065875795204306, + "rewards/rejected": -5.381967213114754, + "step": 5240 + }, + { + "epoch": 2.747971735147867, + "grad_norm": 0.4250248012002315, + "kl": 0.0, + "learning_rate": 1.06120981624703e-06, + "logits/chosen": -722993152.0, + "logits/rejected": -619341440.0, + "logps/chosen": -366.1237785016287, + "logps/rejected": -946.9309309309309, + "loss": 0.2349, + "rewards/chosen": -0.4523082820134365, + "rewards/margins": 5.939583609878456, + "rewards/rejected": -6.391891891891892, + "step": 5250 + }, + { + "epoch": 2.753205967024339, + "grad_norm": 0.5454111392127353, + "kl": 0.0, + "learning_rate": 1.017752722259624e-06, + "logits/chosen": -730018624.0, + "logits/rejected": -651899712.0, + "logps/chosen": -406.984126984127, + "logps/rejected": -901.5138461538462, + "loss": 0.2371, + "rewards/chosen": -0.650719246031746, + "rewards/margins": 5.203126907814408, + "rewards/rejected": -5.8538461538461535, + "step": 5260 + }, + { + "epoch": 2.758440198900811, + "grad_norm": 0.9196034243834265, + "kl": 0.0, + "learning_rate": 9.751856364888178e-07, + "logits/chosen": -695363200.0, + "logits/rejected": -569376768.0, + "logps/chosen": -404.28205128205127, + "logps/rejected": -996.8780487804878, + "loss": 0.2357, + "rewards/chosen": -0.558769030448718, + "rewards/margins": 6.532694384185429, + "rewards/rejected": -7.091463414634147, + "step": 5270 + }, + { + "epoch": 2.7636744307772836, + "grad_norm": 0.6213552424976277, + "kl": 0.0, + "learning_rate": 9.335101386471285e-07, + "logits/chosen": -686607552.0, + "logits/rejected": -656828032.0, + "logps/chosen": -461.7721518987342, + "logps/rejected": -922.5679012345679, + "loss": 0.2452, + "rewards/chosen": -1.2786787974683544, + "rewards/margins": 4.898018733395843, + "rewards/rejected": -6.176697530864198, + "step": 5280 + }, + { + "epoch": 2.7689086626537556, + "grad_norm": 0.6679904514162947, + "kl": 0.0, + "learning_rate": 8.927277753592339e-07, + "logits/chosen": -717435712.0, + "logits/rejected": -489842272.0, + "logps/chosen": -387.56656346749224, + "logps/rejected": -1071.6466876971608, + "loss": 0.2605, + "rewards/chosen": -0.5881990131578947, + "rewards/margins": 7.4181101351070895, + "rewards/rejected": -8.006309148264984, + "step": 5290 + }, + { + "epoch": 2.7741428945302276, + "grad_norm": 0.5523711430994795, + "kl": 0.0, + "learning_rate": 8.528400601045816e-07, + "logits/chosen": -682308416.0, + "logits/rejected": -570530176.0, + "logps/chosen": -432.96676737160124, + "logps/rejected": -1099.495145631068, + "loss": 0.2637, + "rewards/chosen": -1.1677799282477341, + "rewards/margins": 7.145326867868771, + "rewards/rejected": -8.313106796116505, + "step": 5300 + }, + { + "epoch": 2.7741428945302276, + "eval_kl": 0.0, + "eval_logits/chosen": -1414479104.0, + "eval_logits/rejected": -1139319424.0, + "eval_logps/chosen": -551.0460168233548, + "eval_logps/rejected": -835.7553455992044, + "eval_loss": 0.4497031271457672, + "eval_rewards/chosen": -2.193159327065809, + "eval_rewards/margins": 3.214598007593564, + "eval_rewards/rejected": -5.407757334659373, + "eval_runtime": 93.4396, + "eval_samples_per_second": 42.808, + "eval_steps_per_second": 0.674, + "step": 5300 + }, + { + "epoch": 2.7793771264067, + "grad_norm": 0.47999789112467145, + "kl": 0.0, + "learning_rate": 8.138484731612272e-07, + "logits/chosen": -720686272.0, + "logits/rejected": -552914112.0, + "logps/chosen": -408.576802507837, + "logps/rejected": -1018.018691588785, + "loss": 0.2428, + "rewards/chosen": -0.6440536833855799, + "rewards/margins": 6.398002391380776, + "rewards/rejected": -7.042056074766355, + "step": 5310 + }, + { + "epoch": 2.784611358283172, + "grad_norm": 0.8354416613963258, + "kl": 0.0, + "learning_rate": 7.757544615508927e-07, + "logits/chosen": -685873536.0, + "logits/rejected": -544420672.0, + "logps/chosen": -448.6771159874608, + "logps/rejected": -1034.766355140187, + "loss": 0.2554, + "rewards/chosen": -1.1204692398119123, + "rewards/margins": 6.40445287856815, + "rewards/rejected": -7.524922118380062, + "step": 5320 + }, + { + "epoch": 2.789845590159644, + "grad_norm": 0.5173887570787987, + "kl": 0.0, + "learning_rate": 7.385594389852674e-07, + "logits/chosen": -755499008.0, + "logits/rejected": -526490016.0, + "logps/chosen": -370.35158501440924, + "logps/rejected": -963.6587030716723, + "loss": 0.2679, + "rewards/chosen": -0.6374279538904899, + "rewards/margins": 5.954295595597564, + "rewards/rejected": -6.591723549488054, + "step": 5330 + }, + { + "epoch": 2.795079822036116, + "grad_norm": 0.4805949533097288, + "kl": 0.0, + "learning_rate": 7.0226478581355e-07, + "logits/chosen": -667418624.0, + "logits/rejected": -534459200.0, + "logps/chosen": -417.87012987012986, + "logps/rejected": -965.2048192771084, + "loss": 0.2464, + "rewards/chosen": -0.8856026785714286, + "rewards/margins": 5.993162381669535, + "rewards/rejected": -6.878765060240964, + "step": 5340 + }, + { + "epoch": 2.8003140539125884, + "grad_norm": 0.4980762598857717, + "kl": 0.0, + "learning_rate": 6.668718489712039e-07, + "logits/chosen": -703594496.0, + "logits/rejected": -536399040.0, + "logps/chosen": -437.4019292604502, + "logps/rejected": -1035.5744680851064, + "loss": 0.2438, + "rewards/chosen": -0.9048118468649518, + "rewards/margins": 6.747923715445078, + "rewards/rejected": -7.65273556231003, + "step": 5350 + }, + { + "epoch": 2.8055482857890603, + "grad_norm": 0.6042457279492705, + "kl": 0.0, + "learning_rate": 6.323819419299992e-07, + "logits/chosen": -694471872.0, + "logits/rejected": -554591872.0, + "logps/chosen": -529.083870967742, + "logps/rejected": -1095.7575757575758, + "loss": 0.2427, + "rewards/chosen": -1.7447580645161291, + "rewards/margins": 6.24463587487781, + "rewards/rejected": -7.989393939393939, + "step": 5360 + }, + { + "epoch": 2.8107825176655323, + "grad_norm": 0.5542724861237701, + "kl": 0.0, + "learning_rate": 5.987963446492384e-07, + "logits/chosen": -647128704.0, + "logits/rejected": -598422336.0, + "logps/chosen": -427.97241379310344, + "logps/rejected": -950.8571428571429, + "loss": 0.2395, + "rewards/chosen": -0.9704067887931035, + "rewards/margins": 5.628164639778325, + "rewards/rejected": -6.598571428571429, + "step": 5370 + }, + { + "epoch": 2.8160167495420048, + "grad_norm": 0.5420236989736987, + "kl": 0.0, + "learning_rate": 5.661163035282802e-07, + "logits/chosen": -674129536.0, + "logits/rejected": -598212608.0, + "logps/chosen": -419.10828025477707, + "logps/rejected": -1006.4294478527607, + "loss": 0.2559, + "rewards/chosen": -0.90734474522293, + "rewards/margins": 5.825784089132899, + "rewards/rejected": -6.733128834355829, + "step": 5380 + }, + { + "epoch": 2.8212509814184767, + "grad_norm": 0.562792644335705, + "kl": 0.0, + "learning_rate": 5.343430313602738e-07, + "logits/chosen": -708522816.0, + "logits/rejected": -509660352.0, + "logps/chosen": -383.055900621118, + "logps/rejected": -904.8553459119497, + "loss": 0.2659, + "rewards/chosen": -0.43432162267080743, + "rewards/margins": 5.90608718236064, + "rewards/rejected": -6.340408805031447, + "step": 5390 + }, + { + "epoch": 2.8264852132949487, + "grad_norm": 0.5976247674593289, + "kl": 0.0, + "learning_rate": 5.034777072871394e-07, + "logits/chosen": -699505024.0, + "logits/rejected": -571159360.0, + "logps/chosen": -456.67973856209153, + "logps/rejected": -919.5688622754491, + "loss": 0.2683, + "rewards/chosen": -1.327777139501634, + "rewards/margins": 5.115336632953456, + "rewards/rejected": -6.4431137724550895, + "step": 5400 + }, + { + "epoch": 2.8264852132949487, + "eval_kl": 0.0, + "eval_logits/chosen": -1420271232.0, + "eval_logits/rejected": -1144445824.0, + "eval_logps/chosen": -552.9777337951509, + "eval_logps/rejected": -847.9124813525609, + "eval_loss": 0.4496484398841858, + "eval_rewards/chosen": -2.211652647204354, + "eval_rewards/margins": 3.318680520373965, + "eval_rewards/rejected": -5.530333167578319, + "eval_runtime": 93.4275, + "eval_samples_per_second": 42.814, + "eval_steps_per_second": 0.674, + "step": 5400 + }, + { + "epoch": 2.831719445171421, + "grad_norm": 0.8906081192471247, + "kl": 0.0, + "learning_rate": 4.735214767558338e-07, + "logits/chosen": -713451136.0, + "logits/rejected": -585105408.0, + "logps/chosen": -354.5263157894737, + "logps/rejected": -850.6845637583892, + "loss": 0.2582, + "rewards/chosen": -0.46142292580409355, + "rewards/margins": 5.177838819162349, + "rewards/rejected": -5.639261744966443, + "step": 5410 + }, + { + "epoch": 2.836953677047893, + "grad_norm": 0.4724582634761599, + "kl": 0.0, + "learning_rate": 4.444754514758231e-07, + "logits/chosen": -735785792.0, + "logits/rejected": -530736736.0, + "logps/chosen": -417.6190476190476, + "logps/rejected": -1088.421052631579, + "loss": 0.2558, + "rewards/chosen": -0.9201078869047619, + "rewards/margins": 6.978740797305765, + "rewards/rejected": -7.8988486842105265, + "step": 5420 + }, + { + "epoch": 2.842187908924365, + "grad_norm": 0.5395820316831023, + "kl": 0.0, + "learning_rate": 4.163407093778243e-07, + "logits/chosen": -717121152.0, + "logits/rejected": -543739072.0, + "logps/chosen": -373.1482649842271, + "logps/rejected": -935.9256965944272, + "loss": 0.2558, + "rewards/chosen": -0.5599122634069401, + "rewards/margins": 5.833276591082224, + "rewards/rejected": -6.393188854489164, + "step": 5430 + }, + { + "epoch": 2.8474221408008376, + "grad_norm": 0.7642717712653311, + "kl": 0.0, + "learning_rate": 3.891182945738259e-07, + "logits/chosen": -648124800.0, + "logits/rejected": -659030016.0, + "logps/chosen": -400.1025641025641, + "logps/rejected": -936.6829268292682, + "loss": 0.254, + "rewards/chosen": -0.9886067708333334, + "rewards/margins": 5.395539570630081, + "rewards/rejected": -6.384146341463414, + "step": 5440 + }, + { + "epoch": 2.8526563726773095, + "grad_norm": 0.4376405764138052, + "kl": 0.0, + "learning_rate": 3.628092173183023e-07, + "logits/chosen": -712612224.0, + "logits/rejected": -543109952.0, + "logps/chosen": -414.125, + "logps/rejected": -1204.4, + "loss": 0.2247, + "rewards/chosen": -0.70224609375, + "rewards/margins": 8.368847656249999, + "rewards/rejected": -9.07109375, + "step": 5450 + }, + { + "epoch": 2.8578906045537815, + "grad_norm": 0.41737288250225, + "kl": 0.0, + "learning_rate": 3.37414453970758e-07, + "logits/chosen": -698456448.0, + "logits/rejected": -527433728.0, + "logps/chosen": -375.7546012269939, + "logps/rejected": -1011.3630573248407, + "loss": 0.2532, + "rewards/chosen": -0.6709403757668712, + "rewards/margins": 6.528900388564339, + "rewards/rejected": -7.19984076433121, + "step": 5460 + }, + { + "epoch": 2.863124836430254, + "grad_norm": 0.623828872587112, + "kl": 0.0, + "learning_rate": 3.129349469594728e-07, + "logits/chosen": -694052480.0, + "logits/rejected": -591711424.0, + "logps/chosen": -393.32515337423314, + "logps/rejected": -972.8407643312102, + "loss": 0.2531, + "rewards/chosen": -0.6528230444785276, + "rewards/margins": 5.910871223037397, + "rewards/rejected": -6.563694267515924, + "step": 5470 + }, + { + "epoch": 2.868359068306726, + "grad_norm": 0.5135357902714759, + "kl": 0.0, + "learning_rate": 2.8937160474652725e-07, + "logits/chosen": -613049984.0, + "logits/rejected": -571473920.0, + "logps/chosen": -415.1111111111111, + "logps/rejected": -963.1616766467066, + "loss": 0.2537, + "rewards/chosen": -0.9870046977124183, + "rewards/margins": 5.6424863202516535, + "rewards/rejected": -6.629491017964072, + "step": 5480 + }, + { + "epoch": 2.873593300183198, + "grad_norm": 0.7684511654598384, + "kl": 0.0, + "learning_rate": 2.667253017941018e-07, + "logits/chosen": -783810560.0, + "logits/rejected": -508873920.0, + "logps/chosen": -450.54117647058825, + "logps/rejected": -1056.96, + "loss": 0.2651, + "rewards/chosen": -1.0138327205882354, + "rewards/margins": 6.628667279411765, + "rewards/rejected": -7.6425, + "step": 5490 + }, + { + "epoch": 2.8788275320596703, + "grad_norm": 0.6529707604480456, + "kl": 0.0, + "learning_rate": 2.449968785320139e-07, + "logits/chosen": -714289984.0, + "logits/rejected": -590715264.0, + "logps/chosen": -452.7840531561462, + "logps/rejected": -955.3746312684366, + "loss": 0.2551, + "rewards/chosen": -1.0597487541528239, + "rewards/margins": 5.417389888915023, + "rewards/rejected": -6.477138643067847, + "step": 5500 + }, + { + "epoch": 2.8788275320596703, + "eval_kl": 0.0, + "eval_logits/chosen": -1425663872.0, + "eval_logits/rejected": -1148307200.0, + "eval_logps/chosen": -551.4576942107867, + "eval_logps/rejected": -846.35305818001, + "eval_loss": 0.44935154914855957, + "eval_rewards/chosen": -2.1969322117763483, + "eval_rewards/margins": 3.312391507766168, + "eval_rewards/rejected": -5.509323719542516, + "eval_runtime": 93.4457, + "eval_samples_per_second": 42.806, + "eval_steps_per_second": 0.674, + "step": 5500 + }, + { + "epoch": 2.8840617639361423, + "grad_norm": 0.7418538204206191, + "kl": 0.0, + "learning_rate": 2.2418714132653173e-07, + "logits/chosen": -696569024.0, + "logits/rejected": -590453120.0, + "logps/chosen": -439.5974842767296, + "logps/rejected": -989.1180124223603, + "loss": 0.2709, + "rewards/chosen": -0.9837608097484277, + "rewards/margins": 5.791860308263995, + "rewards/rejected": -6.775621118012422, + "step": 5510 + }, + { + "epoch": 2.8892959958126143, + "grad_norm": 0.4173372869938946, + "kl": 0.0, + "learning_rate": 2.0429686245045098e-07, + "logits/chosen": -672556672.0, + "logits/rejected": -566545600.0, + "logps/chosen": -474.6842105263158, + "logps/rejected": -996.5714285714286, + "loss": 0.2616, + "rewards/chosen": -1.384534333881579, + "rewards/margins": 5.601328761356516, + "rewards/rejected": -6.985863095238095, + "step": 5520 + }, + { + "epoch": 2.8945302276890867, + "grad_norm": 1.1299589811683035, + "kl": 0.0, + "learning_rate": 1.853267800544234e-07, + "logits/chosen": -631767040.0, + "logits/rejected": -611581952.0, + "logps/chosen": -452.3076923076923, + "logps/rejected": -1065.3658536585365, + "loss": 0.2683, + "rewards/chosen": -1.2772435897435896, + "rewards/margins": 6.299738117573484, + "rewards/rejected": -7.576981707317073, + "step": 5530 + }, + { + "epoch": 2.8997644595655587, + "grad_norm": 0.8935381297411652, + "kl": 0.0, + "learning_rate": 1.6727759813958965e-07, + "logits/chosen": -684615296.0, + "logits/rejected": -543948800.0, + "logps/chosen": -414.6953846153846, + "logps/rejected": -994.1333333333333, + "loss": 0.2516, + "rewards/chosen": -1.030673076923077, + "rewards/margins": 5.983612637362637, + "rewards/rejected": -7.014285714285714, + "step": 5540 + }, + { + "epoch": 2.9049986914420307, + "grad_norm": 0.48082815632583886, + "kl": 0.0, + "learning_rate": 1.501499865314171e-07, + "logits/chosen": -717540544.0, + "logits/rejected": -618921984.0, + "logps/chosen": -390.4891640866873, + "logps/rejected": -1102.1324921135647, + "loss": 0.2538, + "rewards/chosen": -0.614454334365325, + "rewards/margins": 7.245955760271899, + "rewards/rejected": -7.860410094637224, + "step": 5550 + }, + { + "epoch": 2.910232923318503, + "grad_norm": 1.2162251721623474, + "kl": 0.0, + "learning_rate": 1.3394458085487505e-07, + "logits/chosen": -692374720.0, + "logits/rejected": -560516288.0, + "logps/chosen": -422.4935064935065, + "logps/rejected": -964.2409638554217, + "loss": 0.2427, + "rewards/chosen": -0.7687195616883117, + "rewards/margins": 5.697394896143013, + "rewards/rejected": -6.466114457831325, + "step": 5560 + }, + { + "epoch": 2.915467155194975, + "grad_norm": 0.977753969363026, + "kl": 0.0, + "learning_rate": 1.1866198251082594e-07, + "logits/chosen": -727606912.0, + "logits/rejected": -533830048.0, + "logps/chosen": -468.3987915407855, + "logps/rejected": -1069.7734627831715, + "loss": 0.2706, + "rewards/chosen": -1.368202416918429, + "rewards/margins": 6.212703731948885, + "rewards/rejected": -7.580906148867314, + "step": 5570 + }, + { + "epoch": 2.920701387071447, + "grad_norm": 0.48217186745357443, + "kl": 0.0, + "learning_rate": 1.0430275865371265e-07, + "logits/chosen": -637009920.0, + "logits/rejected": -540121472.0, + "logps/chosen": -410.9306930693069, + "logps/rejected": -1008.9970326409496, + "loss": 0.2447, + "rewards/chosen": -0.8706683168316832, + "rewards/margins": 6.483931089696507, + "rewards/rejected": -7.35459940652819, + "step": 5580 + }, + { + "epoch": 2.9259356189479195, + "grad_norm": 0.3671802124859091, + "kl": 0.0, + "learning_rate": 9.086744217050857e-08, + "logits/chosen": -659030016.0, + "logits/rejected": -551341248.0, + "logps/chosen": -413.5424836601307, + "logps/rejected": -1029.748502994012, + "loss": 0.2476, + "rewards/chosen": -0.7669015522875817, + "rewards/margins": 6.527260124359125, + "rewards/rejected": -7.294161676646707, + "step": 5590 + }, + { + "epoch": 2.9311698508243915, + "grad_norm": 0.8070041617835978, + "kl": 0.0, + "learning_rate": 7.835653166094747e-08, + "logits/chosen": -722888320.0, + "logits/rejected": -539806912.0, + "logps/chosen": -418.1951219512195, + "logps/rejected": -940.6153846153846, + "loss": 0.2695, + "rewards/chosen": -1.0270400628810976, + "rewards/margins": 5.376405449939416, + "rewards/rejected": -6.403445512820513, + "step": 5600 + }, + { + "epoch": 2.9311698508243915, + "eval_kl": 0.0, + "eval_logits/chosen": -1428859520.0, + "eval_logits/rejected": -1154099328.0, + "eval_logps/chosen": -548.8292924294904, + "eval_logps/rejected": -841.0064644455495, + "eval_loss": 0.449031263589859, + "eval_rewards/chosen": -2.172068283028204, + "eval_rewards/margins": 3.2870316672452917, + "eval_rewards/rejected": -5.459099950273496, + "eval_runtime": 93.4114, + "eval_samples_per_second": 42.821, + "eval_steps_per_second": 0.674, + "step": 5600 + }, + { + "epoch": 2.9364040827008635, + "grad_norm": 0.567629032177363, + "kl": 0.0, + "learning_rate": 6.677049141901315e-08, + "logits/chosen": -703279936.0, + "logits/rejected": -649383104.0, + "logps/chosen": -395.5880398671096, + "logps/rejected": -1018.3362831858407, + "loss": 0.2348, + "rewards/chosen": -0.4837001661129568, + "rewards/margins": 6.577509273415067, + "rewards/rejected": -7.061209439528024, + "step": 5610 + }, + { + "epoch": 2.941638314577336, + "grad_norm": 0.6635829541099674, + "kl": 0.0, + "learning_rate": 5.610975141571162e-08, + "logits/chosen": -723622272.0, + "logits/rejected": -624531840.0, + "logps/chosen": -416.86624203821657, + "logps/rejected": -1150.8220858895706, + "loss": 0.2657, + "rewards/chosen": -0.8588525079617835, + "rewards/margins": 7.375043197559689, + "rewards/rejected": -8.233895705521473, + "step": 5620 + }, + { + "epoch": 2.946872546453808, + "grad_norm": 0.6391268824424916, + "kl": 0.0, + "learning_rate": 4.6374707283117215e-08, + "logits/chosen": -744069504.0, + "logits/rejected": -576349824.0, + "logps/chosen": -394.24, + "logps/rejected": -878.0190476190476, + "loss": 0.2767, + "rewards/chosen": -0.7142788461538462, + "rewards/margins": 5.07143543956044, + "rewards/rejected": -5.785714285714286, + "step": 5630 + }, + { + "epoch": 2.95210677833028, + "grad_norm": 0.4791240509178076, + "kl": 0.0, + "learning_rate": 3.7565720299687076e-08, + "logits/chosen": -732849792.0, + "logits/rejected": -558478144.0, + "logps/chosen": -396.319018404908, + "logps/rejected": -951.4394904458599, + "loss": 0.2625, + "rewards/chosen": -0.5491875958588958, + "rewards/margins": 5.981863359555117, + "rewards/rejected": -6.531050955414012, + "step": 5640 + }, + { + "epoch": 2.9573410102067523, + "grad_norm": 0.41528798410488044, + "kl": 0.0, + "learning_rate": 2.9683117376852475e-08, + "logits/chosen": -745537536.0, + "logits/rejected": -595486336.0, + "logps/chosen": -381.840490797546, + "logps/rejected": -956.7388535031847, + "loss": 0.2575, + "rewards/chosen": -0.3813266871165644, + "rewards/margins": 6.192717898870697, + "rewards/rejected": -6.574044585987261, + "step": 5650 + }, + { + "epoch": 2.9625752420832243, + "grad_norm": 0.5510367626492982, + "kl": 0.0, + "learning_rate": 2.272719104688403e-08, + "logits/chosen": -772171392.0, + "logits/rejected": -536870912.0, + "logps/chosen": -351.65, + "logps/rejected": -1027.6, + "loss": 0.2431, + "rewards/chosen": 0.03939208984375, + "rewards/margins": 7.49798583984375, + "rewards/rejected": -7.45859375, + "step": 5660 + }, + { + "epoch": 2.9678094739596963, + "grad_norm": 0.8534857961161093, + "kl": 0.0, + "learning_rate": 1.6698199452053198e-08, + "logits/chosen": -638346880.0, + "logits/rejected": -599523328.0, + "logps/chosen": -422.4533333333333, + "logps/rejected": -949.2705882352941, + "loss": 0.2679, + "rewards/chosen": -1.1184375, + "rewards/margins": 5.462444852941177, + "rewards/rejected": -6.580882352941177, + "step": 5670 + }, + { + "epoch": 2.9730437058361687, + "grad_norm": 0.679885460906798, + "kl": 0.0, + "learning_rate": 1.1596366335023257e-08, + "logits/chosen": -714499712.0, + "logits/rejected": -607125504.0, + "logps/chosen": -410.2180685358255, + "logps/rejected": -1051.6865203761756, + "loss": 0.2684, + "rewards/chosen": -0.8111005646417445, + "rewards/margins": 6.545482507458569, + "rewards/rejected": -7.356583072100314, + "step": 5680 + }, + { + "epoch": 2.9782779377126407, + "grad_norm": 1.111508955638191, + "kl": 0.0, + "learning_rate": 7.42188103057262e-09, + "logits/chosen": -697198208.0, + "logits/rejected": -521194688.0, + "logps/chosen": -484.1025641025641, + "logps/rejected": -1003.2941176470588, + "loss": 0.3154, + "rewards/chosen": -1.8350249287749287, + "rewards/margins": 5.181843583335798, + "rewards/rejected": -7.016868512110727, + "step": 5690 + }, + { + "epoch": 2.9835121695891127, + "grad_norm": 1.4290512414061876, + "kl": 0.0, + "learning_rate": 4.174898458556009e-09, + "logits/chosen": -756652416.0, + "logits/rejected": -551550976.0, + "logps/chosen": -395.7317073170732, + "logps/rejected": -899.8974358974359, + "loss": 0.2664, + "rewards/chosen": -0.5129215891768293, + "rewards/margins": 5.686597641592401, + "rewards/rejected": -6.199519230769231, + "step": 5700 + }, + { + "epoch": 2.9835121695891127, + "eval_kl": 0.0, + "eval_logits/chosen": -1428926080.0, + "eval_logits/rejected": -1154165888.0, + "eval_logps/chosen": -548.5442850074221, + "eval_logps/rejected": -840.7200397812034, + "eval_loss": 0.4491328001022339, + "eval_rewards/chosen": -2.1686046511627906, + "eval_rewards/margins": 3.2858955974697306, + "eval_rewards/rejected": -5.454500248632521, + "eval_runtime": 93.4352, + "eval_samples_per_second": 42.81, + "eval_steps_per_second": 0.674, + "step": 5700 + }, + { + "epoch": 2.988746401465585, + "grad_norm": 0.7011755361178991, + "kl": 0.0, + "learning_rate": 1.8555391181507288e-09, + "logits/chosen": -752353280.0, + "logits/rejected": -584895680.0, + "logps/chosen": -422.3105590062112, + "logps/rejected": -1009.5094339622641, + "loss": 0.2625, + "rewards/chosen": -0.750873447204969, + "rewards/margins": 6.488120263486856, + "rewards/rejected": -7.238993710691824, + "step": 5710 + }, + { + "epoch": 2.993980633342057, + "grad_norm": 1.9855747771297168, + "kl": 0.0, + "learning_rate": 4.638890833991161e-10, + "logits/chosen": -771856768.0, + "logits/rejected": -604137088.0, + "logps/chosen": -429.2048192771084, + "logps/rejected": -1076.5714285714287, + "loss": 0.2949, + "rewards/chosen": -0.9644672439759037, + "rewards/margins": 6.747383405374746, + "rewards/rejected": -7.71185064935065, + "step": 5720 + }, + { + "epoch": 2.999214865218529, + "grad_norm": 0.5949657119573064, + "kl": 0.0, + "learning_rate": 0.0, + "logits/chosen": -709256832.0, + "logits/rejected": -596089216.0, + "logps/chosen": -497.5903614457831, + "logps/rejected": -1091.844155844156, + "loss": 0.2986, + "rewards/chosen": -1.5481574736445782, + "rewards/margins": 6.373108760121656, + "rewards/rejected": -7.921266233766234, + "step": 5730 + }, + { + "epoch": 2.999214865218529, + "step": 5730, + "total_flos": 0.0, + "train_loss": 0.3599983148757908, + "train_runtime": 21481.0681, + "train_samples_per_second": 17.076, + "train_steps_per_second": 0.267 + } + ], + "logging_steps": 10, + "max_steps": 5730, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}