{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 2000, "global_step": 4168, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002399232245681382, "grad_norm": 4.854994271032306, "learning_rate": 1.199040767386091e-09, "logits/chosen": -0.3870464563369751, "logits/rejected": -0.3449973464012146, "logps/chosen": -161.37554931640625, "logps/rejected": -150.78668212890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0023992322456813818, "grad_norm": 5.007999188048557, "learning_rate": 1.199040767386091e-08, "logits/chosen": -0.38755929470062256, "logits/rejected": -0.40367352962493896, "logps/chosen": -389.556640625, "logps/rejected": -313.19439697265625, "loss": 0.6932, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.000382223108317703, "rewards/margins": 0.000817837193608284, "rewards/rejected": -0.0004356140270829201, "step": 10 }, { "epoch": 0.0047984644913627635, "grad_norm": 5.4251682517128685, "learning_rate": 2.398081534772182e-08, "logits/chosen": -0.40382856130599976, "logits/rejected": -0.4116736352443695, "logps/chosen": -253.2971649169922, "logps/rejected": -222.39187622070312, "loss": 0.6934, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0011010636808350682, "rewards/margins": 0.0007788551738485694, "rewards/rejected": -0.0018799189710989594, "step": 20 }, { "epoch": 0.007197696737044146, "grad_norm": 4.855409710988802, "learning_rate": 3.597122302158273e-08, "logits/chosen": -0.37188905477523804, "logits/rejected": -0.41493088006973267, "logps/chosen": -264.1092834472656, "logps/rejected": -276.79327392578125, "loss": 0.6932, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0006352605996653438, "rewards/margins": 0.0009311493486166, "rewards/rejected": -0.001566409831866622, "step": 30 }, { "epoch": 0.009596928982725527, "grad_norm": 4.773650762501925, "learning_rate": 4.796163069544364e-08, "logits/chosen": -0.4388062357902527, "logits/rejected": -0.4551084041595459, "logps/chosen": -283.5164489746094, "logps/rejected": -264.680419921875, "loss": 0.6931, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -6.967745866859332e-05, "rewards/margins": 0.0002998415438923985, "rewards/rejected": -0.0003695189952850342, "step": 40 }, { "epoch": 0.01199616122840691, "grad_norm": 5.249008388011441, "learning_rate": 5.995203836930455e-08, "logits/chosen": -0.44028440117836, "logits/rejected": -0.41670793294906616, "logps/chosen": -289.8959045410156, "logps/rejected": -249.32876586914062, "loss": 0.693, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.00012158080789959058, "rewards/margins": 0.0013795382110401988, "rewards/rejected": -0.0012579575413838029, "step": 50 }, { "epoch": 0.014395393474088292, "grad_norm": 5.451149941549421, "learning_rate": 7.194244604316546e-08, "logits/chosen": -0.41042596101760864, "logits/rejected": -0.39021745324134827, "logps/chosen": -293.70989990234375, "logps/rejected": -274.8219909667969, "loss": 0.6932, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.00042934497469104826, "rewards/margins": 5.6701617722865194e-05, "rewards/rejected": 0.00037264340789988637, "step": 60 }, { "epoch": 0.016794625719769675, "grad_norm": 4.9176821013212955, "learning_rate": 8.393285371702638e-08, "logits/chosen": -0.37676185369491577, "logits/rejected": -0.3675565719604492, "logps/chosen": -300.6470031738281, "logps/rejected": -285.7118225097656, "loss": 0.6931, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0005408605793491006, "rewards/margins": -0.0005098087713122368, "rewards/rejected": -3.105172800133005e-05, "step": 70 }, { "epoch": 0.019193857965451054, "grad_norm": 5.324153930467038, "learning_rate": 9.592326139088728e-08, "logits/chosen": -0.3833390474319458, "logits/rejected": -0.3485874533653259, "logps/chosen": -202.55172729492188, "logps/rejected": -266.27801513671875, "loss": 0.693, "rewards/accuracies": 0.375, "rewards/chosen": -0.0002449182793498039, "rewards/margins": -0.00045424007112160325, "rewards/rejected": 0.00020932205370627344, "step": 80 }, { "epoch": 0.021593090211132437, "grad_norm": 4.902493446985089, "learning_rate": 1.0791366906474819e-07, "logits/chosen": -0.42354243993759155, "logits/rejected": -0.4241662621498108, "logps/chosen": -345.2079162597656, "logps/rejected": -297.83392333984375, "loss": 0.6931, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.00027865776792168617, "rewards/margins": -7.414491847157478e-05, "rewards/rejected": -0.00020451273303478956, "step": 90 }, { "epoch": 0.02399232245681382, "grad_norm": 5.335283741962904, "learning_rate": 1.199040767386091e-07, "logits/chosen": -0.3970239758491516, "logits/rejected": -0.36344924569129944, "logps/chosen": -279.8683166503906, "logps/rejected": -301.5334167480469, "loss": 0.6926, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0004452617431525141, "rewards/margins": 0.0009495856938883662, "rewards/rejected": -0.0005043239216320217, "step": 100 }, { "epoch": 0.026391554702495202, "grad_norm": 4.56016585237767, "learning_rate": 1.3189448441247004e-07, "logits/chosen": -0.3945187032222748, "logits/rejected": -0.40393415093421936, "logps/chosen": -245.4661102294922, "logps/rejected": -244.88955688476562, "loss": 0.6927, "rewards/accuracies": 0.5, "rewards/chosen": -0.0013859450118616223, "rewards/margins": -9.293450420955196e-05, "rewards/rejected": -0.0012930103112012148, "step": 110 }, { "epoch": 0.028790786948176585, "grad_norm": 5.117639055978934, "learning_rate": 1.4388489208633092e-07, "logits/chosen": -0.3993036150932312, "logits/rejected": -0.413900762796402, "logps/chosen": -301.570068359375, "logps/rejected": -287.91998291015625, "loss": 0.6919, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0009120000759139657, "rewards/margins": 0.002243091817945242, "rewards/rejected": -0.003155091777443886, "step": 120 }, { "epoch": 0.031190019193857964, "grad_norm": 4.533876465192458, "learning_rate": 1.5587529976019183e-07, "logits/chosen": -0.40214699506759644, "logits/rejected": -0.4113968014717102, "logps/chosen": -219.7006072998047, "logps/rejected": -321.1871337890625, "loss": 0.6918, "rewards/accuracies": 0.625, "rewards/chosen": -0.0008342149667441845, "rewards/margins": 0.0029901477973908186, "rewards/rejected": -0.0038243632297962904, "step": 130 }, { "epoch": 0.03358925143953935, "grad_norm": 5.008152220168436, "learning_rate": 1.6786570743405277e-07, "logits/chosen": -0.33501917123794556, "logits/rejected": -0.3522827625274658, "logps/chosen": -314.4662780761719, "logps/rejected": -304.34869384765625, "loss": 0.6911, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0010583497351035476, "rewards/margins": 0.004176832735538483, "rewards/rejected": -0.005235183052718639, "step": 140 }, { "epoch": 0.03598848368522073, "grad_norm": 5.230326203624785, "learning_rate": 1.7985611510791365e-07, "logits/chosen": -0.3923170566558838, "logits/rejected": -0.3953098952770233, "logps/chosen": -236.24685668945312, "logps/rejected": -234.350830078125, "loss": 0.6911, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0009860789868980646, "rewards/margins": 0.004025029484182596, "rewards/rejected": -0.0050111087039113045, "step": 150 }, { "epoch": 0.03838771593090211, "grad_norm": 5.013822574107228, "learning_rate": 1.9184652278177456e-07, "logits/chosen": -0.31543251872062683, "logits/rejected": -0.3109430968761444, "logps/chosen": -316.65985107421875, "logps/rejected": -250.0377960205078, "loss": 0.6906, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0033806234132498503, "rewards/margins": 0.0072640664875507355, "rewards/rejected": -0.010644689202308655, "step": 160 }, { "epoch": 0.040786948176583494, "grad_norm": 4.673205835529891, "learning_rate": 2.038369304556355e-07, "logits/chosen": -0.3539220094680786, "logits/rejected": -0.3610088527202606, "logps/chosen": -352.55316162109375, "logps/rejected": -340.3455810546875, "loss": 0.689, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0013684859732165933, "rewards/margins": 0.009599483571946621, "rewards/rejected": -0.01096796989440918, "step": 170 }, { "epoch": 0.04318618042226487, "grad_norm": 5.280440292000614, "learning_rate": 2.1582733812949638e-07, "logits/chosen": -0.414604127407074, "logits/rejected": -0.40983182191848755, "logps/chosen": -251.31930541992188, "logps/rejected": -246.71249389648438, "loss": 0.6893, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0037162289954721928, "rewards/margins": 0.016576949506998062, "rewards/rejected": -0.020293179899454117, "step": 180 }, { "epoch": 0.04558541266794626, "grad_norm": 6.110987926729073, "learning_rate": 2.278177458033573e-07, "logits/chosen": -0.37201982736587524, "logits/rejected": -0.3684994578361511, "logps/chosen": -334.75994873046875, "logps/rejected": -276.002197265625, "loss": 0.6885, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0038164921570569277, "rewards/margins": 0.0039043619763106108, "rewards/rejected": -0.0077208541333675385, "step": 190 }, { "epoch": 0.04798464491362764, "grad_norm": 4.693128517095156, "learning_rate": 2.398081534772182e-07, "logits/chosen": -0.3985927700996399, "logits/rejected": -0.3652064800262451, "logps/chosen": -327.9397888183594, "logps/rejected": -314.77703857421875, "loss": 0.6871, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0021692051086574793, "rewards/margins": 0.02031012810766697, "rewards/rejected": -0.022479332983493805, "step": 200 }, { "epoch": 0.05038387715930902, "grad_norm": 4.82808785130016, "learning_rate": 2.517985611510791e-07, "logits/chosen": -0.416436105966568, "logits/rejected": -0.42406004667282104, "logps/chosen": -256.06634521484375, "logps/rejected": -279.0437316894531, "loss": 0.6868, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.006927967071533203, "rewards/margins": 0.01621558703482151, "rewards/rejected": -0.023143552243709564, "step": 210 }, { "epoch": 0.052783109404990404, "grad_norm": 4.930218028749106, "learning_rate": 2.637889688249401e-07, "logits/chosen": -0.4129602015018463, "logits/rejected": -0.42116695642471313, "logps/chosen": -326.42987060546875, "logps/rejected": -336.3708801269531, "loss": 0.6874, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.011486181057989597, "rewards/margins": 0.01089237816631794, "rewards/rejected": -0.02237856015563011, "step": 220 }, { "epoch": 0.05518234165067178, "grad_norm": 5.209995561674771, "learning_rate": 2.7577937649880093e-07, "logits/chosen": -0.39636367559432983, "logits/rejected": -0.37121134996414185, "logps/chosen": -249.68606567382812, "logps/rejected": -287.6152038574219, "loss": 0.6826, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.014097055420279503, "rewards/margins": 0.01727898046374321, "rewards/rejected": -0.031376034021377563, "step": 230 }, { "epoch": 0.05758157389635317, "grad_norm": 5.637285290487847, "learning_rate": 2.8776978417266184e-07, "logits/chosen": -0.3950192332267761, "logits/rejected": -0.38908690214157104, "logps/chosen": -302.5347595214844, "logps/rejected": -257.3473205566406, "loss": 0.6811, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.006957252975553274, "rewards/margins": 0.02547440305352211, "rewards/rejected": -0.03243165463209152, "step": 240 }, { "epoch": 0.05998080614203455, "grad_norm": 5.523154228755914, "learning_rate": 2.997601918465228e-07, "logits/chosen": -0.3644653558731079, "logits/rejected": -0.35463160276412964, "logps/chosen": -244.20401000976562, "logps/rejected": -235.6724395751953, "loss": 0.6804, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01351804006844759, "rewards/margins": 0.01623005047440529, "rewards/rejected": -0.029748091474175453, "step": 250 }, { "epoch": 0.06238003838771593, "grad_norm": 4.946428082161042, "learning_rate": 3.1175059952038366e-07, "logits/chosen": -0.44501185417175293, "logits/rejected": -0.39841514825820923, "logps/chosen": -285.33624267578125, "logps/rejected": -289.7425842285156, "loss": 0.6786, "rewards/accuracies": 0.625, "rewards/chosen": -0.024796243757009506, "rewards/margins": 0.03235085308551788, "rewards/rejected": -0.05714709684252739, "step": 260 }, { "epoch": 0.0647792706333973, "grad_norm": 5.212561603527658, "learning_rate": 3.2374100719424457e-07, "logits/chosen": -0.3279297947883606, "logits/rejected": -0.38405635952949524, "logps/chosen": -295.61566162109375, "logps/rejected": -237.0383758544922, "loss": 0.676, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.021820012480020523, "rewards/margins": 0.008004983887076378, "rewards/rejected": -0.029824992641806602, "step": 270 }, { "epoch": 0.0671785028790787, "grad_norm": 5.188924605918914, "learning_rate": 3.3573141486810554e-07, "logits/chosen": -0.3912425935268402, "logits/rejected": -0.38816994428634644, "logps/chosen": -309.39617919921875, "logps/rejected": -302.08001708984375, "loss": 0.6706, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.01541377604007721, "rewards/margins": 0.04586619883775711, "rewards/rejected": -0.06127997115254402, "step": 280 }, { "epoch": 0.06957773512476008, "grad_norm": 4.7601490852750326, "learning_rate": 3.477218225419664e-07, "logits/chosen": -0.3547287583351135, "logits/rejected": -0.32382625341415405, "logps/chosen": -301.15460205078125, "logps/rejected": -276.5646667480469, "loss": 0.6714, "rewards/accuracies": 0.75, "rewards/chosen": -0.021407341584563255, "rewards/margins": 0.03352125734090805, "rewards/rejected": -0.05492859333753586, "step": 290 }, { "epoch": 0.07197696737044146, "grad_norm": 5.589212470867438, "learning_rate": 3.597122302158273e-07, "logits/chosen": -0.4183027744293213, "logits/rejected": -0.4140304923057556, "logps/chosen": -278.68707275390625, "logps/rejected": -302.8797912597656, "loss": 0.6729, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.03756223991513252, "rewards/margins": 0.041960276663303375, "rewards/rejected": -0.0795225277543068, "step": 300 }, { "epoch": 0.07437619961612284, "grad_norm": 5.107027000775508, "learning_rate": 3.7170263788968827e-07, "logits/chosen": -0.38045555353164673, "logits/rejected": -0.3847430646419525, "logps/chosen": -288.02801513671875, "logps/rejected": -258.888671875, "loss": 0.6756, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.043108247220516205, "rewards/margins": 0.04828093200922012, "rewards/rejected": -0.09138917922973633, "step": 310 }, { "epoch": 0.07677543186180422, "grad_norm": 4.7093696590111955, "learning_rate": 3.836930455635491e-07, "logits/chosen": -0.3643363118171692, "logits/rejected": -0.35694578289985657, "logps/chosen": -284.8294372558594, "logps/rejected": -257.43133544921875, "loss": 0.6681, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.04265808314085007, "rewards/margins": 0.04907030984759331, "rewards/rejected": -0.09172839671373367, "step": 320 }, { "epoch": 0.07917466410748561, "grad_norm": 4.910514032068699, "learning_rate": 3.9568345323741003e-07, "logits/chosen": -0.31883081793785095, "logits/rejected": -0.28674525022506714, "logps/chosen": -272.1695251464844, "logps/rejected": -318.9271240234375, "loss": 0.6639, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08635957539081573, "rewards/margins": 0.05848981812596321, "rewards/rejected": -0.14484938979148865, "step": 330 }, { "epoch": 0.08157389635316699, "grad_norm": 5.0332599907751545, "learning_rate": 4.07673860911271e-07, "logits/chosen": -0.3443170189857483, "logits/rejected": -0.35223323106765747, "logps/chosen": -254.35726928710938, "logps/rejected": -281.94195556640625, "loss": 0.6642, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.04620979353785515, "rewards/margins": 0.08755537122488022, "rewards/rejected": -0.13376514613628387, "step": 340 }, { "epoch": 0.08397312859884837, "grad_norm": 5.508072548843049, "learning_rate": 4.1966426858513185e-07, "logits/chosen": -0.3824247717857361, "logits/rejected": -0.3841249346733093, "logps/chosen": -319.01995849609375, "logps/rejected": -318.2255554199219, "loss": 0.6667, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08988650888204575, "rewards/margins": 0.05337555333971977, "rewards/rejected": -0.1432620733976364, "step": 350 }, { "epoch": 0.08637236084452975, "grad_norm": 5.385600660244676, "learning_rate": 4.3165467625899276e-07, "logits/chosen": -0.36046355962753296, "logits/rejected": -0.3866155743598938, "logps/chosen": -274.48846435546875, "logps/rejected": -234.24191284179688, "loss": 0.6672, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0927540734410286, "rewards/margins": 0.045326970517635345, "rewards/rejected": -0.13808102905750275, "step": 360 }, { "epoch": 0.08877159309021113, "grad_norm": 5.5450235472250675, "learning_rate": 4.436450839328537e-07, "logits/chosen": -0.3677825331687927, "logits/rejected": -0.3560819625854492, "logps/chosen": -265.2862243652344, "logps/rejected": -291.4950256347656, "loss": 0.6608, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1145174503326416, "rewards/margins": 0.10159511864185333, "rewards/rejected": -0.21611256897449493, "step": 370 }, { "epoch": 0.09117082533589252, "grad_norm": 4.681358528597076, "learning_rate": 4.556354916067146e-07, "logits/chosen": -0.39217817783355713, "logits/rejected": -0.36524298787117004, "logps/chosen": -255.4202117919922, "logps/rejected": -269.22540283203125, "loss": 0.65, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.0964360311627388, "rewards/margins": 0.10729198157787323, "rewards/rejected": -0.20372800529003143, "step": 380 }, { "epoch": 0.0935700575815739, "grad_norm": 5.3926706949666325, "learning_rate": 4.676258992805755e-07, "logits/chosen": -0.31998729705810547, "logits/rejected": -0.30327945947647095, "logps/chosen": -294.6556396484375, "logps/rejected": -271.11346435546875, "loss": 0.6514, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11293824017047882, "rewards/margins": 0.07754337787628174, "rewards/rejected": -0.19048163294792175, "step": 390 }, { "epoch": 0.09596928982725528, "grad_norm": 5.420438925013907, "learning_rate": 4.796163069544364e-07, "logits/chosen": -0.35674285888671875, "logits/rejected": -0.3888497054576874, "logps/chosen": -275.8892517089844, "logps/rejected": -273.18157958984375, "loss": 0.6491, "rewards/accuracies": 0.75, "rewards/chosen": -0.1275942027568817, "rewards/margins": 0.13519003987312317, "rewards/rejected": -0.26278427243232727, "step": 400 }, { "epoch": 0.09836852207293666, "grad_norm": 5.2798127437891775, "learning_rate": 4.916067146282974e-07, "logits/chosen": -0.362439900636673, "logits/rejected": -0.3545471131801605, "logps/chosen": -278.9376525878906, "logps/rejected": -321.1399841308594, "loss": 0.6355, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.15209965407848358, "rewards/margins": 0.1249600201845169, "rewards/rejected": -0.2770597040653229, "step": 410 }, { "epoch": 0.10076775431861804, "grad_norm": 5.259650468657089, "learning_rate": 4.999992108529978e-07, "logits/chosen": -0.31682300567626953, "logits/rejected": -0.3138789236545563, "logps/chosen": -353.08843994140625, "logps/rejected": -335.11993408203125, "loss": 0.6426, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1913050264120102, "rewards/margins": 0.15328899025917053, "rewards/rejected": -0.3445940315723419, "step": 420 }, { "epoch": 0.10316698656429943, "grad_norm": 5.7901843585434305, "learning_rate": 4.999851817115532e-07, "logits/chosen": -0.4503898620605469, "logits/rejected": -0.406587690114975, "logps/chosen": -276.3069152832031, "logps/rejected": -298.9613342285156, "loss": 0.6437, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1920853555202484, "rewards/margins": 0.21766385436058044, "rewards/rejected": -0.40974926948547363, "step": 430 }, { "epoch": 0.10556621880998081, "grad_norm": 5.5732351937176325, "learning_rate": 4.999536171027889e-07, "logits/chosen": -0.3798277974128723, "logits/rejected": -0.38922780752182007, "logps/chosen": -321.9649658203125, "logps/rejected": -325.6181640625, "loss": 0.6342, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.23407897353172302, "rewards/margins": 0.13595230877399445, "rewards/rejected": -0.37003129720687866, "step": 440 }, { "epoch": 0.10796545105566219, "grad_norm": 6.182065387597852, "learning_rate": 4.999045192408369e-07, "logits/chosen": -0.28310176730155945, "logits/rejected": -0.28306809067726135, "logps/chosen": -274.2243347167969, "logps/rejected": -265.6375732421875, "loss": 0.6339, "rewards/accuracies": 0.625, "rewards/chosen": -0.2778427004814148, "rewards/margins": 0.08695127815008163, "rewards/rejected": -0.36479395627975464, "step": 450 }, { "epoch": 0.11036468330134357, "grad_norm": 5.712218150903385, "learning_rate": 4.998378915697171e-07, "logits/chosen": -0.3742767870426178, "logits/rejected": -0.3694307804107666, "logps/chosen": -301.56683349609375, "logps/rejected": -318.60382080078125, "loss": 0.6177, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2279098480939865, "rewards/margins": 0.23911185562610626, "rewards/rejected": -0.4670217037200928, "step": 460 }, { "epoch": 0.11276391554702495, "grad_norm": 5.207613785560333, "learning_rate": 4.997537387630958e-07, "logits/chosen": -0.30902743339538574, "logits/rejected": -0.3139379322528839, "logps/chosen": -238.130859375, "logps/rejected": -261.97259521484375, "loss": 0.6099, "rewards/accuracies": 0.625, "rewards/chosen": -0.21456749737262726, "rewards/margins": 0.2124086171388626, "rewards/rejected": -0.42697611451148987, "step": 470 }, { "epoch": 0.11516314779270634, "grad_norm": 6.373579896233462, "learning_rate": 4.996520667239582e-07, "logits/chosen": -0.44946223497390747, "logits/rejected": -0.4517344534397125, "logps/chosen": -263.78277587890625, "logps/rejected": -343.26947021484375, "loss": 0.6111, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2376352846622467, "rewards/margins": 0.2664358913898468, "rewards/rejected": -0.5040711164474487, "step": 480 }, { "epoch": 0.11756238003838772, "grad_norm": 6.494273680385882, "learning_rate": 4.995328825841939e-07, "logits/chosen": -0.32751840353012085, "logits/rejected": -0.33484649658203125, "logps/chosen": -246.5781707763672, "logps/rejected": -297.93353271484375, "loss": 0.6083, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.27829456329345703, "rewards/margins": 0.44562092423439026, "rewards/rejected": -0.7239154577255249, "step": 490 }, { "epoch": 0.1199616122840691, "grad_norm": 6.156723989230307, "learning_rate": 4.993961947040967e-07, "logits/chosen": -0.3556281626224518, "logits/rejected": -0.37649574875831604, "logps/chosen": -335.5538024902344, "logps/rejected": -312.1348876953125, "loss": 0.6281, "rewards/accuracies": 0.625, "rewards/chosen": -0.4268563687801361, "rewards/margins": 0.14374002814292908, "rewards/rejected": -0.5705963373184204, "step": 500 }, { "epoch": 0.12236084452975048, "grad_norm": 5.5439607137629, "learning_rate": 4.992420126717784e-07, "logits/chosen": -0.39222702383995056, "logits/rejected": -0.3753407597541809, "logps/chosen": -280.47393798828125, "logps/rejected": -329.1144104003906, "loss": 0.6064, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.24301853775978088, "rewards/margins": 0.4458102285861969, "rewards/rejected": -0.6888288259506226, "step": 510 }, { "epoch": 0.12476007677543186, "grad_norm": 6.961165174392256, "learning_rate": 4.990703473024958e-07, "logits/chosen": -0.3275012969970703, "logits/rejected": -0.35214173793792725, "logps/chosen": -332.95355224609375, "logps/rejected": -348.1015930175781, "loss": 0.6268, "rewards/accuracies": 0.625, "rewards/chosen": -0.42038726806640625, "rewards/margins": 0.20834532380104065, "rewards/rejected": -0.6287325620651245, "step": 520 }, { "epoch": 0.12715930902111325, "grad_norm": 6.451198360785239, "learning_rate": 4.98881210637893e-07, "logits/chosen": -0.35043126344680786, "logits/rejected": -0.3229239583015442, "logps/chosen": -256.13128662109375, "logps/rejected": -325.2506408691406, "loss": 0.6186, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.35000094771385193, "rewards/margins": 0.2998683452606201, "rewards/rejected": -0.6498693227767944, "step": 530 }, { "epoch": 0.1295585412667946, "grad_norm": 5.138402880291277, "learning_rate": 4.986746159451553e-07, "logits/chosen": -0.2927325963973999, "logits/rejected": -0.2958449721336365, "logps/chosen": -296.5050964355469, "logps/rejected": -315.69061279296875, "loss": 0.6091, "rewards/accuracies": 0.75, "rewards/chosen": -0.2853540778160095, "rewards/margins": 0.2909182012081146, "rewards/rejected": -0.5762723088264465, "step": 540 }, { "epoch": 0.131957773512476, "grad_norm": 5.337841343228059, "learning_rate": 4.984505777160795e-07, "logits/chosen": -0.2978525757789612, "logits/rejected": -0.3084144592285156, "logps/chosen": -360.9128112792969, "logps/rejected": -391.237060546875, "loss": 0.6252, "rewards/accuracies": 0.625, "rewards/chosen": -0.40219253301620483, "rewards/margins": 0.2555919587612152, "rewards/rejected": -0.6577844619750977, "step": 550 }, { "epoch": 0.1343570057581574, "grad_norm": 5.899251959062901, "learning_rate": 4.982091116660574e-07, "logits/chosen": -0.44975343346595764, "logits/rejected": -0.4612964689731598, "logps/chosen": -247.9381561279297, "logps/rejected": -239.43057250976562, "loss": 0.6346, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4517020285129547, "rewards/margins": 0.14873093366622925, "rewards/rejected": -0.6004330515861511, "step": 560 }, { "epoch": 0.13675623800383876, "grad_norm": 6.842342229687067, "learning_rate": 4.979502347329732e-07, "logits/chosen": -0.3259963393211365, "logits/rejected": -0.32738104462623596, "logps/chosen": -363.0848083496094, "logps/rejected": -425.85650634765625, "loss": 0.6152, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.53682541847229, "rewards/margins": 0.35178542137145996, "rewards/rejected": -0.88861083984375, "step": 570 }, { "epoch": 0.13915547024952016, "grad_norm": 7.709446348310538, "learning_rate": 4.976739650760151e-07, "logits/chosen": -0.4362337589263916, "logits/rejected": -0.4145421087741852, "logps/chosen": -322.2565002441406, "logps/rejected": -323.690673828125, "loss": 0.6084, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.503047525882721, "rewards/margins": 0.2108907401561737, "rewards/rejected": -0.7139382362365723, "step": 580 }, { "epoch": 0.14155470249520152, "grad_norm": 7.389107341643373, "learning_rate": 4.97380322074402e-07, "logits/chosen": -0.3247602880001068, "logits/rejected": -0.33087000250816345, "logps/chosen": -276.3142395019531, "logps/rejected": -307.705078125, "loss": 0.6167, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4834076464176178, "rewards/margins": 0.29755571484565735, "rewards/rejected": -0.7809633016586304, "step": 590 }, { "epoch": 0.14395393474088292, "grad_norm": 6.3757200147393585, "learning_rate": 4.970693263260237e-07, "logits/chosen": -0.34789201617240906, "logits/rejected": -0.3619535267353058, "logps/chosen": -332.65240478515625, "logps/rejected": -348.0079650878906, "loss": 0.6153, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4377606511116028, "rewards/margins": 0.43048110604286194, "rewards/rejected": -0.8682417869567871, "step": 600 }, { "epoch": 0.1463531669865643, "grad_norm": 6.992245943787356, "learning_rate": 4.967409996459966e-07, "logits/chosen": -0.40890535712242126, "logits/rejected": -0.41852784156799316, "logps/chosen": -343.47113037109375, "logps/rejected": -352.51739501953125, "loss": 0.6028, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.44202518463134766, "rewards/margins": 0.35426777601242065, "rewards/rejected": -0.7962929010391235, "step": 610 }, { "epoch": 0.14875239923224567, "grad_norm": 5.871485349553309, "learning_rate": 4.963953650651326e-07, "logits/chosen": -0.3329642117023468, "logits/rejected": -0.33251506090164185, "logps/chosen": -415.4046936035156, "logps/rejected": -351.1783447265625, "loss": 0.5993, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.527837336063385, "rewards/margins": 0.2701931595802307, "rewards/rejected": -0.798030436038971, "step": 620 }, { "epoch": 0.15115163147792707, "grad_norm": 6.6189704018639155, "learning_rate": 4.960324468283248e-07, "logits/chosen": -0.4594503343105316, "logits/rejected": -0.4743649363517761, "logps/chosen": -287.46697998046875, "logps/rejected": -318.0199279785156, "loss": 0.5864, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5368043184280396, "rewards/margins": 0.2990504205226898, "rewards/rejected": -0.8358548283576965, "step": 630 }, { "epoch": 0.15355086372360843, "grad_norm": 6.440240303252386, "learning_rate": 4.956522703928451e-07, "logits/chosen": -0.4017508625984192, "logits/rejected": -0.3634760081768036, "logps/chosen": -301.4740295410156, "logps/rejected": -330.0491943359375, "loss": 0.5777, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.5480097532272339, "rewards/margins": 0.28381380438804626, "rewards/rejected": -0.8318235278129578, "step": 640 }, { "epoch": 0.15595009596928983, "grad_norm": 8.733595781101256, "learning_rate": 4.952548624265606e-07, "logits/chosen": -0.33765482902526855, "logits/rejected": -0.3307989537715912, "logps/chosen": -368.00457763671875, "logps/rejected": -375.5353088378906, "loss": 0.6137, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6871958374977112, "rewards/margins": 0.2888778746128082, "rewards/rejected": -0.9760736227035522, "step": 650 }, { "epoch": 0.15834932821497122, "grad_norm": 6.492977447810359, "learning_rate": 4.948402508060607e-07, "logits/chosen": -0.41905927658081055, "logits/rejected": -0.4047884941101074, "logps/chosen": -299.0521240234375, "logps/rejected": -338.4973449707031, "loss": 0.6203, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5227022767066956, "rewards/margins": 0.4564470648765564, "rewards/rejected": -0.979149341583252, "step": 660 }, { "epoch": 0.16074856046065258, "grad_norm": 7.072838685765256, "learning_rate": 4.944084646147038e-07, "logits/chosen": -0.3967028260231018, "logits/rejected": -0.3915463387966156, "logps/chosen": -393.5112609863281, "logps/rejected": -390.46771240234375, "loss": 0.6382, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6279814839363098, "rewards/margins": 0.20776453614234924, "rewards/rejected": -0.8357461094856262, "step": 670 }, { "epoch": 0.16314779270633398, "grad_norm": 7.229422542340886, "learning_rate": 4.939595341405754e-07, "logits/chosen": -0.4420618414878845, "logits/rejected": -0.47058361768722534, "logps/chosen": -320.97833251953125, "logps/rejected": -354.74896240234375, "loss": 0.6077, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5401324033737183, "rewards/margins": 0.39541110396385193, "rewards/rejected": -0.935543417930603, "step": 680 }, { "epoch": 0.16554702495201534, "grad_norm": 6.416644260001778, "learning_rate": 4.93493490874365e-07, "logits/chosen": -0.3352740705013275, "logits/rejected": -0.3355199694633484, "logps/chosen": -320.6838073730469, "logps/rejected": -349.72637939453125, "loss": 0.5704, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6109887361526489, "rewards/margins": 0.24212434887886047, "rewards/rejected": -0.8531131744384766, "step": 690 }, { "epoch": 0.16794625719769674, "grad_norm": 8.587832635723272, "learning_rate": 4.93010367507156e-07, "logits/chosen": -0.4223472476005554, "logits/rejected": -0.40697455406188965, "logps/chosen": -279.3757629394531, "logps/rejected": -302.53887939453125, "loss": 0.5809, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5754938125610352, "rewards/margins": 0.49033960700035095, "rewards/rejected": -1.065833330154419, "step": 700 }, { "epoch": 0.17034548944337813, "grad_norm": 8.280473447647031, "learning_rate": 4.925101979281332e-07, "logits/chosen": -0.308775395154953, "logits/rejected": -0.3469308018684387, "logps/chosen": -367.63006591796875, "logps/rejected": -369.24102783203125, "loss": 0.5982, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.54417484998703, "rewards/margins": 0.5548971891403198, "rewards/rejected": -1.0990720987319946, "step": 710 }, { "epoch": 0.1727447216890595, "grad_norm": 7.262952451036594, "learning_rate": 4.919930172222054e-07, "logits/chosen": -0.40510883927345276, "logits/rejected": -0.4301750659942627, "logps/chosen": -333.3446350097656, "logps/rejected": -367.443359375, "loss": 0.562, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6565228700637817, "rewards/margins": 0.41670140624046326, "rewards/rejected": -1.0732243061065674, "step": 720 }, { "epoch": 0.1751439539347409, "grad_norm": 7.527915939450412, "learning_rate": 4.914588616675445e-07, "logits/chosen": -0.5176496505737305, "logits/rejected": -0.5337257981300354, "logps/chosen": -276.7431945800781, "logps/rejected": -333.2959899902344, "loss": 0.5984, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5182946920394897, "rewards/margins": 0.4113486707210541, "rewards/rejected": -0.9296433329582214, "step": 730 }, { "epoch": 0.17754318618042225, "grad_norm": 7.676539707780335, "learning_rate": 4.909077687330404e-07, "logits/chosen": -0.38037022948265076, "logits/rejected": -0.3775717318058014, "logps/chosen": -361.13360595703125, "logps/rejected": -350.4933166503906, "loss": 0.5737, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6827607154846191, "rewards/margins": 0.23238630592823029, "rewards/rejected": -0.9151470065116882, "step": 740 }, { "epoch": 0.17994241842610365, "grad_norm": 7.668861235099345, "learning_rate": 4.903397770756729e-07, "logits/chosen": -0.4016490876674652, "logits/rejected": -0.41701728105545044, "logps/chosen": -345.77093505859375, "logps/rejected": -386.59478759765625, "loss": 0.5849, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6805583834648132, "rewards/margins": 0.4540184438228607, "rewards/rejected": -1.1345769166946411, "step": 750 }, { "epoch": 0.18234165067178504, "grad_norm": 5.8948610663264756, "learning_rate": 4.897549265378004e-07, "logits/chosen": -0.39282411336898804, "logits/rejected": -0.40464964509010315, "logps/chosen": -409.1741638183594, "logps/rejected": -438.93896484375, "loss": 0.5763, "rewards/accuracies": 0.75, "rewards/chosen": -0.6418434381484985, "rewards/margins": 0.33645665645599365, "rewards/rejected": -0.9782999753952026, "step": 760 }, { "epoch": 0.1847408829174664, "grad_norm": 7.575295799966419, "learning_rate": 4.891532581443643e-07, "logits/chosen": -0.4237458109855652, "logits/rejected": -0.43563684821128845, "logps/chosen": -363.11199951171875, "logps/rejected": -443.242919921875, "loss": 0.5533, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.48444193601608276, "rewards/margins": 0.8247998356819153, "rewards/rejected": -1.309241771697998, "step": 770 }, { "epoch": 0.1871401151631478, "grad_norm": 8.560608469390486, "learning_rate": 4.885348141000122e-07, "logits/chosen": -0.37164923548698425, "logits/rejected": -0.38602423667907715, "logps/chosen": -325.58111572265625, "logps/rejected": -402.8592224121094, "loss": 0.5743, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6478859186172485, "rewards/margins": 0.5546708106994629, "rewards/rejected": -1.202556848526001, "step": 780 }, { "epoch": 0.18953934740882916, "grad_norm": 6.7270126026080295, "learning_rate": 4.878996377861367e-07, "logits/chosen": -0.46751460433006287, "logits/rejected": -0.5031236410140991, "logps/chosen": -311.3717346191406, "logps/rejected": -359.74786376953125, "loss": 0.5381, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8375557065010071, "rewards/margins": 0.37298810482025146, "rewards/rejected": -1.2105437517166138, "step": 790 }, { "epoch": 0.19193857965451055, "grad_norm": 8.21855435965664, "learning_rate": 4.872477737578327e-07, "logits/chosen": -0.42351236939430237, "logits/rejected": -0.37146827578544617, "logps/chosen": -370.0285339355469, "logps/rejected": -442.88092041015625, "loss": 0.544, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8096299171447754, "rewards/margins": 0.7599128484725952, "rewards/rejected": -1.569542646408081, "step": 800 }, { "epoch": 0.19433781190019195, "grad_norm": 11.575192192623518, "learning_rate": 4.865792677407718e-07, "logits/chosen": -0.4782884120941162, "logits/rejected": -0.48071250319480896, "logps/chosen": -352.44830322265625, "logps/rejected": -357.5110778808594, "loss": 0.5849, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.840336799621582, "rewards/margins": 0.3332519829273224, "rewards/rejected": -1.173588752746582, "step": 810 }, { "epoch": 0.1967370441458733, "grad_norm": 10.038226850588464, "learning_rate": 4.858941666279955e-07, "logits/chosen": -0.4984146058559418, "logits/rejected": -0.5030771493911743, "logps/chosen": -356.491455078125, "logps/rejected": -367.9295959472656, "loss": 0.5948, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6749507784843445, "rewards/margins": 0.2715102434158325, "rewards/rejected": -0.946461021900177, "step": 820 }, { "epoch": 0.1991362763915547, "grad_norm": 9.05117258722249, "learning_rate": 4.851925184766247e-07, "logits/chosen": -0.4640856683254242, "logits/rejected": -0.4853819012641907, "logps/chosen": -357.955078125, "logps/rejected": -393.90478515625, "loss": 0.5816, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8941922187805176, "rewards/margins": 0.591671347618103, "rewards/rejected": -1.4858636856079102, "step": 830 }, { "epoch": 0.20153550863723607, "grad_norm": 9.681421628090424, "learning_rate": 4.844743725044897e-07, "logits/chosen": -0.4659281373023987, "logits/rejected": -0.5088318586349487, "logps/chosen": -329.5724182128906, "logps/rejected": -351.2308044433594, "loss": 0.5644, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7480869293212891, "rewards/margins": 0.4603498876094818, "rewards/rejected": -1.2084368467330933, "step": 840 }, { "epoch": 0.20393474088291746, "grad_norm": 9.601153198405655, "learning_rate": 4.837397790866774e-07, "logits/chosen": -0.47188258171081543, "logits/rejected": -0.4746321141719818, "logps/chosen": -362.03021240234375, "logps/rejected": -425.6300354003906, "loss": 0.5614, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.596829354763031, "rewards/margins": 0.8421809077262878, "rewards/rejected": -1.4390103816986084, "step": 850 }, { "epoch": 0.20633397312859886, "grad_norm": 8.338440429434868, "learning_rate": 4.829887897519974e-07, "logits/chosen": -0.4840044379234314, "logits/rejected": -0.4713994860649109, "logps/chosen": -310.5907897949219, "logps/rejected": -378.32049560546875, "loss": 0.5839, "rewards/accuracies": 0.75, "rewards/chosen": -0.6725698709487915, "rewards/margins": 0.4422430992126465, "rewards/rejected": -1.114812970161438, "step": 860 }, { "epoch": 0.20873320537428022, "grad_norm": 7.422844726445461, "learning_rate": 4.82221457179368e-07, "logits/chosen": -0.4798775613307953, "logits/rejected": -0.4771656095981598, "logps/chosen": -346.35284423828125, "logps/rejected": -404.0704040527344, "loss": 0.5487, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.653545081615448, "rewards/margins": 0.7137434482574463, "rewards/rejected": -1.36728835105896, "step": 870 }, { "epoch": 0.21113243761996162, "grad_norm": 7.498203140238108, "learning_rate": 4.814378351941206e-07, "logits/chosen": -0.4903596341609955, "logits/rejected": -0.4909774363040924, "logps/chosen": -333.675048828125, "logps/rejected": -356.37615966796875, "loss": 0.5783, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7229236960411072, "rewards/margins": 0.3018752932548523, "rewards/rejected": -1.0247989892959595, "step": 880 }, { "epoch": 0.21353166986564298, "grad_norm": 8.122602026965641, "learning_rate": 4.806379787642241e-07, "logits/chosen": -0.46273237466812134, "logits/rejected": -0.45851221680641174, "logps/chosen": -316.8045959472656, "logps/rejected": -386.3130798339844, "loss": 0.5996, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6047753095626831, "rewards/margins": 0.5871869921684265, "rewards/rejected": -1.1919623613357544, "step": 890 }, { "epoch": 0.21593090211132437, "grad_norm": 7.941797087490691, "learning_rate": 4.798219439964293e-07, "logits/chosen": -0.5175309777259827, "logits/rejected": -0.5462228059768677, "logps/chosen": -322.66717529296875, "logps/rejected": -351.2758483886719, "loss": 0.56, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7025772333145142, "rewards/margins": 0.17370259761810303, "rewards/rejected": -0.8762798309326172, "step": 900 }, { "epoch": 0.21833013435700577, "grad_norm": 11.409843882847035, "learning_rate": 4.78989788132333e-07, "logits/chosen": -0.5460485219955444, "logits/rejected": -0.5394560098648071, "logps/chosen": -274.5609130859375, "logps/rejected": -357.4576416015625, "loss": 0.5144, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.47791242599487305, "rewards/margins": 0.7380608320236206, "rewards/rejected": -1.215973138809204, "step": 910 }, { "epoch": 0.22072936660268713, "grad_norm": 7.234637207163524, "learning_rate": 4.781415695443631e-07, "logits/chosen": -0.47741183638572693, "logits/rejected": -0.46385058760643005, "logps/chosen": -409.548583984375, "logps/rejected": -444.76080322265625, "loss": 0.5728, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9967001080513, "rewards/margins": 0.2708914577960968, "rewards/rejected": -1.2675915956497192, "step": 920 }, { "epoch": 0.22312859884836853, "grad_norm": 7.053932572600293, "learning_rate": 4.772773477316836e-07, "logits/chosen": -0.4638640284538269, "logits/rejected": -0.4659477174282074, "logps/chosen": -399.1170654296875, "logps/rejected": -439.843994140625, "loss": 0.5505, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0153203010559082, "rewards/margins": 0.4187864661216736, "rewards/rejected": -1.434106707572937, "step": 930 }, { "epoch": 0.2255278310940499, "grad_norm": 8.914653285618083, "learning_rate": 4.7639718331602117e-07, "logits/chosen": -0.4590983986854553, "logits/rejected": -0.45087337493896484, "logps/chosen": -361.59393310546875, "logps/rejected": -440.941650390625, "loss": 0.5377, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7911983728408813, "rewards/margins": 0.8489359617233276, "rewards/rejected": -1.6401344537734985, "step": 940 }, { "epoch": 0.22792706333973128, "grad_norm": 9.061462258268621, "learning_rate": 4.7550113803741275e-07, "logits/chosen": -0.4630160331726074, "logits/rejected": -0.5072802305221558, "logps/chosen": -387.6048278808594, "logps/rejected": -365.64532470703125, "loss": 0.5737, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9304320216178894, "rewards/margins": 0.43897590041160583, "rewards/rejected": -1.3694080114364624, "step": 950 }, { "epoch": 0.23032629558541268, "grad_norm": 8.866753051371537, "learning_rate": 4.7458927474987454e-07, "logits/chosen": -0.4371200501918793, "logits/rejected": -0.43364763259887695, "logps/chosen": -409.8465881347656, "logps/rejected": -384.90234375, "loss": 0.5325, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7246421575546265, "rewards/margins": 0.3903957009315491, "rewards/rejected": -1.1150379180908203, "step": 960 }, { "epoch": 0.23272552783109404, "grad_norm": 7.704129412482537, "learning_rate": 4.7366165741699347e-07, "logits/chosen": -0.5143966674804688, "logits/rejected": -0.5406373143196106, "logps/chosen": -423.09014892578125, "logps/rejected": -443.0372619628906, "loss": 0.557, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8278281092643738, "rewards/margins": 0.45336437225341797, "rewards/rejected": -1.2811925411224365, "step": 970 }, { "epoch": 0.23512476007677544, "grad_norm": 7.64309056534733, "learning_rate": 4.727183511074401e-07, "logits/chosen": -0.5884715914726257, "logits/rejected": -0.5859401822090149, "logps/chosen": -379.3627624511719, "logps/rejected": -392.21710205078125, "loss": 0.5616, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.920864462852478, "rewards/margins": 0.23082181811332703, "rewards/rejected": -1.151686191558838, "step": 980 }, { "epoch": 0.2375239923224568, "grad_norm": 8.955148603646292, "learning_rate": 4.717594219904043e-07, "logits/chosen": -0.5043666958808899, "logits/rejected": -0.5165797472000122, "logps/chosen": -397.373046875, "logps/rejected": -394.7057189941406, "loss": 0.5553, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1149301528930664, "rewards/margins": 0.4297923147678375, "rewards/rejected": -1.544722318649292, "step": 990 }, { "epoch": 0.2399232245681382, "grad_norm": 9.68741680379089, "learning_rate": 4.7078493733095393e-07, "logits/chosen": -0.5751169919967651, "logits/rejected": -0.5922902822494507, "logps/chosen": -351.0492858886719, "logps/rejected": -421.7478942871094, "loss": 0.5413, "rewards/accuracies": 0.75, "rewards/chosen": -0.8745172619819641, "rewards/margins": 0.5707848072052002, "rewards/rejected": -1.445302128791809, "step": 1000 }, { "epoch": 0.2423224568138196, "grad_norm": 7.552967651026979, "learning_rate": 4.6979496548531614e-07, "logits/chosen": -0.5039399862289429, "logits/rejected": -0.49812453985214233, "logps/chosen": -357.84130859375, "logps/rejected": -476.9107360839844, "loss": 0.5415, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8784712553024292, "rewards/margins": 0.6386504173278809, "rewards/rejected": -1.5171215534210205, "step": 1010 }, { "epoch": 0.24472168905950095, "grad_norm": 7.847866875926576, "learning_rate": 4.6878957589608293e-07, "logits/chosen": -0.5514234304428101, "logits/rejected": -0.5487276911735535, "logps/chosen": -358.778076171875, "logps/rejected": -490.60150146484375, "loss": 0.5386, "rewards/accuracies": 0.75, "rewards/chosen": -0.7840293645858765, "rewards/margins": 0.8377860188484192, "rewards/rejected": -1.6218153238296509, "step": 1020 }, { "epoch": 0.24712092130518235, "grad_norm": 7.46939724134765, "learning_rate": 4.6776883908733956e-07, "logits/chosen": -0.5816242098808289, "logits/rejected": -0.5803698897361755, "logps/chosen": -391.3583679199219, "logps/rejected": -390.5106506347656, "loss": 0.5283, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8500774502754211, "rewards/margins": 0.6333375573158264, "rewards/rejected": -1.4834150075912476, "step": 1030 }, { "epoch": 0.2495201535508637, "grad_norm": 9.499290159289549, "learning_rate": 4.667328266597178e-07, "logits/chosen": -0.5638601779937744, "logits/rejected": -0.5747939348220825, "logps/chosen": -380.7355651855469, "logps/rejected": -426.9833984375, "loss": 0.5105, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9775155782699585, "rewards/margins": 0.575149416923523, "rewards/rejected": -1.5526649951934814, "step": 1040 }, { "epoch": 0.2519193857965451, "grad_norm": 7.863264950292173, "learning_rate": 4.6568161128537354e-07, "logits/chosen": -0.5107079148292542, "logits/rejected": -0.5302263498306274, "logps/chosen": -362.51165771484375, "logps/rejected": -357.8714294433594, "loss": 0.5418, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9620344042778015, "rewards/margins": 0.46561723947525024, "rewards/rejected": -1.4276517629623413, "step": 1050 }, { "epoch": 0.2543186180422265, "grad_norm": 9.95847080901471, "learning_rate": 4.6461526670288877e-07, "logits/chosen": -0.5152772068977356, "logits/rejected": -0.5086151957511902, "logps/chosen": -378.06707763671875, "logps/rejected": -412.7275390625, "loss": 0.5773, "rewards/accuracies": 0.75, "rewards/chosen": -0.7655351758003235, "rewards/margins": 0.6718277335166931, "rewards/rejected": -1.4373629093170166, "step": 1060 }, { "epoch": 0.2567178502879079, "grad_norm": 6.795589468002909, "learning_rate": 4.635338677120994e-07, "logits/chosen": -0.6026760935783386, "logits/rejected": -0.605171799659729, "logps/chosen": -363.8572998046875, "logps/rejected": -459.39764404296875, "loss": 0.5105, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9281944036483765, "rewards/margins": 0.7557355761528015, "rewards/rejected": -1.6839300394058228, "step": 1070 }, { "epoch": 0.2591170825335892, "grad_norm": 8.768106225235062, "learning_rate": 4.6243749016884835e-07, "logits/chosen": -0.4381980001926422, "logits/rejected": -0.4824441969394684, "logps/chosen": -396.9549255371094, "logps/rejected": -550.8186645507812, "loss": 0.5355, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1090790033340454, "rewards/margins": 0.9858474731445312, "rewards/rejected": -2.094926357269287, "step": 1080 }, { "epoch": 0.2615163147792706, "grad_norm": 12.341914291404834, "learning_rate": 4.613262109796645e-07, "logits/chosen": -0.5834716558456421, "logits/rejected": -0.5614360570907593, "logps/chosen": -384.7580261230469, "logps/rejected": -524.4951171875, "loss": 0.5445, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0333119630813599, "rewards/margins": 0.9544838666915894, "rewards/rejected": -1.9877955913543701, "step": 1090 }, { "epoch": 0.263915547024952, "grad_norm": 7.9475026844690975, "learning_rate": 4.602001080963678e-07, "logits/chosen": -0.5507039427757263, "logits/rejected": -0.5546278953552246, "logps/chosen": -414.7740783691406, "logps/rejected": -445.2301330566406, "loss": 0.5363, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1491403579711914, "rewards/margins": 0.5966090559959412, "rewards/rejected": -1.7457494735717773, "step": 1100 }, { "epoch": 0.2663147792706334, "grad_norm": 8.769481722876568, "learning_rate": 4.590592605106017e-07, "logits/chosen": -0.6439992189407349, "logits/rejected": -0.650866687297821, "logps/chosen": -414.07147216796875, "logps/rejected": -447.4894104003906, "loss": 0.5726, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8819205164909363, "rewards/margins": 0.6305669546127319, "rewards/rejected": -1.5124876499176025, "step": 1110 }, { "epoch": 0.2687140115163148, "grad_norm": 8.351140167515311, "learning_rate": 4.5790374824829165e-07, "logits/chosen": -0.5511302947998047, "logits/rejected": -0.5896275639533997, "logps/chosen": -292.4022216796875, "logps/rejected": -368.3529052734375, "loss": 0.5342, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9656060934066772, "rewards/margins": 0.5942641496658325, "rewards/rejected": -1.5598702430725098, "step": 1120 }, { "epoch": 0.27111324376199614, "grad_norm": 8.859939366313267, "learning_rate": 4.5673365236403216e-07, "logits/chosen": -0.6408380270004272, "logits/rejected": -0.6961749792098999, "logps/chosen": -286.7148132324219, "logps/rejected": -403.5390625, "loss": 0.521, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8016033172607422, "rewards/margins": 0.8615763783454895, "rewards/rejected": -1.6631797552108765, "step": 1130 }, { "epoch": 0.27351247600767753, "grad_norm": 8.034697339520084, "learning_rate": 4.5554905493540075e-07, "logits/chosen": -0.6324433088302612, "logits/rejected": -0.6281362175941467, "logps/chosen": -320.374755859375, "logps/rejected": -417.0606994628906, "loss": 0.5135, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8844473958015442, "rewards/margins": 0.9001449346542358, "rewards/rejected": -1.7845920324325562, "step": 1140 }, { "epoch": 0.2759117082533589, "grad_norm": 8.194750667905803, "learning_rate": 4.5435003905720074e-07, "logits/chosen": -0.5551676750183105, "logits/rejected": -0.5857855081558228, "logps/chosen": -384.4462585449219, "logps/rejected": -426.1817321777344, "loss": 0.5131, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8710935711860657, "rewards/margins": 0.703058660030365, "rewards/rejected": -1.5741522312164307, "step": 1150 }, { "epoch": 0.2783109404990403, "grad_norm": 9.865565713899901, "learning_rate": 4.531366888356324e-07, "logits/chosen": -0.6332504153251648, "logits/rejected": -0.6072624325752258, "logps/chosen": -294.1646423339844, "logps/rejected": -434.31024169921875, "loss": 0.5061, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0756479501724243, "rewards/margins": 0.913143515586853, "rewards/rejected": -1.9887917041778564, "step": 1160 }, { "epoch": 0.2807101727447217, "grad_norm": 10.884390589564477, "learning_rate": 4.519090893823931e-07, "logits/chosen": -0.6246575117111206, "logits/rejected": -0.6475778818130493, "logps/chosen": -372.89215087890625, "logps/rejected": -443.2525329589844, "loss": 0.5261, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2081364393234253, "rewards/margins": 0.7674375772476196, "rewards/rejected": -1.9755741357803345, "step": 1170 }, { "epoch": 0.28310940499040305, "grad_norm": 7.778148160208306, "learning_rate": 4.5066732680870734e-07, "logits/chosen": -0.568785548210144, "logits/rejected": -0.608304500579834, "logps/chosen": -348.26702880859375, "logps/rejected": -393.5550231933594, "loss": 0.5014, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8094667196273804, "rewards/margins": 0.8615515828132629, "rewards/rejected": -1.671018362045288, "step": 1180 }, { "epoch": 0.28550863723608444, "grad_norm": 8.167033695573148, "learning_rate": 4.494114882192862e-07, "logits/chosen": -0.659604012966156, "logits/rejected": -0.6383468508720398, "logps/chosen": -355.82061767578125, "logps/rejected": -419.57440185546875, "loss": 0.4999, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.7848014831542969, "rewards/margins": 0.9513505697250366, "rewards/rejected": -1.7361520528793335, "step": 1190 }, { "epoch": 0.28790786948176583, "grad_norm": 8.531778053136817, "learning_rate": 4.4814166170621735e-07, "logits/chosen": -0.6497922539710999, "logits/rejected": -0.6769246459007263, "logps/chosen": -341.2376708984375, "logps/rejected": -416.5484924316406, "loss": 0.5219, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8200058937072754, "rewards/margins": 0.9894447326660156, "rewards/rejected": -1.8094505071640015, "step": 1200 }, { "epoch": 0.2903071017274472, "grad_norm": 9.115422018620432, "learning_rate": 4.468579363427858e-07, "logits/chosen": -0.6323500871658325, "logits/rejected": -0.6400243043899536, "logps/chosen": -378.551025390625, "logps/rejected": -431.50628662109375, "loss": 0.5342, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1116712093353271, "rewards/margins": 0.7979139089584351, "rewards/rejected": -1.9095849990844727, "step": 1210 }, { "epoch": 0.2927063339731286, "grad_norm": 10.369810004009349, "learning_rate": 4.4556040217722555e-07, "logits/chosen": -0.7057371735572815, "logits/rejected": -0.6883876919746399, "logps/chosen": -335.6352233886719, "logps/rejected": -475.59588623046875, "loss": 0.5004, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9118591547012329, "rewards/margins": 0.989273190498352, "rewards/rejected": -1.9011322259902954, "step": 1220 }, { "epoch": 0.29510556621880996, "grad_norm": 8.78559151507744, "learning_rate": 4.442491502264033e-07, "logits/chosen": -0.6325684785842896, "logits/rejected": -0.6488875150680542, "logps/chosen": -321.8913879394531, "logps/rejected": -360.14776611328125, "loss": 0.5278, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9378086924552917, "rewards/margins": 0.4965507388114929, "rewards/rejected": -1.4343595504760742, "step": 1230 }, { "epoch": 0.29750479846449135, "grad_norm": 7.996922099069886, "learning_rate": 4.429242724694338e-07, "logits/chosen": -0.7028544545173645, "logits/rejected": -0.6892791390419006, "logps/chosen": -346.9255676269531, "logps/rejected": -447.5694274902344, "loss": 0.5193, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.8944141268730164, "rewards/margins": 0.8071637153625488, "rewards/rejected": -1.7015777826309204, "step": 1240 }, { "epoch": 0.29990403071017274, "grad_norm": 8.834617984631441, "learning_rate": 4.4158586184122817e-07, "logits/chosen": -0.6356642246246338, "logits/rejected": -0.6852391362190247, "logps/chosen": -391.93524169921875, "logps/rejected": -455.7276306152344, "loss": 0.4991, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8954633474349976, "rewards/margins": 0.9801417589187622, "rewards/rejected": -1.8756049871444702, "step": 1250 }, { "epoch": 0.30230326295585414, "grad_norm": 10.623650311123297, "learning_rate": 4.4023401222597443e-07, "logits/chosen": -0.6282129287719727, "logits/rejected": -0.7119131088256836, "logps/chosen": -408.7945251464844, "logps/rejected": -447.96533203125, "loss": 0.5039, "rewards/accuracies": 0.75, "rewards/chosen": -1.1023727655410767, "rewards/margins": 0.621015191078186, "rewards/rejected": -1.7233880758285522, "step": 1260 }, { "epoch": 0.30470249520153553, "grad_norm": 9.541014472218308, "learning_rate": 4.3886881845055235e-07, "logits/chosen": -0.6741599440574646, "logits/rejected": -0.7359042167663574, "logps/chosen": -343.3130187988281, "logps/rejected": -435.29681396484375, "loss": 0.5013, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8718856573104858, "rewards/margins": 1.0113645792007446, "rewards/rejected": -1.8832504749298096, "step": 1270 }, { "epoch": 0.30710172744721687, "grad_norm": 9.400894165088678, "learning_rate": 4.374903762778814e-07, "logits/chosen": -0.7107304334640503, "logits/rejected": -0.7214982509613037, "logps/chosen": -361.5565185546875, "logps/rejected": -422.3573303222656, "loss": 0.4996, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9224702715873718, "rewards/margins": 0.8412786722183228, "rewards/rejected": -1.7637488842010498, "step": 1280 }, { "epoch": 0.30950095969289826, "grad_norm": 8.935621537238024, "learning_rate": 4.3609878240020356e-07, "logits/chosen": -0.6704460978507996, "logits/rejected": -0.7258785963058472, "logps/chosen": -425.38690185546875, "logps/rejected": -457.9751892089844, "loss": 0.4947, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0847482681274414, "rewards/margins": 0.9454424977302551, "rewards/rejected": -2.0301907062530518, "step": 1290 }, { "epoch": 0.31190019193857965, "grad_norm": 9.26956553898586, "learning_rate": 4.346941344323005e-07, "logits/chosen": -0.7358589768409729, "logits/rejected": -0.8038908243179321, "logps/chosen": -376.4468688964844, "logps/rejected": -392.2120361328125, "loss": 0.5534, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.231223464012146, "rewards/margins": 0.6490092277526855, "rewards/rejected": -1.880232572555542, "step": 1300 }, { "epoch": 0.31429942418426104, "grad_norm": 8.919630982293898, "learning_rate": 4.332765309046467e-07, "logits/chosen": -0.6560064554214478, "logits/rejected": -0.6517816781997681, "logps/chosen": -403.7508239746094, "logps/rejected": -460.4183654785156, "loss": 0.5389, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1548296213150024, "rewards/margins": 0.9737479090690613, "rewards/rejected": -2.128577470779419, "step": 1310 }, { "epoch": 0.31669865642994244, "grad_norm": 10.42362251711121, "learning_rate": 4.3184607125649754e-07, "logits/chosen": -0.700032114982605, "logits/rejected": -0.7237606048583984, "logps/chosen": -376.8808288574219, "logps/rejected": -488.0193786621094, "loss": 0.5239, "rewards/accuracies": 0.75, "rewards/chosen": -0.9241411089897156, "rewards/margins": 0.9761545062065125, "rewards/rejected": -1.900295615196228, "step": 1320 }, { "epoch": 0.3190978886756238, "grad_norm": 8.823907511461925, "learning_rate": 4.304028558289141e-07, "logits/chosen": -0.7183485627174377, "logits/rejected": -0.7400873899459839, "logps/chosen": -375.0442810058594, "logps/rejected": -453.552978515625, "loss": 0.4986, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7883042693138123, "rewards/margins": 1.0131797790527344, "rewards/rejected": -1.8014838695526123, "step": 1330 }, { "epoch": 0.32149712092130517, "grad_norm": 9.056325230872618, "learning_rate": 4.28946985857725e-07, "logits/chosen": -0.7189252972602844, "logits/rejected": -0.698843777179718, "logps/chosen": -391.2516174316406, "logps/rejected": -495.9942932128906, "loss": 0.5001, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0153273344039917, "rewards/margins": 1.1640372276306152, "rewards/rejected": -2.1793646812438965, "step": 1340 }, { "epoch": 0.32389635316698656, "grad_norm": 9.219431888564412, "learning_rate": 4.2747856346642445e-07, "logits/chosen": -0.720720648765564, "logits/rejected": -0.7227288484573364, "logps/chosen": -323.8959045410156, "logps/rejected": -411.42633056640625, "loss": 0.4889, "rewards/accuracies": 0.875, "rewards/chosen": -0.8822928667068481, "rewards/margins": 0.907570481300354, "rewards/rejected": -1.7898629903793335, "step": 1350 }, { "epoch": 0.32629558541266795, "grad_norm": 9.219391981411931, "learning_rate": 4.2599769165900933e-07, "logits/chosen": -0.7076966166496277, "logits/rejected": -0.7374303340911865, "logps/chosen": -400.1958923339844, "logps/rejected": -457.45538330078125, "loss": 0.5265, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.270545482635498, "rewards/margins": 0.8403311967849731, "rewards/rejected": -2.1108765602111816, "step": 1360 }, { "epoch": 0.32869481765834935, "grad_norm": 7.961461255371797, "learning_rate": 4.245044743127535e-07, "logits/chosen": -0.8138734698295593, "logits/rejected": -0.8004827499389648, "logps/chosen": -375.55255126953125, "logps/rejected": -462.57452392578125, "loss": 0.51, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1249182224273682, "rewards/margins": 0.7031105160713196, "rewards/rejected": -1.828028678894043, "step": 1370 }, { "epoch": 0.3310940499040307, "grad_norm": 17.415326900164427, "learning_rate": 4.229990161709214e-07, "logits/chosen": -0.7364221811294556, "logits/rejected": -0.6838979721069336, "logps/chosen": -354.063720703125, "logps/rejected": -506.49468994140625, "loss": 0.533, "rewards/accuracies": 0.875, "rewards/chosen": -1.1404192447662354, "rewards/margins": 1.2287505865097046, "rewards/rejected": -2.3691699504852295, "step": 1380 }, { "epoch": 0.3334932821497121, "grad_norm": 8.490137472524392, "learning_rate": 4.214814228354204e-07, "logits/chosen": -0.7031981348991394, "logits/rejected": -0.7180779576301575, "logps/chosen": -381.91839599609375, "logps/rejected": -509.70391845703125, "loss": 0.4942, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9766290783882141, "rewards/margins": 1.4473177194595337, "rewards/rejected": -2.4239466190338135, "step": 1390 }, { "epoch": 0.33589251439539347, "grad_norm": 9.025625908308374, "learning_rate": 4.1995180075939375e-07, "logits/chosen": -0.7361186742782593, "logits/rejected": -0.7329140901565552, "logps/chosen": -412.9568786621094, "logps/rejected": -463.9566955566406, "loss": 0.4945, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0486990213394165, "rewards/margins": 0.7866090536117554, "rewards/rejected": -1.8353080749511719, "step": 1400 }, { "epoch": 0.33829174664107486, "grad_norm": 9.968153480159424, "learning_rate": 4.1841025723975297e-07, "logits/chosen": -0.685051441192627, "logits/rejected": -0.6909801959991455, "logps/chosen": -381.12054443359375, "logps/rejected": -472.12628173828125, "loss": 0.4854, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7610518932342529, "rewards/margins": 1.0710885524749756, "rewards/rejected": -1.832140326499939, "step": 1410 }, { "epoch": 0.34069097888675626, "grad_norm": 10.86480858776822, "learning_rate": 4.168569004096516e-07, "logits/chosen": -0.6658666133880615, "logits/rejected": -0.6583417654037476, "logps/chosen": -361.16693115234375, "logps/rejected": -498.03997802734375, "loss": 0.4812, "rewards/accuracies": 0.75, "rewards/chosen": -1.107450246810913, "rewards/margins": 1.1987017393112183, "rewards/rejected": -2.306152105331421, "step": 1420 }, { "epoch": 0.3430902111324376, "grad_norm": 9.37088980367882, "learning_rate": 4.152918392308997e-07, "logits/chosen": -0.8107253313064575, "logits/rejected": -0.792646050453186, "logps/chosen": -417.71197509765625, "logps/rejected": -471.83721923828125, "loss": 0.4813, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6705278158187866, "rewards/margins": 0.6656574010848999, "rewards/rejected": -2.3361852169036865, "step": 1430 }, { "epoch": 0.345489443378119, "grad_norm": 13.27114348199083, "learning_rate": 4.137151834863213e-07, "logits/chosen": -0.7099133729934692, "logits/rejected": -0.6920545697212219, "logps/chosen": -407.37493896484375, "logps/rejected": -562.7499389648438, "loss": 0.5386, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6011251211166382, "rewards/margins": 1.1472370624542236, "rewards/rejected": -2.7483620643615723, "step": 1440 }, { "epoch": 0.3478886756238004, "grad_norm": 12.106940133149287, "learning_rate": 4.121270437720526e-07, "logits/chosen": -0.6640886068344116, "logits/rejected": -0.6298462748527527, "logps/chosen": -366.8916931152344, "logps/rejected": -493.5521545410156, "loss": 0.5162, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4960647821426392, "rewards/margins": 0.7681604623794556, "rewards/rejected": -2.2642252445220947, "step": 1450 }, { "epoch": 0.3502879078694818, "grad_norm": 8.795405985800107, "learning_rate": 4.105275314897852e-07, "logits/chosen": -0.6949892640113831, "logits/rejected": -0.7056195139884949, "logps/chosen": -351.6402282714844, "logps/rejected": -530.5101318359375, "loss": 0.5051, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.323290467262268, "rewards/margins": 1.3428999185562134, "rewards/rejected": -2.6661901473999023, "step": 1460 }, { "epoch": 0.35268714011516317, "grad_norm": 9.62012729406584, "learning_rate": 4.089167588389508e-07, "logits/chosen": -0.6170503497123718, "logits/rejected": -0.6489865183830261, "logps/chosen": -479.70001220703125, "logps/rejected": -550.5726318359375, "loss": 0.4903, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2569714784622192, "rewards/margins": 1.206158995628357, "rewards/rejected": -2.463130235671997, "step": 1470 }, { "epoch": 0.3550863723608445, "grad_norm": 11.864875238047302, "learning_rate": 4.072948388088515e-07, "logits/chosen": -0.5827468037605286, "logits/rejected": -0.58921217918396, "logps/chosen": -419.8984375, "logps/rejected": -524.0806274414062, "loss": 0.5201, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3793574571609497, "rewards/margins": 1.0044056177139282, "rewards/rejected": -2.383762836456299, "step": 1480 }, { "epoch": 0.3574856046065259, "grad_norm": 10.119374435549082, "learning_rate": 4.056618851707334e-07, "logits/chosen": -0.6013773679733276, "logits/rejected": -0.6345557570457458, "logps/chosen": -384.5671691894531, "logps/rejected": -508.9178771972656, "loss": 0.4721, "rewards/accuracies": 0.875, "rewards/chosen": -1.068169355392456, "rewards/margins": 1.1706373691558838, "rewards/rejected": -2.238806962966919, "step": 1490 }, { "epoch": 0.3598848368522073, "grad_norm": 10.659792949010905, "learning_rate": 4.0401801246980675e-07, "logits/chosen": -0.7585668563842773, "logits/rejected": -0.7779415249824524, "logps/chosen": -384.1558532714844, "logps/rejected": -441.42010498046875, "loss": 0.5083, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.535270094871521, "rewards/margins": 0.7871710062026978, "rewards/rejected": -2.3224408626556396, "step": 1500 }, { "epoch": 0.3622840690978887, "grad_norm": 10.123601349785917, "learning_rate": 4.0236333601721043e-07, "logits/chosen": -0.6317464709281921, "logits/rejected": -0.6271511316299438, "logps/chosen": -450.973876953125, "logps/rejected": -524.8987426757812, "loss": 0.5378, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4312914609909058, "rewards/margins": 0.6362205147743225, "rewards/rejected": -2.067512035369873, "step": 1510 }, { "epoch": 0.3646833013435701, "grad_norm": 10.190915974376356, "learning_rate": 4.0069797188192364e-07, "logits/chosen": -0.6999167203903198, "logits/rejected": -0.6949875354766846, "logps/chosen": -410.099853515625, "logps/rejected": -510.3067321777344, "loss": 0.4944, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1585137844085693, "rewards/margins": 1.2204173803329468, "rewards/rejected": -2.3789315223693848, "step": 1520 }, { "epoch": 0.3670825335892514, "grad_norm": 10.837356546075833, "learning_rate": 3.9902203688262417e-07, "logits/chosen": -0.6491087675094604, "logits/rejected": -0.681550145149231, "logps/chosen": -389.38946533203125, "logps/rejected": -480.93701171875, "loss": 0.4764, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0993043184280396, "rewards/margins": 1.0439014434814453, "rewards/rejected": -2.1432056427001953, "step": 1530 }, { "epoch": 0.3694817658349328, "grad_norm": 9.953869181979533, "learning_rate": 3.9733564857949365e-07, "logits/chosen": -0.637142539024353, "logits/rejected": -0.6458380222320557, "logps/chosen": -504.4761657714844, "logps/rejected": -539.9736938476562, "loss": 0.502, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6645132303237915, "rewards/margins": 0.8288125991821289, "rewards/rejected": -2.493325710296631, "step": 1540 }, { "epoch": 0.3718809980806142, "grad_norm": 11.690441286702658, "learning_rate": 3.9563892526597177e-07, "logits/chosen": -0.6920310258865356, "logits/rejected": -0.6713690161705017, "logps/chosen": -376.0501708984375, "logps/rejected": -492.35382080078125, "loss": 0.4707, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3901115655899048, "rewards/margins": 0.6692919731140137, "rewards/rejected": -2.059403419494629, "step": 1550 }, { "epoch": 0.3742802303262956, "grad_norm": 9.497977320733629, "learning_rate": 3.9393198596045795e-07, "logits/chosen": -0.7644148468971252, "logits/rejected": -0.7483991384506226, "logps/chosen": -376.76885986328125, "logps/rejected": -497.29595947265625, "loss": 0.543, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3605177402496338, "rewards/margins": 0.9189019203186035, "rewards/rejected": -2.279419422149658, "step": 1560 }, { "epoch": 0.376679462571977, "grad_norm": 7.866666448729003, "learning_rate": 3.922149503979628e-07, "logits/chosen": -0.6804630160331726, "logits/rejected": -0.7361734509468079, "logps/chosen": -405.5047912597656, "logps/rejected": -583.3118286132812, "loss": 0.4749, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.131927728652954, "rewards/margins": 1.7377452850341797, "rewards/rejected": -2.869673013687134, "step": 1570 }, { "epoch": 0.3790786948176583, "grad_norm": 9.850531494942707, "learning_rate": 3.904879390217095e-07, "logits/chosen": -0.8319008946418762, "logits/rejected": -0.8503821492195129, "logps/chosen": -379.04827880859375, "logps/rejected": -460.70184326171875, "loss": 0.4687, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1876747608184814, "rewards/margins": 0.9654358625411987, "rewards/rejected": -2.1531107425689697, "step": 1580 }, { "epoch": 0.3814779270633397, "grad_norm": 9.904464624440974, "learning_rate": 3.8875107297468463e-07, "logits/chosen": -0.7686917781829834, "logits/rejected": -0.7551219463348389, "logps/chosen": -396.49847412109375, "logps/rejected": -589.2566528320312, "loss": 0.5065, "rewards/accuracies": 0.875, "rewards/chosen": -1.386002540588379, "rewards/margins": 1.4360870122909546, "rewards/rejected": -2.822089195251465, "step": 1590 }, { "epoch": 0.3838771593090211, "grad_norm": 9.880008920909678, "learning_rate": 3.87004474091141e-07, "logits/chosen": -0.621880829334259, "logits/rejected": -0.6349480152130127, "logps/chosen": -381.6803894042969, "logps/rejected": -495.33538818359375, "loss": 0.5038, "rewards/accuracies": 0.75, "rewards/chosen": -1.3982946872711182, "rewards/margins": 0.9522615671157837, "rewards/rejected": -2.3505563735961914, "step": 1600 }, { "epoch": 0.3862763915547025, "grad_norm": 9.893501375511553, "learning_rate": 3.8524826488805114e-07, "logits/chosen": -0.7854813933372498, "logits/rejected": -0.7525703310966492, "logps/chosen": -448.2474670410156, "logps/rejected": -512.2816162109375, "loss": 0.5077, "rewards/accuracies": 0.75, "rewards/chosen": -1.513240098953247, "rewards/margins": 1.0552384853363037, "rewards/rejected": -2.5684781074523926, "step": 1610 }, { "epoch": 0.3886756238003839, "grad_norm": 11.71127933022176, "learning_rate": 3.834825685565133e-07, "logits/chosen": -0.7181990146636963, "logits/rejected": -0.7670043706893921, "logps/chosen": -360.92669677734375, "logps/rejected": -421.00372314453125, "loss": 0.4668, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1134910583496094, "rewards/margins": 0.9747357368469238, "rewards/rejected": -2.088226795196533, "step": 1620 }, { "epoch": 0.39107485604606523, "grad_norm": 10.523368241496188, "learning_rate": 3.8170750895311007e-07, "logits/chosen": -0.7231374979019165, "logits/rejected": -0.7205518484115601, "logps/chosen": -411.99884033203125, "logps/rejected": -491.25201416015625, "loss": 0.4704, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.136505126953125, "rewards/margins": 1.0490738153457642, "rewards/rejected": -2.185579299926758, "step": 1630 }, { "epoch": 0.3934740882917466, "grad_norm": 9.516920015983152, "learning_rate": 3.7992321059122045e-07, "logits/chosen": -0.6618058681488037, "logits/rejected": -0.7061210870742798, "logps/chosen": -389.2023010253906, "logps/rejected": -476.97998046875, "loss": 0.5002, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4337702989578247, "rewards/margins": 1.021848201751709, "rewards/rejected": -2.455618381500244, "step": 1640 }, { "epoch": 0.395873320537428, "grad_norm": 9.385395418767361, "learning_rate": 3.7812979863228576e-07, "logits/chosen": -0.7980898022651672, "logits/rejected": -0.8222333788871765, "logps/chosen": -364.509521484375, "logps/rejected": -491.23175048828125, "loss": 0.4547, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4786386489868164, "rewards/margins": 1.1454684734344482, "rewards/rejected": -2.6241071224212646, "step": 1650 }, { "epoch": 0.3982725527831094, "grad_norm": 10.324873859774062, "learning_rate": 3.763273988770296e-07, "logits/chosen": -0.6266960501670837, "logits/rejected": -0.6783492565155029, "logps/chosen": -393.72125244140625, "logps/rejected": -528.2474365234375, "loss": 0.4626, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2447704076766968, "rewards/margins": 1.3596256971359253, "rewards/rejected": -2.604396104812622, "step": 1660 }, { "epoch": 0.4006717850287908, "grad_norm": 11.992709858901152, "learning_rate": 3.7451613775663405e-07, "logits/chosen": -0.7533406615257263, "logits/rejected": -0.7324401140213013, "logps/chosen": -424.1239318847656, "logps/rejected": -602.9987182617188, "loss": 0.5282, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7245795726776123, "rewards/margins": 1.6655222177505493, "rewards/rejected": -3.390101671218872, "step": 1670 }, { "epoch": 0.40307101727447214, "grad_norm": 10.879572547394886, "learning_rate": 3.726961423238706e-07, "logits/chosen": -0.7729811668395996, "logits/rejected": -0.7984837293624878, "logps/chosen": -386.8953552246094, "logps/rejected": -555.2294311523438, "loss": 0.4942, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5044174194335938, "rewards/margins": 1.4247596263885498, "rewards/rejected": -2.9291768074035645, "step": 1680 }, { "epoch": 0.40547024952015354, "grad_norm": 11.218700739770185, "learning_rate": 3.708675402441882e-07, "logits/chosen": -0.6574599742889404, "logits/rejected": -0.6961637139320374, "logps/chosen": -434.22650146484375, "logps/rejected": -494.91607666015625, "loss": 0.5091, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3373010158538818, "rewards/margins": 0.9446905255317688, "rewards/rejected": -2.281991481781006, "step": 1690 }, { "epoch": 0.40786948176583493, "grad_norm": 8.667993544637785, "learning_rate": 3.6903045978675775e-07, "logits/chosen": -0.7062468528747559, "logits/rejected": -0.7464607954025269, "logps/chosen": -384.9060363769531, "logps/rejected": -543.77685546875, "loss": 0.4989, "rewards/accuracies": 0.75, "rewards/chosen": -1.4272969961166382, "rewards/margins": 1.7380192279815674, "rewards/rejected": -3.165316343307495, "step": 1700 }, { "epoch": 0.4102687140115163, "grad_norm": 8.468965922840697, "learning_rate": 3.6718502981547474e-07, "logits/chosen": -0.7246867418289185, "logits/rejected": -0.7426190376281738, "logps/chosen": -419.00439453125, "logps/rejected": -550.689453125, "loss": 0.5076, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4041268825531006, "rewards/margins": 0.8987758755683899, "rewards/rejected": -2.3029026985168457, "step": 1710 }, { "epoch": 0.4126679462571977, "grad_norm": 9.099426609662448, "learning_rate": 3.6533137977991986e-07, "logits/chosen": -0.7021734118461609, "logits/rejected": -0.7220349311828613, "logps/chosen": -433.3681640625, "logps/rejected": -527.8214111328125, "loss": 0.5319, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4180127382278442, "rewards/margins": 0.7351676225662231, "rewards/rejected": -2.1531801223754883, "step": 1720 }, { "epoch": 0.41506717850287905, "grad_norm": 8.435202779591016, "learning_rate": 3.6346963970627865e-07, "logits/chosen": -0.639062762260437, "logits/rejected": -0.6104099154472351, "logps/chosen": -357.98175048828125, "logps/rejected": -512.7109375, "loss": 0.4567, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9540430307388306, "rewards/margins": 1.3174707889556885, "rewards/rejected": -2.2715137004852295, "step": 1730 }, { "epoch": 0.41746641074856045, "grad_norm": 11.561877784247002, "learning_rate": 3.615999401882207e-07, "logits/chosen": -0.7886170148849487, "logits/rejected": -0.7725807428359985, "logps/chosen": -376.1795959472656, "logps/rejected": -529.3326416015625, "loss": 0.484, "rewards/accuracies": 0.75, "rewards/chosen": -1.531253457069397, "rewards/margins": 1.2733750343322754, "rewards/rejected": -2.804628610610962, "step": 1740 }, { "epoch": 0.41986564299424184, "grad_norm": 10.686288534465577, "learning_rate": 3.597224123777389e-07, "logits/chosen": -0.6760295629501343, "logits/rejected": -0.6590694785118103, "logps/chosen": -399.1770324707031, "logps/rejected": -554.7936401367188, "loss": 0.4812, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3701995611190796, "rewards/margins": 1.3539022207260132, "rewards/rejected": -2.7241015434265137, "step": 1750 }, { "epoch": 0.42226487523992323, "grad_norm": 9.333815894429002, "learning_rate": 3.5783718797595e-07, "logits/chosen": -0.759990394115448, "logits/rejected": -0.777604877948761, "logps/chosen": -457.8511657714844, "logps/rejected": -525.8488159179688, "loss": 0.5006, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.607553482055664, "rewards/margins": 1.082852840423584, "rewards/rejected": -2.690406322479248, "step": 1760 }, { "epoch": 0.4246641074856046, "grad_norm": 10.014756401369624, "learning_rate": 3.559443992238558e-07, "logits/chosen": -0.7365792393684387, "logits/rejected": -0.7805954217910767, "logps/chosen": -389.59808349609375, "logps/rejected": -584.1541137695312, "loss": 0.5067, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.296053171157837, "rewards/margins": 1.6537158489227295, "rewards/rejected": -2.9497690200805664, "step": 1770 }, { "epoch": 0.42706333973128596, "grad_norm": 10.070273415353826, "learning_rate": 3.540441788930673e-07, "logits/chosen": -0.6368024945259094, "logits/rejected": -0.6715587377548218, "logps/chosen": -434.8896484375, "logps/rejected": -555.1212158203125, "loss": 0.4727, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2840174436569214, "rewards/margins": 1.5494660139083862, "rewards/rejected": -2.833483934402466, "step": 1780 }, { "epoch": 0.42946257197696736, "grad_norm": 9.237418058304717, "learning_rate": 3.5213666027649123e-07, "logits/chosen": -0.7330187559127808, "logits/rejected": -0.7538883686065674, "logps/chosen": -488.0216369628906, "logps/rejected": -523.1275024414062, "loss": 0.4906, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0400612354278564, "rewards/margins": 0.7529541850090027, "rewards/rejected": -2.793015241622925, "step": 1790 }, { "epoch": 0.43186180422264875, "grad_norm": 10.85433893317022, "learning_rate": 3.5022197717898017e-07, "logits/chosen": -0.7300796508789062, "logits/rejected": -0.7841044664382935, "logps/chosen": -403.67730712890625, "logps/rejected": -509.16558837890625, "loss": 0.4465, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7776187658309937, "rewards/margins": 1.2936731576919556, "rewards/rejected": -3.07129168510437, "step": 1800 }, { "epoch": 0.43426103646833014, "grad_norm": 9.714614186627546, "learning_rate": 3.4830026390794633e-07, "logits/chosen": -0.7365170121192932, "logits/rejected": -0.7708272337913513, "logps/chosen": -494.6708984375, "logps/rejected": -567.5706176757812, "loss": 0.4518, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.851345419883728, "rewards/margins": 1.2546782493591309, "rewards/rejected": -3.1060233116149902, "step": 1810 }, { "epoch": 0.43666026871401153, "grad_norm": 7.553890272409832, "learning_rate": 3.4637165526394104e-07, "logits/chosen": -0.7593089938163757, "logits/rejected": -0.7586512565612793, "logps/chosen": -378.30596923828125, "logps/rejected": -479.9606018066406, "loss": 0.4896, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3854007720947266, "rewards/margins": 0.8888666033744812, "rewards/rejected": -2.2742676734924316, "step": 1820 }, { "epoch": 0.43905950095969287, "grad_norm": 8.674418910529546, "learning_rate": 3.4443628653119814e-07, "logits/chosen": -0.6358439922332764, "logits/rejected": -0.6491922736167908, "logps/chosen": -425.07568359375, "logps/rejected": -638.0093994140625, "loss": 0.4962, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3985021114349365, "rewards/margins": 1.68317449092865, "rewards/rejected": -3.081676959991455, "step": 1830 }, { "epoch": 0.44145873320537427, "grad_norm": 9.311707071153172, "learning_rate": 3.424942934681453e-07, "logits/chosen": -0.7188653349876404, "logits/rejected": -0.7695431709289551, "logps/chosen": -372.1746520996094, "logps/rejected": -530.4584350585938, "loss": 0.4715, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.1345326900482178, "rewards/margins": 1.6814968585968018, "rewards/rejected": -2.8160295486450195, "step": 1840 }, { "epoch": 0.44385796545105566, "grad_norm": 12.609351544952487, "learning_rate": 3.405458122978804e-07, "logits/chosen": -0.7544587850570679, "logits/rejected": -0.7642985582351685, "logps/chosen": -424.3369140625, "logps/rejected": -506.177978515625, "loss": 0.4883, "rewards/accuracies": 0.75, "rewards/chosen": -1.232379674911499, "rewards/margins": 1.0791466236114502, "rewards/rejected": -2.3115265369415283, "step": 1850 }, { "epoch": 0.44625719769673705, "grad_norm": 10.750691343448636, "learning_rate": 3.3859097969861633e-07, "logits/chosen": -0.6986510157585144, "logits/rejected": -0.6798522472381592, "logps/chosen": -440.4366149902344, "logps/rejected": -503.4756774902344, "loss": 0.4636, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.483278512954712, "rewards/margins": 1.0428184270858765, "rewards/rejected": -2.526096820831299, "step": 1860 }, { "epoch": 0.44865642994241844, "grad_norm": 10.209697273000886, "learning_rate": 3.366299327940936e-07, "logits/chosen": -0.7111358046531677, "logits/rejected": -0.6854827404022217, "logps/chosen": -484.577392578125, "logps/rejected": -609.7952880859375, "loss": 0.4982, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7832101583480835, "rewards/margins": 1.0032484531402588, "rewards/rejected": -2.7864584922790527, "step": 1870 }, { "epoch": 0.4510556621880998, "grad_norm": 10.739925690898465, "learning_rate": 3.3466280914396117e-07, "logits/chosen": -0.6824935078620911, "logits/rejected": -0.7022455334663391, "logps/chosen": -411.59375, "logps/rejected": -576.3400268554688, "loss": 0.4636, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5962189435958862, "rewards/margins": 1.4433557987213135, "rewards/rejected": -3.0395750999450684, "step": 1880 }, { "epoch": 0.4534548944337812, "grad_norm": 15.164146543864618, "learning_rate": 3.326897467341281e-07, "logits/chosen": -0.7167527079582214, "logits/rejected": -0.74461829662323, "logps/chosen": -349.6092529296875, "logps/rejected": -494.6625061035156, "loss": 0.4884, "rewards/accuracies": 0.75, "rewards/chosen": -1.3855880498886108, "rewards/margins": 1.2441637516021729, "rewards/rejected": -2.629751682281494, "step": 1890 }, { "epoch": 0.45585412667946257, "grad_norm": 12.377059840959534, "learning_rate": 3.3071088396708335e-07, "logits/chosen": -0.7990108132362366, "logits/rejected": -0.7646141052246094, "logps/chosen": -343.55450439453125, "logps/rejected": -517.2105712890625, "loss": 0.489, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3536632061004639, "rewards/margins": 1.533830165863037, "rewards/rejected": -2.88749361038208, "step": 1900 }, { "epoch": 0.45825335892514396, "grad_norm": 11.613569882909587, "learning_rate": 3.2872635965218824e-07, "logits/chosen": -0.5556444525718689, "logits/rejected": -0.5901921391487122, "logps/chosen": -421.22119140625, "logps/rejected": -584.3763427734375, "loss": 0.5208, "rewards/accuracies": 0.75, "rewards/chosen": -1.6016429662704468, "rewards/margins": 1.2570686340332031, "rewards/rejected": -2.8587117195129395, "step": 1910 }, { "epoch": 0.46065259117082535, "grad_norm": 9.955391170965928, "learning_rate": 3.2673631299593905e-07, "logits/chosen": -0.658098578453064, "logits/rejected": -0.7359055876731873, "logps/chosen": -450.887451171875, "logps/rejected": -559.0867919921875, "loss": 0.4893, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6648181676864624, "rewards/margins": 1.1666862964630127, "rewards/rejected": -2.8315043449401855, "step": 1920 }, { "epoch": 0.4630518234165067, "grad_norm": 11.201011724113032, "learning_rate": 3.247408835922024e-07, "logits/chosen": -0.6952091455459595, "logits/rejected": -0.6913400292396545, "logps/chosen": -496.1107482910156, "logps/rejected": -632.0260009765625, "loss": 0.4992, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7909389734268188, "rewards/margins": 1.2750978469848633, "rewards/rejected": -3.0660367012023926, "step": 1930 }, { "epoch": 0.4654510556621881, "grad_norm": 9.304186497298465, "learning_rate": 3.2274021141242306e-07, "logits/chosen": -0.6521833539009094, "logits/rejected": -0.6770762205123901, "logps/chosen": -436.94500732421875, "logps/rejected": -563.1138916015625, "loss": 0.452, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.661968469619751, "rewards/margins": 1.2037100791931152, "rewards/rejected": -2.865678548812866, "step": 1940 }, { "epoch": 0.4678502879078695, "grad_norm": 14.57039869779609, "learning_rate": 3.2073443679580613e-07, "logits/chosen": -0.710097074508667, "logits/rejected": -0.7277542352676392, "logps/chosen": -424.69091796875, "logps/rejected": -525.2821044921875, "loss": 0.4701, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3532383441925049, "rewards/margins": 0.9686284065246582, "rewards/rejected": -2.321866750717163, "step": 1950 }, { "epoch": 0.47024952015355087, "grad_norm": 8.6232802821884, "learning_rate": 3.1872370043947194e-07, "logits/chosen": -0.7797672152519226, "logits/rejected": -0.8235223889350891, "logps/chosen": -389.4978332519531, "logps/rejected": -578.5491333007812, "loss": 0.4749, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0760996341705322, "rewards/margins": 1.9451076984405518, "rewards/rejected": -3.021207809448242, "step": 1960 }, { "epoch": 0.47264875239923226, "grad_norm": 10.70725522123182, "learning_rate": 3.167081433885874e-07, "logits/chosen": -0.5467191338539124, "logits/rejected": -0.573945164680481, "logps/chosen": -495.3836975097656, "logps/rejected": -635.158203125, "loss": 0.44, "rewards/accuracies": 0.75, "rewards/chosen": -1.5290435552597046, "rewards/margins": 0.9863445162773132, "rewards/rejected": -2.515388011932373, "step": 1970 }, { "epoch": 0.4750479846449136, "grad_norm": 12.578146181704888, "learning_rate": 3.14687907026472e-07, "logits/chosen": -0.6268805265426636, "logits/rejected": -0.6679359674453735, "logps/chosen": -384.5243835449219, "logps/rejected": -525.4344482421875, "loss": 0.4628, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.4070085287094116, "rewards/margins": 1.292812705039978, "rewards/rejected": -2.6998214721679688, "step": 1980 }, { "epoch": 0.477447216890595, "grad_norm": 9.948295850627304, "learning_rate": 3.126631330646801e-07, "logits/chosen": -0.635405421257019, "logits/rejected": -0.6675763726234436, "logps/chosen": -497.21466064453125, "logps/rejected": -574.4089965820312, "loss": 0.4961, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.751558542251587, "rewards/margins": 0.7816027402877808, "rewards/rejected": -2.533161163330078, "step": 1990 }, { "epoch": 0.4798464491362764, "grad_norm": 10.378623964749199, "learning_rate": 3.1063396353306097e-07, "logits/chosen": -0.698126494884491, "logits/rejected": -0.7467767000198364, "logps/chosen": -417.2310485839844, "logps/rejected": -496.03173828125, "loss": 0.4627, "rewards/accuracies": 0.75, "rewards/chosen": -1.2678186893463135, "rewards/margins": 1.3022050857543945, "rewards/rejected": -2.570024013519287, "step": 2000 }, { "epoch": 0.4798464491362764, "eval_logits/chosen": -0.704944372177124, "eval_logits/rejected": -0.7196417450904846, "eval_logps/chosen": -413.8262023925781, "eval_logps/rejected": -571.4524536132812, "eval_loss": 0.46414685249328613, "eval_rewards/accuracies": 0.8035714030265808, "eval_rewards/chosen": -1.4788715839385986, "eval_rewards/margins": 1.4836254119873047, "eval_rewards/rejected": -2.9624969959259033, "eval_runtime": 234.411, "eval_samples_per_second": 19.031, "eval_steps_per_second": 0.299, "step": 2000 }, { "epoch": 0.4822456813819578, "grad_norm": 10.645209258851928, "learning_rate": 3.0860054076979535e-07, "logits/chosen": -0.6905248761177063, "logits/rejected": -0.681174635887146, "logps/chosen": -447.33978271484375, "logps/rejected": -541.9505004882812, "loss": 0.4867, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6291488409042358, "rewards/margins": 1.1908628940582275, "rewards/rejected": -2.820011615753174, "step": 2010 }, { "epoch": 0.4846449136276392, "grad_norm": 12.91171406230696, "learning_rate": 3.065630074114115e-07, "logits/chosen": -0.7098181843757629, "logits/rejected": -0.7330686450004578, "logps/chosen": -465.2738342285156, "logps/rejected": -591.0724487304688, "loss": 0.4784, "rewards/accuracies": 0.75, "rewards/chosen": -1.6661344766616821, "rewards/margins": 1.6645488739013672, "rewards/rejected": -3.330683469772339, "step": 2020 }, { "epoch": 0.4870441458733205, "grad_norm": 11.6716276344079, "learning_rate": 3.0452150638277947e-07, "logits/chosen": -0.6548662185668945, "logits/rejected": -0.6267608404159546, "logps/chosen": -399.82501220703125, "logps/rejected": -510.04425048828125, "loss": 0.5167, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6882944107055664, "rewards/margins": 0.975196361541748, "rewards/rejected": -2.6634907722473145, "step": 2030 }, { "epoch": 0.4894433781190019, "grad_norm": 8.381022963011278, "learning_rate": 3.024761808870856e-07, "logits/chosen": -0.7615236043930054, "logits/rejected": -0.7732762098312378, "logps/chosen": -385.1295471191406, "logps/rejected": -582.0843505859375, "loss": 0.462, "rewards/accuracies": 0.875, "rewards/chosen": -1.4341049194335938, "rewards/margins": 1.9455454349517822, "rewards/rejected": -3.379650592803955, "step": 2040 }, { "epoch": 0.4918426103646833, "grad_norm": 14.700023421409464, "learning_rate": 3.004271743957875e-07, "logits/chosen": -0.6434902548789978, "logits/rejected": -0.6391478180885315, "logps/chosen": -473.94256591796875, "logps/rejected": -567.1292114257812, "loss": 0.5136, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.904388666152954, "rewards/margins": 0.7521687746047974, "rewards/rejected": -2.656557559967041, "step": 2050 }, { "epoch": 0.4942418426103647, "grad_norm": 9.948082556212318, "learning_rate": 2.983746306385499e-07, "logits/chosen": -0.8040687441825867, "logits/rejected": -0.750956118106842, "logps/chosen": -404.974365234375, "logps/rejected": -571.2884521484375, "loss": 0.4606, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5249295234680176, "rewards/margins": 1.4364125728607178, "rewards/rejected": -2.9613418579101562, "step": 2060 }, { "epoch": 0.4966410748560461, "grad_norm": 10.22438537397434, "learning_rate": 2.963186935931628e-07, "logits/chosen": -0.7241095304489136, "logits/rejected": -0.6997084021568298, "logps/chosen": -448.89300537109375, "logps/rejected": -555.0241088867188, "loss": 0.4714, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5664496421813965, "rewards/margins": 1.0874204635620117, "rewards/rejected": -2.653870105743408, "step": 2070 }, { "epoch": 0.4990403071017274, "grad_norm": 8.109581248601415, "learning_rate": 2.9425950747544176e-07, "logits/chosen": -0.641141414642334, "logits/rejected": -0.7013910412788391, "logps/chosen": -510.6063537597656, "logps/rejected": -640.6654052734375, "loss": 0.4353, "rewards/accuracies": 0.75, "rewards/chosen": -1.8512099981307983, "rewards/margins": 1.588141679763794, "rewards/rejected": -3.4393515586853027, "step": 2080 }, { "epoch": 0.5014395393474088, "grad_norm": 12.084926653784903, "learning_rate": 2.921972167291119e-07, "logits/chosen": -0.7068333625793457, "logits/rejected": -0.7458164691925049, "logps/chosen": -449.1051330566406, "logps/rejected": -605.0886840820312, "loss": 0.4457, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4715017080307007, "rewards/margins": 1.2597967386245728, "rewards/rejected": -2.7312982082366943, "step": 2090 }, { "epoch": 0.5038387715930902, "grad_norm": 10.473433486601849, "learning_rate": 2.9013196601567567e-07, "logits/chosen": -0.672719419002533, "logits/rejected": -0.6805760264396667, "logps/chosen": -399.2666931152344, "logps/rejected": -524.7140502929688, "loss": 0.5356, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.396222472190857, "rewards/margins": 1.1300289630889893, "rewards/rejected": -2.5262515544891357, "step": 2100 }, { "epoch": 0.5062380038387716, "grad_norm": 8.259116612360256, "learning_rate": 2.8806390020426555e-07, "logits/chosen": -0.7717374563217163, "logits/rejected": -0.7531148195266724, "logps/chosen": -406.16351318359375, "logps/rejected": -557.4243774414062, "loss": 0.4464, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2984120845794678, "rewards/margins": 1.5021198987960815, "rewards/rejected": -2.800532102584839, "step": 2110 }, { "epoch": 0.508637236084453, "grad_norm": 12.480170050902744, "learning_rate": 2.8599316436148187e-07, "logits/chosen": -0.6736984252929688, "logits/rejected": -0.6641879081726074, "logps/chosen": -438.26092529296875, "logps/rejected": -534.37890625, "loss": 0.4671, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.752355933189392, "rewards/margins": 0.9608189463615417, "rewards/rejected": -2.713174819946289, "step": 2120 }, { "epoch": 0.5110364683301344, "grad_norm": 13.113194375488565, "learning_rate": 2.8391990374121723e-07, "logits/chosen": -0.7215433120727539, "logits/rejected": -0.7145394086837769, "logps/chosen": -429.87689208984375, "logps/rejected": -588.1317138671875, "loss": 0.5056, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8144502639770508, "rewards/margins": 1.295245885848999, "rewards/rejected": -3.10969614982605, "step": 2130 }, { "epoch": 0.5134357005758158, "grad_norm": 10.389514519832314, "learning_rate": 2.818442637744669e-07, "logits/chosen": -0.7280897498130798, "logits/rejected": -0.7563216686248779, "logps/chosen": -451.57037353515625, "logps/rejected": -583.136962890625, "loss": 0.4995, "rewards/accuracies": 0.875, "rewards/chosen": -1.9117393493652344, "rewards/margins": 1.2607026100158691, "rewards/rejected": -3.1724419593811035, "step": 2140 }, { "epoch": 0.5158349328214972, "grad_norm": 9.893178585305161, "learning_rate": 2.797663900591284e-07, "logits/chosen": -0.7491916418075562, "logits/rejected": -0.7631763815879822, "logps/chosen": -454.49542236328125, "logps/rejected": -534.6981201171875, "loss": 0.4507, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7530666589736938, "rewards/margins": 1.1452219486236572, "rewards/rejected": -2.8982887268066406, "step": 2150 }, { "epoch": 0.5182341650671785, "grad_norm": 10.78966013478198, "learning_rate": 2.776864283497874e-07, "logits/chosen": -0.7122198343276978, "logits/rejected": -0.7713319063186646, "logps/chosen": -410.6304626464844, "logps/rejected": -599.1976928710938, "loss": 0.479, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6474449634552002, "rewards/margins": 1.9179481267929077, "rewards/rejected": -3.5653927326202393, "step": 2160 }, { "epoch": 0.5206333973128598, "grad_norm": 7.331401403742752, "learning_rate": 2.756045245474943e-07, "logits/chosen": -0.672527551651001, "logits/rejected": -0.6674192547798157, "logps/chosen": -429.77374267578125, "logps/rejected": -540.6340942382812, "loss": 0.4662, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4047420024871826, "rewards/margins": 0.8396526575088501, "rewards/rejected": -2.2443947792053223, "step": 2170 }, { "epoch": 0.5230326295585412, "grad_norm": 11.47735860978489, "learning_rate": 2.7352082468952977e-07, "logits/chosen": -0.7144309282302856, "logits/rejected": -0.7627060413360596, "logps/chosen": -419.7461853027344, "logps/rejected": -631.6406860351562, "loss": 0.5053, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7272449731826782, "rewards/margins": 1.802207589149475, "rewards/rejected": -3.529452085494995, "step": 2180 }, { "epoch": 0.5254318618042226, "grad_norm": 12.66627984144212, "learning_rate": 2.7143547493916e-07, "logits/chosen": -0.7830231785774231, "logits/rejected": -0.7730289697647095, "logps/chosen": -392.6502380371094, "logps/rejected": -612.9030151367188, "loss": 0.4365, "rewards/accuracies": 0.875, "rewards/chosen": -1.296020746231079, "rewards/margins": 2.1511874198913574, "rewards/rejected": -3.4472084045410156, "step": 2190 }, { "epoch": 0.527831094049904, "grad_norm": 10.665323165924152, "learning_rate": 2.693486215753853e-07, "logits/chosen": -0.7580839395523071, "logits/rejected": -0.778628945350647, "logps/chosen": -419.266845703125, "logps/rejected": -601.0218505859375, "loss": 0.4777, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.676492691040039, "rewards/margins": 1.9639472961425781, "rewards/rejected": -3.640439987182617, "step": 2200 }, { "epoch": 0.5302303262955854, "grad_norm": 15.168171864262504, "learning_rate": 2.6726041098267805e-07, "logits/chosen": -0.8083688616752625, "logits/rejected": -0.835811972618103, "logps/chosen": -480.6859436035156, "logps/rejected": -533.2108764648438, "loss": 0.4966, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.826184868812561, "rewards/margins": 0.8893225789070129, "rewards/rejected": -2.7155075073242188, "step": 2210 }, { "epoch": 0.5326295585412668, "grad_norm": 13.565039831991228, "learning_rate": 2.6517098964071507e-07, "logits/chosen": -0.6329632997512817, "logits/rejected": -0.658043384552002, "logps/chosen": -456.0284118652344, "logps/rejected": -526.2221069335938, "loss": 0.5196, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.818780541419983, "rewards/margins": 0.5884894132614136, "rewards/rejected": -2.4072699546813965, "step": 2220 }, { "epoch": 0.5350287907869482, "grad_norm": 11.227729437674412, "learning_rate": 2.630805041141023e-07, "logits/chosen": -0.7310200333595276, "logits/rejected": -0.7426483035087585, "logps/chosen": -385.03326416015625, "logps/rejected": -597.54541015625, "loss": 0.4777, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4363114833831787, "rewards/margins": 1.9664011001586914, "rewards/rejected": -3.402712345123291, "step": 2230 }, { "epoch": 0.5374280230326296, "grad_norm": 12.301180793094087, "learning_rate": 2.609891010420941e-07, "logits/chosen": -0.756328284740448, "logits/rejected": -0.731390118598938, "logps/chosen": -422.31524658203125, "logps/rejected": -570.0447998046875, "loss": 0.4578, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.5025454759597778, "rewards/margins": 1.4451904296875, "rewards/rejected": -2.9477362632751465, "step": 2240 }, { "epoch": 0.539827255278311, "grad_norm": 11.203287249603568, "learning_rate": 2.5889692712830674e-07, "logits/chosen": -0.7012640237808228, "logits/rejected": -0.734104335308075, "logps/chosen": -366.77569580078125, "logps/rejected": -487.832763671875, "loss": 0.452, "rewards/accuracies": 0.875, "rewards/chosen": -1.3068386316299438, "rewards/margins": 1.2463617324829102, "rewards/rejected": -2.5532002449035645, "step": 2250 }, { "epoch": 0.5422264875239923, "grad_norm": 11.401920927705493, "learning_rate": 2.5680412913042843e-07, "logits/chosen": -0.7200027704238892, "logits/rejected": -0.7047854661941528, "logps/chosen": -408.1886291503906, "logps/rejected": -583.43896484375, "loss": 0.4436, "rewards/accuracies": 0.875, "rewards/chosen": -1.5970829725265503, "rewards/margins": 1.6908365488052368, "rewards/rejected": -3.2879199981689453, "step": 2260 }, { "epoch": 0.5446257197696737, "grad_norm": 13.14986787185228, "learning_rate": 2.5471085384992404e-07, "logits/chosen": -0.7282342910766602, "logits/rejected": -0.7267628312110901, "logps/chosen": -395.9429016113281, "logps/rejected": -668.2164916992188, "loss": 0.4305, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5156352519989014, "rewards/margins": 2.5282320976257324, "rewards/rejected": -4.043867588043213, "step": 2270 }, { "epoch": 0.5470249520153551, "grad_norm": 9.395110944586639, "learning_rate": 2.526172481217381e-07, "logits/chosen": -0.6741994619369507, "logits/rejected": -0.6505922675132751, "logps/chosen": -426.7835388183594, "logps/rejected": -573.0272827148438, "loss": 0.4759, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0611157417297363, "rewards/margins": 1.2577565908432007, "rewards/rejected": -3.3188719749450684, "step": 2280 }, { "epoch": 0.5494241842610365, "grad_norm": 13.157814593299042, "learning_rate": 2.5052345880399456e-07, "logits/chosen": -0.727673351764679, "logits/rejected": -0.7585957050323486, "logps/chosen": -428.2842712402344, "logps/rejected": -550.08642578125, "loss": 0.4445, "rewards/accuracies": 0.75, "rewards/chosen": -1.9122216701507568, "rewards/margins": 1.1558340787887573, "rewards/rejected": -3.0680556297302246, "step": 2290 }, { "epoch": 0.5518234165067178, "grad_norm": 11.84384992024014, "learning_rate": 2.4842963276769555e-07, "logits/chosen": -0.6177406311035156, "logits/rejected": -0.5921697616577148, "logps/chosen": -425.8565368652344, "logps/rejected": -591.1513061523438, "loss": 0.4711, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9242675304412842, "rewards/margins": 1.2209120988845825, "rewards/rejected": -3.1451797485351562, "step": 2300 }, { "epoch": 0.5542226487523992, "grad_norm": 10.558621692008028, "learning_rate": 2.463359168864189e-07, "logits/chosen": -0.6363598108291626, "logits/rejected": -0.7210627794265747, "logps/chosen": -480.21820068359375, "logps/rejected": -575.8844604492188, "loss": 0.4867, "rewards/accuracies": 0.75, "rewards/chosen": -1.584695816040039, "rewards/margins": 1.3920116424560547, "rewards/rejected": -2.9767074584960938, "step": 2310 }, { "epoch": 0.5566218809980806, "grad_norm": 13.809223972324538, "learning_rate": 2.4424245802601555e-07, "logits/chosen": -0.7176483869552612, "logits/rejected": -0.7233623266220093, "logps/chosen": -392.2666931152344, "logps/rejected": -544.047607421875, "loss": 0.4604, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4483287334442139, "rewards/margins": 0.9412357211112976, "rewards/rejected": -2.3895645141601562, "step": 2320 }, { "epoch": 0.559021113243762, "grad_norm": 10.320795589655923, "learning_rate": 2.421494030343072e-07, "logits/chosen": -0.5995772480964661, "logits/rejected": -0.665002703666687, "logps/chosen": -429.8282165527344, "logps/rejected": -476.87384033203125, "loss": 0.5063, "rewards/accuracies": 0.75, "rewards/chosen": -1.5427360534667969, "rewards/margins": 1.0240482091903687, "rewards/rejected": -2.566784143447876, "step": 2330 }, { "epoch": 0.5614203454894434, "grad_norm": 11.374067564936935, "learning_rate": 2.400568987307861e-07, "logits/chosen": -0.6323488354682922, "logits/rejected": -0.6464725732803345, "logps/chosen": -405.65814208984375, "logps/rejected": -461.2422790527344, "loss": 0.4381, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6847827434539795, "rewards/margins": 0.6106036305427551, "rewards/rejected": -2.29538631439209, "step": 2340 }, { "epoch": 0.5638195777351248, "grad_norm": 11.798719511944995, "learning_rate": 2.379650918963156e-07, "logits/chosen": -0.7201340198516846, "logits/rejected": -0.7137752771377563, "logps/chosen": -407.8214416503906, "logps/rejected": -557.0501708984375, "loss": 0.4396, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9475882053375244, "rewards/margins": 1.3403925895690918, "rewards/rejected": -3.287980556488037, "step": 2350 }, { "epoch": 0.5662188099808061, "grad_norm": 18.10704498970427, "learning_rate": 2.3587412926283438e-07, "logits/chosen": -0.7477551698684692, "logits/rejected": -0.7495108842849731, "logps/chosen": -487.01788330078125, "logps/rejected": -621.1641235351562, "loss": 0.4846, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7702643871307373, "rewards/margins": 1.6865708827972412, "rewards/rejected": -3.4568352699279785, "step": 2360 }, { "epoch": 0.5686180422264875, "grad_norm": 9.023431566315685, "learning_rate": 2.337841575030642e-07, "logits/chosen": -0.6413623690605164, "logits/rejected": -0.6585931777954102, "logps/chosen": -468.298095703125, "logps/rejected": -593.5641479492188, "loss": 0.4868, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6977428197860718, "rewards/margins": 1.104875087738037, "rewards/rejected": -2.8026180267333984, "step": 2370 }, { "epoch": 0.5710172744721689, "grad_norm": 9.106685136975523, "learning_rate": 2.316953232202206e-07, "logits/chosen": -0.6131690740585327, "logits/rejected": -0.705342710018158, "logps/chosen": -403.64483642578125, "logps/rejected": -455.7774353027344, "loss": 0.4345, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.58620285987854, "rewards/margins": 1.1645066738128662, "rewards/rejected": -2.7507095336914062, "step": 2380 }, { "epoch": 0.5734165067178503, "grad_norm": 12.485032451040595, "learning_rate": 2.2960777293772958e-07, "logits/chosen": -0.5965815186500549, "logits/rejected": -0.6691153049468994, "logps/chosen": -375.786376953125, "logps/rejected": -546.0467529296875, "loss": 0.4677, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4897288084030151, "rewards/margins": 1.8273935317993164, "rewards/rejected": -3.3171226978302, "step": 2390 }, { "epoch": 0.5758157389635317, "grad_norm": 9.104242742336105, "learning_rate": 2.2752165308894974e-07, "logits/chosen": -0.6820736527442932, "logits/rejected": -0.6869875192642212, "logps/chosen": -366.31060791015625, "logps/rejected": -517.474365234375, "loss": 0.4591, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6863653659820557, "rewards/margins": 1.5893397331237793, "rewards/rejected": -3.275705337524414, "step": 2400 }, { "epoch": 0.5782149712092131, "grad_norm": 15.565842828873247, "learning_rate": 2.254371100069005e-07, "logits/chosen": -0.6215115189552307, "logits/rejected": -0.5873704552650452, "logps/chosen": -400.32440185546875, "logps/rejected": -548.8357543945312, "loss": 0.4672, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.3634350299835205, "rewards/margins": 1.2007476091384888, "rewards/rejected": -2.564182758331299, "step": 2410 }, { "epoch": 0.5806142034548945, "grad_norm": 10.548601465166293, "learning_rate": 2.2335428991399725e-07, "logits/chosen": -0.6729727387428284, "logits/rejected": -0.6920270919799805, "logps/chosen": -388.9903564453125, "logps/rejected": -711.9734497070312, "loss": 0.4679, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8660037517547607, "rewards/margins": 3.018165111541748, "rewards/rejected": -4.884169101715088, "step": 2420 }, { "epoch": 0.5830134357005758, "grad_norm": 9.156657974638184, "learning_rate": 2.2127333891179458e-07, "logits/chosen": -0.7091829180717468, "logits/rejected": -0.7354472875595093, "logps/chosen": -383.76605224609375, "logps/rejected": -601.3203735351562, "loss": 0.48, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6603578329086304, "rewards/margins": 1.862908959388733, "rewards/rejected": -3.523266553878784, "step": 2430 }, { "epoch": 0.5854126679462572, "grad_norm": 13.232575343054773, "learning_rate": 2.1919440297073782e-07, "logits/chosen": -0.7067408561706543, "logits/rejected": -0.7342425584793091, "logps/chosen": -383.7411804199219, "logps/rejected": -589.1730346679688, "loss": 0.4993, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7060611248016357, "rewards/margins": 1.9039154052734375, "rewards/rejected": -3.609976291656494, "step": 2440 }, { "epoch": 0.5878119001919386, "grad_norm": 9.071189608456551, "learning_rate": 2.1711762791992368e-07, "logits/chosen": -0.6443785429000854, "logits/rejected": -0.6465337872505188, "logps/chosen": -449.90753173828125, "logps/rejected": -567.1222534179688, "loss": 0.4979, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4745603799819946, "rewards/margins": 1.4671189785003662, "rewards/rejected": -2.941678762435913, "step": 2450 }, { "epoch": 0.5902111324376199, "grad_norm": 10.300854259670189, "learning_rate": 2.1504315943687114e-07, "logits/chosen": -0.7359960675239563, "logits/rejected": -0.72270667552948, "logps/chosen": -403.669189453125, "logps/rejected": -606.3935546875, "loss": 0.4464, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.6255872249603271, "rewards/margins": 1.5683765411376953, "rewards/rejected": -3.1939637660980225, "step": 2460 }, { "epoch": 0.5926103646833013, "grad_norm": 12.922478808451812, "learning_rate": 2.1297114303730248e-07, "logits/chosen": -0.6276537775993347, "logits/rejected": -0.5880897045135498, "logps/chosen": -394.9385986328125, "logps/rejected": -579.1409912109375, "loss": 0.5033, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5723788738250732, "rewards/margins": 1.3019646406173706, "rewards/rejected": -2.8743433952331543, "step": 2470 }, { "epoch": 0.5950095969289827, "grad_norm": 11.565229978778513, "learning_rate": 2.1090172406493616e-07, "logits/chosen": -0.6361690163612366, "logits/rejected": -0.6227170825004578, "logps/chosen": -399.3015441894531, "logps/rejected": -556.216064453125, "loss": 0.4182, "rewards/accuracies": 0.875, "rewards/chosen": -1.5578765869140625, "rewards/margins": 1.3979572057724, "rewards/rejected": -2.955833911895752, "step": 2480 }, { "epoch": 0.5974088291746641, "grad_norm": 13.512372450905238, "learning_rate": 2.0883504768129146e-07, "logits/chosen": -0.7200502157211304, "logits/rejected": -0.7266454696655273, "logps/chosen": -463.3838806152344, "logps/rejected": -626.7127075195312, "loss": 0.4704, "rewards/accuracies": 0.875, "rewards/chosen": -1.8044846057891846, "rewards/margins": 1.658591628074646, "rewards/rejected": -3.46307635307312, "step": 2490 }, { "epoch": 0.5998080614203455, "grad_norm": 10.955905269308658, "learning_rate": 2.0677125885550571e-07, "logits/chosen": -0.560949444770813, "logits/rejected": -0.6333897113800049, "logps/chosen": -404.170654296875, "logps/rejected": -486.53631591796875, "loss": 0.4561, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.536853551864624, "rewards/margins": 1.2563847303390503, "rewards/rejected": -2.7932381629943848, "step": 2500 }, { "epoch": 0.6022072936660269, "grad_norm": 11.614988822427518, "learning_rate": 2.0471050235416587e-07, "logits/chosen": -0.6411922574043274, "logits/rejected": -0.7291480898857117, "logps/chosen": -442.89910888671875, "logps/rejected": -545.2685546875, "loss": 0.438, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7440217733383179, "rewards/margins": 1.5008093118667603, "rewards/rejected": -3.24483060836792, "step": 2510 }, { "epoch": 0.6046065259117083, "grad_norm": 12.607898741358827, "learning_rate": 2.026529227311532e-07, "logits/chosen": -0.7110682725906372, "logits/rejected": -0.7056074142456055, "logps/chosen": -416.0528259277344, "logps/rejected": -573.2369384765625, "loss": 0.4998, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9202907085418701, "rewards/margins": 1.4374290704727173, "rewards/rejected": -3.357719898223877, "step": 2520 }, { "epoch": 0.6070057581573897, "grad_norm": 10.178927094903639, "learning_rate": 2.005986643175036e-07, "logits/chosen": -0.6290922164916992, "logits/rejected": -0.5829756259918213, "logps/chosen": -434.39288330078125, "logps/rejected": -613.2723999023438, "loss": 0.4025, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4553261995315552, "rewards/margins": 1.7923997640609741, "rewards/rejected": -3.2477259635925293, "step": 2530 }, { "epoch": 0.6094049904030711, "grad_norm": 13.408248580673138, "learning_rate": 1.9854787121128328e-07, "logits/chosen": -0.6658229231834412, "logits/rejected": -0.7100438475608826, "logps/chosen": -389.3979797363281, "logps/rejected": -438.6206970214844, "loss": 0.4888, "rewards/accuracies": 0.75, "rewards/chosen": -1.6204715967178345, "rewards/margins": 0.9446170926094055, "rewards/rejected": -2.5650887489318848, "step": 2540 }, { "epoch": 0.6118042226487524, "grad_norm": 12.880050575259938, "learning_rate": 1.9650068726748106e-07, "logits/chosen": -0.6123485565185547, "logits/rejected": -0.6827987432479858, "logps/chosen": -440.87896728515625, "logps/rejected": -584.0040283203125, "loss": 0.4767, "rewards/accuracies": 0.75, "rewards/chosen": -1.731406569480896, "rewards/margins": 1.396422028541565, "rewards/rejected": -3.127828359603882, "step": 2550 }, { "epoch": 0.6142034548944337, "grad_norm": 10.788734052701793, "learning_rate": 1.9445725608791718e-07, "logits/chosen": -0.6031758785247803, "logits/rejected": -0.648442268371582, "logps/chosen": -400.7000732421875, "logps/rejected": -660.7904052734375, "loss": 0.4752, "rewards/accuracies": 0.75, "rewards/chosen": -1.2926169633865356, "rewards/margins": 2.534430980682373, "rewards/rejected": -3.827047824859619, "step": 2560 }, { "epoch": 0.6166026871401151, "grad_norm": 10.567224853092739, "learning_rate": 1.924177210111705e-07, "logits/chosen": -0.7051092386245728, "logits/rejected": -0.7292466759681702, "logps/chosen": -377.7469787597656, "logps/rejected": -552.3789672851562, "loss": 0.4712, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5156134366989136, "rewards/margins": 1.6158390045166016, "rewards/rejected": -3.1314525604248047, "step": 2570 }, { "epoch": 0.6190019193857965, "grad_norm": 9.777520636965628, "learning_rate": 1.9038222510252364e-07, "logits/chosen": -0.6945359110832214, "logits/rejected": -0.6680124998092651, "logps/chosen": -410.309326171875, "logps/rejected": -502.10760498046875, "loss": 0.4815, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4556920528411865, "rewards/margins": 1.0640310049057007, "rewards/rejected": -2.5197231769561768, "step": 2580 }, { "epoch": 0.6214011516314779, "grad_norm": 11.680219063781859, "learning_rate": 1.883509111439277e-07, "logits/chosen": -0.6178931593894958, "logits/rejected": -0.6295452117919922, "logps/chosen": -406.61749267578125, "logps/rejected": -649.4032592773438, "loss": 0.4905, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6173099279403687, "rewards/margins": 1.7425906658172607, "rewards/rejected": -3.359900712966919, "step": 2590 }, { "epoch": 0.6238003838771593, "grad_norm": 8.702965805638529, "learning_rate": 1.8632392162398665e-07, "logits/chosen": -0.7019624710083008, "logits/rejected": -0.6865247488021851, "logps/chosen": -422.4203186035156, "logps/rejected": -627.749755859375, "loss": 0.446, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.275315761566162, "rewards/margins": 2.0367074012756348, "rewards/rejected": -3.312023639678955, "step": 2600 }, { "epoch": 0.6261996161228407, "grad_norm": 10.5146873413561, "learning_rate": 1.84301398727962e-07, "logits/chosen": -0.6342155933380127, "logits/rejected": -0.5797609686851501, "logps/chosen": -340.15179443359375, "logps/rejected": -614.032470703125, "loss": 0.449, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4568018913269043, "rewards/margins": 2.3389487266540527, "rewards/rejected": -3.795750379562378, "step": 2610 }, { "epoch": 0.6285988483685221, "grad_norm": 10.737478950061426, "learning_rate": 1.8228348432779966e-07, "logits/chosen": -0.7070366740226746, "logits/rejected": -0.717880129814148, "logps/chosen": -418.17706298828125, "logps/rejected": -548.4085083007812, "loss": 0.501, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8227876424789429, "rewards/margins": 1.3973190784454346, "rewards/rejected": -3.220106840133667, "step": 2620 }, { "epoch": 0.6309980806142035, "grad_norm": 9.684713099622218, "learning_rate": 1.8027031997217773e-07, "logits/chosen": -0.7213168740272522, "logits/rejected": -0.7527577877044678, "logps/chosen": -404.2679748535156, "logps/rejected": -679.9412841796875, "loss": 0.4013, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9124399423599243, "rewards/margins": 2.531831741333008, "rewards/rejected": -4.444272041320801, "step": 2630 }, { "epoch": 0.6333973128598849, "grad_norm": 11.280247842445823, "learning_rate": 1.7826204687657758e-07, "logits/chosen": -0.6257885098457336, "logits/rejected": -0.5951186418533325, "logps/chosen": -457.1598205566406, "logps/rejected": -517.6695556640625, "loss": 0.4193, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6708452701568604, "rewards/margins": 1.033866286277771, "rewards/rejected": -2.704711437225342, "step": 2640 }, { "epoch": 0.6357965451055663, "grad_norm": 14.057616949353399, "learning_rate": 1.762588059133781e-07, "logits/chosen": -0.6142539381980896, "logits/rejected": -0.6413928270339966, "logps/chosen": -473.72979736328125, "logps/rejected": -600.7677001953125, "loss": 0.4437, "rewards/accuracies": 0.875, "rewards/chosen": -1.7104568481445312, "rewards/margins": 1.6506239175796509, "rewards/rejected": -3.3610808849334717, "step": 2650 }, { "epoch": 0.6381957773512476, "grad_norm": 10.007284775964992, "learning_rate": 1.7426073760197406e-07, "logits/chosen": -0.768576979637146, "logits/rejected": -0.7556449174880981, "logps/chosen": -412.39984130859375, "logps/rejected": -669.0284423828125, "loss": 0.4858, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6826118230819702, "rewards/margins": 2.168964147567749, "rewards/rejected": -3.8515758514404297, "step": 2660 }, { "epoch": 0.6405950095969289, "grad_norm": 8.96835406227991, "learning_rate": 1.7226798209891935e-07, "logits/chosen": -0.583341658115387, "logits/rejected": -0.6630910038948059, "logps/chosen": -434.9918518066406, "logps/rejected": -545.5853271484375, "loss": 0.4433, "rewards/accuracies": 0.875, "rewards/chosen": -1.7140676975250244, "rewards/margins": 1.6664565801620483, "rewards/rejected": -3.380524158477783, "step": 2670 }, { "epoch": 0.6429942418426103, "grad_norm": 10.326630103018084, "learning_rate": 1.7028067918809535e-07, "logits/chosen": -0.6508952379226685, "logits/rejected": -0.6744917631149292, "logps/chosen": -381.95367431640625, "logps/rejected": -674.4815673828125, "loss": 0.4495, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.5953381061553955, "rewards/margins": 2.39109468460083, "rewards/rejected": -3.9864323139190674, "step": 2680 }, { "epoch": 0.6453934740882917, "grad_norm": 12.773425150047508, "learning_rate": 1.6829896827090584e-07, "logits/chosen": -0.7672047019004822, "logits/rejected": -0.7807837724685669, "logps/chosen": -420.0484313964844, "logps/rejected": -511.435302734375, "loss": 0.4804, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6788091659545898, "rewards/margins": 1.1816326379776, "rewards/rejected": -2.8604416847229004, "step": 2690 }, { "epoch": 0.6477927063339731, "grad_norm": 8.097344363797557, "learning_rate": 1.6632298835649844e-07, "logits/chosen": -0.6450155973434448, "logits/rejected": -0.6305941343307495, "logps/chosen": -443.16607666015625, "logps/rejected": -679.9429321289062, "loss": 0.4316, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5886855125427246, "rewards/margins": 1.9584792852401733, "rewards/rejected": -3.5471644401550293, "step": 2700 }, { "epoch": 0.6501919385796545, "grad_norm": 17.48613369684693, "learning_rate": 1.6435287805201364e-07, "logits/chosen": -0.6135013103485107, "logits/rejected": -0.6035945415496826, "logps/chosen": -451.90380859375, "logps/rejected": -557.508544921875, "loss": 0.4885, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8703639507293701, "rewards/margins": 1.1114423274993896, "rewards/rejected": -2.9818062782287598, "step": 2710 }, { "epoch": 0.6525911708253359, "grad_norm": 10.490682463258194, "learning_rate": 1.6238877555286207e-07, "logits/chosen": -0.6777503490447998, "logits/rejected": -0.6797904968261719, "logps/chosen": -436.258056640625, "logps/rejected": -608.3687744140625, "loss": 0.4457, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.422296166419983, "rewards/margins": 1.5763972997665405, "rewards/rejected": -2.9986937046051025, "step": 2720 }, { "epoch": 0.6549904030710173, "grad_norm": 12.224882835993649, "learning_rate": 1.60430818633031e-07, "logits/chosen": -0.6939619779586792, "logits/rejected": -0.6975654363632202, "logps/chosen": -427.3936462402344, "logps/rejected": -593.472412109375, "loss": 0.4393, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.530230164527893, "rewards/margins": 1.6595938205718994, "rewards/rejected": -3.189823865890503, "step": 2730 }, { "epoch": 0.6573896353166987, "grad_norm": 10.153894922340301, "learning_rate": 1.5847914463541939e-07, "logits/chosen": -0.6700283288955688, "logits/rejected": -0.6896187663078308, "logps/chosen": -357.0517578125, "logps/rejected": -522.7760620117188, "loss": 0.4347, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.513653039932251, "rewards/margins": 1.3927968740463257, "rewards/rejected": -2.906449794769287, "step": 2740 }, { "epoch": 0.6597888675623801, "grad_norm": 8.373116906245645, "learning_rate": 1.5653389046220427e-07, "logits/chosen": -0.60322105884552, "logits/rejected": -0.6336368322372437, "logps/chosen": -365.42059326171875, "logps/rejected": -530.8365478515625, "loss": 0.4393, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.241115927696228, "rewards/margins": 1.4525740146636963, "rewards/rejected": -2.693690299987793, "step": 2750 }, { "epoch": 0.6621880998080614, "grad_norm": 13.542426256092975, "learning_rate": 1.545951925652375e-07, "logits/chosen": -0.6148853302001953, "logits/rejected": -0.6405919194221497, "logps/chosen": -476.22589111328125, "logps/rejected": -573.9639282226562, "loss": 0.4321, "rewards/accuracies": 0.875, "rewards/chosen": -1.4998128414154053, "rewards/margins": 1.6009342670440674, "rewards/rejected": -3.1007466316223145, "step": 2760 }, { "epoch": 0.6645873320537428, "grad_norm": 11.419777174024256, "learning_rate": 1.5266318693647423e-07, "logits/chosen": -0.6193658113479614, "logits/rejected": -0.6027348637580872, "logps/chosen": -428.71490478515625, "logps/rejected": -535.6744995117188, "loss": 0.4509, "rewards/accuracies": 0.75, "rewards/chosen": -1.558387041091919, "rewards/margins": 1.1832726001739502, "rewards/rejected": -2.741659641265869, "step": 2770 }, { "epoch": 0.6669865642994242, "grad_norm": 12.984704637723206, "learning_rate": 1.5073800909843353e-07, "logits/chosen": -0.6714409589767456, "logits/rejected": -0.7126461267471313, "logps/chosen": -428.8514099121094, "logps/rejected": -529.1953735351562, "loss": 0.4526, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.5292034149169922, "rewards/margins": 1.5168203115463257, "rewards/rejected": -3.0460238456726074, "step": 2780 }, { "epoch": 0.6693857965451055, "grad_norm": 14.138835125115458, "learning_rate": 1.488197940946922e-07, "logits/chosen": -0.6455475687980652, "logits/rejected": -0.642737090587616, "logps/chosen": -417.8585510253906, "logps/rejected": -533.5113525390625, "loss": 0.4155, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.2823712825775146, "rewards/margins": 1.770923376083374, "rewards/rejected": -3.0532946586608887, "step": 2790 }, { "epoch": 0.6717850287907869, "grad_norm": 15.610740173237787, "learning_rate": 1.4690867648041167e-07, "logits/chosen": -0.6172278523445129, "logits/rejected": -0.6776692271232605, "logps/chosen": -418.36578369140625, "logps/rejected": -581.3909912109375, "loss": 0.4866, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4995958805084229, "rewards/margins": 1.887927770614624, "rewards/rejected": -3.387523651123047, "step": 2800 }, { "epoch": 0.6741842610364683, "grad_norm": 10.722965313997076, "learning_rate": 1.4500479031289987e-07, "logits/chosen": -0.6301898956298828, "logits/rejected": -0.6822000741958618, "logps/chosen": -449.79840087890625, "logps/rejected": -575.036865234375, "loss": 0.4839, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.575657606124878, "rewards/margins": 1.3444297313690186, "rewards/rejected": -2.9200873374938965, "step": 2810 }, { "epoch": 0.6765834932821497, "grad_norm": 10.427262900663766, "learning_rate": 1.4310826914220747e-07, "logits/chosen": -0.6417717933654785, "logits/rejected": -0.6694071888923645, "logps/chosen": -496.41064453125, "logps/rejected": -595.4664306640625, "loss": 0.468, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6847069263458252, "rewards/margins": 1.2323038578033447, "rewards/rejected": -2.91701078414917, "step": 2820 }, { "epoch": 0.6789827255278311, "grad_norm": 11.587445881578779, "learning_rate": 1.412192460017597e-07, "logits/chosen": -0.6943923234939575, "logits/rejected": -0.6795603036880493, "logps/chosen": -427.15155029296875, "logps/rejected": -583.9935913085938, "loss": 0.4638, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.664332628250122, "rewards/margins": 1.5579102039337158, "rewards/rejected": -3.222242832183838, "step": 2830 }, { "epoch": 0.6813819577735125, "grad_norm": 8.40191068876138, "learning_rate": 1.3933785339902504e-07, "logits/chosen": -0.6684115529060364, "logits/rejected": -0.6200501918792725, "logps/chosen": -355.61358642578125, "logps/rejected": -538.16650390625, "loss": 0.4765, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4830284118652344, "rewards/margins": 1.320604681968689, "rewards/rejected": -2.803633213043213, "step": 2840 }, { "epoch": 0.6837811900191939, "grad_norm": 9.705292210107526, "learning_rate": 1.374642233062197e-07, "logits/chosen": -0.6299320459365845, "logits/rejected": -0.6841608285903931, "logps/chosen": -473.47222900390625, "logps/rejected": -577.1047973632812, "loss": 0.4608, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.683821439743042, "rewards/margins": 1.3992817401885986, "rewards/rejected": -3.0831027030944824, "step": 2850 }, { "epoch": 0.6861804222648752, "grad_norm": 10.377222974537387, "learning_rate": 1.355984871510511e-07, "logits/chosen": -0.6160681247711182, "logits/rejected": -0.5785273313522339, "logps/chosen": -480.8841857910156, "logps/rejected": -627.6201171875, "loss": 0.4366, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7052888870239258, "rewards/margins": 1.3676353693008423, "rewards/rejected": -3.0729241371154785, "step": 2860 }, { "epoch": 0.6885796545105566, "grad_norm": 9.428651099395465, "learning_rate": 1.3374077580749783e-07, "logits/chosen": -0.6848248243331909, "logits/rejected": -0.6872170567512512, "logps/chosen": -386.19873046875, "logps/rejected": -546.9776000976562, "loss": 0.4409, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8132606744766235, "rewards/margins": 1.4182673692703247, "rewards/rejected": -3.2315280437469482, "step": 2870 }, { "epoch": 0.690978886756238, "grad_norm": 13.945186305417156, "learning_rate": 1.3189121958663024e-07, "logits/chosen": -0.6140165328979492, "logits/rejected": -0.6878429651260376, "logps/chosen": -531.7335205078125, "logps/rejected": -573.6845092773438, "loss": 0.4699, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.3029494285583496, "rewards/margins": 0.8266263008117676, "rewards/rejected": -3.129575490951538, "step": 2880 }, { "epoch": 0.6933781190019194, "grad_norm": 12.851850092870997, "learning_rate": 1.3004994822746895e-07, "logits/chosen": -0.7798065543174744, "logits/rejected": -0.7741595506668091, "logps/chosen": -430.0048828125, "logps/rejected": -566.271240234375, "loss": 0.4687, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7590996026992798, "rewards/margins": 1.2983884811401367, "rewards/rejected": -3.057487964630127, "step": 2890 }, { "epoch": 0.6957773512476008, "grad_norm": 12.345371556886562, "learning_rate": 1.2821709088788434e-07, "logits/chosen": -0.5772908329963684, "logits/rejected": -0.6081336140632629, "logps/chosen": -378.8913879394531, "logps/rejected": -535.4404907226562, "loss": 0.4542, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5490992069244385, "rewards/margins": 1.543939232826233, "rewards/rejected": -3.093038558959961, "step": 2900 }, { "epoch": 0.6981765834932822, "grad_norm": 14.289160494822747, "learning_rate": 1.2639277613553736e-07, "logits/chosen": -0.6734031438827515, "logits/rejected": -0.6486900448799133, "logps/chosen": -372.4572448730469, "logps/rejected": -487.9190979003906, "loss": 0.461, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6701858043670654, "rewards/margins": 1.1198476552963257, "rewards/rejected": -2.7900338172912598, "step": 2910 }, { "epoch": 0.7005758157389635, "grad_norm": 12.042269042020521, "learning_rate": 1.2457713193885975e-07, "logits/chosen": -0.6462276577949524, "logits/rejected": -0.660896897315979, "logps/chosen": -339.7335510253906, "logps/rejected": -547.5769653320312, "loss": 0.4327, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.608481764793396, "rewards/margins": 1.6944421529769897, "rewards/rejected": -3.3029239177703857, "step": 2920 }, { "epoch": 0.7029750479846449, "grad_norm": 15.19569101654986, "learning_rate": 1.2277028565807838e-07, "logits/chosen": -0.6471028923988342, "logits/rejected": -0.6764336824417114, "logps/chosen": -425.2708435058594, "logps/rejected": -565.1405639648438, "loss": 0.4666, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5919914245605469, "rewards/margins": 1.4726965427398682, "rewards/rejected": -3.064688205718994, "step": 2930 }, { "epoch": 0.7053742802303263, "grad_norm": 13.971571066599896, "learning_rate": 1.209723640362815e-07, "logits/chosen": -0.6739888191223145, "logits/rejected": -0.6833500266075134, "logps/chosen": -457.6881408691406, "logps/rejected": -645.4636840820312, "loss": 0.5118, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7702935934066772, "rewards/margins": 1.9555785655975342, "rewards/rejected": -3.725872039794922, "step": 2940 }, { "epoch": 0.7077735124760077, "grad_norm": 11.688443402207835, "learning_rate": 1.191834931905277e-07, "logits/chosen": -0.6156803369522095, "logits/rejected": -0.6335949897766113, "logps/chosen": -491.3165588378906, "logps/rejected": -634.9505004882812, "loss": 0.4246, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7908356189727783, "rewards/margins": 1.4069457054138184, "rewards/rejected": -3.1977813243865967, "step": 2950 }, { "epoch": 0.710172744721689, "grad_norm": 10.926862919127126, "learning_rate": 1.1740379860299988e-07, "logits/chosen": -0.6044400334358215, "logits/rejected": -0.6133986711502075, "logps/chosen": -445.9336853027344, "logps/rejected": -599.2249755859375, "loss": 0.4724, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5493725538253784, "rewards/margins": 1.346935749053955, "rewards/rejected": -2.896308183670044, "step": 2960 }, { "epoch": 0.7125719769673704, "grad_norm": 10.901886694528228, "learning_rate": 1.1563340511220254e-07, "logits/chosen": -0.6457855105400085, "logits/rejected": -0.6682008504867554, "logps/chosen": -491.98797607421875, "logps/rejected": -631.3724975585938, "loss": 0.4918, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8513435125350952, "rewards/margins": 1.5765281915664673, "rewards/rejected": -3.4278717041015625, "step": 2970 }, { "epoch": 0.7149712092130518, "grad_norm": 10.052694308856596, "learning_rate": 1.1387243690420556e-07, "logits/chosen": -0.6152561902999878, "logits/rejected": -0.6195570230484009, "logps/chosen": -467.3623962402344, "logps/rejected": -649.93994140625, "loss": 0.4614, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.5257104635238647, "rewards/margins": 1.9131603240966797, "rewards/rejected": -3.438870906829834, "step": 2980 }, { "epoch": 0.7173704414587332, "grad_norm": 13.617310196785702, "learning_rate": 1.1212101750393235e-07, "logits/chosen": -0.652159571647644, "logits/rejected": -0.669161856174469, "logps/chosen": -421.92620849609375, "logps/rejected": -561.7144775390625, "loss": 0.4315, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6268739700317383, "rewards/margins": 1.6520655155181885, "rewards/rejected": -3.278939723968506, "step": 2990 }, { "epoch": 0.7197696737044146, "grad_norm": 9.598022401827114, "learning_rate": 1.1037926976649562e-07, "logits/chosen": -0.6599806547164917, "logits/rejected": -0.6797146797180176, "logps/chosen": -446.956787109375, "logps/rejected": -640.7252807617188, "loss": 0.5011, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7980464696884155, "rewards/margins": 1.6290537118911743, "rewards/rejected": -3.427100419998169, "step": 3000 }, { "epoch": 0.722168905950096, "grad_norm": 10.802854623400075, "learning_rate": 1.0864731586857936e-07, "logits/chosen": -0.5600841045379639, "logits/rejected": -0.5877747535705566, "logps/chosen": -460.4256286621094, "logps/rejected": -603.6989135742188, "loss": 0.4476, "rewards/accuracies": 0.875, "rewards/chosen": -1.7123295068740845, "rewards/margins": 1.7583147287368774, "rewards/rejected": -3.470644474029541, "step": 3010 }, { "epoch": 0.7245681381957774, "grad_norm": 10.40502367000872, "learning_rate": 1.0692527729986839e-07, "logits/chosen": -0.6589699387550354, "logits/rejected": -0.6785061955451965, "logps/chosen": -431.74407958984375, "logps/rejected": -569.8807373046875, "loss": 0.4092, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.640982985496521, "rewards/margins": 1.5366647243499756, "rewards/rejected": -3.177647829055786, "step": 3020 }, { "epoch": 0.7269673704414588, "grad_norm": 13.119977378434658, "learning_rate": 1.0521327485452692e-07, "logits/chosen": -0.5950068831443787, "logits/rejected": -0.6270568370819092, "logps/chosen": -422.1985778808594, "logps/rejected": -560.9178466796875, "loss": 0.4589, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6238855123519897, "rewards/margins": 1.5490210056304932, "rewards/rejected": -3.1729063987731934, "step": 3030 }, { "epoch": 0.7293666026871402, "grad_norm": 12.65683765122561, "learning_rate": 1.0351142862272468e-07, "logits/chosen": -0.6144478917121887, "logits/rejected": -0.673326849937439, "logps/chosen": -397.64007568359375, "logps/rejected": -620.5786743164062, "loss": 0.4609, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.721166968345642, "rewards/margins": 2.2845423221588135, "rewards/rejected": -4.005709648132324, "step": 3040 }, { "epoch": 0.7317658349328215, "grad_norm": 12.041290560966866, "learning_rate": 1.0181985798221343e-07, "logits/chosen": -0.550287663936615, "logits/rejected": -0.5756568908691406, "logps/chosen": -453.2012634277344, "logps/rejected": -650.4305419921875, "loss": 0.5115, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7032783031463623, "rewards/margins": 1.8229873180389404, "rewards/rejected": -3.5262656211853027, "step": 3050 }, { "epoch": 0.7341650671785028, "grad_norm": 13.307257346339915, "learning_rate": 1.0013868158995329e-07, "logits/chosen": -0.5099418759346008, "logits/rejected": -0.5463215112686157, "logps/chosen": -423.4984436035156, "logps/rejected": -534.3104248046875, "loss": 0.488, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6515445709228516, "rewards/margins": 1.3121881484985352, "rewards/rejected": -2.9637324810028076, "step": 3060 }, { "epoch": 0.7365642994241842, "grad_norm": 10.351075268333476, "learning_rate": 9.84680173737887e-08, "logits/chosen": -0.6528457403182983, "logits/rejected": -0.6615931987762451, "logps/chosen": -444.66717529296875, "logps/rejected": -533.7962646484375, "loss": 0.4646, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.675614595413208, "rewards/margins": 1.2561010122299194, "rewards/rejected": -2.931715488433838, "step": 3070 }, { "epoch": 0.7389635316698656, "grad_norm": 9.443696454315829, "learning_rate": 9.680798252417713e-08, "logits/chosen": -0.6959069967269897, "logits/rejected": -0.7350667715072632, "logps/chosen": -372.649658203125, "logps/rejected": -555.9948120117188, "loss": 0.4445, "rewards/accuracies": 0.875, "rewards/chosen": -1.5621325969696045, "rewards/margins": 1.445049524307251, "rewards/rejected": -3.0071818828582764, "step": 3080 }, { "epoch": 0.741362763915547, "grad_norm": 11.47296651628307, "learning_rate": 9.515869348596808e-08, "logits/chosen": -0.6271109580993652, "logits/rejected": -0.6984132528305054, "logps/chosen": -474.30230712890625, "logps/rejected": -584.3538818359375, "loss": 0.4685, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6461492776870728, "rewards/margins": 1.4845248460769653, "rewards/rejected": -3.130673885345459, "step": 3090 }, { "epoch": 0.7437619961612284, "grad_norm": 10.021909817688707, "learning_rate": 9.352026595023493e-08, "logits/chosen": -0.6822315454483032, "logits/rejected": -0.6859509944915771, "logps/chosen": -464.59588623046875, "logps/rejected": -531.1356201171875, "loss": 0.4735, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.5760090351104736, "rewards/margins": 1.0710208415985107, "rewards/rejected": -2.6470298767089844, "step": 3100 }, { "epoch": 0.7461612284069098, "grad_norm": 12.68094636270811, "learning_rate": 9.189281484616004e-08, "logits/chosen": -0.6438357830047607, "logits/rejected": -0.6551543474197388, "logps/chosen": -383.98248291015625, "logps/rejected": -559.4468383789062, "loss": 0.4749, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.792620301246643, "rewards/margins": 1.217846155166626, "rewards/rejected": -3.0104660987854004, "step": 3110 }, { "epoch": 0.7485604606525912, "grad_norm": 11.028927932236359, "learning_rate": 9.027645433297249e-08, "logits/chosen": -0.5930813550949097, "logits/rejected": -0.5801911950111389, "logps/chosen": -542.3621826171875, "logps/rejected": -639.3001708984375, "loss": 0.4961, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.2047979831695557, "rewards/margins": 1.3231174945831299, "rewards/rejected": -3.5279152393341064, "step": 3120 }, { "epoch": 0.7509596928982726, "grad_norm": 13.142799913341205, "learning_rate": 8.867129779194066e-08, "logits/chosen": -0.6943696141242981, "logits/rejected": -0.7373479604721069, "logps/chosen": -370.00933837890625, "logps/rejected": -546.3580322265625, "loss": 0.4559, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5144741535186768, "rewards/margins": 1.7807083129882812, "rewards/rejected": -3.295182466506958, "step": 3130 }, { "epoch": 0.753358925143954, "grad_norm": 11.188015298374586, "learning_rate": 8.707745781841866e-08, "logits/chosen": -0.6203271150588989, "logits/rejected": -0.6621488332748413, "logps/chosen": -396.5414123535156, "logps/rejected": -583.9200439453125, "loss": 0.4851, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.708814263343811, "rewards/margins": 1.800474762916565, "rewards/rejected": -3.509288787841797, "step": 3140 }, { "epoch": 0.7557581573896354, "grad_norm": 6.520933471158698, "learning_rate": 8.549504621394831e-08, "logits/chosen": -0.7140206694602966, "logits/rejected": -0.7158041000366211, "logps/chosen": -390.3932189941406, "logps/rejected": -608.0358276367188, "loss": 0.3909, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4705148935317993, "rewards/margins": 2.1252219676971436, "rewards/rejected": -3.5957369804382324, "step": 3150 }, { "epoch": 0.7581573896353166, "grad_norm": 13.927678669681274, "learning_rate": 8.392417397841703e-08, "logits/chosen": -0.6197787523269653, "logits/rejected": -0.6519285440444946, "logps/chosen": -416.1162109375, "logps/rejected": -562.5856323242188, "loss": 0.4768, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.56044602394104, "rewards/margins": 1.3187508583068848, "rewards/rejected": -2.879196882247925, "step": 3160 }, { "epoch": 0.760556621880998, "grad_norm": 10.074795778090818, "learning_rate": 8.236495130227083e-08, "logits/chosen": -0.5864537358283997, "logits/rejected": -0.6371886730194092, "logps/chosen": -442.40313720703125, "logps/rejected": -615.7220458984375, "loss": 0.4786, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.4601584672927856, "rewards/margins": 2.044390916824341, "rewards/rejected": -3.504549503326416, "step": 3170 }, { "epoch": 0.7629558541266794, "grad_norm": 13.663011005109329, "learning_rate": 8.081748755878612e-08, "logits/chosen": -0.6179635524749756, "logits/rejected": -0.6545384526252747, "logps/chosen": -452.73382568359375, "logps/rejected": -525.165283203125, "loss": 0.4475, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.757580041885376, "rewards/margins": 1.2399779558181763, "rewards/rejected": -2.9975578784942627, "step": 3180 }, { "epoch": 0.7653550863723608, "grad_norm": 11.361503632955099, "learning_rate": 7.928189129639632e-08, "logits/chosen": -0.5514404773712158, "logits/rejected": -0.5337514281272888, "logps/chosen": -404.6654052734375, "logps/rejected": -557.6652221679688, "loss": 0.4311, "rewards/accuracies": 0.875, "rewards/chosen": -1.5441641807556152, "rewards/margins": 1.4246327877044678, "rewards/rejected": -2.968797206878662, "step": 3190 }, { "epoch": 0.7677543186180422, "grad_norm": 14.862474133401097, "learning_rate": 7.775827023107834e-08, "logits/chosen": -0.6156660318374634, "logits/rejected": -0.643264889717102, "logps/chosen": -428.32989501953125, "logps/rejected": -573.34130859375, "loss": 0.4982, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.7375695705413818, "rewards/margins": 1.2411071062088013, "rewards/rejected": -2.9786763191223145, "step": 3200 }, { "epoch": 0.7701535508637236, "grad_norm": 11.951225803984062, "learning_rate": 7.624673123879682e-08, "logits/chosen": -0.6403513550758362, "logits/rejected": -0.6913474798202515, "logps/chosen": -411.776611328125, "logps/rejected": -524.7989501953125, "loss": 0.4563, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.625312089920044, "rewards/margins": 1.2939417362213135, "rewards/rejected": -2.9192535877227783, "step": 3210 }, { "epoch": 0.772552783109405, "grad_norm": 9.593855988626308, "learning_rate": 7.474738034800663e-08, "logits/chosen": -0.7377493381500244, "logits/rejected": -0.728441596031189, "logps/chosen": -364.731201171875, "logps/rejected": -579.6549682617188, "loss": 0.4688, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6445392370224, "rewards/margins": 2.2165136337280273, "rewards/rejected": -3.861053466796875, "step": 3220 }, { "epoch": 0.7749520153550864, "grad_norm": 12.640503406225628, "learning_rate": 7.326032273221606e-08, "logits/chosen": -0.6615322828292847, "logits/rejected": -0.6496458053588867, "logps/chosen": -469.51007080078125, "logps/rejected": -604.8178100585938, "loss": 0.4379, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.678299903869629, "rewards/margins": 1.5208964347839355, "rewards/rejected": -3.1991963386535645, "step": 3230 }, { "epoch": 0.7773512476007678, "grad_norm": 13.937512065135813, "learning_rate": 7.178566270260872e-08, "logits/chosen": -0.6518770456314087, "logits/rejected": -0.6966899633407593, "logps/chosen": -453.48797607421875, "logps/rejected": -625.8817138671875, "loss": 0.4913, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9149278402328491, "rewards/margins": 1.3741127252578735, "rewards/rejected": -3.2890403270721436, "step": 3240 }, { "epoch": 0.7797504798464492, "grad_norm": 11.229901391937847, "learning_rate": 7.032350370072709e-08, "logits/chosen": -0.5851765871047974, "logits/rejected": -0.6159471273422241, "logps/chosen": -437.6891174316406, "logps/rejected": -598.682861328125, "loss": 0.4339, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.56214439868927, "rewards/margins": 1.7052253484725952, "rewards/rejected": -3.2673697471618652, "step": 3250 }, { "epoch": 0.7821497120921305, "grad_norm": 11.146263352042403, "learning_rate": 6.887394829121596e-08, "logits/chosen": -0.6397983431816101, "logits/rejected": -0.7075640559196472, "logps/chosen": -459.11553955078125, "logps/rejected": -686.8187255859375, "loss": 0.4385, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8990970849990845, "rewards/margins": 2.3527369499206543, "rewards/rejected": -4.251833915710449, "step": 3260 }, { "epoch": 0.7845489443378119, "grad_norm": 11.067281012027754, "learning_rate": 6.743709815462833e-08, "logits/chosen": -0.702612042427063, "logits/rejected": -0.7422297596931458, "logps/chosen": -440.0489196777344, "logps/rejected": -574.8292846679688, "loss": 0.4316, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7014598846435547, "rewards/margins": 1.6431381702423096, "rewards/rejected": -3.3445980548858643, "step": 3270 }, { "epoch": 0.7869481765834933, "grad_norm": 9.830989747261166, "learning_rate": 6.601305408029287e-08, "logits/chosen": -0.5348180532455444, "logits/rejected": -0.5696184039115906, "logps/chosen": -442.927490234375, "logps/rejected": -583.0240478515625, "loss": 0.4608, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.906508445739746, "rewards/margins": 1.4029037952423096, "rewards/rejected": -3.3094124794006348, "step": 3280 }, { "epoch": 0.7893474088291746, "grad_norm": 12.175693604464533, "learning_rate": 6.460191595924366e-08, "logits/chosen": -0.5926901698112488, "logits/rejected": -0.6048527956008911, "logps/chosen": -458.3211975097656, "logps/rejected": -586.0518798828125, "loss": 0.4435, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8188337087631226, "rewards/margins": 1.2453868389129639, "rewards/rejected": -3.064220905303955, "step": 3290 }, { "epoch": 0.791746641074856, "grad_norm": 11.750711543669468, "learning_rate": 6.320378277721342e-08, "logits/chosen": -0.6148731112480164, "logits/rejected": -0.6050759553909302, "logps/chosen": -457.68878173828125, "logps/rejected": -555.8333740234375, "loss": 0.4667, "rewards/accuracies": 0.75, "rewards/chosen": -1.960931420326233, "rewards/margins": 1.0489509105682373, "rewards/rejected": -3.0098819732666016, "step": 3300 }, { "epoch": 0.7941458733205374, "grad_norm": 13.919290209857113, "learning_rate": 6.181875260769032e-08, "logits/chosen": -0.6457343101501465, "logits/rejected": -0.6977934241294861, "logps/chosen": -435.4483947753906, "logps/rejected": -547.3876953125, "loss": 0.4771, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3289425373077393, "rewards/margins": 1.8005937337875366, "rewards/rejected": -3.1295368671417236, "step": 3310 }, { "epoch": 0.7965451055662188, "grad_norm": 12.774155044726525, "learning_rate": 6.044692260503797e-08, "logits/chosen": -0.5455335378646851, "logits/rejected": -0.569166362285614, "logps/chosen": -488.89923095703125, "logps/rejected": -631.3294677734375, "loss": 0.4144, "rewards/accuracies": 0.875, "rewards/chosen": -1.8063256740570068, "rewards/margins": 1.6628071069717407, "rewards/rejected": -3.469132661819458, "step": 3320 }, { "epoch": 0.7989443378119002, "grad_norm": 11.394655126013888, "learning_rate": 5.9088388997680984e-08, "logits/chosen": -0.5993139147758484, "logits/rejected": -0.6322329044342041, "logps/chosen": -519.2103271484375, "logps/rejected": -596.4190063476562, "loss": 0.4286, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7323102951049805, "rewards/margins": 1.5095983743667603, "rewards/rejected": -3.241908550262451, "step": 3330 }, { "epoch": 0.8013435700575816, "grad_norm": 12.200878623728958, "learning_rate": 5.774324708135439e-08, "logits/chosen": -0.6741082668304443, "logits/rejected": -0.7074322700500488, "logps/chosen": -373.7408447265625, "logps/rejected": -486.2967224121094, "loss": 0.4564, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4916250705718994, "rewards/margins": 1.3450305461883545, "rewards/rejected": -2.836656093597412, "step": 3340 }, { "epoch": 0.803742802303263, "grad_norm": 10.453220701732908, "learning_rate": 5.641159121241953e-08, "logits/chosen": -0.651732325553894, "logits/rejected": -0.6395163536071777, "logps/chosen": -382.8707580566406, "logps/rejected": -592.2684936523438, "loss": 0.4593, "rewards/accuracies": 0.75, "rewards/chosen": -1.6059911251068115, "rewards/margins": 1.7453718185424805, "rewards/rejected": -3.351362943649292, "step": 3350 }, { "epoch": 0.8061420345489443, "grad_norm": 11.15644450768947, "learning_rate": 5.5093514801245106e-08, "logits/chosen": -0.58311527967453, "logits/rejected": -0.6077650189399719, "logps/chosen": -422.38287353515625, "logps/rejected": -605.2293701171875, "loss": 0.4403, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6799404621124268, "rewards/margins": 1.5105533599853516, "rewards/rejected": -3.1904940605163574, "step": 3360 }, { "epoch": 0.8085412667946257, "grad_norm": 13.646112951195525, "learning_rate": 5.378911030565453e-08, "logits/chosen": -0.520195722579956, "logits/rejected": -0.5245386362075806, "logps/chosen": -505.70819091796875, "logps/rejected": -675.5585327148438, "loss": 0.4727, "rewards/accuracies": 0.75, "rewards/chosen": -2.0382139682769775, "rewards/margins": 1.3585295677185059, "rewards/rejected": -3.3967432975769043, "step": 3370 }, { "epoch": 0.8109404990403071, "grad_norm": 10.108220217158234, "learning_rate": 5.249846922444101e-08, "logits/chosen": -0.6458074450492859, "logits/rejected": -0.7096244096755981, "logps/chosen": -390.3080139160156, "logps/rejected": -658.1427001953125, "loss": 0.4316, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.7075704336166382, "rewards/margins": 2.7826123237609863, "rewards/rejected": -4.490181922912598, "step": 3380 }, { "epoch": 0.8133397312859885, "grad_norm": 12.66986825653512, "learning_rate": 5.122168209094865e-08, "logits/chosen": -0.5679661631584167, "logits/rejected": -0.5969215631484985, "logps/chosen": -402.6626892089844, "logps/rejected": -498.47479248046875, "loss": 0.4469, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8141701221466064, "rewards/margins": 0.9520853757858276, "rewards/rejected": -2.7662553787231445, "step": 3390 }, { "epoch": 0.8157389635316699, "grad_norm": 9.834215615688265, "learning_rate": 4.995883846672222e-08, "logits/chosen": -0.5988723039627075, "logits/rejected": -0.6316601037979126, "logps/chosen": -566.8046875, "logps/rejected": -627.3985595703125, "loss": 0.4445, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.795799970626831, "rewards/margins": 1.456854224205017, "rewards/rejected": -3.2526543140411377, "step": 3400 }, { "epoch": 0.8181381957773513, "grad_norm": 11.502139473474164, "learning_rate": 4.871002693522486e-08, "logits/chosen": -0.5939972996711731, "logits/rejected": -0.5957666635513306, "logps/chosen": -431.6761169433594, "logps/rejected": -533.3076171875, "loss": 0.471, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6297683715820312, "rewards/margins": 1.2114887237548828, "rewards/rejected": -2.841256856918335, "step": 3410 }, { "epoch": 0.8205374280230326, "grad_norm": 9.038305230217524, "learning_rate": 4.7475335095623956e-08, "logits/chosen": -0.598876416683197, "logits/rejected": -0.6024787425994873, "logps/chosen": -451.1143493652344, "logps/rejected": -610.4466552734375, "loss": 0.4555, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8450918197631836, "rewards/margins": 1.7127292156219482, "rewards/rejected": -3.5578207969665527, "step": 3420 }, { "epoch": 0.822936660268714, "grad_norm": 17.498105828738847, "learning_rate": 4.6254849556646714e-08, "logits/chosen": -0.5433209538459778, "logits/rejected": -0.5503520965576172, "logps/chosen": -476.6543884277344, "logps/rejected": -635.9801635742188, "loss": 0.4553, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6800941228866577, "rewards/margins": 1.8871549367904663, "rewards/rejected": -3.567249298095703, "step": 3430 }, { "epoch": 0.8253358925143954, "grad_norm": 12.97369382211523, "learning_rate": 4.504865593050483e-08, "logits/chosen": -0.5857795476913452, "logits/rejected": -0.6014319658279419, "logps/chosen": -460.88348388671875, "logps/rejected": -596.8003540039062, "loss": 0.4711, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8348299264907837, "rewards/margins": 1.2898591756820679, "rewards/rejected": -3.1246893405914307, "step": 3440 }, { "epoch": 0.8277351247600768, "grad_norm": 12.980103635170739, "learning_rate": 4.385683882688895e-08, "logits/chosen": -0.5943895578384399, "logits/rejected": -0.6202970743179321, "logps/chosen": -484.2207946777344, "logps/rejected": -530.2681884765625, "loss": 0.5219, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8965202569961548, "rewards/margins": 1.0006572008132935, "rewards/rejected": -2.897177219390869, "step": 3450 }, { "epoch": 0.8301343570057581, "grad_norm": 13.02526735064428, "learning_rate": 4.2679481847033985e-08, "logits/chosen": -0.6043378114700317, "logits/rejected": -0.6147600412368774, "logps/chosen": -457.24072265625, "logps/rejected": -620.4854125976562, "loss": 0.4657, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8802976608276367, "rewards/margins": 1.4927122592926025, "rewards/rejected": -3.3730101585388184, "step": 3460 }, { "epoch": 0.8325335892514395, "grad_norm": 10.240226182122116, "learning_rate": 4.151666757785435e-08, "logits/chosen": -0.6241481304168701, "logits/rejected": -0.6314017176628113, "logps/chosen": -401.0560302734375, "logps/rejected": -634.5782470703125, "loss": 0.4354, "rewards/accuracies": 0.875, "rewards/chosen": -1.427109956741333, "rewards/margins": 2.2766079902648926, "rewards/rejected": -3.7037181854248047, "step": 3470 }, { "epoch": 0.8349328214971209, "grad_norm": 12.450431309081564, "learning_rate": 4.036847758615136e-08, "logits/chosen": -0.5069397687911987, "logits/rejected": -0.5848828554153442, "logps/chosen": -474.01043701171875, "logps/rejected": -627.2423706054688, "loss": 0.4704, "rewards/accuracies": 0.75, "rewards/chosen": -2.224709987640381, "rewards/margins": 1.437635898590088, "rewards/rejected": -3.6623454093933105, "step": 3480 }, { "epoch": 0.8373320537428023, "grad_norm": 10.138959991207273, "learning_rate": 3.923499241289113e-08, "logits/chosen": -0.6470298171043396, "logits/rejected": -0.6826261878013611, "logps/chosen": -513.265869140625, "logps/rejected": -608.2982177734375, "loss": 0.4976, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9213802814483643, "rewards/margins": 1.5853230953216553, "rewards/rejected": -3.5067031383514404, "step": 3490 }, { "epoch": 0.8397312859884837, "grad_norm": 7.737508793759033, "learning_rate": 3.811629156755541e-08, "logits/chosen": -0.5882548093795776, "logits/rejected": -0.6023901104927063, "logps/chosen": -484.13787841796875, "logps/rejected": -623.9624633789062, "loss": 0.4558, "rewards/accuracies": 0.75, "rewards/chosen": -1.8149983882904053, "rewards/margins": 1.4490883350372314, "rewards/rejected": -3.2640864849090576, "step": 3500 }, { "epoch": 0.8421305182341651, "grad_norm": 9.568696750818255, "learning_rate": 3.701245352256391e-08, "logits/chosen": -0.5902693867683411, "logits/rejected": -0.6269119381904602, "logps/chosen": -477.4012756347656, "logps/rejected": -554.7974853515625, "loss": 0.4444, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.74009108543396, "rewards/margins": 1.0817813873291016, "rewards/rejected": -2.8218724727630615, "step": 3510 }, { "epoch": 0.8445297504798465, "grad_norm": 12.00695207332559, "learning_rate": 3.592355570776984e-08, "logits/chosen": -0.6704959273338318, "logits/rejected": -0.6980074644088745, "logps/chosen": -379.5652770996094, "logps/rejected": -551.5906372070312, "loss": 0.4273, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.503880500793457, "rewards/margins": 1.5545501708984375, "rewards/rejected": -3.0584301948547363, "step": 3520 }, { "epoch": 0.8469289827255279, "grad_norm": 9.257056695952906, "learning_rate": 3.484967450502904e-08, "logits/chosen": -0.583393394947052, "logits/rejected": -0.626370370388031, "logps/chosen": -371.80194091796875, "logps/rejected": -579.4976806640625, "loss": 0.4622, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.546886920928955, "rewards/margins": 1.578477382659912, "rewards/rejected": -3.1253647804260254, "step": 3530 }, { "epoch": 0.8493282149712092, "grad_norm": 14.494964173795129, "learning_rate": 3.3790885242841296e-08, "logits/chosen": -0.610164999961853, "logits/rejected": -0.6447314023971558, "logps/chosen": -432.84210205078125, "logps/rejected": -672.5138549804688, "loss": 0.419, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7256730794906616, "rewards/margins": 2.3827433586120605, "rewards/rejected": -4.1084160804748535, "step": 3540 }, { "epoch": 0.8517274472168906, "grad_norm": 10.946709475218693, "learning_rate": 3.274726219106677e-08, "logits/chosen": -0.6309023499488831, "logits/rejected": -0.6657734513282776, "logps/chosen": -481.0071716308594, "logps/rejected": -643.405029296875, "loss": 0.4691, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7343776226043701, "rewards/margins": 1.648224115371704, "rewards/rejected": -3.3826019763946533, "step": 3550 }, { "epoch": 0.8541266794625719, "grad_norm": 10.689900945283165, "learning_rate": 3.171887855571642e-08, "logits/chosen": -0.618683934211731, "logits/rejected": -0.5845375657081604, "logps/chosen": -405.7684631347656, "logps/rejected": -498.9336853027344, "loss": 0.4636, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7307859659194946, "rewards/margins": 0.9967991709709167, "rewards/rejected": -2.7275853157043457, "step": 3560 }, { "epoch": 0.8565259117082533, "grad_norm": 12.01909548635221, "learning_rate": 3.070580647381643e-08, "logits/chosen": -0.5815375447273254, "logits/rejected": -0.62933748960495, "logps/chosen": -413.2583923339844, "logps/rejected": -569.9872436523438, "loss": 0.4664, "rewards/accuracies": 0.75, "rewards/chosen": -1.719911813735962, "rewards/margins": 1.5588918924331665, "rewards/rejected": -3.278803586959839, "step": 3570 }, { "epoch": 0.8589251439539347, "grad_norm": 11.866805408924924, "learning_rate": 2.9708117008348576e-08, "logits/chosen": -0.57561856508255, "logits/rejected": -0.6152477264404297, "logps/chosen": -487.6710510253906, "logps/rejected": -560.8670043945312, "loss": 0.4308, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7181422710418701, "rewards/margins": 1.274820327758789, "rewards/rejected": -2.992962598800659, "step": 3580 }, { "epoch": 0.8613243761996161, "grad_norm": 11.368661908364057, "learning_rate": 2.8725880143264992e-08, "logits/chosen": -0.6161478757858276, "logits/rejected": -0.621782660484314, "logps/chosen": -455.6564025878906, "logps/rejected": -615.894775390625, "loss": 0.5076, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.038883686065674, "rewards/margins": 1.1252130270004272, "rewards/rejected": -3.1640963554382324, "step": 3590 }, { "epoch": 0.8637236084452975, "grad_norm": 16.14783859072051, "learning_rate": 2.775916477857948e-08, "logits/chosen": -0.6006742715835571, "logits/rejected": -0.6039419770240784, "logps/chosen": -416.3548278808594, "logps/rejected": -545.7139892578125, "loss": 0.4477, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8900909423828125, "rewards/margins": 1.278205394744873, "rewards/rejected": -3.1682963371276855, "step": 3600 }, { "epoch": 0.8661228406909789, "grad_norm": 11.990331851376549, "learning_rate": 2.680803872553408e-08, "logits/chosen": -0.624252200126648, "logits/rejected": -0.7020074725151062, "logps/chosen": -399.41583251953125, "logps/rejected": -661.79638671875, "loss": 0.4534, "rewards/accuracies": 0.875, "rewards/chosen": -1.495900273323059, "rewards/margins": 2.714350461959839, "rewards/rejected": -4.210250377655029, "step": 3610 }, { "epoch": 0.8685220729366603, "grad_norm": 14.100883523011712, "learning_rate": 2.5872568701842706e-08, "logits/chosen": -0.58869868516922, "logits/rejected": -0.6457717418670654, "logps/chosen": -384.4469299316406, "logps/rejected": -560.6753540039062, "loss": 0.5292, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.7529369592666626, "rewards/margins": 1.4963237047195435, "rewards/rejected": -3.249260425567627, "step": 3620 }, { "epoch": 0.8709213051823417, "grad_norm": 14.528120128162028, "learning_rate": 2.495282032701096e-08, "logits/chosen": -0.6547173857688904, "logits/rejected": -0.7049331068992615, "logps/chosen": -343.5353698730469, "logps/rejected": -484.39617919921875, "loss": 0.4662, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.712456464767456, "rewards/margins": 1.597670316696167, "rewards/rejected": -3.310126781463623, "step": 3630 }, { "epoch": 0.8733205374280231, "grad_norm": 14.220779667409527, "learning_rate": 2.4048858117733133e-08, "logits/chosen": -0.6767258644104004, "logits/rejected": -0.7082260847091675, "logps/chosen": -444.30328369140625, "logps/rejected": -614.4713745117188, "loss": 0.4496, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7594902515411377, "rewards/margins": 2.157029628753662, "rewards/rejected": -3.9165198802948, "step": 3640 }, { "epoch": 0.8757197696737045, "grad_norm": 11.643290676232011, "learning_rate": 2.3160745483366938e-08, "logits/chosen": -0.6050413846969604, "logits/rejected": -0.6035085916519165, "logps/chosen": -438.84942626953125, "logps/rejected": -599.5989990234375, "loss": 0.4611, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.013329029083252, "rewards/margins": 1.2097079753875732, "rewards/rejected": -3.223036527633667, "step": 3650 }, { "epoch": 0.8781190019193857, "grad_norm": 12.836965077883955, "learning_rate": 2.2288544721485197e-08, "logits/chosen": -0.7066579461097717, "logits/rejected": -0.7247270941734314, "logps/chosen": -371.9989929199219, "logps/rejected": -582.3410034179688, "loss": 0.4348, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5276857614517212, "rewards/margins": 1.909120798110962, "rewards/rejected": -3.4368062019348145, "step": 3660 }, { "epoch": 0.8805182341650671, "grad_norm": 11.52899292103731, "learning_rate": 2.1432317013506117e-08, "logits/chosen": -0.7156012654304504, "logits/rejected": -0.7498332262039185, "logps/chosen": -447.65216064453125, "logps/rejected": -560.2888793945312, "loss": 0.5002, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9097896814346313, "rewards/margins": 1.5398523807525635, "rewards/rejected": -3.449641704559326, "step": 3670 }, { "epoch": 0.8829174664107485, "grad_norm": 13.634086015337521, "learning_rate": 2.0592122420401704e-08, "logits/chosen": -0.5250085592269897, "logits/rejected": -0.57183438539505, "logps/chosen": -402.1532287597656, "logps/rejected": -514.146484375, "loss": 0.4715, "rewards/accuracies": 0.75, "rewards/chosen": -1.7628848552703857, "rewards/margins": 0.9792253375053406, "rewards/rejected": -2.7421107292175293, "step": 3680 }, { "epoch": 0.8853166986564299, "grad_norm": 10.931227876480555, "learning_rate": 1.976801987848459e-08, "logits/chosen": -0.6632574796676636, "logits/rejected": -0.6789246201515198, "logps/chosen": -438.82598876953125, "logps/rejected": -618.0784912109375, "loss": 0.4579, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6889044046401978, "rewards/margins": 1.663873314857483, "rewards/rejected": -3.3527779579162598, "step": 3690 }, { "epoch": 0.8877159309021113, "grad_norm": 12.88089932313707, "learning_rate": 1.8960067195273987e-08, "logits/chosen": -0.6466517448425293, "logits/rejected": -0.6885952949523926, "logps/chosen": -386.842529296875, "logps/rejected": -563.5064697265625, "loss": 0.4368, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6945703029632568, "rewards/margins": 1.814061164855957, "rewards/rejected": -3.508631467819214, "step": 3700 }, { "epoch": 0.8901151631477927, "grad_norm": 11.33604603257016, "learning_rate": 1.816832104544072e-08, "logits/chosen": -0.5263174772262573, "logits/rejected": -0.5546278953552246, "logps/chosen": -467.58062744140625, "logps/rejected": -575.5345458984375, "loss": 0.4632, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8927171230316162, "rewards/margins": 1.3500896692276, "rewards/rejected": -3.2428061962127686, "step": 3710 }, { "epoch": 0.8925143953934741, "grad_norm": 10.620251504859734, "learning_rate": 1.7392836966831553e-08, "logits/chosen": -0.5253115892410278, "logits/rejected": -0.5563468933105469, "logps/chosen": -434.1669921875, "logps/rejected": -624.7203369140625, "loss": 0.4168, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.62359619140625, "rewards/margins": 2.1972010135650635, "rewards/rejected": -3.8207976818084717, "step": 3720 }, { "epoch": 0.8949136276391555, "grad_norm": 13.275739947749992, "learning_rate": 1.663366935657373e-08, "logits/chosen": -0.6284958124160767, "logits/rejected": -0.6665322780609131, "logps/chosen": -392.99346923828125, "logps/rejected": -562.0194091796875, "loss": 0.4876, "rewards/accuracies": 0.75, "rewards/chosen": -1.7423441410064697, "rewards/margins": 1.6086766719818115, "rewards/rejected": -3.3510212898254395, "step": 3730 }, { "epoch": 0.8973128598848369, "grad_norm": 15.530425079669428, "learning_rate": 1.5890871467258898e-08, "logits/chosen": -0.5325186252593994, "logits/rejected": -0.5298448204994202, "logps/chosen": -506.15118408203125, "logps/rejected": -606.4880981445312, "loss": 0.4461, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7241334915161133, "rewards/margins": 1.3581666946411133, "rewards/rejected": -3.0822999477386475, "step": 3740 }, { "epoch": 0.8997120921305183, "grad_norm": 10.19750465326913, "learning_rate": 1.5164495403207967e-08, "logits/chosen": -0.6508103609085083, "logits/rejected": -0.6848149299621582, "logps/chosen": -467.98046875, "logps/rejected": -680.7239379882812, "loss": 0.4473, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8734798431396484, "rewards/margins": 1.8285648822784424, "rewards/rejected": -3.702044725418091, "step": 3750 }, { "epoch": 0.9021113243761996, "grad_norm": 12.346658548809334, "learning_rate": 1.4454592116815962e-08, "logits/chosen": -0.5491658449172974, "logits/rejected": -0.5658199787139893, "logps/chosen": -436.49505615234375, "logps/rejected": -605.0525512695312, "loss": 0.4476, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5681920051574707, "rewards/margins": 1.4636965990066528, "rewards/rejected": -3.031888484954834, "step": 3760 }, { "epoch": 0.904510556621881, "grad_norm": 8.029053228209763, "learning_rate": 1.3761211404977934e-08, "logits/chosen": -0.6819595098495483, "logits/rejected": -0.6787452101707458, "logps/chosen": -416.7828063964844, "logps/rejected": -647.9034423828125, "loss": 0.4124, "rewards/accuracies": 0.75, "rewards/chosen": -1.8057578802108765, "rewards/margins": 2.299898862838745, "rewards/rejected": -4.105656623840332, "step": 3770 }, { "epoch": 0.9069097888675623, "grad_norm": 12.064654099310772, "learning_rate": 1.3084401905596177e-08, "logits/chosen": -0.6439425349235535, "logits/rejected": -0.7118849158287048, "logps/chosen": -462.96624755859375, "logps/rejected": -571.2410278320312, "loss": 0.4681, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6763248443603516, "rewards/margins": 1.4996883869171143, "rewards/rejected": -3.176013469696045, "step": 3780 }, { "epoch": 0.9093090211132437, "grad_norm": 11.710340217031446, "learning_rate": 1.2424211094168053e-08, "logits/chosen": -0.4598866403102875, "logits/rejected": -0.5065708756446838, "logps/chosen": -502.1546936035156, "logps/rejected": -631.9319458007812, "loss": 0.4355, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6645265817642212, "rewards/margins": 1.4170030355453491, "rewards/rejected": -3.0815296173095703, "step": 3790 }, { "epoch": 0.9117082533589251, "grad_norm": 10.967428634066259, "learning_rate": 1.1780685280456143e-08, "logits/chosen": -0.5946656465530396, "logits/rejected": -0.6331689953804016, "logps/chosen": -513.2894287109375, "logps/rejected": -712.6598510742188, "loss": 0.5091, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.061929225921631, "rewards/margins": 1.8402255773544312, "rewards/rejected": -3.9021544456481934, "step": 3800 }, { "epoch": 0.9141074856046065, "grad_norm": 12.665118166759022, "learning_rate": 1.1153869605239564e-08, "logits/chosen": -0.5937948226928711, "logits/rejected": -0.6147378087043762, "logps/chosen": -453.622314453125, "logps/rejected": -509.7294006347656, "loss": 0.4617, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6426572799682617, "rewards/margins": 1.0369822978973389, "rewards/rejected": -2.6796395778656006, "step": 3810 }, { "epoch": 0.9165067178502879, "grad_norm": 13.103880470124455, "learning_rate": 1.0543808037147606e-08, "logits/chosen": -0.6876846551895142, "logits/rejected": -0.7068900465965271, "logps/chosen": -422.0281677246094, "logps/rejected": -667.2548828125, "loss": 0.448, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.5988212823867798, "rewards/margins": 2.381234645843506, "rewards/rejected": -3.980056047439575, "step": 3820 }, { "epoch": 0.9189059500959693, "grad_norm": 8.997982934710759, "learning_rate": 9.95054336957557e-09, "logits/chosen": -0.6352418661117554, "logits/rejected": -0.6384015083312988, "logps/chosen": -425.876220703125, "logps/rejected": -567.1570434570312, "loss": 0.4062, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.594925045967102, "rewards/margins": 1.3477894067764282, "rewards/rejected": -2.9427144527435303, "step": 3830 }, { "epoch": 0.9213051823416507, "grad_norm": 11.532295976661894, "learning_rate": 9.37411721768286e-09, "logits/chosen": -0.5964576601982117, "logits/rejected": -0.6410446763038635, "logps/chosen": -464.87823486328125, "logps/rejected": -694.9117431640625, "loss": 0.4116, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9001286029815674, "rewards/margins": 1.8672752380371094, "rewards/rejected": -3.767404079437256, "step": 3840 }, { "epoch": 0.9237044145873321, "grad_norm": 11.154793279181051, "learning_rate": 8.81457001547392e-09, "logits/chosen": -0.5532232522964478, "logits/rejected": -0.537521481513977, "logps/chosen": -463.60540771484375, "logps/rejected": -587.6224365234375, "loss": 0.4493, "rewards/accuracies": 0.875, "rewards/chosen": -1.8449652194976807, "rewards/margins": 1.1032222509384155, "rewards/rejected": -2.9481875896453857, "step": 3850 }, { "epoch": 0.9261036468330134, "grad_norm": 11.05082468706879, "learning_rate": 8.271941012961942e-09, "logits/chosen": -0.546120822429657, "logits/rejected": -0.5496604442596436, "logps/chosen": -394.8270568847656, "logps/rejected": -661.518310546875, "loss": 0.452, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7090051174163818, "rewards/margins": 2.0601634979248047, "rewards/rejected": -3.7691688537597656, "step": 3860 }, { "epoch": 0.9285028790786948, "grad_norm": 12.11334503986506, "learning_rate": 7.746268273415568e-09, "logits/chosen": -0.6160927414894104, "logits/rejected": -0.5806897878646851, "logps/chosen": -448.4591369628906, "logps/rejected": -560.7288818359375, "loss": 0.4697, "rewards/accuracies": 0.75, "rewards/chosen": -1.6977341175079346, "rewards/margins": 0.6812986135482788, "rewards/rejected": -2.379032611846924, "step": 3870 }, { "epoch": 0.9309021113243762, "grad_norm": 11.544284346542328, "learning_rate": 7.237588670689076e-09, "logits/chosen": -0.6722389459609985, "logits/rejected": -0.7193390727043152, "logps/chosen": -424.52545166015625, "logps/rejected": -611.8565063476562, "loss": 0.4297, "rewards/accuracies": 0.75, "rewards/chosen": -1.7004144191741943, "rewards/margins": 2.1898179054260254, "rewards/rejected": -3.890232801437378, "step": 3880 }, { "epoch": 0.9333013435700576, "grad_norm": 11.715147679206341, "learning_rate": 6.745937886635606e-09, "logits/chosen": -0.5881049633026123, "logits/rejected": -0.6139761805534363, "logps/chosen": -464.87908935546875, "logps/rejected": -680.6104125976562, "loss": 0.4339, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7303098440170288, "rewards/margins": 2.0931639671325684, "rewards/rejected": -3.8234734535217285, "step": 3890 }, { "epoch": 0.935700575815739, "grad_norm": 10.400118656620084, "learning_rate": 6.271350408604409e-09, "logits/chosen": -0.6110928058624268, "logits/rejected": -0.6207016706466675, "logps/chosen": -360.6642761230469, "logps/rejected": -565.9451904296875, "loss": 0.427, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.2418937683105469, "rewards/margins": 1.8387107849121094, "rewards/rejected": -3.0806050300598145, "step": 3900 }, { "epoch": 0.9380998080614203, "grad_norm": 10.41650137772363, "learning_rate": 5.813859527021487e-09, "logits/chosen": -0.5888563394546509, "logits/rejected": -0.6184204816818237, "logps/chosen": -425.99176025390625, "logps/rejected": -601.2418823242188, "loss": 0.4341, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6858470439910889, "rewards/margins": 2.0192112922668457, "rewards/rejected": -3.7050583362579346, "step": 3910 }, { "epoch": 0.9404990403071017, "grad_norm": 11.8325653977223, "learning_rate": 5.373497333054616e-09, "logits/chosen": -0.627325713634491, "logits/rejected": -0.6293385028839111, "logps/chosen": -477.244140625, "logps/rejected": -571.5916748046875, "loss": 0.485, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9243043661117554, "rewards/margins": 1.038652777671814, "rewards/rejected": -2.9629569053649902, "step": 3920 }, { "epoch": 0.9428982725527831, "grad_norm": 12.78783329314139, "learning_rate": 4.950294716362213e-09, "logits/chosen": -0.5876864194869995, "logits/rejected": -0.6184590458869934, "logps/chosen": -502.6764221191406, "logps/rejected": -607.631103515625, "loss": 0.4598, "rewards/accuracies": 0.625, "rewards/chosen": -1.936902403831482, "rewards/margins": 1.1025243997573853, "rewards/rejected": -3.0394270420074463, "step": 3930 }, { "epoch": 0.9452975047984645, "grad_norm": 9.281745185619053, "learning_rate": 4.544281362926422e-09, "logits/chosen": -0.6346616148948669, "logits/rejected": -0.6384531855583191, "logps/chosen": -471.59912109375, "logps/rejected": -625.6812133789062, "loss": 0.4382, "rewards/accuracies": 0.75, "rewards/chosen": -1.560572862625122, "rewards/margins": 1.6168543100357056, "rewards/rejected": -3.177427291870117, "step": 3940 }, { "epoch": 0.9476967370441459, "grad_norm": 11.774371944259814, "learning_rate": 4.15548575297095e-09, "logits/chosen": -0.6360484957695007, "logits/rejected": -0.6688522696495056, "logps/chosen": -415.83685302734375, "logps/rejected": -604.4151611328125, "loss": 0.4287, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7383111715316772, "rewards/margins": 1.9439513683319092, "rewards/rejected": -3.682262420654297, "step": 3950 }, { "epoch": 0.9500959692898272, "grad_norm": 9.746861342706476, "learning_rate": 3.7839351589631366e-09, "logits/chosen": -0.6292358040809631, "logits/rejected": -0.5761995911598206, "logps/chosen": -410.7013244628906, "logps/rejected": -591.5662841796875, "loss": 0.4475, "rewards/accuracies": 0.75, "rewards/chosen": -1.7394983768463135, "rewards/margins": 1.2296544313430786, "rewards/rejected": -2.9691526889801025, "step": 3960 }, { "epoch": 0.9524952015355086, "grad_norm": 11.778423651753915, "learning_rate": 3.4296556437010405e-09, "logits/chosen": -0.6809018850326538, "logits/rejected": -0.695138692855835, "logps/chosen": -382.27337646484375, "logps/rejected": -545.0702514648438, "loss": 0.4623, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.746303915977478, "rewards/margins": 1.5834695100784302, "rewards/rejected": -3.32977294921875, "step": 3970 }, { "epoch": 0.95489443378119, "grad_norm": 12.78112886016458, "learning_rate": 3.092672058485124e-09, "logits/chosen": -0.6508705615997314, "logits/rejected": -0.6388789415359497, "logps/chosen": -406.3719482421875, "logps/rejected": -647.2185668945312, "loss": 0.4965, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6541764736175537, "rewards/margins": 2.194026231765747, "rewards/rejected": -3.8482024669647217, "step": 3980 }, { "epoch": 0.9572936660268714, "grad_norm": 11.56058546078559, "learning_rate": 2.7730080413750356e-09, "logits/chosen": -0.5227060914039612, "logits/rejected": -0.5568557977676392, "logps/chosen": -435.97833251953125, "logps/rejected": -572.4579467773438, "loss": 0.4687, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5814177989959717, "rewards/margins": 1.316449522972107, "rewards/rejected": -2.897867441177368, "step": 3990 }, { "epoch": 0.9596928982725528, "grad_norm": 10.601860302681265, "learning_rate": 2.4706860155316033e-09, "logits/chosen": -0.6122329235076904, "logits/rejected": -0.6286668181419373, "logps/chosen": -524.0120849609375, "logps/rejected": -665.0354614257812, "loss": 0.4644, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8280586004257202, "rewards/margins": 1.3872017860412598, "rewards/rejected": -3.2152607440948486, "step": 4000 }, { "epoch": 0.9596928982725528, "eval_logits/chosen": -0.6084198951721191, "eval_logits/rejected": -0.626106858253479, "eval_logps/chosen": -439.457275390625, "eval_logps/rejected": -623.4827270507812, "eval_loss": 0.43927037715911865, "eval_rewards/accuracies": 0.8285714387893677, "eval_rewards/chosen": -1.7351824045181274, "eval_rewards/margins": 1.7476173639297485, "eval_rewards/rejected": -3.482799530029297, "eval_runtime": 205.925, "eval_samples_per_second": 21.663, "eval_steps_per_second": 0.34, "step": 4000 }, { "epoch": 0.9620921305182342, "grad_norm": 12.00737344425498, "learning_rate": 2.185727187643843e-09, "logits/chosen": -0.6551751494407654, "logits/rejected": -0.6754254102706909, "logps/chosen": -384.97015380859375, "logps/rejected": -610.7628784179688, "loss": 0.4753, "rewards/accuracies": 0.875, "rewards/chosen": -1.6510511636734009, "rewards/margins": 2.127711296081543, "rewards/rejected": -3.7787623405456543, "step": 4010 }, { "epoch": 0.9644913627639156, "grad_norm": 13.508620118248077, "learning_rate": 1.9181515464413434e-09, "logits/chosen": -0.5892629623413086, "logits/rejected": -0.6146517395973206, "logps/chosen": -518.9400024414062, "logps/rejected": -719.6813354492188, "loss": 0.4167, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4437657594680786, "rewards/margins": 1.971710205078125, "rewards/rejected": -3.4154763221740723, "step": 4020 }, { "epoch": 0.966890595009597, "grad_norm": 10.39563083030194, "learning_rate": 1.6679778612923302e-09, "logits/chosen": -0.5608581304550171, "logits/rejected": -0.6161444187164307, "logps/chosen": -488.674560546875, "logps/rejected": -594.23681640625, "loss": 0.4207, "rewards/accuracies": 0.75, "rewards/chosen": -1.8151352405548096, "rewards/margins": 1.053483247756958, "rewards/rejected": -2.8686180114746094, "step": 4030 }, { "epoch": 0.9692898272552783, "grad_norm": 11.778106767916414, "learning_rate": 1.43522368088686e-09, "logits/chosen": -0.5647310018539429, "logits/rejected": -0.6243175864219666, "logps/chosen": -465.7294006347656, "logps/rejected": -713.4627685546875, "loss": 0.4842, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9448398351669312, "rewards/margins": 2.4202308654785156, "rewards/rejected": -4.365070819854736, "step": 4040 }, { "epoch": 0.9716890595009597, "grad_norm": 13.924155842753079, "learning_rate": 1.2199053320059993e-09, "logits/chosen": -0.5596794486045837, "logits/rejected": -0.5762395858764648, "logps/chosen": -459.1459045410156, "logps/rejected": -605.2967529296875, "loss": 0.4569, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7754650115966797, "rewards/margins": 1.3398131132125854, "rewards/rejected": -3.1152782440185547, "step": 4050 }, { "epoch": 0.974088291746641, "grad_norm": 10.101674481691141, "learning_rate": 1.0220379183764338e-09, "logits/chosen": -0.6857717633247375, "logits/rejected": -0.67542564868927, "logps/chosen": -366.9244384765625, "logps/rejected": -573.9992065429688, "loss": 0.4449, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.5600941181182861, "rewards/margins": 1.933546781539917, "rewards/rejected": -3.493640899658203, "step": 4060 }, { "epoch": 0.9764875239923224, "grad_norm": 10.9782818923637, "learning_rate": 8.416353196111503e-10, "logits/chosen": -0.5480167269706726, "logits/rejected": -0.551671028137207, "logps/chosen": -435.30120849609375, "logps/rejected": -583.2462158203125, "loss": 0.4928, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.900551438331604, "rewards/margins": 1.5537331104278564, "rewards/rejected": -3.45428466796875, "step": 4070 }, { "epoch": 0.9788867562380038, "grad_norm": 13.709217546306082, "learning_rate": 6.787101902356873e-10, "logits/chosen": -0.5955997705459595, "logits/rejected": -0.5746399760246277, "logps/chosen": -457.44512939453125, "logps/rejected": -655.3438110351562, "loss": 0.4125, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.843754529953003, "rewards/margins": 1.7090778350830078, "rewards/rejected": -3.5528323650360107, "step": 4080 }, { "epoch": 0.9812859884836852, "grad_norm": 15.002773863211088, "learning_rate": 5.332739588005953e-10, "logits/chosen": -0.7055156826972961, "logits/rejected": -0.7228876352310181, "logps/chosen": -376.8843994140625, "logps/rejected": -592.8224487304688, "loss": 0.4463, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6472818851470947, "rewards/margins": 1.8167731761932373, "rewards/rejected": -3.464055299758911, "step": 4090 }, { "epoch": 0.9836852207293666, "grad_norm": 13.389436536453555, "learning_rate": 4.053368270797164e-10, "logits/chosen": -0.5252457857131958, "logits/rejected": -0.5572882890701294, "logps/chosen": -437.04620361328125, "logps/rejected": -581.270751953125, "loss": 0.4427, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8863855600357056, "rewards/margins": 1.4563047885894775, "rewards/rejected": -3.3426902294158936, "step": 4100 }, { "epoch": 0.986084452975048, "grad_norm": 9.112429191727882, "learning_rate": 2.949077693545354e-10, "logits/chosen": -0.49874311685562134, "logits/rejected": -0.5566374063491821, "logps/chosen": -467.4441833496094, "logps/rejected": -625.98681640625, "loss": 0.4872, "rewards/accuracies": 0.75, "rewards/chosen": -1.7639166116714478, "rewards/margins": 1.2108697891235352, "rewards/rejected": -2.9747862815856934, "step": 4110 }, { "epoch": 0.9884836852207294, "grad_norm": 10.18504682387471, "learning_rate": 2.0199453178471047e-10, "logits/chosen": -0.5316934585571289, "logits/rejected": -0.5874772071838379, "logps/chosen": -509.3759765625, "logps/rejected": -588.4547119140625, "loss": 0.4279, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8416917324066162, "rewards/margins": 1.1702340841293335, "rewards/rejected": -3.0119261741638184, "step": 4120 }, { "epoch": 0.9908829174664108, "grad_norm": 9.481256396419454, "learning_rate": 1.266036318647301e-10, "logits/chosen": -0.593826413154602, "logits/rejected": -0.6198239326477051, "logps/chosen": -489.977294921875, "logps/rejected": -679.3530883789062, "loss": 0.42, "rewards/accuracies": 0.875, "rewards/chosen": -1.5848186016082764, "rewards/margins": 2.1017608642578125, "rewards/rejected": -3.686579465866089, "step": 4130 }, { "epoch": 0.9932821497120922, "grad_norm": 14.774926148490673, "learning_rate": 6.874035796672339e-11, "logits/chosen": -0.6392898559570312, "logits/rejected": -0.65348219871521, "logps/chosen": -449.4820251464844, "logps/rejected": -634.1976318359375, "loss": 0.4374, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.4988961219787598, "rewards/margins": 2.359536647796631, "rewards/rejected": -3.858433246612549, "step": 4140 }, { "epoch": 0.9956813819577736, "grad_norm": 13.569592726410221, "learning_rate": 2.8408768969423458e-11, "logits/chosen": -0.6521973609924316, "logits/rejected": -0.6605287194252014, "logps/chosen": -451.91217041015625, "logps/rejected": -617.4226684570312, "loss": 0.446, "rewards/accuracies": 0.75, "rewards/chosen": -1.6109716892242432, "rewards/margins": 1.5008330345153809, "rewards/rejected": -3.111804485321045, "step": 4150 }, { "epoch": 0.9980806142034548, "grad_norm": 14.763770162561924, "learning_rate": 5.611693973617271e-12, "logits/chosen": -0.5573083162307739, "logits/rejected": -0.5667176246643066, "logps/chosen": -397.83111572265625, "logps/rejected": -570.945068359375, "loss": 0.465, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6926639080047607, "rewards/margins": 1.5600066184997559, "rewards/rejected": -3.2526707649230957, "step": 4160 }, { "epoch": 1.0, "step": 4168, "total_flos": 0.0, "train_loss": 0.5098277106738136, "train_runtime": 16630.655, "train_samples_per_second": 8.019, "train_steps_per_second": 0.251 } ], "logging_steps": 10, "max_steps": 4168, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }